#!/usr/bin/env python3 """ OCR Profile Test Script Tests all PDF files against expected_receipts.json and reports PASS/FAIL for each field. Usage: python scripts/test_all_profiles.py [--pdf FILENAME] [--verbose] Options: --pdf FILENAME Test only a specific PDF file --verbose Show detailed output for each field --timeout N Timeout in seconds for OCR (default: 60) """ import argparse import json import os import sys import time from datetime import datetime, timedelta, timezone from decimal import Decimal from pathlib import Path from typing import Dict, List, Optional, Any try: import requests from jose import jwt except ImportError: print("Error: Required packages not installed.") print("Run: pip install python-jose requests") sys.exit(1) # Configuration API_BASE = os.getenv("API_BASE", "http://localhost:8000") JWT_SECRET = os.getenv("JWT_SECRET_KEY", "GENERATE_NEW_SECRET_FOR_PRODUCTION3334!") EXPECTED_FILE = "tests/ocr-validation/expected_receipts.json" PDF_DIR = "docs/data-entry" def create_jwt_token() -> str: """Create a test JWT token for API authentication.""" # Valid permissions: read, write, delete, admin, reports, export (from PermissionType enum) payload = { "username": "TEST_PROFILES", "user_id": 1, "companies": ["604"], "permissions": ["read", "write", "admin"], # Use valid PermissionType values only "exp": datetime.now(timezone.utc) + timedelta(hours=1), "iat": datetime.now(timezone.utc), "type": "access" } return jwt.encode(payload, JWT_SECRET, algorithm="HS256") def load_expected_receipts() -> Dict[str, Dict]: """Load expected values from JSON file, indexed by filename.""" with open(EXPECTED_FILE, 'r', encoding='utf-8') as f: data = json.load(f) # Index by filename for easy lookup return {r['filename']: r for r in data.get('receipts', [])} def submit_ocr(pdf_path: str, token: str, timeout: int = 60) -> Optional[Dict]: """Submit a PDF to OCR API and wait for result.""" headers = {"Authorization": f"Bearer {token}"} filename = os.path.basename(pdf_path) try: with open(pdf_path, "rb") as f: files = {"file": (filename, f, "application/pdf")} response = requests.post( f"{API_BASE}/api/data-entry/ocr/extract?engine=doctr_plus", files=files, headers=headers, timeout=30 ) if response.status_code != 200: print(f" ❌ HTTP Error: {response.status_code}") return None job_data = response.json() job_id = job_data.get("job_id") if not job_id: print(f" ❌ No job_id in response") return None # Poll for completion start_time = time.time() while time.time() - start_time < timeout: poll_response = requests.get( f"{API_BASE}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30", headers=headers, timeout=35 ) if poll_response.status_code == 200: job_result = poll_response.json() status = job_result.get("status") if status == "completed": return job_result.get("result", {}) elif status == "error": print(f" ❌ OCR Error: {job_result.get('error', 'Unknown')}") return None time.sleep(2) print(f" ❌ Timeout waiting for OCR") return None except Exception as e: print(f" ❌ Exception: {e}") return None def normalize_cui(cui: Optional[str]) -> Optional[str]: """Normalize CUI for comparison (remove RO prefix, whitespace, leading zeros).""" if not cui: return None # Remove RO prefix, spaces, and leading zeros result = str(cui).upper().replace("RO", "").replace(" ", "").strip() # Remove leading zeros but keep at least one digit result = result.lstrip("0") or "0" return result def compare_values(extracted: Any, expected: Any, field: str, tolerance: float = 0.02) -> tuple: """ Compare extracted vs expected value. Returns (passed: bool, message: str) """ # Handle None cases if expected is None: return (True, "N/A (no expected value)") if extracted is None: return (False, f"Missing (expected: {expected})") # Numeric comparison with tolerance if field in ['total', 'card', 'numerar', 'total_tva']: try: ext_val = float(extracted) if extracted else 0.0 exp_val = float(expected) if expected else 0.0 if exp_val == 0: if ext_val == 0: return (True, "0.0 ✓") else: return (False, f"{ext_val} (expected: 0.0)") diff = abs(ext_val - exp_val) pct_diff = diff / exp_val * 100 if diff <= tolerance or pct_diff <= 1.0: # Within tolerance or 1% return (True, f"{ext_val} ✓") else: return (False, f"{ext_val} (expected: {exp_val}, diff: {diff:.2f})") except (TypeError, ValueError): return (False, f"Invalid numeric: {extracted}") # CUI comparison (normalize both) if field in ['cui_furnizor', 'cui_client']: ext_norm = normalize_cui(str(extracted)) if extracted else None exp_norm = normalize_cui(str(expected)) if expected else None if ext_norm == exp_norm: return (True, f"{extracted} ✓") else: return (False, f"{extracted} (expected: {expected})") # String comparison if field in ['furnizor', 'numar_bon', 'data_bon']: ext_str = str(extracted).strip() if extracted else "" exp_str = str(expected).strip() if expected else "" # For dates, compare YYYY-MM-DD format if field == 'data_bon': # Extract date from datetime if present if 'T' in ext_str: ext_str = ext_str.split('T')[0] if ext_str == exp_str: return (True, f"{extracted} ✓") else: return (False, f"{extracted} (expected: {expected})") # Partial match for vendor names (OCR can have errors) if field == 'furnizor': ext_upper = ext_str.upper() exp_upper = exp_str.upper() # Check if main keywords match exp_words = [w for w in exp_upper.split() if len(w) > 3] matches = sum(1 for w in exp_words if w in ext_upper) if matches >= len(exp_words) * 0.5: # 50% of words match return (True, f"{ext_str} ✓") else: return (False, f"{ext_str} (expected: {exp_str})") if ext_str == exp_str: return (True, f"{extracted} ✓") else: return (False, f"{extracted} (expected: {expected})") # Default comparison if str(extracted) == str(expected): return (True, f"{extracted} ✓") else: return (False, f"{extracted} (expected: {expected})") def compare_tva(extracted_tva: List[Dict], expected_tva: List[Dict]) -> tuple: """Compare TVA entries.""" if not expected_tva: if not extracted_tva: return (True, "No TVA (non-VAT payer) ✓") else: ext_sum = sum(e.get('amount', 0) for e in extracted_tva) return (False, f"Extracted TVA {ext_sum} but expected none") if not extracted_tva: exp_sum = sum(e.get('value', 0) for e in expected_tva) return (False, f"No TVA extracted (expected: {exp_sum})") # Compare total TVA amount ext_sum = sum(float(e.get('amount', 0)) for e in extracted_tva) exp_sum = sum(float(e.get('value', 0)) for e in expected_tva) diff = abs(ext_sum - exp_sum) if diff <= 0.05: # 5 bani tolerance return (True, f"TVA={ext_sum:.2f} ✓") else: return (False, f"TVA={ext_sum:.2f} (expected: {exp_sum:.2f})") def compare_payment(extracted: List[Dict], expected_card: float, expected_numerar: float) -> tuple: """Compare payment methods.""" ext_card = 0.0 ext_numerar = 0.0 for p in (extracted or []): method = p.get('method', '').upper() amount = float(p.get('amount', 0)) if method == 'CARD': ext_card += amount elif method == 'NUMERAR': ext_numerar += amount # Check CARD card_ok = abs(ext_card - expected_card) <= 0.02 # Check NUMERAR numerar_ok = abs(ext_numerar - expected_numerar) <= 0.02 if card_ok and numerar_ok: parts = [] if expected_card > 0: parts.append(f"CARD={ext_card:.2f}") if expected_numerar > 0: parts.append(f"NUMERAR={ext_numerar:.2f}") return (True, f"{', '.join(parts) or 'No payment'} ✓") else: return (False, f"CARD={ext_card:.2f} NUMERAR={ext_numerar:.2f} (expected: CARD={expected_card}, NUMERAR={expected_numerar})") def test_pdf(pdf_filename: str, expected: Dict, token: str, verbose: bool = False, timeout: int = 60) -> Dict: """Test a single PDF file against expected values.""" pdf_path = os.path.join(PDF_DIR, pdf_filename) if not os.path.exists(pdf_path): return { 'filename': pdf_filename, 'status': 'SKIP', 'reason': 'File not found', 'fields': {} } print(f"\n 📄 Testing: {pdf_filename}") # Submit OCR result = submit_ocr(pdf_path, token, timeout) if not result: return { 'filename': pdf_filename, 'status': 'ERROR', 'reason': 'OCR extraction failed', 'fields': {} } # Compare fields fields = {} all_passed = True # Total passed, msg = compare_values(result.get('amount'), expected.get('total'), 'total') fields['total'] = {'passed': passed, 'message': msg} if not passed: all_passed = False # TVA passed, msg = compare_tva(result.get('tva_entries', []), expected.get('tva_details', [])) fields['tva'] = {'passed': passed, 'message': msg} if not passed: all_passed = False # Payment passed, msg = compare_payment( result.get('payment_methods', []), expected.get('card', 0.0), expected.get('numerar', 0.0) ) fields['payment'] = {'passed': passed, 'message': msg} if not passed: all_passed = False # CUI furnizor passed, msg = compare_values(result.get('cui'), expected.get('cui_furnizor'), 'cui_furnizor') fields['cui_furnizor'] = {'passed': passed, 'message': msg} if not passed: all_passed = False # CUI client (optional) if expected.get('cui_client'): passed, msg = compare_values(result.get('client_cui'), expected.get('cui_client'), 'cui_client') fields['cui_client'] = {'passed': passed, 'message': msg} if not passed: all_passed = False # Date passed, msg = compare_values(result.get('receipt_date'), expected.get('data_bon'), 'data_bon') fields['date'] = {'passed': passed, 'message': msg} # Don't fail on date mismatch (OCR date detection is tricky) # Print results status = 'PASS' if all_passed else 'FAIL' status_icon = '✅' if all_passed else '❌' print(f" {status_icon} {status}") if verbose or not all_passed: for field_name, field_result in fields.items(): icon = '✓' if field_result['passed'] else '✗' print(f" {icon} {field_name}: {field_result['message']}") return { 'filename': pdf_filename, 'status': status, 'fields': fields, 'extracted': result } def main(): parser = argparse.ArgumentParser(description="Test OCR profiles against expected values") parser.add_argument("--pdf", help="Test only a specific PDF file") parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output") parser.add_argument("--timeout", type=int, default=60, help="OCR timeout in seconds") args = parser.parse_args() print("\n" + "="*70) print(" OCR Profile Test - All PDFs vs expected_receipts.json") print("="*70) # Load expected values try: expected_receipts = load_expected_receipts() print(f"\n📋 Loaded {len(expected_receipts)} expected receipts") except Exception as e: print(f"❌ Failed to load expected_receipts.json: {e}") sys.exit(1) # Create JWT token token = create_jwt_token() print(f"🔑 JWT token created") # Determine which PDFs to test if args.pdf: pdfs_to_test = [args.pdf] else: # Test all PDFs in expected_receipts pdfs_to_test = list(expected_receipts.keys()) print(f"📁 Testing {len(pdfs_to_test)} PDF files") # Run tests results = [] passed = 0 failed = 0 skipped = 0 errors = 0 for pdf_filename in pdfs_to_test: expected = expected_receipts.get(pdf_filename, {}) if not expected: print(f"\n ⚠️ {pdf_filename}: No expected values in JSON") skipped += 1 continue result = test_pdf(pdf_filename, expected, token, args.verbose, args.timeout) results.append(result) if result['status'] == 'PASS': passed += 1 elif result['status'] == 'FAIL': failed += 1 elif result['status'] == 'SKIP': skipped += 1 else: errors += 1 # Print summary print("\n" + "="*70) print(" SUMMARY") print("="*70) print(f" ✅ Passed: {passed}") print(f" ❌ Failed: {failed}") print(f" ⏭️ Skipped: {skipped}") print(f" 💥 Errors: {errors}") print(f" 📊 Total: {len(pdfs_to_test)}") print("="*70) # List failures if failed > 0: print("\n❌ FAILED TESTS:") for r in results: if r['status'] == 'FAIL': print(f" - {r['filename']}") for field, info in r['fields'].items(): if not info['passed']: print(f" • {field}: {info['message']}") # Exit code sys.exit(0 if failed == 0 else 1) if __name__ == "__main__": main()