Files
roa2web-service-auto/tests/ocr-validation/test_receipts_sequential.py
Marius Mutu 495790411f feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00

229 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""Test each receipt sequentially and report results."""
import json
import os
import sys
import time
import requests
from datetime import datetime, timedelta
from jose import jwt
API_BASE = "http://localhost:8000"
PDF_FOLDER = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry"
EXPECTED_FILE = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/tests/ocr-validation/expected_receipts.json"
def get_jwt_token():
"""Create a test JWT token for API authentication."""
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
now = datetime.utcnow()
expire = now + timedelta(hours=1)
payload = {
"username": "MARIUS",
"user_id": 1,
"companies": ["604"],
"permissions": ["read", "write"],
"exp": expire,
"iat": now,
"type": "access"
}
return jwt.encode(payload, secret_key, algorithm="HS256")
def test_receipt(pdf_path: str, expected: dict, headers: dict) -> dict:
"""Test a single receipt and return results."""
filename = os.path.basename(pdf_path)
result = {
"filename": filename,
"success": False,
"time_ms": 0,
"error": None,
"extracted": {},
"matches": {},
"issues": []
}
start_time = time.time()
try:
with open(pdf_path, "rb") as f:
files = {"file": (filename, f, "application/pdf")}
response = requests.post(
f"{API_BASE}/api/data-entry/ocr/extract?engine=doctr_plus",
files=files,
headers=headers,
timeout=120
)
if response.status_code != 200:
result["error"] = f"HTTP {response.status_code}"
result["time_ms"] = int((time.time() - start_time) * 1000)
return result
job_data = response.json()
job_id = job_data.get("job_id")
# Poll for completion
for _ in range(60): # Max 60 polls (2 minutes)
poll_response = requests.get(
f"{API_BASE}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30",
headers=headers,
timeout=35
)
if poll_response.status_code == 200:
job_result = poll_response.json()
status = job_result.get("status")
if status == "completed":
break
elif status == "error":
result["error"] = job_result.get("error", "Unknown error")
result["time_ms"] = int((time.time() - start_time) * 1000)
return result
time.sleep(1)
result["time_ms"] = int((time.time() - start_time) * 1000)
if job_result.get("status") != "completed":
result["error"] = f"Timeout - status: {job_result.get('status')}"
return result
# Extract fields (correct field names from API)
extraction = job_result.get("result", {})
result["extracted"] = {
"total": extraction.get("amount"), # API uses "amount" not "total"
"date": extraction.get("receipt_date"), # API uses "receipt_date" not "date"
"cui": extraction.get("cui"),
"tva_total": extraction.get("tva_total"),
"confidence": extraction.get("overall_confidence")
}
# Compare with expected (use correct field names from expected_receipts.json)
exp_total = expected.get("total")
exp_date = expected.get("data_bon")
exp_cui = expected.get("cui_furnizor")
# Normalize for comparison
def normalize_total(val):
if val is None:
return None
return float(str(val).replace(',', '.'))
def normalize_cui(val):
if val is None:
return None
return str(val).upper().replace('RO', '').replace(' ', '').strip()
ext_total = normalize_total(result["extracted"]["total"])
ext_cui = normalize_cui(result["extracted"]["cui"])
exp_cui_norm = normalize_cui(exp_cui)
exp_total_norm = normalize_total(exp_total)
result["matches"]["total"] = abs(ext_total - exp_total_norm) < 0.01 if ext_total and exp_total_norm else None
result["matches"]["date"] = result["extracted"]["date"] == exp_date if exp_date else None
result["matches"]["cui"] = ext_cui == exp_cui_norm if exp_cui else None
# Check for issues
if exp_total and not result["matches"]["total"]:
result["issues"].append(f"TOTAL: got {result['extracted']['total']}, expected {exp_total}")
if exp_date and not result["matches"]["date"]:
result["issues"].append(f"DATE: got {result['extracted']['date']}, expected {exp_date}")
if exp_cui and not result["matches"]["cui"]:
result["issues"].append(f"CUI: got {result['extracted']['cui']}, expected {exp_cui}")
result["success"] = len(result["issues"]) == 0
except Exception as e:
result["error"] = str(e)
result["time_ms"] = int((time.time() - start_time) * 1000)
return result
def main():
# Load expected data
with open(EXPECTED_FILE) as f:
expected_data = json.load(f)
# Handle both formats: list or dict with "receipts" key
if isinstance(expected_data, dict) and "receipts" in expected_data:
all_receipts = expected_data["receipts"]
else:
all_receipts = expected_data
# Get JWT token
token = get_jwt_token()
if not token:
print("ERROR: Could not get JWT token")
sys.exit(1)
headers = {"Authorization": f"Bearer {token}"}
# Filter single-page receipts
receipts = [r for r in all_receipts if r.get("pages", 1) == 1]
print(f"\n{'='*60}")
print(f"Testing {len(receipts)} single-page receipts with doctr_plus")
print(f"{'='*60}\n")
results = []
times = []
for i, receipt in enumerate(receipts, 1):
filename = receipt["filename"]
pdf_path = os.path.join(PDF_FOLDER, filename)
if not os.path.exists(pdf_path):
print(f"[{i:02d}/{len(receipts)}] SKIP: {filename} (not found)")
continue
print(f"[{i:02d}/{len(receipts)}] Testing: {filename[:50]}...", end=" ", flush=True)
result = test_receipt(pdf_path, receipt, headers)
results.append(result)
if result["error"]:
print(f"ERROR ({result['time_ms']}ms): {result['error']}")
elif result["success"]:
print(f"OK ({result['time_ms']}ms) conf={result['extracted'].get('confidence', 0):.2f}")
times.append(result["time_ms"])
else:
print(f"FAIL ({result['time_ms']}ms): {'; '.join(result['issues'])}")
times.append(result["time_ms"])
# Summary
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
successful = [r for r in results if r["success"]]
failed = [r for r in results if not r["success"] and not r["error"]]
errors = [r for r in results if r["error"]]
print(f"Total: {len(results)}")
print(f"Success: {len(successful)} ({len(successful)*100/len(results):.1f}%)")
print(f"Failed: {len(failed)}")
print(f"Errors: {len(errors)}")
if times:
avg_time = sum(times) / len(times)
print(f"\nTiming: avg={avg_time:.0f}ms, min={min(times)}ms, max={max(times)}ms")
# Flag slow ones
slow_threshold = avg_time * 2
slow = [r for r in results if r["time_ms"] > slow_threshold and not r["error"]]
if slow:
print(f"\nSlow receipts (>{slow_threshold:.0f}ms):")
for r in slow:
print(f" - {r['filename']}: {r['time_ms']}ms")
if failed:
print(f"\nFailed receipts:")
for r in failed:
print(f" - {r['filename']}: {'; '.join(r['issues'])}")
if errors:
print(f"\nError receipts:")
for r in errors:
print(f" - {r['filename']}: {r['error']}")
if __name__ == "__main__":
main()