feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions
--- a/tests/ocr-validation/get_raw_ocr_text.py
+++ b/tests/ocr-validation/get_raw_ocr_text.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""
+Quick script to get raw OCR text for specific receipts.
+Usage: python get_raw_ocr_text.py <receipt_path>
+"""
+import sys
+import os
+import time
+import requests
+from pathlib import Path
+from datetime import datetime, timedelta
+
+# Add project root to path
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+sys.path.insert(0, str(project_root / 'backend'))
+
+from jose import jwt
+
+API_BASE = "http://localhost:8000/api/data-entry"
+
+def create_test_token() -> str:
+    """Create a test JWT token for API authentication."""
+    secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
+    now = datetime.utcnow()
+    expire = now + timedelta(hours=1)
+
+    payload = {
+        "username": "ocr_test_user",
+        "user_id": 999,
+        "companies": ["TEST"],
+        "permissions": ["read", "write"],
+        "exp": expire,
+        "iat": now,
+        "type": "access"
+    }
+
+    return jwt.encode(payload, secret_key, algorithm="HS256")
+
+
+def get_raw_ocr_text(file_path: str, token: str) -> dict:
+    """Submit file to OCR and get raw text."""
+    path = Path(file_path)
+    if not path.exists():
+        return {"error": f"File not found: {file_path}"}
+
+    # Submit OCR job
+    print(f"\n{'='*60}")
+    print(f"Processing: {path.name}")
+    print(f"{'='*60}")
+
+    headers = {'Authorization': f'Bearer {token}'}
+
+    with open(path, 'rb') as f:
+        files = {'file': (path.name, f, 'application/pdf')}
+
+        response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers)
+
+        if response.status_code != 200:
+            return {"error": f"Submit failed: {response.status_code} - {response.text}"}
+
+        result = response.json()
+        job_id = result.get('job_id')
+        print(f"Job ID: {job_id}")
+
+    # Poll for completion
+    max_wait = 120
+    start = time.time()
+
+    while time.time() - start < max_wait:
+        status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers)
+        if status_response.status_code != 200:
+            return {"error": f"Status check failed: {status_response.status_code}"}
+
+        status = status_response.json()
+        job_status = status.get('status')
+
+        if job_status == 'completed':
+            result = status.get('result', {})
+
+            # Print raw texts
+            raw_texts = result.get('raw_texts', [])
+            print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n")
+
+            for i, raw_text in enumerate(raw_texts):
+                print(f"\n=== Pass {i+1} ===")
+                print(raw_text[:3000] if len(raw_text) > 3000 else raw_text)
+                print(f"\n[Text length: {len(raw_text)} chars]")
+
+            # Print extracted fields
+            print(f"\n--- EXTRACTED FIELDS ---")
+            print(f"TOTAL: {result.get('amount')}")
+            print(f"DATE: {result.get('receipt_date')}")
+            print(f"CUI: {result.get('cui')}")
+            print(f"TVA Total: {result.get('tva_total')}")
+            print(f"TVA Entries: {result.get('tva_entries')}")
+            print(f"Confidence: {result.get('overall_confidence')}")
+            print(f"Engine: {result.get('ocr_engine')}")
+
+            return result
+
+        elif job_status == 'failed':
+            return {"error": f"OCR failed: {status.get('error')}"}
+
+        print(f"  Status: {job_status}, waiting...")
+        time.sleep(2)
+
+    return {"error": "Timeout waiting for OCR"}
+
+if __name__ == "__main__":
+    # Create test token
+    token = create_test_token()
+    print(f"Using JWT token for authentication")
+
+    if len(sys.argv) < 2:
+        # Default: process the two receipts user wants to see
+        receipts = [
+            "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena 1 sept.pdf",
+            "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena, electrice consumabile 604.pdf"
+        ]
+    else:
+        receipts = sys.argv[1:]
+
+    for receipt in receipts:
+        result = get_raw_ocr_text(receipt, token)
+        if "error" in result:
+            print(f"ERROR: {result['error']}")