feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
127
tests/ocr-validation/get_raw_ocr_text.py
Normal file
127
tests/ocr-validation/get_raw_ocr_text.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick script to get raw OCR text for specific receipts.
|
||||
Usage: python get_raw_ocr_text.py <receipt_path>
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
sys.path.insert(0, str(project_root / 'backend'))
|
||||
|
||||
from jose import jwt
|
||||
|
||||
API_BASE = "http://localhost:8000/api/data-entry"
|
||||
|
||||
def create_test_token() -> str:
|
||||
"""Create a test JWT token for API authentication."""
|
||||
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
now = datetime.utcnow()
|
||||
expire = now + timedelta(hours=1)
|
||||
|
||||
payload = {
|
||||
"username": "ocr_test_user",
|
||||
"user_id": 999,
|
||||
"companies": ["TEST"],
|
||||
"permissions": ["read", "write"],
|
||||
"exp": expire,
|
||||
"iat": now,
|
||||
"type": "access"
|
||||
}
|
||||
|
||||
return jwt.encode(payload, secret_key, algorithm="HS256")
|
||||
|
||||
|
||||
def get_raw_ocr_text(file_path: str, token: str) -> dict:
|
||||
"""Submit file to OCR and get raw text."""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
return {"error": f"File not found: {file_path}"}
|
||||
|
||||
# Submit OCR job
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing: {path.name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
headers = {'Authorization': f'Bearer {token}'}
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
files = {'file': (path.name, f, 'application/pdf')}
|
||||
|
||||
response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers)
|
||||
|
||||
if response.status_code != 200:
|
||||
return {"error": f"Submit failed: {response.status_code} - {response.text}"}
|
||||
|
||||
result = response.json()
|
||||
job_id = result.get('job_id')
|
||||
print(f"Job ID: {job_id}")
|
||||
|
||||
# Poll for completion
|
||||
max_wait = 120
|
||||
start = time.time()
|
||||
|
||||
while time.time() - start < max_wait:
|
||||
status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers)
|
||||
if status_response.status_code != 200:
|
||||
return {"error": f"Status check failed: {status_response.status_code}"}
|
||||
|
||||
status = status_response.json()
|
||||
job_status = status.get('status')
|
||||
|
||||
if job_status == 'completed':
|
||||
result = status.get('result', {})
|
||||
|
||||
# Print raw texts
|
||||
raw_texts = result.get('raw_texts', [])
|
||||
print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n")
|
||||
|
||||
for i, raw_text in enumerate(raw_texts):
|
||||
print(f"\n=== Pass {i+1} ===")
|
||||
print(raw_text[:3000] if len(raw_text) > 3000 else raw_text)
|
||||
print(f"\n[Text length: {len(raw_text)} chars]")
|
||||
|
||||
# Print extracted fields
|
||||
print(f"\n--- EXTRACTED FIELDS ---")
|
||||
print(f"TOTAL: {result.get('amount')}")
|
||||
print(f"DATE: {result.get('receipt_date')}")
|
||||
print(f"CUI: {result.get('cui')}")
|
||||
print(f"TVA Total: {result.get('tva_total')}")
|
||||
print(f"TVA Entries: {result.get('tva_entries')}")
|
||||
print(f"Confidence: {result.get('overall_confidence')}")
|
||||
print(f"Engine: {result.get('ocr_engine')}")
|
||||
|
||||
return result
|
||||
|
||||
elif job_status == 'failed':
|
||||
return {"error": f"OCR failed: {status.get('error')}"}
|
||||
|
||||
print(f" Status: {job_status}, waiting...")
|
||||
time.sleep(2)
|
||||
|
||||
return {"error": "Timeout waiting for OCR"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create test token
|
||||
token = create_test_token()
|
||||
print(f"Using JWT token for authentication")
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
# Default: process the two receipts user wants to see
|
||||
receipts = [
|
||||
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena 1 sept.pdf",
|
||||
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena, electrice consumabile 604.pdf"
|
||||
]
|
||||
else:
|
||||
receipts = sys.argv[1:]
|
||||
|
||||
for receipt in receipts:
|
||||
result = get_raw_ocr_text(receipt, token)
|
||||
if "error" in result:
|
||||
print(f"ERROR: {result['error']}")
|
||||
Reference in New Issue
Block a user