Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
128 lines
4.1 KiB
Python
128 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick script to get raw OCR text for specific receipts.
|
|
Usage: python get_raw_ocr_text.py <receipt_path>
|
|
"""
|
|
import sys
|
|
import os
|
|
import time
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
sys.path.insert(0, str(project_root / 'backend'))
|
|
|
|
from jose import jwt
|
|
|
|
API_BASE = "http://localhost:8000/api/data-entry"
|
|
|
|
def create_test_token() -> str:
|
|
"""Create a test JWT token for API authentication."""
|
|
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
|
now = datetime.utcnow()
|
|
expire = now + timedelta(hours=1)
|
|
|
|
payload = {
|
|
"username": "ocr_test_user",
|
|
"user_id": 999,
|
|
"companies": ["TEST"],
|
|
"permissions": ["read", "write"],
|
|
"exp": expire,
|
|
"iat": now,
|
|
"type": "access"
|
|
}
|
|
|
|
return jwt.encode(payload, secret_key, algorithm="HS256")
|
|
|
|
|
|
def get_raw_ocr_text(file_path: str, token: str) -> dict:
|
|
"""Submit file to OCR and get raw text."""
|
|
path = Path(file_path)
|
|
if not path.exists():
|
|
return {"error": f"File not found: {file_path}"}
|
|
|
|
# Submit OCR job
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing: {path.name}")
|
|
print(f"{'='*60}")
|
|
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
|
|
with open(path, 'rb') as f:
|
|
files = {'file': (path.name, f, 'application/pdf')}
|
|
|
|
response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers)
|
|
|
|
if response.status_code != 200:
|
|
return {"error": f"Submit failed: {response.status_code} - {response.text}"}
|
|
|
|
result = response.json()
|
|
job_id = result.get('job_id')
|
|
print(f"Job ID: {job_id}")
|
|
|
|
# Poll for completion
|
|
max_wait = 120
|
|
start = time.time()
|
|
|
|
while time.time() - start < max_wait:
|
|
status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers)
|
|
if status_response.status_code != 200:
|
|
return {"error": f"Status check failed: {status_response.status_code}"}
|
|
|
|
status = status_response.json()
|
|
job_status = status.get('status')
|
|
|
|
if job_status == 'completed':
|
|
result = status.get('result', {})
|
|
|
|
# Print raw texts
|
|
raw_texts = result.get('raw_texts', [])
|
|
print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n")
|
|
|
|
for i, raw_text in enumerate(raw_texts):
|
|
print(f"\n=== Pass {i+1} ===")
|
|
print(raw_text[:3000] if len(raw_text) > 3000 else raw_text)
|
|
print(f"\n[Text length: {len(raw_text)} chars]")
|
|
|
|
# Print extracted fields
|
|
print(f"\n--- EXTRACTED FIELDS ---")
|
|
print(f"TOTAL: {result.get('amount')}")
|
|
print(f"DATE: {result.get('receipt_date')}")
|
|
print(f"CUI: {result.get('cui')}")
|
|
print(f"TVA Total: {result.get('tva_total')}")
|
|
print(f"TVA Entries: {result.get('tva_entries')}")
|
|
print(f"Confidence: {result.get('overall_confidence')}")
|
|
print(f"Engine: {result.get('ocr_engine')}")
|
|
|
|
return result
|
|
|
|
elif job_status == 'failed':
|
|
return {"error": f"OCR failed: {status.get('error')}"}
|
|
|
|
print(f" Status: {job_status}, waiting...")
|
|
time.sleep(2)
|
|
|
|
return {"error": "Timeout waiting for OCR"}
|
|
|
|
if __name__ == "__main__":
|
|
# Create test token
|
|
token = create_test_token()
|
|
print(f"Using JWT token for authentication")
|
|
|
|
if len(sys.argv) < 2:
|
|
# Default: process the two receipts user wants to see
|
|
receipts = [
|
|
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena 1 sept.pdf",
|
|
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena, electrice consumabile 604.pdf"
|
|
]
|
|
else:
|
|
receipts = sys.argv[1:]
|
|
|
|
for receipt in receipts:
|
|
result = get_raw_ocr_text(receipt, token)
|
|
if "error" in result:
|
|
print(f"ERROR: {result['error']}")
|