Files
roa2web-service-auto/tests/ocr-validation/get_raw_ocr_text.py
Claude Agent 62f86250cc refactor(docs): consolidate and cleanup documentation
- Delete 9 deprecated/obsolete docs (~6,300 lines removed)
- Move test PDFs to tests/fixtures/ocr-samples/
- Create docs/DEPLOYMENT.md as principal guide
- Create tests/ocr-validation/README.md
- Update all refs for ultrathin monolith architecture
- Update OCR tests to use relative paths

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 09:14:51 +00:00

129 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Quick script to get raw OCR text for specific receipts.
Usage: python get_raw_ocr_text.py <receipt_path>
"""
import sys
import os
import time
import requests
from pathlib import Path
from datetime import datetime, timedelta
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / 'backend'))
from jose import jwt
API_BASE = "http://localhost:8000/api/data-entry"
def create_test_token() -> str:
"""Create a test JWT token for API authentication."""
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
now = datetime.utcnow()
expire = now + timedelta(hours=1)
payload = {
"username": "ocr_test_user",
"user_id": 999,
"companies": ["TEST"],
"permissions": ["read", "write"],
"exp": expire,
"iat": now,
"type": "access"
}
return jwt.encode(payload, secret_key, algorithm="HS256")
def get_raw_ocr_text(file_path: str, token: str) -> dict:
"""Submit file to OCR and get raw text."""
path = Path(file_path)
if not path.exists():
return {"error": f"File not found: {file_path}"}
# Submit OCR job
print(f"\n{'='*60}")
print(f"Processing: {path.name}")
print(f"{'='*60}")
headers = {'Authorization': f'Bearer {token}'}
with open(path, 'rb') as f:
files = {'file': (path.name, f, 'application/pdf')}
response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers)
if response.status_code != 200:
return {"error": f"Submit failed: {response.status_code} - {response.text}"}
result = response.json()
job_id = result.get('job_id')
print(f"Job ID: {job_id}")
# Poll for completion
max_wait = 120
start = time.time()
while time.time() - start < max_wait:
status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers)
if status_response.status_code != 200:
return {"error": f"Status check failed: {status_response.status_code}"}
status = status_response.json()
job_status = status.get('status')
if job_status == 'completed':
result = status.get('result', {})
# Print raw texts
raw_texts = result.get('raw_texts', [])
print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n")
for i, raw_text in enumerate(raw_texts):
print(f"\n=== Pass {i+1} ===")
print(raw_text[:3000] if len(raw_text) > 3000 else raw_text)
print(f"\n[Text length: {len(raw_text)} chars]")
# Print extracted fields
print(f"\n--- EXTRACTED FIELDS ---")
print(f"TOTAL: {result.get('amount')}")
print(f"DATE: {result.get('receipt_date')}")
print(f"CUI: {result.get('cui')}")
print(f"TVA Total: {result.get('tva_total')}")
print(f"TVA Entries: {result.get('tva_entries')}")
print(f"Confidence: {result.get('overall_confidence')}")
print(f"Engine: {result.get('ocr_engine')}")
return result
elif job_status == 'failed':
return {"error": f"OCR failed: {status.get('error')}"}
print(f" Status: {job_status}, waiting...")
time.sleep(2)
return {"error": "Timeout waiting for OCR"}
if __name__ == "__main__":
# Create test token
token = create_test_token()
print(f"Using JWT token for authentication")
if len(sys.argv) < 2:
# Default: process sample receipts from fixtures
fixtures_dir = Path(__file__).parent.parent / "fixtures" / "ocr-samples"
receipts = [
str(fixtures_dir / "brick igiena 1 sept.pdf"),
str(fixtures_dir / "brick igiena, electrice consumabile 604.pdf")
]
else:
receipts = sys.argv[1:]
for receipt in receipts:
result = get_raw_ocr_text(receipt, token)
if "error" in result:
print(f"ERROR: {result['error']}")