#!/usr/bin/env python3 """ Quick script to get raw OCR text for specific receipts. Usage: python get_raw_ocr_text.py """ import sys import os import time import requests from pathlib import Path from datetime import datetime, timedelta # Add project root to path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) sys.path.insert(0, str(project_root / 'backend')) from jose import jwt API_BASE = "http://localhost:8000/api/data-entry" def create_test_token() -> str: """Create a test JWT token for API authentication.""" secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32') now = datetime.utcnow() expire = now + timedelta(hours=1) payload = { "username": "ocr_test_user", "user_id": 999, "companies": ["TEST"], "permissions": ["read", "write"], "exp": expire, "iat": now, "type": "access" } return jwt.encode(payload, secret_key, algorithm="HS256") def get_raw_ocr_text(file_path: str, token: str) -> dict: """Submit file to OCR and get raw text.""" path = Path(file_path) if not path.exists(): return {"error": f"File not found: {file_path}"} # Submit OCR job print(f"\n{'='*60}") print(f"Processing: {path.name}") print(f"{'='*60}") headers = {'Authorization': f'Bearer {token}'} with open(path, 'rb') as f: files = {'file': (path.name, f, 'application/pdf')} response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers) if response.status_code != 200: return {"error": f"Submit failed: {response.status_code} - {response.text}"} result = response.json() job_id = result.get('job_id') print(f"Job ID: {job_id}") # Poll for completion max_wait = 120 start = time.time() while time.time() - start < max_wait: status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers) if status_response.status_code != 200: return {"error": f"Status check failed: {status_response.status_code}"} status = status_response.json() job_status = status.get('status') if job_status == 'completed': result = status.get('result', {}) # Print raw texts raw_texts = result.get('raw_texts', []) print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n") for i, raw_text in enumerate(raw_texts): print(f"\n=== Pass {i+1} ===") print(raw_text[:3000] if len(raw_text) > 3000 else raw_text) print(f"\n[Text length: {len(raw_text)} chars]") # Print extracted fields print(f"\n--- EXTRACTED FIELDS ---") print(f"TOTAL: {result.get('amount')}") print(f"DATE: {result.get('receipt_date')}") print(f"CUI: {result.get('cui')}") print(f"TVA Total: {result.get('tva_total')}") print(f"TVA Entries: {result.get('tva_entries')}") print(f"Confidence: {result.get('overall_confidence')}") print(f"Engine: {result.get('ocr_engine')}") return result elif job_status == 'failed': return {"error": f"OCR failed: {status.get('error')}"} print(f" Status: {job_status}, waiting...") time.sleep(2) return {"error": "Timeout waiting for OCR"} if __name__ == "__main__": # Create test token token = create_test_token() print(f"Using JWT token for authentication") if len(sys.argv) < 2: # Default: process sample receipts from fixtures fixtures_dir = Path(__file__).parent.parent / "fixtures" / "ocr-samples" receipts = [ str(fixtures_dir / "brick igiena 1 sept.pdf"), str(fixtures_dir / "brick igiena, electrice consumabile 604.pdf") ] else: receipts = sys.argv[1:] for receipt in receipts: result = get_raw_ocr_text(receipt, token) if "error" in result: print(f"ERROR: {result['error']}")