roa2web-service-auto/tests/ocr-validation/get_raw_ocr_text.py

#!/usr/bin/env python3
"""
Quick script to get raw OCR text for specific receipts.
Usage: python get_raw_ocr_text.py <receipt_path>
"""
import sys
import os
import time
import requests
from pathlib import Path
from datetime import datetime, timedelta

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / 'backend'))

from jose import jwt

API_BASE = "http://localhost:8000/api/data-entry"

def create_test_token() -> str:
    """Create a test JWT token for API authentication."""
    secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
    now = datetime.utcnow()
    expire = now + timedelta(hours=1)

    payload = {
        "username": "ocr_test_user",
        "user_id": 999,
        "companies": ["TEST"],
        "permissions": ["read", "write"],
        "exp": expire,
        "iat": now,
        "type": "access"
    }

    return jwt.encode(payload, secret_key, algorithm="HS256")


def get_raw_ocr_text(file_path: str, token: str) -> dict:
    """Submit file to OCR and get raw text."""
    path = Path(file_path)
    if not path.exists():
        return {"error": f"File not found: {file_path}"}

    # Submit OCR job
    print(f"\n{'='*60}")
    print(f"Processing: {path.name}")
    print(f"{'='*60}")

    headers = {'Authorization': f'Bearer {token}'}

    with open(path, 'rb') as f:
        files = {'file': (path.name, f, 'application/pdf')}

        response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers)

        if response.status_code != 200:
            return {"error": f"Submit failed: {response.status_code} - {response.text}"}

        result = response.json()
        job_id = result.get('job_id')
        print(f"Job ID: {job_id}")

    # Poll for completion
    max_wait = 120
    start = time.time()

    while time.time() - start < max_wait:
        status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers)
        if status_response.status_code != 200:
            return {"error": f"Status check failed: {status_response.status_code}"}

        status = status_response.json()
        job_status = status.get('status')

        if job_status == 'completed':
            result = status.get('result', {})

            # Print raw texts
            raw_texts = result.get('raw_texts', [])
            print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n")

            for i, raw_text in enumerate(raw_texts):
                print(f"\n=== Pass {i+1} ===")
                print(raw_text[:3000] if len(raw_text) > 3000 else raw_text)
                print(f"\n[Text length: {len(raw_text)} chars]")

            # Print extracted fields
            print(f"\n--- EXTRACTED FIELDS ---")
            print(f"TOTAL: {result.get('amount')}")
            print(f"DATE: {result.get('receipt_date')}")
            print(f"CUI: {result.get('cui')}")
            print(f"TVA Total: {result.get('tva_total')}")
            print(f"TVA Entries: {result.get('tva_entries')}")
            print(f"Confidence: {result.get('overall_confidence')}")
            print(f"Engine: {result.get('ocr_engine')}")

            return result

        elif job_status == 'failed':
            return {"error": f"OCR failed: {status.get('error')}"}

        print(f"  Status: {job_status}, waiting...")
        time.sleep(2)

    return {"error": "Timeout waiting for OCR"}

if __name__ == "__main__":
    # Create test token
    token = create_test_token()
    print(f"Using JWT token for authentication")

    if len(sys.argv) < 2:
        # Default: process sample receipts from fixtures
        fixtures_dir = Path(__file__).parent.parent / "fixtures" / "ocr-samples"
        receipts = [
            str(fixtures_dir / "brick igiena 1 sept.pdf"),
            str(fixtures_dir / "brick igiena, electrice consumabile 604.pdf")
        ]
    else:
        receipts = sys.argv[1:]

    for receipt in receipts:
        result = get_raw_ocr_text(receipt, token)
        if "error" in result:
            print(f"ERROR: {result['error']}")