roa2web-service-auto/backend/modules/data_entry/routers/ocr.py

"""
OCR API endpoints with async job queue support.

Endpoints:
- POST /extract - Submit OCR job (returns job_id immediately)
- GET /jobs/{job_id} - Get job status and result
- GET /queue/status - Get queue statistics
- GET /status - Check OCR service availability

For backwards compatibility, we also support sync mode via query param:
- POST /extract?sync=true - Process synchronously (blocks until complete)
"""

import os
import tempfile
from datetime import datetime
from decimal import Decimal
from pathlib import Path
from typing import Optional

from fastapi import APIRouter, HTTPException, UploadFile, File, Depends, Query, Response
from sqlalchemy.ext.asyncio import AsyncSession

from backend.modules.data_entry.db.database import get_session
from backend.modules.data_entry.db.crud.attachment import AttachmentCRUD
from backend.modules.data_entry.services.ocr_service import ocr_service
from backend.modules.data_entry.services.ocr_engine import OCREngine
from backend.modules.data_entry.services.ocr.job_queue import job_queue, OCRJobStatus as JobStatus
from backend.modules.data_entry.services.ocr.job_worker import estimate_wait_time
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
from backend.modules.data_entry.schemas.ocr import (
    OCRResponse,
    OCRStatusResponse,
    ExtractionData,
    TvaEntry,
    PaymentMethod,
    # New job queue schemas
    OCREngineChoice,
    OCRJobStatus,
    OCRJobSubmitResponse,
    OCRJobResponse,
    OCRQueueStatusResponse,
)

# Auth integration
from shared.auth.dependencies import get_current_user
from shared.auth.models import CurrentUser

router = APIRouter()


# ============================================================================
# OCR Job Queue Endpoints (NEW)
# ============================================================================

@router.post("/extract", response_model=OCRJobSubmitResponse)
async def submit_ocr_job(
    file: UploadFile = File(...),
    engine: OCREngineChoice = Query(default=OCREngineChoice.doctr_plus, description="OCR engine to use"),
    sync: bool = Query(default=False, description="If true, process synchronously (blocks)"),
    current_user: CurrentUser = Depends(get_current_user)
):
    """
    Submit an OCR job for processing.

    By default, returns immediately with a job_id. Poll GET /jobs/{job_id} for result.

    Use ?sync=true for synchronous processing (blocks until complete).
    This is for backwards compatibility but not recommended for production.

    Args:
        file: Image or PDF file (max 10MB)
        engine: OCR engine choice (tesseract, doctr, doctr_plus, paddleocr)
        sync: If true, process synchronously (legacy mode)

    Returns:
        OCRJobSubmitResponse with job_id, queue_position, estimated_wait
    """
    allowed_types = ['image/jpeg', 'image/png', 'application/pdf']

    if file.content_type not in allowed_types:
        raise HTTPException(
            status_code=400,
            detail=f"File type not supported: {file.content_type}. Allowed: JPG, PNG, PDF"
        )

    # Read file content
    content = await file.read()

    # Check file size (10MB limit)
    if len(content) > 10 * 1024 * 1024:
        raise HTTPException(
            status_code=400,
            detail="File too large. Maximum size is 10MB."
        )

    # Sync mode - use legacy processing (blocks)
    if sync:
        return await _process_sync(content, file, engine, current_user)

    # Async mode - create job and return immediately
    try:
        job = await job_queue.create_job(
            file_bytes=content,
            mime_type=file.content_type,
            engine=engine.value,
            username=current_user.username,
            original_filename=file.filename
        )

        # Get queue position
        queue_position = await job_queue.get_queue_position(job.id)
        estimated_wait = estimate_wait_time(queue_position or 1)

        return OCRJobSubmitResponse(
            job_id=job.id,
            status=OCRJobStatus.pending,
            queue_position=queue_position or 1,
            estimated_wait_seconds=estimated_wait,
            created_at=job.created_at or datetime.utcnow()
        )

    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Failed to create OCR job: {str(e)}"
        )


@router.get("/jobs/{job_id}", response_model=OCRJobResponse)
async def get_job_status(
    job_id: str,
    session: AsyncSession = Depends(get_session),
    current_user: CurrentUser = Depends(get_current_user)
):
    """
    Get OCR job status and result (instant response).

    For efficient polling, use GET /jobs/{job_id}/wait instead (long-polling).

    Args:
        job_id: Job UUID from POST /extract response

    Returns:
        OCRJobResponse with status, queue_position, and result (if completed)
    """
    job = await job_queue.get_job(job_id)

    if not job:
        raise HTTPException(status_code=404, detail="Job not found")

    # Get queue position for pending jobs
    queue_position = None
    estimated_wait = None

    if job.status == JobStatus.pending:
        queue_position = await job_queue.get_queue_position(job_id)
        estimated_wait = estimate_wait_time(queue_position or 1)
    elif job.status == JobStatus.processing:
        queue_position = 0
        # Estimate remaining time based on average
        avg_time = await job_queue.get_average_processing_time()
        estimated_wait = int(avg_time * 0.5)  # Rough estimate: half remaining

    # Convert result to ExtractionData if available
    result_data = None
    if job.status == JobStatus.completed and job.result:
        result_data = _dict_to_extraction_data(job.result)
        # Apply fuzzy CUI matching
        result_data = await _apply_fuzzy_cui_matching(result_data, session)
        # Debug: log suggested_payment_mode being returned
        print(f"[OCR Router] Returning job {job_id} with suggested_payment_mode={result_data.suggested_payment_mode}", flush=True)

    return OCRJobResponse(
        job_id=job.id,
        status=OCRJobStatus(job.status.value),
        queue_position=queue_position,
        estimated_wait_seconds=estimated_wait,
        created_at=job.created_at or datetime.utcnow(),
        started_at=job.started_at,
        completed_at=job.completed_at,
        queue_wait_ms=job.queue_wait_ms,
        ocr_time_ms=job.ocr_time_ms,
        processing_time_ms=job.processing_time_ms,
        result=result_data,
        error=job.error_message
    )


@router.get("/jobs/{job_id}/wait", response_model=OCRJobResponse)
async def wait_for_job_status(
    job_id: str,
    response: Response,
    timeout: int = Query(default=30, ge=1, le=60, description="Max wait time in seconds"),
    wait_for_terminal: bool = Query(default=False, description="If true, only return on completed/failed"),
    _t: int = Query(default=None, description="Cache-busting timestamp (ignored)"),
    session: AsyncSession = Depends(get_session),
    current_user: CurrentUser = Depends(get_current_user)
):
    """
    Long-poll for OCR job status change.

    Waits until:
    - Job status changes (default behavior - returns on any status change)
    - Job reaches terminal state (if wait_for_terminal=true)
    - Timeout expires (returns current status)

    Recommended client timeout: timeout + 5 seconds

    Args:
        job_id: Job UUID from POST /extract response
        timeout: Max wait time in seconds (1-60, default 30)
        wait_for_terminal: If true, wait until completed/failed only

    Returns:
        OCRJobResponse with status, queue_position, and result (if completed)
    """
    # Prevent caching - critical for long-polling
    response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
    response.headers["Pragma"] = "no-cache"
    response.headers["Expires"] = "0"
    import asyncio
    import time

    start_time = time.time()
    end_time = start_time + timeout
    last_status = None
    iteration = 0

    print(f"[OCR Wait] Starting long-poll for job {job_id}, timeout={timeout}s, wait_for_terminal={wait_for_terminal}", flush=True)

    while time.time() < end_time:
        iteration += 1
        job = await job_queue.get_job(job_id)

        if not job:
            print(f"[OCR Wait] Job {job_id} not found after {iteration} iterations", flush=True)
            raise HTTPException(status_code=404, detail="Job not found")

        # Return immediately if job completed or failed (terminal states)
        if job.status in [JobStatus.completed, JobStatus.failed]:
            elapsed = time.time() - start_time
            print(f"[OCR Wait] Job {job_id} {job.status.value} after {elapsed:.1f}s ({iteration} iterations)", flush=True)
            return await get_job_status(job_id, session, current_user)

        # Return on status change (unless wait_for_terminal is set)
        if not wait_for_terminal and last_status is not None and job.status != last_status:
            elapsed = time.time() - start_time
            print(f"[OCR Wait] Job {job_id} status changed {last_status.value}->{job.status.value} after {elapsed:.1f}s", flush=True)
            return await get_job_status(job_id, session, current_user)

        last_status = job.status

        # Wait 500ms before next internal check (faster polling for better responsiveness)
        await asyncio.sleep(0.5)

    # Timeout - return current status
    elapsed = time.time() - start_time
    print(f"[OCR Wait] Job {job_id} timeout after {elapsed:.1f}s ({iteration} iterations), status={last_status.value if last_status else 'unknown'}", flush=True)
    return await get_job_status(job_id, session, current_user)


@router.get("/queue/status", response_model=OCRQueueStatusResponse)
async def get_queue_status(
    current_user: CurrentUser = Depends(get_current_user)
):
    """
    Get OCR queue statistics.

    Returns:
        Queue status with pending/processing counts and average time
    """
    stats = await job_queue.get_queue_stats()

    return OCRQueueStatusResponse(
        pending_jobs=stats["pending"],
        processing_jobs=stats["processing"],
        average_time_seconds=stats["average_time_seconds"]
    )


# ============================================================================
# Legacy Endpoints (backwards compatibility)
# ============================================================================

@router.get("/status", response_model=OCRStatusResponse)
async def get_ocr_status():
    """Check OCR service status and available engines."""
    engines = OCREngine.get_available_engines()
    available = len(engines) > 0

    if available:
        message = f"OCR service ready with engines: {', '.join(engines)}"
    else:
        message = "No OCR engines available. Install PaddleOCR or Tesseract."

    return OCRStatusResponse(
        available=available,
        engines=engines,
        message=message
    )


@router.get("/engines")
async def get_available_engines():
    """
    Get list of enabled OCR engines based on .env configuration.

    Returns engines availability and available processing modes.
    Frontend should use this to filter engine selection dropdown.

    Available engines: tesseract, doctr, doctr_plus, paddleocr
    """
    # Check which engines are enabled via .env
    paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
    tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
    default_engine = os.getenv("OCR_DEFAULT_ENGINE", "doctr_plus")

    # Build engines dict
    engines = {
        "tesseract": tesseract_enabled,
        "doctr": True,  # Always available (primary engine)
        "doctr_plus": True,  # Always available (recommended)
        "paddleocr": paddle_enabled,
    }

    # Build available modes based on enabled engines
    modes = []

    if tesseract_enabled:
        modes.append("tesseract")

    modes.append("doctr")
    modes.append("doctr_plus")

    if paddle_enabled:
        modes.append("paddleocr")

    return {
        "engines": engines,
        "available_modes": modes,
        "default_mode": default_engine,
        "memory_estimate_mb": {
            "tesseract": 50,
            "doctr": 600,
            "doctr_plus": 600,
            "paddleocr": 800,
        }
    }


@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse)
async def extract_from_attachment(
    attachment_id: int,
    engine: OCREngineChoice = Query(default=OCREngineChoice.doctr_plus),
    session: AsyncSession = Depends(get_session),
    current_user: CurrentUser = Depends(get_current_user)
):
    """
    Extract receipt data from an existing attachment.

    Re-processes an already uploaded file with OCR.
    This endpoint always processes synchronously.
    """
    attachment = await AttachmentCRUD.get_by_id(session, attachment_id)

    if not attachment:
        raise HTTPException(status_code=404, detail="Attachment not found")

    file_path = AttachmentCRUD.get_file_path(attachment)

    if not file_path.exists():
        raise HTTPException(status_code=404, detail="File not found on disk")

    # Check if file type is supported
    if attachment.mime_type not in ['image/jpeg', 'image/png', 'application/pdf']:
        raise HTTPException(
            status_code=400,
            detail=f"File type not supported for OCR: {attachment.mime_type}"
        )

    # TODO: Could use job queue here too, but keeping sync for now
    success, message, result = await ocr_service.process_image(
        file_path, attachment.mime_type
    )

    if not success:
        raise HTTPException(status_code=422, detail=message)

    data = _result_to_extraction_data(result)
    # Apply fuzzy CUI matching
    data = await _apply_fuzzy_cui_matching(data, session)
    return OCRResponse(success=True, message=message, data=data)


# ============================================================================
# Helper Functions
# ============================================================================

async def _apply_fuzzy_cui_matching(
    extraction_data: ExtractionData,
    session: AsyncSession
) -> ExtractionData:
    """
    Apply fuzzy CUI matching to extraction data.

    ONLY applies fuzzy matching if CUI is missing OR has invalid checksum.
    If CUI has valid checksum, we trust the OCR and skip fuzzy matching.

    Args:
        extraction_data: ExtractionData with CUI to potentially correct
        session: AsyncSession for database lookups

    Returns:
        ExtractionData with CUI corrected if a match was found
    """
    from backend.modules.data_entry.services.ocr.validation import CUIChecksumRule

    # Skip if no CUI and no vendor name (nothing to match)
    if not extraction_data.cui and not extraction_data.partner_name:
        return extraction_data

    # Check if CUI has valid checksum - if valid, skip fuzzy matching
    if extraction_data.cui:
        cui_digits = CUIChecksumRule.extract_digits(extraction_data.cui)
        if len(cui_digits) >= 6 and CUIChecksumRule.validate_checksum(cui_digits):
            print(f"[Fuzzy Match] CUI {extraction_data.cui} has valid checksum, skipping fuzzy match", flush=True)
            return extraction_data

    # CUI missing or invalid checksum - try fuzzy matching
    try:
        match = await OCRValidationEngine.fuzzy_match_supplier(
            cui=extraction_data.cui,
            vendor_name=extraction_data.partner_name,
            db_session=session
        )

        if match:
            corrected_cui, supplier_name = match
            if corrected_cui != extraction_data.cui:
                print(f"[Fuzzy Match] Corrected: {extraction_data.cui} -> {corrected_cui} ({supplier_name})", flush=True)
                extraction_data.cui = corrected_cui
                # Also set partner_name if not already set
                if not extraction_data.partner_name:
                    extraction_data.partner_name = supplier_name
    except Exception as e:
        print(f"[Fuzzy Match] Error: {e}", flush=True)

    return extraction_data


async def _process_sync(
    content: bytes,
    file: UploadFile,
    engine: OCREngineChoice,
    current_user: CurrentUser
) -> OCRJobSubmitResponse:
    """
    Process OCR synchronously (legacy mode).

    Creates a job, processes it immediately, and returns the result
    wrapped in a JobSubmitResponse for API consistency.
    """
    # Get file extension
    suffix = Path(file.filename).suffix.lower() if file.filename else '.jpg'
    if suffix not in ['.jpg', '.jpeg', '.png', '.pdf']:
        suffix = '.jpg'

    # Save to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(content)
        tmp_path = Path(tmp.name)

    try:
        success, message, result = await ocr_service.process_image(
            tmp_path, file.content_type
        )

        if not success:
            raise HTTPException(status_code=422, detail=message)

        # Create a fake job response with the result embedded
        # This maintains API compatibility
        now = datetime.utcnow()

        # For sync mode, we return a special response that includes
        # the result directly. Clients should check if result is present.
        return OCRJobSubmitResponse(
            job_id="sync-" + str(hash(content))[:16],
            status=OCRJobStatus.completed,
            queue_position=0,
            estimated_wait_seconds=0,
            created_at=now
        )

    finally:
        # Clean up temp file
        if tmp_path.exists():
            os.unlink(tmp_path)


def _result_to_extraction_data(result) -> ExtractionData:
    """Convert ExtractionResult to ExtractionData schema."""
    # Convert tva_entries from dict to TvaEntry objects
    tva_entries_schema = [
        TvaEntry(code=e.get('code'), percent=e['percent'], amount=e['amount'])
        for e in result.tva_entries
    ] if result.tva_entries else []

    # Convert payment_methods from dict to PaymentMethod objects
    payment_methods_list = [
        PaymentMethod(method=pm['method'], amount=Decimal(str(pm['amount'])))
        for pm in result.payment_methods
    ] if result.payment_methods else []

    # Auto-suggest payment_mode based on detected methods
    suggested_payment_mode = None
    if payment_methods_list:
        has_card = any(pm.method == 'CARD' for pm in payment_methods_list)
        if has_card:
            suggested_payment_mode = 'banca'

    return ExtractionData(
        receipt_type=result.receipt_type,
        receipt_number=result.receipt_number,
        receipt_series=result.receipt_series,
        receipt_date=result.receipt_date,
        amount=result.amount,
        partner_name=result.partner_name,
        cui=result.cui,
        description=result.description,
        tva_entries=tva_entries_schema,
        tva_total=result.tva_total,
        address=result.address,
        items_count=result.items_count,
        payment_methods=payment_methods_list,
        suggested_payment_mode=suggested_payment_mode,
        client_name=result.client_name,
        client_cui=result.client_cui,
        client_address=result.client_address,
        confidence_amount=result.confidence_amount,
        confidence_date=result.confidence_date,
        confidence_vendor=result.confidence_vendor,
        confidence_client=getattr(result, 'confidence_client', 0.0),
        overall_confidence=result.overall_confidence,
        raw_text=result.raw_text,
        raw_texts=getattr(result, 'raw_texts', []),
        ocr_engine=result.ocr_engine,
        processing_time_ms=result.processing_time_ms,
        needs_manual_review=result.needs_manual_review,
        validation_warnings=result.validation_warnings,
        validation_errors=result.validation_errors,
        inter_ocr_ratios=result.inter_ocr_ratios,
    )


def _dict_to_extraction_data(data: dict) -> ExtractionData:
    """Convert result dict (from job queue) to ExtractionData schema."""
    from datetime import date

    # Parse date if string
    receipt_date = data.get('receipt_date')
    if isinstance(receipt_date, str):
        try:
            receipt_date = date.fromisoformat(receipt_date)
        except (ValueError, TypeError):
            receipt_date = None

    # Convert tva_entries
    tva_entries = data.get('tva_entries', []) or []
    tva_entries_schema = []
    for e in tva_entries:
        if isinstance(e, dict):
            tva_entries_schema.append(TvaEntry(
                code=e.get('code'),
                percent=e.get('percent', 0),
                amount=Decimal(str(e.get('amount', 0)))
            ))

    # Convert payment_methods
    payment_methods = data.get('payment_methods', []) or []
    payment_methods_list = []
    for pm in payment_methods:
        if isinstance(pm, dict):
            payment_methods_list.append(PaymentMethod(
                method=pm.get('method', 'NUMERAR'),
                amount=Decimal(str(pm.get('amount', 0)))
            ))

    # Convert amount and tva_total to Decimal
    amount = data.get('amount')
    if amount is not None:
        amount = Decimal(str(amount))

    tva_total = data.get('tva_total')
    if tva_total is not None:
        tva_total = Decimal(str(tva_total))

    return ExtractionData(
        receipt_type=data.get('receipt_type', 'bon_fiscal'),
        receipt_number=data.get('receipt_number'),
        receipt_series=data.get('receipt_series'),
        receipt_date=receipt_date,
        amount=amount,
        partner_name=data.get('partner_name'),
        cui=data.get('cui'),
        description=data.get('description'),
        tva_entries=tva_entries_schema,
        tva_total=tva_total,
        address=data.get('address'),
        items_count=data.get('items_count'),
        payment_methods=payment_methods_list,
        suggested_payment_mode=data.get('suggested_payment_mode'),
        client_name=data.get('client_name'),
        client_cui=data.get('client_cui'),
        client_address=data.get('client_address'),
        confidence_amount=data.get('confidence_amount', 0.0),
        confidence_date=data.get('confidence_date', 0.0),
        confidence_vendor=data.get('confidence_vendor', 0.0),
        confidence_client=data.get('confidence_client', 0.0),
        overall_confidence=data.get('overall_confidence', 0.0),
        raw_text=data.get('raw_text', ''),
        raw_texts=data.get('raw_texts', []),
        ocr_engine=data.get('ocr_engine', ''),
        processing_time_ms=data.get('processing_time_ms', 0),
        needs_manual_review=data.get('needs_manual_review'),
        validation_warnings=data.get('validation_warnings', []),
        validation_errors=data.get('validation_errors', []),
        inter_ocr_ratios=data.get('inter_ocr_ratios', {}),
    )