feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions
--- a/backend/modules/data_entry/services/ocr/job_worker.py
+++ b/backend/modules/data_entry/services/ocr/job_worker.py
@@ -2,7 +2,7 @@
 OCR Job Worker - Background Task for Queue Processing

 Runs as an asyncio background task in FastAPI.
-Continuously polls the job queue and processes OCR requests.
+Continuously polls the job queue and processes OCR requests IN PARALLEL.

 Architecture:
    FastAPI startup
@@ -12,18 +12,19 @@ Architecture:
    asyncio.create_task(_job_worker_loop())
        ↓
    while True:
-        job = job_queue.get_next_pending()
-        if job:
-            result = ocr_worker_pool.submit_task(...)
-            job_queue.update_status(...)
-        await asyncio.sleep(0.5)
+        # Process up to OCR_WORKERS jobs concurrently
+        jobs = get_pending_jobs(limit=available_slots)
+        for job in jobs:
+            asyncio.create_task(_process_job(job))
+        await asyncio.sleep(0.1)
 """

 import asyncio
 import logging
+import os
 import time
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Set

 from .job_queue import job_queue, OCRJobStatus, OCRJob
 from .ocr_worker_pool import ocr_worker_pool
@@ -34,47 +35,76 @@ logger = logging.getLogger(__name__)
 _job_worker_task: Optional[asyncio.Task] = None
 _cleanup_task: Optional[asyncio.Task] = None
 _shutdown_event: Optional[asyncio.Event] = None
+_active_tasks: Set[asyncio.Task] = set()  # Track active job tasks
+_concurrency_semaphore: Optional[asyncio.Semaphore] = None  # Limit concurrent jobs

 # Configuration
-POLL_INTERVAL_SECONDS = 0.5  # How often to check for new jobs
+POLL_INTERVAL_SECONDS = 0.1  # How often to check for new jobs (faster for parallel)
 CLEANUP_INTERVAL_SECONDS = 3600  # Clean expired jobs every hour
 OCR_TIMEOUT_SECONDS = 120  # Max time for OCR processing


 async def _job_worker_loop() -> None:
    """
-    Main worker loop - processes jobs from queue.
+    Main worker loop - processes jobs from queue IN PARALLEL.

-    Runs continuously until shutdown. Polls queue every 0.5s
-    and submits jobs to worker pool for processing.
+    Runs continuously until shutdown. Uses semaphore to limit
+    concurrent jobs to OCR_WORKERS count. Launches jobs as
+    background tasks without waiting for completion.
    """
-    global _shutdown_event
+    global _shutdown_event, _active_tasks, _concurrency_semaphore

-    logger.info("[JobWorker] Starting worker loop...")
+    # Get max concurrent jobs from env (matches worker pool size)
+    max_concurrent = int(os.getenv('OCR_WORKERS', '2'))
+    _concurrency_semaphore = asyncio.Semaphore(max_concurrent)
+    _active_tasks = set()
+
+    logger.info(f"[JobWorker] Starting PARALLEL worker loop (max_concurrent={max_concurrent})...")
    _shutdown_event = asyncio.Event()

    consecutive_errors = 0
-    max_consecutive_errors = 5
+    max_consecutive_errors = 10

    while not _shutdown_event.is_set():
        try:
-            # Get next pending job
-            job = await job_queue.get_next_pending()
-
-            if job:
-                consecutive_errors = 0  # Reset error counter on success
-                await _process_job(job)
-            else:
-                # No jobs - wait before polling again
+            # Clean up completed tasks
+            done_tasks = {t for t in _active_tasks if t.done()}
+            for task in done_tasks:
+                _active_tasks.discard(task)
+                # Check for exceptions
                try:
-                    await asyncio.wait_for(
-                        _shutdown_event.wait(),
-                        timeout=POLL_INTERVAL_SECONDS
-                    )
-                    if _shutdown_event.is_set():
-                        break
-                except asyncio.TimeoutError:
-                    pass  # Normal timeout, continue loop
+                    task.result()
+                except Exception as e:
+                    logger.error(f"[JobWorker] Task failed: {e}")
+
+            # Check if we have capacity for more jobs
+            active_count = len(_active_tasks)
+            available_slots = max_concurrent - active_count
+
+            if available_slots > 0:
+                # Get next pending job
+                job = await job_queue.get_next_pending()
+
+                if job:
+                    consecutive_errors = 0
+                    # Launch job processing as background task
+                    task = asyncio.create_task(_process_job_with_semaphore(job))
+                    _active_tasks.add(task)
+                    logger.debug(f"[JobWorker] Launched job {job.id} (active={len(_active_tasks)}/{max_concurrent})")
+                else:
+                    # No pending jobs - wait briefly
+                    try:
+                        await asyncio.wait_for(
+                            _shutdown_event.wait(),
+                            timeout=POLL_INTERVAL_SECONDS
+                        )
+                        if _shutdown_event.is_set():
+                            break
+                    except asyncio.TimeoutError:
+                        pass
+            else:
+                # At capacity - wait for a slot to free up
+                await asyncio.sleep(POLL_INTERVAL_SECONDS)

        except asyncio.CancelledError:
            logger.info("[JobWorker] Worker loop cancelled")
@@ -88,27 +118,46 @@ async def _job_worker_loop() -> None:
                logger.error("[JobWorker] Too many consecutive errors, stopping worker")
                break

-            # Backoff on errors
            await asyncio.sleep(min(consecutive_errors * 2, 30))

+    # Wait for active tasks to complete on shutdown
+    if _active_tasks:
+        logger.info(f"[JobWorker] Waiting for {len(_active_tasks)} active tasks to complete...")
+        await asyncio.gather(*_active_tasks, return_exceptions=True)
+
    logger.info("[JobWorker] Worker loop stopped")


+async def _process_job_with_semaphore(job: OCRJob) -> None:
+    """
+    Process job with semaphore to limit concurrency.
+
+    Acquires semaphore before processing, releases after.
+    This ensures we don't exceed OCR_WORKERS concurrent jobs.
+    """
+    global _concurrency_semaphore
+
+    async with _concurrency_semaphore:
+        await _process_job(job)
+
+
 async def _process_job(job: OCRJob) -> None:
    """
    Process a single OCR job.

-    Reads file, submits to worker pool, updates job status.
+    Reads file, submits to worker pool, updates job status,
+    and saves metrics for analytics.

    Args:
        job: OCRJob to process
    """
    logger.info(f"[JobWorker] Processing job {job.id}: engine={job.engine}, file={Path(job.file_path).name}")
    start_time = time.time()
+    file_size = 0
+    file_type = "image/jpeg"

    try:
-        # Mark as processing
-        await job_queue.update_status(job.id, OCRJobStatus.processing)
+        # Note: Job already marked as 'processing' atomically in get_next_pending()

        # Read file bytes
        file_path = Path(job.file_path)
@@ -118,6 +167,10 @@ async def _process_job(job: OCRJob) -> None:
        with open(file_path, 'rb') as f:
            file_bytes = f.read()

+        file_size = len(file_bytes)
+        # Determine file type from job or extension
+        file_type = getattr(job, 'mime_type', 'image/jpeg') or 'image/jpeg'
+
        # Submit to worker pool
        result = await ocr_worker_pool.submit_task(
            image_bytes=file_bytes,
@@ -132,14 +185,43 @@ async def _process_job(job: OCRJob) -> None:
            # Job completed successfully
            extraction = result.get("extraction", {})

+            # Include raw_texts for analysis (from all OCR engine passes)
+            extraction['raw_texts'] = result.get("raw_texts", [])
+
+            # Extract actual OCR processing time from extraction result
+            ocr_time_ms = extraction.get('processing_time_ms', 0)
+
+            # Debug: log suggested_payment_mode
+            spm = extraction.get('suggested_payment_mode')
+            logger.info(f"[JobWorker] Job {job.id} extraction has suggested_payment_mode={spm}")
+
            await job_queue.update_status(
                job_id=job.id,
                status=OCRJobStatus.completed,
                result=extraction,
-                processing_time_ms=elapsed_ms
+                processing_time_ms=elapsed_ms,
+                ocr_time_ms=ocr_time_ms
            )

-            logger.info(f"[JobWorker] Job {job.id} completed in {elapsed_ms}ms")
+            logger.info(f"[JobWorker] Job {job.id} completed in {elapsed_ms}ms (ocr: {ocr_time_ms}ms)")
+
+            # Save metrics for successful job
+            await _save_job_metrics(
+                job_id=job.id,
+                username=job.created_by or 'unknown',
+                engine_requested=job.engine,
+                engine_used=extraction.get('ocr_engine', job.engine),
+                processing_time_ms=elapsed_ms,
+                file_size_bytes=file_size,
+                file_type=file_type,
+                original_filename=job.original_filename,
+                success=True,
+                overall_confidence=extraction.get('overall_confidence', 0.0),
+                fields_extracted=_count_extracted_fields(extraction),
+                needs_manual_review=extraction.get('needs_manual_review'),
+                validation_warnings_count=len(extraction.get('validation_warnings', [])),
+                validation_errors_count=len(extraction.get('validation_errors', [])),
+            )

        else:
            # Job failed
@@ -154,6 +236,20 @@ async def _process_job(job: OCRJob) -> None:

            logger.warning(f"[JobWorker] Job {job.id} failed after {elapsed_ms}ms: {error_msg}")

+            # Save metrics for failed job
+            await _save_job_metrics(
+                job_id=job.id,
+                username=job.created_by or 'unknown',
+                engine_requested=job.engine,
+                engine_used=job.engine,
+                processing_time_ms=elapsed_ms,
+                file_size_bytes=file_size,
+                file_type=file_type,
+                original_filename=job.original_filename,
+                success=False,
+                error_message=error_msg,
+            )
+
    except Exception as e:
        elapsed_ms = int((time.time() - start_time) * 1000)

@@ -166,6 +262,20 @@ async def _process_job(job: OCRJob) -> None:
            processing_time_ms=elapsed_ms
        )

+        # Save metrics for error job
+        await _save_job_metrics(
+            job_id=job.id,
+            username=job.created_by or 'unknown',
+            engine_requested=job.engine,
+            engine_used=job.engine,
+            processing_time_ms=elapsed_ms,
+            file_size_bytes=file_size,
+            file_type=file_type,
+            original_filename=job.original_filename,
+            success=False,
+            error_message=str(e),
+        )
+
    finally:
        # Cleanup file after processing
        try:
@@ -340,3 +450,96 @@ def estimate_wait_time(queue_position: int) -> int:

    # Estimate: position * average_time
    return int(queue_position * avg_time)
+
+
+# ============================================================================
+# Metrics Helper Functions
+# ============================================================================
+
+async def _save_job_metrics(
+    job_id: str,
+    username: str,
+    engine_requested: str,
+    engine_used: str,
+    processing_time_ms: int = 0,
+    file_size_bytes: int = 0,
+    file_type: str = "image/jpeg",
+    original_filename: Optional[str] = None,
+    success: bool = True,
+    error_message: Optional[str] = None,
+    overall_confidence: float = 0.0,
+    fields_extracted: int = 0,
+    needs_manual_review: Optional[bool] = None,
+    validation_warnings_count: int = 0,
+    validation_errors_count: int = 0,
+) -> None:
+    """
+    Save OCR job metrics to database for analytics.
+
+    Called after each job completes (success or failure).
+    Errors are logged but don't affect job processing.
+    """
+    try:
+        from backend.modules.data_entry.db.database import get_db_session
+        from backend.modules.data_entry.db.crud.ocr_settings import OCRMetricsCRUD
+
+        async with await get_db_session() as session:
+            await OCRMetricsCRUD.create(
+                session=session,
+                job_id=job_id,
+                username=username,
+                engine_requested=engine_requested,
+                engine_used=engine_used,
+                processing_time_ms=processing_time_ms,
+                file_size_bytes=file_size_bytes,
+                file_type=file_type,
+                original_filename=original_filename,
+                success=success,
+                error_message=error_message,
+                overall_confidence=overall_confidence,
+                fields_extracted=fields_extracted,
+                needs_manual_review=needs_manual_review,
+                validation_warnings_count=validation_warnings_count,
+                validation_errors_count=validation_errors_count,
+            )
+            logger.debug(f"[JobWorker] Saved metrics for job {job_id}")
+
+    except Exception as e:
+        # Log but don't fail - metrics are nice-to-have
+        logger.warning(f"[JobWorker] Failed to save metrics for job {job_id}: {e}")
+
+
+def _count_extracted_fields(extraction: dict) -> int:
+    """
+    Count number of successfully extracted fields from OCR result.
+
+    Counts non-None values in key fields.
+    """
+    key_fields = [
+        'receipt_number',
+        'receipt_date',
+        'amount',
+        'partner_name',
+        'cui',
+        'tva_total',
+        'address',
+        'items_count',
+    ]
+
+    count = 0
+    for field in key_fields:
+        value = extraction.get(field)
+        if value is not None and value != '' and value != []:
+            count += 1
+
+    # Also count TVA entries if present
+    tva_entries = extraction.get('tva_entries', [])
+    if tva_entries and len(tva_entries) > 0:
+        count += 1
+
+    # Count payment methods if present
+    payment_methods = extraction.get('payment_methods', [])
+    if payment_methods and len(payment_methods) > 0:
+        count += 1
+
+    return count