feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions
--- a/backend/modules/data_entry/services/ocr/job_queue.py
+++ b/backend/modules/data_entry/services/ocr/job_queue.py
@@ -13,13 +13,14 @@ Schema:
        status TEXT NOT NULL,      -- pending, processing, completed, failed
        file_path TEXT NOT NULL,   -- Path to uploaded file
        mime_type TEXT NOT NULL,
-        engine TEXT DEFAULT 'auto',
+        engine TEXT DEFAULT 'doctr_plus',
        created_at TIMESTAMP,
        started_at TIMESTAMP,
        completed_at TIMESTAMP,
        result_json TEXT,          -- JSON extraction result
        error_message TEXT,
-        processing_time_ms INTEGER,
+        processing_time_ms INTEGER,  -- Total job time (started_at to completed_at)
+        ocr_time_ms INTEGER,         -- Actual OCR engine processing time
        created_by TEXT,           -- Username
        original_filename TEXT,
        expires_at TIMESTAMP
@@ -74,17 +75,26 @@ class OCRJob:
    status: OCRJobStatus
    file_path: str
    mime_type: str
-    engine: str = "auto"
+    engine: str = "doctr_plus"
    created_at: Optional[datetime] = None
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
    result_json: Optional[str] = None
    error_message: Optional[str] = None
-    processing_time_ms: Optional[int] = None
+    processing_time_ms: Optional[int] = None  # Total job time (started_at to completed_at)
+    ocr_time_ms: Optional[int] = None  # Actual OCR engine processing time
    created_by: Optional[str] = None
    original_filename: Optional[str] = None
    expires_at: Optional[datetime] = None

+    @property
+    def queue_wait_ms(self) -> Optional[int]:
+        """Calculate queue wait time (created_at to started_at)."""
+        if self.created_at and self.started_at:
+            delta = self.started_at - self.created_at
+            return int(delta.total_seconds() * 1000)
+        return None
+
    @property
    def result(self) -> Optional[Dict]:
        """Parse result_json to dict."""
@@ -143,19 +153,27 @@ class OCRJobQueue:
                    status TEXT NOT NULL DEFAULT 'pending',
                    file_path TEXT NOT NULL,
                    mime_type TEXT NOT NULL,
-                    engine TEXT DEFAULT 'auto',
+                    engine TEXT DEFAULT 'doctr_plus',
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    started_at TIMESTAMP,
                    completed_at TIMESTAMP,
                    result_json TEXT,
                    error_message TEXT,
                    processing_time_ms INTEGER,
+                    ocr_time_ms INTEGER,
                    created_by TEXT,
                    original_filename TEXT,
                    expires_at TIMESTAMP
                )
            ''')

+            # Migration: add ocr_time_ms column if it doesn't exist
+            try:
+                await db.execute('ALTER TABLE ocr_jobs ADD COLUMN ocr_time_ms INTEGER')
+                logger.info("[OCRJobQueue] Added ocr_time_ms column to existing table")
+            except Exception:
+                pass  # Column already exists
+
            # Index for efficient queue queries
            await db.execute('''
                CREATE INDEX IF NOT EXISTS idx_ocr_jobs_status
@@ -177,7 +195,7 @@ class OCRJobQueue:
        self,
        file_bytes: bytes,
        mime_type: str,
-        engine: str = "auto",
+        engine: str = "doctr_plus",
        username: Optional[str] = None,
        original_filename: Optional[str] = None
    ) -> OCRJob:
@@ -189,7 +207,7 @@ class OCRJobQueue:
        Args:
            file_bytes: Raw file bytes
            mime_type: MIME type of file
-            engine: OCR engine ('auto', 'paddleocr', 'tesseract')
+            engine: OCR engine ('tesseract', 'doctr', 'doctr_plus', 'paddleocr')
            username: Username of requester
            original_filename: Original filename from upload

@@ -301,24 +319,52 @@ class OCRJobQueue:

    async def get_next_pending(self) -> Optional[OCRJob]:
        """
-        Get the next pending job (oldest first).
+        Get the next pending job (oldest first) and atomically mark it as processing.
+
+        This prevents race conditions in parallel processing - only one worker
+        can claim each job.

        Returns:
            Next OCRJob to process or None if queue empty
        """
        await self.initialize()

-        async with aiosqlite.connect(str(self.db_path)) as db:
-            db.row_factory = aiosqlite.Row
-            async with db.execute('''
-                SELECT * FROM ocr_jobs
-                WHERE status = 'pending'
-                ORDER BY created_at ASC
-                LIMIT 1
-            ''') as cursor:
-                row = await cursor.fetchone()
-                if row:
-                    return self._row_to_job(row)
+        now = datetime.utcnow()
+
+        async with self._lock:  # Serialize access to prevent race conditions
+            async with aiosqlite.connect(str(self.db_path)) as db:
+                db.row_factory = aiosqlite.Row
+
+                # Get the next pending job
+                async with db.execute('''
+                    SELECT * FROM ocr_jobs
+                    WHERE status = 'pending'
+                    ORDER BY created_at ASC
+                    LIMIT 1
+                ''') as cursor:
+                    row = await cursor.fetchone()
+                    if not row:
+                        return None
+
+                    job_id = row['id']
+
+                # Atomically mark as processing
+                await db.execute('''
+                    UPDATE ocr_jobs
+                    SET status = 'processing', started_at = ?
+                    WHERE id = ? AND status = 'pending'
+                ''', (now.isoformat(), job_id))
+                await db.commit()
+
+                # Fetch the updated job
+                async with db.execute(
+                    'SELECT * FROM ocr_jobs WHERE id = ?',
+                    (job_id,)
+                ) as cursor:
+                    updated_row = await cursor.fetchone()
+                    if updated_row:
+                        return self._row_to_job(updated_row)
+
        return None

    async def update_status(
@@ -327,7 +373,8 @@ class OCRJobQueue:
        status: OCRJobStatus,
        result: Optional[Dict] = None,
        error: Optional[str] = None,
-        processing_time_ms: Optional[int] = None
+        processing_time_ms: Optional[int] = None,
+        ocr_time_ms: Optional[int] = None
    ) -> bool:
        """
        Update job status.
@@ -337,7 +384,8 @@ class OCRJobQueue:
            status: New status
            result: Extraction result dict (for completed)
            error: Error message (for failed)
-            processing_time_ms: Processing time
+            processing_time_ms: Total job processing time (started_at to completed_at)
+            ocr_time_ms: Actual OCR engine processing time

        Returns:
            True if update successful
@@ -359,18 +407,18 @@ class OCRJobQueue:
        elif status == OCRJobStatus.completed:
            query = '''
                UPDATE ocr_jobs
-                SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?
+                SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?, ocr_time_ms = ?
                WHERE id = ?
            '''
-            params = (status.value, now.isoformat(), result_json, processing_time_ms, job_id)
+            params = (status.value, now.isoformat(), result_json, processing_time_ms, ocr_time_ms, job_id)

        elif status == OCRJobStatus.failed:
            query = '''
                UPDATE ocr_jobs
-                SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?
+                SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?, ocr_time_ms = ?
                WHERE id = ?
            '''
-            params = (status.value, now.isoformat(), error, processing_time_ms, job_id)
+            params = (status.value, now.isoformat(), error, processing_time_ms, ocr_time_ms, job_id)

        else:
            query = 'UPDATE ocr_jobs SET status = ? WHERE id = ?'
@@ -542,13 +590,14 @@ class OCRJobQueue:
            status=OCRJobStatus(row['status']),
            file_path=row['file_path'],
            mime_type=row['mime_type'],
-            engine=row['engine'] or 'auto',
+            engine=row['engine'] or 'doctr_plus',
            created_at=parse_datetime(row['created_at']),
            started_at=parse_datetime(row['started_at']),
            completed_at=parse_datetime(row['completed_at']),
            result_json=row['result_json'],
            error_message=row['error_message'],
            processing_time_ms=row['processing_time_ms'],
+            ocr_time_ms=row['ocr_time_ms'] if 'ocr_time_ms' in row.keys() else None,
            created_by=row['created_by'],
            original_filename=row['original_filename'],
            expires_at=parse_datetime(row['expires_at']),