feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions
--- a/backend/modules/data_entry/services/ocr_engine.py
+++ b/backend/modules/data_entry/services/ocr_engine.py
@@ -1,4 +1,4 @@
-"""OCR engine wrapper for PaddleOCR and Tesseract."""
+"""OCR engine wrapper for PaddleOCR, docTR, and Tesseract."""

 import os
 import logging
@@ -9,9 +9,8 @@ from typing import List, Optional, Tuple

 import numpy as np

-# Setup logging
+# Setup logging (respects LOG_LEVEL env var set in main.py)
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)  # Ensure logs are visible

 # Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
 os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
@@ -19,6 +18,7 @@ os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
 # Lazy imports - these will be imported on first use
 PaddleOCR = None  # Will be imported lazily
 pytesseract = None  # Will be imported lazily
+doctr_ocr_predictor = None  # Will be imported lazily

 # Check availability without importing heavy libraries
 def _check_paddle_available() -> bool:
@@ -37,8 +37,17 @@ def _check_tesseract_available() -> bool:
    except Exception:
        return False

+def _check_doctr_available() -> bool:
+    """Check if doctr is installed without importing it."""
+    try:
+        import importlib.util
+        return importlib.util.find_spec("doctr") is not None
+    except Exception:
+        return False
+
 PADDLE_AVAILABLE = _check_paddle_available()
 TESSERACT_AVAILABLE = _check_tesseract_available()
+DOCTR_AVAILABLE = _check_doctr_available()


@dataclass
@@ -59,6 +68,11 @@ class OCREngine:
        self._paddle_ready = threading.Event()  # Signals when PaddleOCR is FULLY ready
        self._paddle_init_lock = threading.Lock()

+        self._doctr = None
+        self._doctr_init_started = False
+        self._doctr_ready = threading.Event()  # Signals when docTR is FULLY ready
+        self._doctr_init_lock = threading.Lock()
+
    def _init_paddle_lazy(self):
        """Lazy initialize PaddleOCR on first use (avoids slow startup)."""
        global PaddleOCR
@@ -94,6 +108,78 @@ class OCREngine:
        # Signal that initialization is complete (success or failure)
        self._paddle_ready.set()

+    def _init_doctr_lazy(self):
+        """Lazy initialize docTR on first use (avoids slow startup)."""
+        global doctr_ocr_predictor
+
+        with self._doctr_init_lock:
+            if self._doctr_init_started:
+                return  # Already initializing or done
+            self._doctr_init_started = True
+
+        if DOCTR_AVAILABLE:
+            try:
+                print("Importing docTR (first use, may take ~10-15 seconds)...", flush=True)
+                from doctr.io import DocumentFile
+                from doctr.models import ocr_predictor
+
+                print("Initializing docTR engine (PyTorch backend)...", flush=True)
+                # Initialize docTR predictor with pretrained models
+                # Uses db_resnet50 for detection and crnn_vgg16_bn for recognition
+                self._doctr = ocr_predictor(
+                    det_arch='db_resnet50',
+                    reco_arch='crnn_vgg16_bn',
+                    pretrained=True,
+                    assume_straight_pages=True,
+                    straighten_pages=False,
+                    preserve_aspect_ratio=True,
+                )
+                doctr_ocr_predictor = self._doctr
+                print("docTR initialized successfully with PyTorch backend", flush=True)
+            except Exception as e:
+                print(f"Warning: Failed to initialize docTR: {e}", flush=True)
+                self._doctr = None
+
+        # Signal that initialization is complete (success or failure)
+        self._doctr_ready.set()
+
+    def wait_for_doctr(self, timeout: float = 30.0) -> bool:
+        """
+        Wait for docTR to be fully initialized.
+
+        Args:
+            timeout: Max seconds to wait (default 30s)
+
+        Returns:
+            True if docTR is ready, False if timeout or unavailable
+        """
+        if not DOCTR_AVAILABLE:
+            return False
+
+        if self._doctr is not None:
+            return True  # Already ready
+
+        if not self._doctr_init_started:
+            # Start initialization if not already started
+            self._init_doctr_lazy()
+
+        # Wait for initialization to complete
+        print(f"[OCR] Waiting for docTR to be ready (max {timeout}s)...", flush=True)
+        start = time.time()
+        ready = self._doctr_ready.wait(timeout=timeout)
+        elapsed = time.time() - start
+
+        if ready and self._doctr is not None:
+            print(f"[OCR] docTR ready after {elapsed:.1f}s", flush=True)
+            return True
+        else:
+            print(f"[OCR] docTR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
+            return False
+
+    def is_doctr_ready(self) -> bool:
+        """Check if docTR is ready without waiting."""
+        return self._doctr is not None
+
    def wait_for_paddle(self, timeout: float = 30.0) -> bool:
        """
        Wait for PaddleOCR to be fully initialized.
@@ -239,6 +325,84 @@ class OCREngine:
        logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
        return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")

+    def _doctr_recognize(self, image: np.ndarray) -> OCRResult:
+        """Recognize text using docTR."""
+        # Wait for docTR to be fully ready
+        if not self.wait_for_doctr(timeout=30.0):
+            logger.warning("[docTR] Not ready, falling back to Tesseract")
+            if TESSERACT_AVAILABLE:
+                return self._tesseract_recognize(image)
+            raise RuntimeError("docTR not ready and Tesseract not available")
+
+        try:
+            logger.info(f"[docTR] Processing image, shape: {image.shape}")
+
+            # docTR requires RGB images
+            import cv2
+            if len(image.shape) == 2:
+                # Convert grayscale to RGB
+                image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+                logger.info(f"[docTR] Converted grayscale to RGB, new shape: {image.shape}")
+            elif image.shape[2] == 4:
+                # Convert RGBA to RGB
+                image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+                logger.info(f"[docTR] Converted RGBA to RGB, new shape: {image.shape}")
+            elif image.shape[2] == 3:
+                # Check if BGR (from OpenCV) and convert to RGB
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                logger.info(f"[docTR] Converted BGR to RGB, shape: {image.shape}")
+
+            # Process image with docTR
+            logger.info("[docTR] Running prediction...")
+            from doctr.io import DocumentFile
+
+            # docTR expects a document (list of pages as numpy arrays)
+            result = self._doctr([image])
+
+            if not result or not result.pages:
+                logger.warning("[docTR] No results returned")
+                return OCRResult(text="", confidence=0.0, boxes=[], engine="doctr")
+
+            # Extract text from all pages
+            all_texts = []
+            all_confidences = []
+            boxes = []
+
+            for page in result.pages:
+                for block in page.blocks:
+                    for line in block.lines:
+                        line_text = ' '.join(word.value for word in line.words)
+                        line_confidence = sum(w.confidence for w in line.words) / len(line.words) if line.words else 0.0
+                        all_texts.append(line_text)
+                        all_confidences.append(line_confidence)
+
+                        # Store word-level boxes
+                        for word in line.words:
+                            boxes.append({
+                                'text': word.value,
+                                'confidence': float(word.confidence),
+                                'box': word.geometry  # (xmin, ymin), (xmax, ymax)
+                            })
+
+            text_result = '\n'.join(all_texts)
+            avg_conf = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
+
+            logger.info(f"[docTR] SUCCESS - Found {len(all_texts)} text lines, avg confidence: {avg_conf:.2%}")
+            logger.debug(f"[docTR] Raw text preview: {text_result[:200]}...")
+
+            return OCRResult(
+                text=text_result,
+                confidence=float(avg_conf),
+                boxes=boxes,
+                engine="doctr"
+            )
+
+        except Exception as e:
+            logger.error(f"[docTR] ERROR: {e}, falling back to Tesseract")
+            if TESSERACT_AVAILABLE:
+                return self._tesseract_recognize(image)
+            raise
+
    def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
        """
        Run both OCR engines and return both results.
@@ -286,10 +450,27 @@ class OCREngine:

    @staticmethod
    def get_available_engines() -> List[str]:
-        """Return list of available OCR engines."""
+        """
+        Return list of available OCR engines.
+
+        Respects OCR_ENABLE_PADDLEOCR and OCR_ENABLE_TESSERACT from .env.
+        Engines that are disabled via .env are not returned even if installed.
+
+        Available engines: tesseract, doctr, doctr_plus, paddleocr
+        """
+        # Check .env settings
+        paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
+        tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
+
        engines = []
-        if PADDLE_AVAILABLE:
-            engines.append('paddleocr')
-        if TESSERACT_AVAILABLE:
+
+        # Base engines (only if installed AND enabled)
+        if TESSERACT_AVAILABLE and tesseract_enabled:
            engines.append('tesseract')
+        if DOCTR_AVAILABLE:
+            engines.append('doctr')
+            engines.append('doctr_plus')  # docTR with 2-tier sequential + early exit
+        if PADDLE_AVAILABLE and paddle_enabled:
+            engines.append('paddleocr')
+
        return engines