feat(ocr): Implement persistent worker pool with SQLite job queue

Major OCR infrastructure improvements: - Add persistent SQLite-based job queue for OCR tasks - Implement worker pool with process isolation and auto-restart - Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone - Optimize Tesseract preprocessing based on benchmark results (8x faster) - Add recognize_cif_optimized() with multi-strategy CIF extraction - Add Romanian CIF checksum validation - Increase Telegram long polling timeout from 10s to 30s Squashed commits: - feat(ocr): Implement persistent worker pool with SQLite job queue - feat(ocr): Add OCR engine selector dropdown to upload zone - perf(telegram): Increase long polling timeout from 10s to 30s - perf(ocr): Optimize Tesseract preprocessing based on benchmark results 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 12:32:12 +02:00
parent 00f826f7ed
commit 74f7aefc26
23 changed files with 3616 additions and 209 deletions
--- a/backend/modules/data_entry/services/image_preprocessor.py
+++ b/backend/modules/data_entry/services/image_preprocessor.py
@@ -234,50 +234,76 @@ class ImagePreprocessor:

        return result

-    def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray:
+    def preprocess_for_tesseract(self, image: np.ndarray, binarize: bool = False,
+                                   padding: int = 0, clahe_clip: float = 1.5) -> np.ndarray:
        """
-        Tesseract-optimized preprocessing.
-        Tesseract works best with:
-        - Clean black text on white background (binarized)
-        - High DPI (scale up small images)
-        - Otsu thresholding (better than adaptive for clean documents)
-        """
-        # 0. Add safety padding to protect edge content during deskew rotation
-        image = self._add_safety_padding(image)
+        Tesseract-optimized preprocessing (based on comprehensive benchmark).

-        # 1. Grayscale
+        BENCHMARK FINDINGS:
+        - DPI 200 is optimal (not 300!)
+        - Padding 40px fixes left margin truncation issues
+        - CLAHE 1.5 for most receipts, 2.0 for difficult ones
+        - NO deskew, NO denoising for clear PDFs
+
+        Recommended usage:
+        - Simple receipts: padding=0, clahe_clip=1.5
+        - Complex receipts: padding=40, clahe_clip=1.5
+        - Difficult/faded: padding=40, clahe_clip=2.0, binarize=True
+
+        Args:
+            image: Input image (RGB from pdf2image or BGR from OpenCV)
+            binarize: Apply Otsu binarization (for faded receipts)
+            padding: White padding in pixels (40px recommended for edge protection)
+            clahe_clip: CLAHE clip limit (1.5 normal, 2.0 for difficult)
+
+        Returns:
+            Preprocessed grayscale image
+        """
+        # 1. Grayscale (handle both RGB and BGR)
        if len(image.shape) == 3:
-            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image.copy()

-        # 2. Scale for optimal Tesseract (target ~2000px width for receipts)
-        height, width = gray.shape
-        if width < 2000:
-            scale = 2000 / width
-            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
-        elif width > 3000:
-            scale = 3000 / width
-            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
+        # 2. Add padding if specified (protects against left margin truncation)
+        if padding > 0:
+            gray = cv2.copyMakeBorder(
+                gray, padding, padding, padding, padding,
+                cv2.BORDER_CONSTANT, value=255
+            )

-        # 3. Deskew
-        gray = self._deskew(gray)
-
-        # 4. Strong contrast enhancement
-        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+        # 3. CLAHE contrast enhancement
+        clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

-        # 5. Denoise before binarization
-        denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21)
+        # NO deskew, NO denoising - these DEGRADE quality on clear PDFs!

-        # 6. Otsu binarization (better than adaptive for clean PDFs)
-        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        if not binarize:
+            return enhanced

-        # 7. Light morphological cleanup
-        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
-        cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
+        # Binarization only for faded receipts
+        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

-        return cleaned
+        # Ensure correct polarity
+        if np.mean(binary) < 127:
+            binary = 255 - binary
+
+        return binary
+
+    def preprocess_for_tesseract_padded(self, image: np.ndarray) -> np.ndarray:
+        """
+        Tesseract preprocessing with optimal padding (40px).
+
+        Best for complex receipts where left margin gets truncated.
+        """
+        return self.preprocess_for_tesseract(image, padding=40)
+
+    def preprocess_for_tesseract_faded(self, image: np.ndarray) -> np.ndarray:
+        """
+        Tesseract preprocessing for FADED thermal receipts.
+        Uses binarization to recover faded text.
+        """
+        return self.preprocess_for_tesseract(image, binarize=True)

    def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
        """