feat: Improve OCR adaptive pipeline with early exit and better pattern matching

- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 01:54:52 +02:00
parent 6c3dd89f6d
commit 9f06482681
9 changed files with 952 additions and 116 deletions
--- a/data-entry-app/backend/app/services/image_preprocessor.py
+++ b/data-entry-app/backend/app/services/image_preprocessor.py
@@ -23,37 +23,57 @@ class ImagePreprocessor:
            raise ValueError(f"Could not load image: {path}")
        return image

-    def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]:
+    def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
        """
-        Convert PDF to images with high DPI for better OCR.
+        Convert PDF to images.

        Args:
            path: Path to PDF file
-            dpi: Resolution (400 recommended for receipts, higher = better quality but slower)
+            dpi: Resolution (300 = fast & good quality, 400 = better but slower)
        """
        if not PDF_AVAILABLE:
            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
-        # Use 400 DPI for better text recognition on thermal receipts
        images = pdf2image.convert_from_path(str(path), dpi=dpi)
        return [np.array(img) for img in images]

    def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
        """
-        Apply preprocessing pipeline for thermal receipt images.
+        Apply LIGHT preprocessing - better for clear PDFs.
+        Heavy binarization can destroy text on clear images.
+        """
+        return self.preprocess_light(image)

-        Pipeline:
-        1. Convert to grayscale
-        2. Resize if too small (min 1500px width for high quality)
-        3. Deskew (straighten rotated text)
-        4. Contrast enhancement (CLAHE)
-        5. Denoise (Non-local means)
-        6. Sharpening (for clearer text edges)
-        7. Adaptive thresholding (binarization)
-        8. Morphological operations (connect broken chars)
+    def preprocess_light(self, image: np.ndarray) -> np.ndarray:
+        """
+        Light preprocessing for CLEAR images (PDFs, good scans).
+        Preserves original quality, only enhances contrast.
+        """
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()

-        Args:
-            image: Input image (BGR or grayscale)
-            high_quality: If True, apply more aggressive preprocessing
+        # 2. Resize if too small
+        height, width = gray.shape
+        if width < 1500:
+            scale = 1500 / width
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Light contrast enhancement only
+        clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+
+        # NO binarization, NO morphological ops - preserve original quality
+        return enhanced
+
+    def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
+        """
+        Heavy preprocessing for FADED thermal receipts.
+        Aggressive binarization to recover faded text.
        """
        # 1. Grayscale
        if len(image.shape) == 3:
@@ -63,57 +83,48 @@ class ImagePreprocessor:

        # 2. Resize if too small (larger = better OCR)
        height, width = gray.shape
-        min_width = 1500 if high_quality else 1000
-        if width < min_width:
-            scale = min_width / width
-            gray = cv2.resize(
-                gray, None, fx=scale, fy=scale,
-                interpolation=cv2.INTER_CUBIC
-            )
+        if width < 1500:
+            scale = 1500 / width
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

        # 3. Deskew
        gray = self._deskew(gray)

-        # 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization)
+        # 4. Contrast enhancement with CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

-        # 5. Denoise (slightly less aggressive to preserve text details)
-        denoised = cv2.fastNlMeansDenoising(
-            enhanced, h=8,  # Lower h = preserve more details
-            templateWindowSize=7,
-            searchWindowSize=21
-        )
+        # 5. Denoise
+        denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)

-        # 6. Sharpening to enhance text edges
-        if high_quality:
-            # Unsharp mask for better text clarity
-            gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
-            sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
-        else:
-            sharpened = denoised
+        # 6. Sharpening
+        gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
+        sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)

-        # 7. Adaptive thresholding with optimized parameters
+        # 7. Adaptive thresholding (binarization)
        binary = cv2.adaptiveThreshold(
            sharpened, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
-            blockSize=11,  # Smaller block = better for small text
-            C=5  # Lower C = darker result, better for faded receipts
+            blockSize=11, C=5
        )

        # 8. Morphological operations
-        # Close small gaps in characters
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)

-        # Optional: Remove small noise spots
-        if high_quality:
-            kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
-            result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open)
-
        return result

+    def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
+        """
+        Generate 2 preprocessing variants for OCR (fast mode).
+        Returns: [light_processed, heavy_processed]
+        """
+        return [
+            self.preprocess_light(image),
+            self.preprocess_heavy(image),
+        ]
+
    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """Correct image rotation/skew using Hough lines."""
        edges = cv2.Canny(image, 50, 150, apertureSize=3)