feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions
--- a/data-entry-app/backend/app/services/image_preprocessor.py
+++ b/data-entry-app/backend/app/services/image_preprocessor.py
@@ -0,0 +1,116 @@
+"""Image preprocessing for optimal OCR results."""
+
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import cv2
+
+try:
+    import pdf2image
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+
+
+class ImagePreprocessor:
+    """Preprocess receipt images for OCR."""
+
+    def load_image(self, path: Path) -> np.ndarray:
+        """Load image from file."""
+        image = cv2.imread(str(path))
+        if image is None:
+            raise ValueError(f"Could not load image: {path}")
+        return image
+
+    def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
+        """Convert PDF to images."""
+        if not PDF_AVAILABLE:
+            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
+        images = pdf2image.convert_from_path(str(path), dpi=dpi)
+        return [np.array(img) for img in images]
+
+    def preprocess(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply preprocessing pipeline for thermal receipt images.
+
+        Pipeline:
+        1. Convert to grayscale
+        2. Resize if too small (min 1000px width)
+        3. Deskew (straighten rotated text)
+        4. Denoise (Non-local means)
+        5. Adaptive thresholding (binarization)
+        6. Morphological close (connect broken chars)
+        """
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # 2. Resize if too small
+        height, width = gray.shape
+        if width < 1000:
+            scale = 1000 / width
+            gray = cv2.resize(
+                gray, None, fx=scale, fy=scale,
+                interpolation=cv2.INTER_CUBIC
+            )
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Denoise
+        denoised = cv2.fastNlMeansDenoising(
+            gray, h=10,
+            templateWindowSize=7,
+            searchWindowSize=21
+        )
+
+        # 5. Adaptive thresholding
+        binary = cv2.adaptiveThreshold(
+            denoised, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            blockSize=15, C=8
+        )
+
+        # 6. Morphological close
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
+
+        return result
+
+    def _deskew(self, image: np.ndarray) -> np.ndarray:
+        """Correct image rotation/skew using Hough lines."""
+        edges = cv2.Canny(image, 50, 150, apertureSize=3)
+        lines = cv2.HoughLinesP(
+            edges, 1, np.pi / 180,
+            threshold=100, minLineLength=100, maxLineGap=10
+        )
+
+        if lines is None:
+            return image
+
+        angles = []
+        for line in lines:
+            x1, y1, x2, y2 = line[0]
+            angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
+            if abs(angle) < 45:
+                angles.append(angle)
+
+        if not angles:
+            return image
+
+        median_angle = np.median(angles)
+        if abs(median_angle) < 0.5:
+            return image
+
+        h, w = image.shape[:2]
+        center = (w // 2, h // 2)
+        M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+        return cv2.warpAffine(
+            image, M, (w, h),
+            flags=cv2.INTER_CUBIC,
+            borderMode=cv2.BORDER_REPLICATE
+        )