feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions
--- a/data-entry-app/backend/app/services/ocr_service.py
+++ b/data-entry-app/backend/app/services/ocr_service.py
@@ -0,0 +1,110 @@
+"""Main OCR service coordinating preprocessing, recognition, and extraction."""
+
+import os
+# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Optional, Tuple
+
+from app.services.ocr_engine import OCREngine
+from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
+from app.services.image_preprocessor import ImagePreprocessor
+
+
+class OCRService:
+    """Service for OCR processing of receipt images."""
+
+    _executor = ThreadPoolExecutor(max_workers=2)
+
+    def __init__(self):
+        self.preprocessor = ImagePreprocessor()
+        self.ocr_engine = OCREngine()
+        self.extractor = ReceiptExtractor()
+
+    async def process_image(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """
+        Process receipt image and extract structured data.
+
+        Args:
+            image_path: Path to the image file
+            mime_type: MIME type of the file
+
+        Returns:
+            Tuple of (success, message, extraction_result)
+        """
+        try:
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(
+                self._executor,
+                self._process_sync,
+                image_path,
+                mime_type
+            )
+            return result
+        except Exception as e:
+            return False, f"OCR processing failed: {str(e)}", None
+
+    def _process_sync(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """Synchronous processing (runs in thread pool)."""
+
+        # Handle PDF
+        if mime_type == 'application/pdf':
+            try:
+                images = self.preprocessor.pdf_to_images(image_path)
+                if not images:
+                    return False, "Failed to extract images from PDF", None
+                image = images[0]  # Process first page only
+            except RuntimeError as e:
+                return False, str(e), None
+        else:
+            try:
+                image = self.preprocessor.load_image(image_path)
+            except ValueError as e:
+                return False, str(e), None
+
+        # Preprocess image
+        processed = self.preprocessor.preprocess(image)
+
+        # Perform OCR
+        try:
+            ocr_result = self.ocr_engine.recognize(processed)
+        except RuntimeError as e:
+            return False, str(e), None
+
+        if not ocr_result.text:
+            return False, "No text detected in image", None
+
+        # Extract structured fields
+        extraction = self.extractor.extract(ocr_result.text)
+
+        # Build result message
+        fields_found = []
+        if extraction.amount:
+            fields_found.append("amount")
+        if extraction.receipt_date:
+            fields_found.append("date")
+        if extraction.partner_name:
+            fields_found.append("vendor")
+        if extraction.cui:
+            fields_found.append("CUI")
+        if extraction.receipt_number:
+            fields_found.append("number")
+
+        message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
+
+        return True, message, extraction
+
+
+# Singleton instance
+ocr_service = OCRService()