feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions
--- a/data-entry-app/backend/app/services/ocr_extractor.py
+++ b/data-entry-app/backend/app/services/ocr_extractor.py
@@ -0,0 +1,231 @@
+"""Extract structured fields from OCR text (Romanian receipts)."""
+
+import re
+from datetime import date, datetime
+from decimal import Decimal, InvalidOperation
+from typing import Optional, Tuple
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ExtractionResult:
+    """Structured extraction result from receipt."""
+    receipt_type: str = 'bon_fiscal'
+    receipt_number: Optional[str] = None
+    receipt_series: Optional[str] = None
+    receipt_date: Optional[date] = None
+    amount: Optional[Decimal] = None
+    partner_name: Optional[str] = None
+    cui: Optional[str] = None
+    description: Optional[str] = None
+
+    confidence_amount: float = 0.0
+    confidence_date: float = 0.0
+    confidence_vendor: float = 0.0
+    raw_text: str = ""
+
+    @property
+    def overall_confidence(self) -> float:
+        """Calculate weighted overall confidence score."""
+        weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
+        return round(
+            self.confidence_amount * weights['amount'] +
+            self.confidence_date * weights['date'] +
+            self.confidence_vendor * weights['vendor'],
+            2
+        )
+
+
+class ReceiptExtractor:
+    """Extract receipt fields using pattern matching for Romanian receipts."""
+
+    # Total amount patterns (most specific first)
+    TOTAL_PATTERNS = [
+        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
+        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
+        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
+        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
+        (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
+        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
+    ]
+
+    # Date patterns
+    DATE_PATTERNS = [
+        (r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
+        (r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
+        (r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
+        (r'(\d{4}[./]\d{2}[./]\d{2})', 0.75),  # YYYY.MM.DD format
+    ]
+
+    # Receipt number patterns
+    NUMBER_PATTERNS = [
+        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
+        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
+        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
+        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
+        (r'NR\.?\s*:?\s*(\d{4,})', 0.70),
+    ]
+
+    # CUI (fiscal code) patterns
+    CUI_PATTERNS = [
+        (r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
+        (r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
+        (r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
+        (r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
+    ]
+
+    # Series patterns
+    SERIES_PATTERNS = [
+        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
+        (r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
+    ]
+
+    def extract(self, text: str) -> ExtractionResult:
+        """Extract all fields from OCR text."""
+        result = ExtractionResult()
+        result.raw_text = text
+        text_upper = text.upper()
+
+        # Extract fields
+        result.amount, result.confidence_amount = self._extract_amount(text_upper)
+        result.receipt_date, result.confidence_date = self._extract_date(text_upper)
+        result.receipt_number, _ = self._extract_number(text_upper)
+        result.receipt_series, _ = self._extract_series(text_upper)
+        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
+        result.cui, _ = self._extract_cui(text_upper)
+
+        # Detect receipt type
+        result.receipt_type = self._detect_receipt_type(text_upper)
+
+        return result
+
+    def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
+        """Extract total amount from text."""
+        for pattern, confidence in self.TOTAL_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+            if match:
+                try:
+                    amount_str = re.sub(r'[^\d.,]', '', match.group(1))
+                    # Handle Romanian number format (1.234,56)
+                    amount_str = self._normalize_number(amount_str)
+                    amount = Decimal(amount_str)
+                    if amount > 0:
+                        return amount, confidence
+                except (InvalidOperation, ValueError):
+                    continue
+        return None, 0.0
+
+    def _normalize_number(self, num_str: str) -> str:
+        """Normalize Romanian number format to standard decimal."""
+        # Remove spaces
+        num_str = num_str.replace(' ', '')
+
+        # Handle comma as decimal separator
+        if ',' in num_str and '.' in num_str:
+            # Romanian format: 1.234,56
+            num_str = num_str.replace('.', '').replace(',', '.')
+        elif ',' in num_str:
+            # Could be 1,50 or 1,234
+            parts = num_str.split(',')
+            if len(parts) == 2 and len(parts[1]) <= 2:
+                # Decimal comma: 1,50
+                num_str = num_str.replace(',', '.')
+            else:
+                # Thousands comma: 1,234
+                num_str = num_str.replace(',', '')
+        elif '.' in num_str:
+            parts = num_str.split('.')
+            if len(parts) > 2:
+                # Multiple dots: 1.234.567 -> 1234567
+                num_str = ''.join(parts[:-1]) + '.' + parts[-1]
+
+        return num_str
+
+    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
+        """Extract receipt date from text."""
+        for pattern, confidence in self.DATE_PATTERNS:
+            match = re.search(pattern, text)
+            if match:
+                try:
+                    date_str = match.group(1).replace('/', '.')
+
+                    # Try DD.MM.YYYY format first
+                    try:
+                        parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
+                    except ValueError:
+                        # Try YYYY.MM.DD format
+                        parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
+
+                    # Validate date range
+                    today = date.today()
+                    if parsed <= today and parsed.year >= 2020:
+                        return parsed, confidence
+                except ValueError:
+                    continue
+        return None, 0.0
+
+    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract receipt number from text."""
+        for pattern, confidence in self.NUMBER_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(1), confidence
+        return None, 0.0
+
+    def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract receipt series from text."""
+        for pattern, confidence in self.SERIES_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(1).upper(), confidence
+        return None, 0.0
+
+    def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract vendor/partner name from text."""
+        lines = text.split('\n')
+        skip_keywords = [
+            'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
+            'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
+            'RON', 'LEI', 'CHITANTA', 'REST'
+        ]
+
+        for i, line in enumerate(lines[:7]):  # Check first 7 lines
+            line = line.strip()
+
+            # Skip empty lines
+            if not line:
+                continue
+
+            # Skip lines that are just numbers
+            if re.match(r'^[\d.,\s]+$', line):
+                continue
+
+            # Skip lines with keywords
+            if any(kw in line.upper() for kw in skip_keywords):
+                continue
+
+            # Clean the line
+            vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()
+
+            if len(vendor) >= 3:
+                # Confidence decreases for lines further down
+                confidence = max(0.3, 0.8 - (i * 0.1))
+                return vendor, confidence
+
+        return None, 0.0
+
+    def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract CUI (fiscal identification code) from text."""
+        for pattern, confidence in self.CUI_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                cui = match.group(1)
+                if 6 <= len(cui) <= 10:
+                    return cui, confidence
+        return None, 0.0
+
+    def _detect_receipt_type(self, text: str) -> str:
+        """Detect receipt type from text content."""
+        if 'CHITANTA' in text or 'CHITANȚĂ' in text:
+            return 'chitanta'
+        return 'bon_fiscal'