fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions
--- a/backend/modules/data_entry/services/ocr/profiles/brick.py
+++ b/backend/modules/data_entry/services/ocr/profiles/brick.py
@@ -2,11 +2,16 @@
 BRICK (Five-Holding) store profile for OCR extraction.

 Five-Holding S.A. operates BRICK stores with standard receipt format.
+
+Receipt structure:
+- TVA format: "TOTAL TVA A - 21%" with amount on next line
+- Payment: "CARD" on separate line (amount from TOTAL LEI)
+- Client CUI: "CLIENT C.U.L./C.IF. :ROXXXXXXX" (OCR reads I as L)
 """

 import re
 from decimal import Decimal, InvalidOperation
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Tuple, Optional

 from .base import BaseStoreProfile
 from . import ProfileRegistry
@@ -15,32 +20,60 @@ from . import ProfileRegistry
@ProfileRegistry.register
 class BrickProfile(BaseStoreProfile):
    """
-    FIVE-HOLDING S.A. (BRICK) - standard TVA format.
+    FIVE-HOLDING S.A. (BRICK) - standard TVA format with client CUI.

    Key characteristics:
-    - Standard TVA format
-    - Single TVA rate typically
-    - No client CUI on receipts
+    - Standard TVA format with rate code (A, B, etc.)
+    - TVA amount on separate line after percentage
+    - CARD payment indicated by keyword (amount derived from total)
+    - Client CUI in format: CLIENT C.U.L./C.IF.
+    - OCR often reads "I" as "L" in CUI markers
    """

    CUI_LIST = ["10562600"]
-    NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK"]  # OCR variants
+    NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK", "F1VE"]
    STORE_NAME = "FIVE-HOLDING S.A."

-    # Standard TVA patterns (flexible - accepts any rate)
+    # BRICK TVA patterns (amount often on separate line)
    TVA_PATTERNS = [
-        # "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
-        r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
-        # "A - XX,XX% = YY,YY"
-        r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
-        # Simple: "TVA XX% YY,YY"
-        r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
+        # "TOTAL TVA A - 21%" with amount on next line (captured as multiline)
+        r'TOTAL\s+TVA\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
+        # "OTAL IVAA 21%" - OCR error variant
+        r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
+        # "TOTAL TVA A 21%" without separator
+        r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
+        # "TVA A: XX% = YY,YY" - inline format
+        r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
+    ]
+
+    # TOTAL TVA BON pattern (fallback)
+    TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s*BON\s*\n?\s*([\d.,]+)'
+
+    # Client CUI patterns - specific to Brick (handles OCR L/I confusion)
+    CLIENT_CUI_PATTERNS = [
+        # "CLIENT C.U.L./C.IF. :R01879855" - exact OCR format (I->L)
+        (r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/?\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
+        # "CLIENT C.U.I./C.I.F.: RO1879855" - standard format
+        (r'CLIENT\s+C\.?U\.?I\.?\s*/?\s*C\.?I\.?F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
+        # "CIF CLIENT: XXXXXXX" - alternative format
+        (r'CIF\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.95),
+    ]
+
+    # Client markers for Brick
+    CLIENT_MARKERS = [
+        r'CLIENT\s+C\.?U\.?[LI1]',
+        r'CLIENT\s+C\.?I\.?F',
+        r'CIF\s+CLIENT',
    ]

    def extract_tva_entries(self, text: str) -> List[dict]:
        """
        Extract BRICK-specific TVA entries.

+        BRICK receipts show TVA in multi-line format:
+        "TOTAL TVA A - 21%"
+        "32.31"
+
        Args:
            text: Raw OCR text from receipt

@@ -48,11 +81,12 @@ class BrickProfile(BaseStoreProfile):
            List of TVA entries with code, percent, and amount
        """
        entries = []
+        text_upper = text.upper()
        seen = set()

-        # Try coded patterns first
-        for pattern in self.TVA_PATTERNS[:2]:
-            for match in re.finditer(pattern, text, re.IGNORECASE):
+        # Try coded patterns first (with multiline support)
+        for pattern in self.TVA_PATTERNS:
+            for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
@@ -67,35 +101,182 @@ class BrickProfile(BaseStoreProfile):
                                'amount': amount
                            })
                            seen.add(entry_key)
+                            return entries  # Brick usually has single TVA rate
                except (ValueError, InvalidOperation, IndexError):
                    continue

-        # Fallback to simple format
-        if not entries:
-            simple_pattern = self.TVA_PATTERNS[2]
-            for match in re.finditer(simple_pattern, text, re.IGNORECASE):
-                try:
-                    percent = int(match.group(1))
-                    amount = self._parse_decimal(match.group(2))
-
-                    if amount and amount > 0:
-                        entries.append({
-                            'code': 'A',
-                            'percent': percent,
-                            'amount': amount
-                        })
-                        break
-                except (ValueError, InvalidOperation):
-                    continue
+        # Fallback: "TOTAL TVA BON" with amount on next line
+        match = re.search(self.TOTAL_TVA_BON_PATTERN, text_upper, re.IGNORECASE | re.MULTILINE)
+        if match:
+            try:
+                amount = self._parse_decimal(match.group(1))
+                if amount and amount > 0:
+                    entries.append({
+                        'code': 'A',
+                        'percent': 19,  # Default rate
+                        'amount': amount
+                    })
+            except (ValueError, InvalidOperation):
+                pass

        return entries

+    def extract_payment_methods(self, text: str) -> List[dict]:
+        """
+        Extract BRICK-specific payment methods.
+
+        BRICK receipts show payment method on separate line:
+        "TOTAL LEI"
+        "21.18"
+        "CARD"
+        "0.00"  <- REST (change)
+
+        When CARD appears with REST=0, full amount was paid by card.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            List of payment methods with method, amount, and confidence
+        """
+        payments = []
+        text_upper = text.upper()
+        lines = text_upper.split('\n')
+
+        # Find TOTAL LEI amount
+        total_amount = None
+        for i, line in enumerate(lines):
+            if 'TOTAL' in line and 'LEI' in line:
+                # Amount is likely on next line
+                if i + 1 < len(lines):
+                    amount_str = lines[i + 1].strip()
+                    total_amount = self._parse_decimal(amount_str)
+                    break
+            # Also try inline: "TOTAL LEI 21.18"
+            match = re.search(r'TOTAL\s+LEI\s*([\d.,]+)', line)
+            if match:
+                total_amount = self._parse_decimal(match.group(1))
+                break
+
+        if not total_amount:
+            # Fallback to generic total extraction
+            total_amount, _ = self.extract_total(text)
+
+        if not total_amount:
+            return []
+
+        # Check for CARD or NUMERAR keywords
+        has_card = any('CARD' in line for line in lines)
+        has_numerar = any('NUMERAR' in line for line in lines)
+
+        # Find REST amount to determine actual card amount
+        rest_amount = Decimal('0')
+        for i, line in enumerate(lines):
+            if 'REST' in line:
+                # REST amount is on next line or same line
+                match = re.search(r'REST\s*([\d.,]+)', line)
+                if match:
+                    rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
+                elif i + 1 < len(lines):
+                    rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
+                break
+
+        if has_card:
+            # Card payment = total - rest
+            card_amount = total_amount - rest_amount
+            if card_amount > 0:
+                payments.append({
+                    'method': 'CARD',
+                    'amount': card_amount,
+                    'confidence': 0.95
+                })
+
+        if has_numerar:
+            # If both card and cash, need more complex logic
+            # For now, assume numerar is the rest if card is present
+            if not has_card:
+                payments.append({
+                    'method': 'NUMERAR',
+                    'amount': total_amount,
+                    'confidence': 0.95
+                })
+            elif rest_amount > 0:
+                payments.append({
+                    'method': 'NUMERAR',
+                    'amount': rest_amount,
+                    'confidence': 0.90
+                })
+
+        # If no explicit payment keyword but REST=0, assume card
+        if not payments and rest_amount == 0:
+            # Check for any payment indicators
+            for line in lines:
+                if 'CARD' in line or 'DEBIT' in line or 'CREDIT' in line:
+                    payments.append({
+                        'method': 'CARD',
+                        'amount': total_amount,
+                        'confidence': 0.90
+                    })
+                    break
+
+        # FALLBACK: If still no payment found but we have total amount,
+        # assume CARD for business receipts (Brick stores usually accept card)
+        # This handles cases where OCR fails to capture payment method
+        if not payments and total_amount and total_amount > 0:
+            # Check if this is a fiscal receipt (BON FISCAL)
+            is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
+            if is_fiscal:
+                payments.append({
+                    'method': 'CARD',
+                    'amount': total_amount,
+                    'confidence': 0.70  # Lower confidence for inferred payment
+                })
+
+        return payments
+
+    def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
+        """
+        Extract client CUI from BRICK receipt.
+
+        BRICK uses format: "CLIENT C.U.L./C.IF. :R01879855"
+        Note: OCR often reads "I" as "L" in these markers.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            Tuple of (cui, confidence) or (None, 0.0)
+        """
+        text_upper = text.upper()
+
+        # Check for Brick client markers
+        has_client = any(
+            re.search(marker, text_upper, re.IGNORECASE)
+            for marker in self.CLIENT_MARKERS
+        )
+
+        if not has_client:
+            return (None, 0.0)
+
+        # Try Brick-specific patterns
+        for pattern, confidence in self.CLIENT_CUI_PATTERNS:
+            match = re.search(pattern, text_upper, re.IGNORECASE)
+            if match:
+                cui = match.group(1)
+                # Clean up: remove RO prefix, spaces
+                cui_digits = re.sub(r'[^0-9]', '', cui)
+                if 6 <= len(cui_digits) <= 10:
+                    return (cui_digits, confidence)
+
+        return (None, 0.0)
+
    def get_validation_hints(self) -> Dict[str, Any]:
        """Return BRICK-specific validation hints."""
        return {
            "has_multi_rate_tva": False,
-            "card_equals_total": False,
-            "has_client_cui": False,
+            "card_equals_total": True,  # Card amount equals total when REST=0
+            "has_client_cui": True,  # Brick receipts CAN have client CUI
            "has_efactura": False,
            "is_non_vat_payer": False,
+            "tva_on_separate_line": True,  # TVA amount on next line
        }