fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions
--- a/backend/modules/data_entry/services/ocr/profiles/electrobering.py
+++ b/backend/modules/data_entry/services/ocr/profiles/electrobering.py
@@ -2,11 +2,16 @@
 ELECTROBERING S.R.L. store profile for OCR extraction.

 Electronics and home supplies store.
+
+Receipt structure:
+- TVA format: "TOTAL TVA A - - 19%" with amount on next line
+- "TOTAL TVA BON" with total TVA amount
+- Client CUI: "CIF CLIENT: XXXXXXX"
 """

 import re
 from decimal import Decimal, InvalidOperation
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Tuple, Optional

 from .base import BaseStoreProfile
 from . import ProfileRegistry
@@ -15,11 +20,11 @@ from . import ProfileRegistry
@ProfileRegistry.register
 class ElectroberingProfile(BaseStoreProfile):
    """
-    ELECTROBERING S.R.L. - standard TVA profile.
+    ELECTROBERING S.R.L. - standard TVA profile with multiline support.

    Key characteristics:
-    - Standard TVA format (single rate, any percentage)
-    - Electronics and home supplies
+    - TVA format with rate on one line, amount on next
+    - Double-dash separators common (OCR artifact)
    - May have client CUI for B2B purchases
    - CARD payment typical
    """
@@ -28,19 +33,28 @@ class ElectroberingProfile(BaseStoreProfile):
    NAME_PATTERNS = ["ELECTROBERING", "ELECTR0BERING", "ELECTROBERING SRL"]
    STORE_NAME = "ELECTROBERING S.R.L."

-    # Standard TVA patterns (flexible - accepts any rate)
+    # ELECTROBERING TVA patterns (handles double-dash and multiline)
    TVA_PATTERNS = [
-        # "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
-        r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
-        # "A - XX,XX% = YY,YY"
-        r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
-        # "TVA XX% YY,YY" (simple format without code)
-        r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
+        # "TOTAL TVA A - - 19%" with amount on next line
+        r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
+        # "TOTAL TVA A 19%" without separator
+        r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
+        # Standard: "TVA A: XX% = YY,YY"
+        r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
    ]

+    # TOTAL TVA BON pattern (fallback)
+    TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
+
    def extract_tva_entries(self, text: str) -> List[dict]:
        """
-        Extract TVA entries from receipt text.
+        Extract ELECTROBERING-specific TVA entries.
+
+        ELECTROBERING receipts show TVA in multi-line format:
+        "TOTAL TVA A - - 19%"
+        "5.59"
+        "TOTAL TVA BON"
+        "5.59"

        Args:
            text: Raw OCR text from receipt
@@ -49,45 +63,61 @@ class ElectroberingProfile(BaseStoreProfile):
            List of TVA entries with code, percent, and amount
        """
        entries = []
-        seen = set()
+        text_upper = text.upper()
+        lines = text_upper.split('\n')

-        # Try coded patterns first
-        for pattern in self.TVA_PATTERNS[:2]:
-            for match in re.finditer(pattern, text, re.IGNORECASE):
-                try:
-                    code = match.group(1).upper()
-                    percent = int(match.group(2))
-                    amount = self._parse_decimal(match.group(3))
-
-                    if amount and amount > 0:
-                        entry_key = (code, percent)
-                        if entry_key not in seen:
-                            entries.append({
-                                'code': code,
-                                'percent': percent,
-                                'amount': amount
-                            })
-                            seen.add(entry_key)
-                except (ValueError, InvalidOperation, IndexError):
-                    continue
-
-        # Fallback to simple format
-        if not entries:
-            simple_pattern = self.TVA_PATTERNS[2]
-            for match in re.finditer(simple_pattern, text, re.IGNORECASE):
-                try:
-                    percent = int(match.group(1))
-                    amount = self._parse_decimal(match.group(2))
+        # Find TVA rate line and get amount from next line
+        for i, line in enumerate(lines):
+            # Match "TOTAL TVA A - - 19%" or "TOTAL TVA A 19%"
+            match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', line)
+            if match:
+                code = match.group(1)
+                percent = int(match.group(2))

+                # Amount should be on next line
+                if i + 1 < len(lines):
+                    amount_str = lines[i + 1].strip()
+                    amount = self._parse_decimal(amount_str)
                    if amount and amount > 0:
                        entries.append({
-                            'code': 'A',
+                            'code': code,
                            'percent': percent,
                            'amount': amount
                        })
-                        break
+                        return entries
+
+        # Fallback: Find TOTAL TVA BON and get amount
+        for i, line in enumerate(lines):
+            if re.search(self.TOTAL_TVA_BON_PATTERN, line):
+                # Amount should be on next line
+                if i + 1 < len(lines):
+                    amount_str = lines[i + 1].strip()
+                    amount = self._parse_decimal(amount_str)
+                    if amount and amount > 0:
+                        entries.append({
+                            'code': 'A',
+                            'percent': 19,  # Default Romanian TVA rate
+                            'amount': amount
+                        })
+                        return entries
+
+        # Last fallback: inline format "TVA A: XX% = YY,YY"
+        for pattern in [self.TVA_PATTERNS[2]]:
+            match = re.search(pattern, text_upper, re.IGNORECASE)
+            if match and len(match.groups()) >= 3:
+                try:
+                    code = match.group(1)
+                    percent = int(match.group(2))
+                    amount = self._parse_decimal(match.group(3))
+                    if amount and amount > 0:
+                        entries.append({
+                            'code': code,
+                            'percent': percent,
+                            'amount': amount
+                        })
+                        return entries
                except (ValueError, InvalidOperation):
-                    continue
+                    pass

        return entries

@@ -99,4 +129,5 @@ class ElectroberingProfile(BaseStoreProfile):
            "has_client_cui": True,  # May have client CUI for B2B
            "has_efactura": False,
            "is_non_vat_payer": False,
+            "tva_on_separate_line": True,
        }