feat(ocr): Add modular store profiles with hot-reload support

## Store Profiles System - Add ProfileRegistry for CUI-based profile lookup - Add BaseStoreProfile with generic extraction patterns - Implement hot-reload via POST /api/data-entry/ocr/profiles/reload ## 12 Store Profiles - LIDL: Multi-rate TVA (A, B, C, D codes) - OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates - BRICK, DEDEMAN: Standard TVA, e-factura support - KINETERRA, BEST PRINT: Non-VAT payers (returns []) - STEPOUT MARKET: TVA 5% (books/reduced rate) - UNLIMITED KEYS: NUMERAR payment detection - GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA ## Flexible TVA Patterns - All patterns use (\d{1,2})% to accept any rate - Supports historical (19%, 9%, 5%) and current (21%, 11%) ## Payment Methods Fix - Fixed base.py to support multiple payments of same type - Changed deduplication from method-only to (method, amount) tuple - Returns separate entries for split payments ## Tools - Add generate_store_profile.py for automatic profile generation - Analyzes PDFs via OCR API and detects patterns 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-06 23:07:07 +00:00
parent 67b0082df0
commit 099556213d
25 changed files with 3707 additions and 114 deletions
--- a/backend/modules/data_entry/services/ocr_extractor.py
+++ b/backend/modules/data_entry/services/ocr_extractor.py
@@ -7,6 +7,7 @@ from typing import Optional, Tuple, List
 from dataclasses import dataclass, field

 from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry


@dataclass
@@ -63,6 +64,57 @@ class ExtractionResult:
 class ReceiptExtractor:
    """Extract receipt fields using pattern matching for Romanian receipts."""

+    # =========================================================================
+    # DEPRECATED: STORE_PROFILES dict - USE ProfileRegistry INSTEAD
+    # =========================================================================
+    # Store profiles are now managed by ProfileRegistry in:
+    #   backend/modules/data_entry/services/ocr/profiles/
+    #
+    # This dict is kept for reference only. All extraction logic now uses:
+    #   ProfileRegistry.get_profile(cui)
+    #
+    # See: backend/modules/data_entry/services/ocr/profiles/README.md
+    # =========================================================================
+    STORE_PROFILES = {
+        # Lidl - multi-rate TVA (A+B), specific format without hyphen/colon
+        "22891860": {
+            "name": "LIDL DISCOUNT S.R.L.",
+            "tva_pattern": "lidl",
+            "tva_format": "TVA {code} {percent}% {amount}",
+            "has_multi_rate_tva": True,
+            "card_equals_total": True,
+        },
+        # OMV Petrom - single TVA rate, client CUI included
+        "11201891": {
+            "name": "OMV PETROM MARKETING S.R.L.",
+            "tva_pattern": "standard",
+            "has_client_cui": True,
+        },
+        # FIVE-HOLDING (BRICK) - standard format
+        "10562600": {
+            "name": "FIVE-HOLDING S.A.",
+            "tva_pattern": "standard",
+        },
+        # Dedeman - e-factura format
+        "2816464": {
+            "name": "DEDEMAN SRL",
+            "tva_pattern": "standard",
+            "has_efactura": True,
+        },
+        # SOCAR Petroleum
+        "12546600": {
+            "name": "SOCAR PETROLEUM S.A.",
+            "tva_pattern": "standard",
+            "has_client_cui": True,
+        },
+        # Kineterra - non-VAT payer
+        "31180432": {
+            "name": "KINETERRA CONCEPT SRL",
+            "tva_pattern": "none",
+            "is_non_vat_payer": True,
+        },
+    }
+
    # Total amount patterns (most specific first)
    # Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
    # OCR often produces errors, so patterns must be tolerant
@@ -394,48 +446,101 @@ class ReceiptExtractor:
        result.raw_text = text
        text_upper = text.upper()

-        # Extract core fields
-        result.amount, result.confidence_amount = self._extract_amount(text_upper)
-        result.receipt_date, result.confidence_date = self._extract_date(text_upper)
-        result.receipt_number, _ = self._extract_number(text_upper)
-        result.receipt_series, _ = self._extract_series(text_upper)
+        # =========================================================================
+        # STEP 1: Extract vendor info FIRST to find store profile
+        # =========================================================================
        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
        result.cui, _ = self._extract_cui(text_upper, text)
-        # Normalize CUI: fix R0 → RO OCR error and validate format
        result.cui = OCRValidationEngine.normalize_cui(result.cui)

-        # Extract additional fields - Multiple TVA entries
-        result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
+        # Lookup store-specific profile for enhanced extraction accuracy
+        store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
+        if store_profile:
+            print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
+
+        # =========================================================================
+        # STEP 2: Extract ALL fields using profile (if available) or generic
+        # =========================================================================
+        if store_profile:
+            # Profile-specific extraction (higher accuracy for known stores)
+            result.amount, result.confidence_amount = store_profile.extract_total(text_upper)
+            result.receipt_date, result.confidence_date = store_profile.extract_date(text_upper)
+            result.receipt_number, _ = store_profile.extract_receipt_number(text_upper)
+            result.tva_entries = store_profile.extract_tva_entries(text_upper)
+            result.tva_total = sum(e['amount'] for e in result.tva_entries) if result.tva_entries else None
+            result.payment_methods = store_profile.extract_payment_methods(text_upper)
+
+            # Client data extraction via profile (CUI + name)
+            profile_client_cui, cui_confidence = store_profile.extract_client_cui(text_upper)
+            profile_client_name, name_confidence = store_profile.extract_client_name(text)
+
+            if profile_client_cui or profile_client_name:
+                # Use profile extraction results
+                result.client_cui = OCRValidationEngine.normalize_cui(profile_client_cui) if profile_client_cui else None
+                result.client_name = profile_client_name
+                result.confidence_client = max(cui_confidence, name_confidence)
+                # Address still via generic (no profile method)
+                _, _, client_address, _ = self._extract_client_data(text_upper, text)
+                result.client_address = client_address
+            else:
+                # Fallback to generic client extraction
+                client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
+                result.client_name = client_name
+                result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
+                result.client_address = client_address
+                result.confidence_client = confidence
+
+            print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
+                  f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
+        else:
+            # Generic extraction for unknown stores
+            result.amount, result.confidence_amount = self._extract_amount(text_upper)
+            result.receipt_date, result.confidence_date = self._extract_date(text_upper)
+            result.receipt_number, _ = self._extract_number(text_upper)
+            result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
+            result.payment_methods = self._extract_payment_methods(text_upper)
+
+            # Generic client extraction
+            client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
+            result.client_name = client_name
+            result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
+            result.client_address = client_address
+            result.confidence_client = confidence
+
+        # Series extraction (no profile method, always generic)
+        result.receipt_series, _ = self._extract_series(text_upper)
+
+        # =========================================================================
+        # STEP 3: Debug logging and validation
+        # =========================================================================
        if not result.tva_entries:
            print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
-            # Debug: show what patterns see
            normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
            taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
            rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
            print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)

-        # Log TVA vs TOTAL for debugging (validation happens in ocr_service._final_validation)
-        # NOTE: We NO LONGER clear TVA here - the service will recalculate TOTAL from TVA if needed
+        # Log TVA vs TOTAL for debugging
        if result.tva_total and result.amount:
            if result.tva_total > result.amount:
                print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
            elif result.tva_total > result.amount * Decimal('0.5'):
                print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)

+        # Additional generic extractions
        result.items_count = self._extract_items_count(text_upper)
        result.address = self._extract_address(text_upper)
-        result.payment_methods = self._extract_payment_methods(text_upper)

-        # Validate payment methods against extracted amount
-        # If payment sum >> amount, clear invalid payments (likely OCR error)
+        # =========================================================================
+        # STEP 4: Validate and post-process
+        # =========================================================================
        # Save original payment methods before validation (for payment mode detection)
        original_payment_methods = result.payment_methods.copy() if result.payment_methods else []

+        # Validate payment methods against extracted amount
        result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)

        # Auto-suggest payment_mode based on detected payment methods
-        # Use ORIGINAL payment_methods to detect CARD even if validation cleared them
-        # (e.g., CARD 318.16 is valid even if total validation failed)
        payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
        if payment_methods_for_mode:
            card_amount = sum(
@@ -447,17 +552,9 @@ class ReceiptExtractor:
                result.suggested_payment_mode = 'banca'
                print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
            else:
-                # Only cash payments detected
                result.suggested_payment_mode = 'numerar'
                print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)

-        # Extract client data (B2B receipts)
-        client_name, client_cui, client_address, confidence_client = self._extract_client_data(text_upper, text)
-        result.client_name = client_name
-        result.client_cui = OCRValidationEngine.normalize_cui(client_cui)  # Fix R0 → RO OCR error
-        result.client_address = client_address
-        result.confidence_client = confidence_client
-
        # Detect receipt type
        result.receipt_type = self._detect_receipt_type(text_upper)

@@ -620,6 +717,40 @@ class ReceiptExtractor:

        return num_str

+    def _calculate_multi_rate_tva_total(self, tva_entries: List[dict]) -> Optional[Decimal]:
+        """
+        Calculate implied total from ALL TVA entries (multi-rate support).
+
+        Formula for each entry: total_for_entry = tva * (100 + rate) / rate
+        Final total = sum of all entry totals
+
+        Example for Lidl (TVA A 21% = 7.71, TVA B 11% = 2.13):
+            Entry A: 7.71 * 121 / 21 = 44.45
+            Entry B: 2.13 * 111 / 11 = 21.49
+            Total: 44.45 + 21.49 = 65.94 ≈ 65.86 (within tolerance)
+
+        Returns:
+            Implied total Decimal, or None if calculation not possible
+        """
+        if not tva_entries:
+            return None
+
+        total = Decimal('0')
+        for entry in tva_entries:
+            rate = entry.get('percent', 0)
+            tva_amount = entry.get('amount')
+            if tva_amount and rate > 0:
+                try:
+                    tva_dec = Decimal(str(tva_amount))
+                    # Formula: total_for_entry = tva * (100 + rate) / rate
+                    entry_total = tva_dec * Decimal(100 + rate) / Decimal(rate)
+                    total += entry_total
+                    print(f"[Multi-rate TVA] Entry {entry.get('code', '?')}: tva={tva_amount}, rate={rate}% -> implied={entry_total:.2f}", flush=True)
+                except (InvalidOperation, ValueError, TypeError):
+                    continue
+
+        return total.quantize(Decimal('0.01')) if total > 0 else None
+
    def _cross_validate_and_calculate_amount(
        self,
        amount: Optional[Decimal],
@@ -634,12 +765,11 @@ class ReceiptExtractor:
        Returns: (amount, confidence, source_description)

        Logic:
-        1. If amount is valid (>0) with high confidence (>=0.8), use it directly
-        2. Calculate payment_sum = CARD + NUMERAR + other methods
-        3. Calculate tva_implied_total = tva_total * (100 + rate) / rate
-        4. Cross-validate: if payment_sum matches extracted amount, boost confidence
-        5. If amount is 0/None, use payment_sum as total
-        6. If payment_sum is 0, try to calculate from TVA
+        1. Collect all available sources: extracted amount, payment sum, TVA-implied total
+        2. Find consensus: 2+ sources within 3% tolerance
+        3. If consensus found, use the higher-confidence source value
+        4. If extracted differs >10% from all others, it's an outlier - correct it
+        5. If no consensus possible, fallback to individual validations
        """
        # Calculate payment methods sum
        payment_sum = Decimal('0')
@@ -652,43 +782,73 @@ class ReceiptExtractor:
                except (InvalidOperation, ValueError, TypeError):
                    continue

-        # Calculate TVA-implied total: total = tva * (100 + rate) / rate
-        tva_implied_total = None
-        if tva_entries:
-            # Use the main TVA entry (typically the largest or first one)
-            main_entry = tva_entries[0]
-            rate = main_entry.get('percent', 19)
-            tva_amount = main_entry.get('amount')
-            if tva_amount and rate > 0:
-                try:
-                    tva_dec = Decimal(str(tva_amount))
-                    # total = tva * (100 + rate) / rate
-                    tva_implied_total = (tva_dec * Decimal(100 + rate) / Decimal(rate)).quantize(Decimal('0.01'))
-                except (InvalidOperation, ValueError, TypeError):
-                    pass
+        # Calculate TVA-implied total using ALL entries (multi-rate fix)
+        tva_implied_total = self._calculate_multi_rate_tva_total(tva_entries)

-        # Case 1: Amount is valid with high confidence - validate against TVA and payments
+        # Multi-source consensus approach (3% tolerance for multi-rate TVA rounding)
+        CONSENSUS_TOLERANCE = 3.0  # 3% tolerance
+
+        # Collect all available sources with their confidences
+        sources = []
+        if amount and amount > 0:
+            sources.append(('extracted', float(amount), confidence_amount))
+        if payment_sum > 0:
+            sources.append(('payment', float(payment_sum), 0.92))  # Payment is very reliable
+        if tva_implied_total and tva_implied_total > 0:
+            sources.append(('tva_calc', float(tva_implied_total), 0.88))  # TVA calc is reliable
+
+        print(f"[Cross-Validation] Sources: {[(s[0], f'{s[1]:.2f}', f'{s[2]:.2f}') for s in sources]}", flush=True)
+
+        # Find consensus: 2+ sources within tolerance
+        if len(sources) >= 2:
+            for i, (name1, val1, conf1) in enumerate(sources):
+                for name2, val2, conf2 in sources[i+1:]:
+                    if val1 <= 0 or val2 <= 0:
+                        continue
+                    diff_pct = abs(val1 - val2) / max(val1, val2) * 100
+                    if diff_pct <= CONSENSUS_TOLERANCE:
+                        # Consensus found! Use value from higher-confidence source
+                        if conf1 >= conf2:
+                            consensus_val, consensus_conf = val1, conf1
+                        else:
+                            consensus_val, consensus_conf = val2, conf2
+                        # Boost confidence for consensus
+                        consensus_conf = min(0.98, consensus_conf + 0.05)
+                        print(f"[Cross-Validation] Consensus: {name1}={val1:.2f} ≈ {name2}={val2:.2f} (diff={diff_pct:.1f}%)", flush=True)
+                        return Decimal(str(round(consensus_val, 2))), consensus_conf, f"consensus ({name1}+{name2})"
+
+        # No consensus - check if extracted is an outlier (differs >10% from all others)
+        if amount and amount > 0 and len(sources) >= 2:
+            other_sources = [s for s in sources if s[0] != 'extracted']
+            if other_sources:
+                extracted_val = float(amount)
+                all_differ = all(
+                    abs(extracted_val - s[1]) / max(extracted_val, s[1]) * 100 > 10
+                    for s in other_sources if s[1] > 0
+                )
+                if all_differ:
+                    # Extracted differs significantly from all others - use the best other source
+                    best_other = max(other_sources, key=lambda s: s[2])
+                    print(f"[Cross-Validation] Extracted outlier: {extracted_val:.2f} differs >10% from all others, using {best_other[0]}={best_other[1]:.2f}", flush=True)
+                    return Decimal(str(round(best_other[1], 2))), best_other[2], f"corrected (extracted outlier, using {best_other[0]})"
+
+        # Fallback: Case 1 - Amount valid with high confidence
        if amount and amount > 0 and confidence_amount >= 0.8:
-            # First check TVA-implied total (most reliable when TVA is extracted correctly)
+            # Check TVA-implied total
            if tva_implied_total and tva_implied_total > 0:
                tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
-                if tva_diff_percent <= 1:
-                    # Near-perfect TVA match - highest confidence
+                if tva_diff_percent <= 3:
                    return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
                elif tva_diff_percent > 10:
-                    # Significant mismatch - TVA-implied total is more reliable
-                    # This catches cases where wrong TOTAL line was extracted (e.g., REST, SUBTOTAL)
                    print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
                    return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"

            # Cross-validate with payment methods
            if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
-                # Perfect match - boost confidence
                return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
            elif payment_sum > 0:
                payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
                if payment_diff_percent > 10:
-                    # Significant mismatch - payment sum is more reliable
                    print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
                    return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"

@@ -696,29 +856,22 @@ class ReceiptExtractor:

        # Case 2: Amount exists but low confidence - try to validate/correct
        if amount and amount > 0:
-            # First check TVA-implied total (most reliable)
            if tva_implied_total and tva_implied_total > 0:
                tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
-                if tva_diff_percent <= 2:
-                    # Close match - boost confidence
+                if tva_diff_percent <= 3:
                    return amount, 0.88, "extracted (validated by TVA)"
                elif tva_diff_percent > 10:
-                    # Significant mismatch - use TVA-implied total
                    print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
                    return tva_implied_total, 0.85, "calculated from TVA"

-            # Check if payment methods sum matches
            if payment_sum > 0:
                payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
-                if payment_diff_percent <= 0.5:
-                    # Close match - boost confidence
+                if payment_diff_percent <= 1:
                    return amount, 0.90, "extracted (validated by payment methods)"
                elif payment_diff_percent > 10:
-                    # Mismatch - prefer payment_sum as it's more reliable
                    print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
                    return payment_sum, 0.85, "calculated from payment methods"

-            # No validation possible - return as-is
            return amount, confidence_amount, "extracted (unvalidated)"

        # Case 3: Amount is 0 or None - calculate from payment methods
@@ -946,6 +1099,28 @@ class ReceiptExtractor:

        return name

+    def _get_store_profile(self, cui: Optional[str]) -> Optional[dict]:
+        """
+        Get store-specific profile by CUI.
+
+        DEPRECATED: Use ProfileRegistry.get_profile() directly for profile objects.
+        This method is kept for backward compatibility and returns validation hints dict.
+
+        Args:
+            cui: The CUI extracted from receipt (with or without RO prefix)
+
+        Returns:
+            Store profile validation hints dict or None if not found
+        """
+        profile = ProfileRegistry.get_profile(cui)
+        if profile:
+            # Return validation hints for backward compatibility
+            hints = profile.get_validation_hints()
+            hints['name'] = profile.STORE_NAME
+            print(f"[Store Profile] Found profile for {cui}: {profile.STORE_NAME}", flush=True)
+            return hints
+        return None
+
    def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
        """
        Extract vendor CUI (fiscal identification code) from text.
@@ -1020,11 +1195,114 @@ class ReceiptExtractor:
        # Default to bon_fiscal if neither found
        return 'bon_fiscal'

+    def _try_pattern_lidl(self, text: str) -> List[dict]:
+        """
+        Try Lidl-style TVA pattern: "TVA A 21,00% 7.71" (no hyphen/colon separator).
+
+        Lidl receipts format:
+            TOTAL TVA 9,84
+            TVA A 21,00% 7,71
+            TVA B 11,00% 2,13
+
+        Returns list of TVA entries found.
+        """
+        entries = []
+        seen = set()
+
+        # Pattern: TVA/TUA/IVA + code (A-D) + percent + amount (on same line)
+        # Handles: "TVA A 21,00% 7,71", "TVA B 11,00% 2,13", "TUA A 21% 7.71"
+        lidl_patterns = [
+            # Same line: "TVA A  21,00%   7.71" (with various spacing)
+            r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
+            # Same line with backslash (OCR artifact): "TVA A \21,00% 7.71"
+            r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
+            # IVA variant
+            r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
+        ]
+
+        for pattern in lidl_patterns:
+            for match in re.finditer(pattern, text, re.IGNORECASE):
+                try:
+                    code = match.group(1).upper()
+                    percent = int(match.group(2))
+                    amount_str = self._normalize_number(match.group(3))
+                    amount = Decimal(amount_str)
+
+                    if amount > 0:
+                        entry_key = (code, percent)
+                        if entry_key not in seen:
+                            entries.append({
+                                'code': code,
+                                'percent': percent,
+                                'amount': amount
+                            })
+                            seen.add(entry_key)
+                            print(f"[TVA Lidl] Found: TVA {code} {percent}% = {amount}", flush=True)
+                except (ValueError, InvalidOperation):
+                    continue
+
+        return entries
+
+    def _select_best_tva_candidate(
+        self,
+        candidates: List[tuple],
+        tva_bon_total: Optional[Decimal]
+    ) -> Tuple[List[dict], Optional[Decimal]]:
+        """
+        Select the best TVA candidate from collected candidates.
+
+        Selection criteria (priority order):
+        1. Sum matches TOTAL TVA BON (highest priority)
+        2. More entries = better (for multi-rate receipts)
+        3. Pattern confidence as tiebreaker
+
+        Args:
+            candidates: List of (pattern_name, confidence, entries, sum)
+            tva_bon_total: Authoritative TOTAL TVA BON value (if extracted)
+
+        Returns:
+            (best_entries, best_sum)
+        """
+        if not candidates:
+            return [], None
+
+        # Score each candidate
+        scored = []
+        for name, confidence, entries, sum_val in candidates:
+            score = 0.0
+
+            # Criterion 1: Sum matches TOTAL TVA BON (highest priority)
+            if tva_bon_total and sum_val:
+                tolerance = max(Decimal('0.02'), tva_bon_total * Decimal('0.02'))  # 2% tolerance
+                if abs(sum_val - tva_bon_total) <= tolerance:
+                    score += 100  # High bonus for matching authoritative total
+                    print(f"[TVA Select] {name}: sum {sum_val} matches tva_bon_total {tva_bon_total}", flush=True)
+
+            # Criterion 2: More entries (for multi-rate receipts)
+            score += len(entries) * 10
+
+            # Criterion 3: Pattern confidence
+            score += confidence * 5
+
+            scored.append((score, name, confidence, entries, sum_val))
+            print(f"[TVA Select] Candidate {name}: score={score:.1f}, entries={len(entries)}, sum={sum_val}", flush=True)
+
+        # Sort by score descending
+        scored.sort(key=lambda x: x[0], reverse=True)
+        best = scored[0]
+        print(f"[TVA Select] Winner: {best[1]} (score={best[0]:.1f})", flush=True)
+
+        return best[3], best[4]
+
    def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
        """
        Extract multiple TVA (VAT) entries from text.
        Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).

+        Uses CANDIDATE COLLECTION approach:
+        - Try ALL patterns and collect candidates
+        - Select best candidate based on matching TOTAL TVA BON
+
        Returns (tva_entries, tva_total) where tva_entries is a list of:
            {'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
        """
@@ -1054,6 +1332,22 @@ class ReceiptExtractor:
        # Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
        normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)

+        # Extract TOTAL TVA BON/TOTAL TVA first as the authoritative reference
+        tva_bon_total = self._extract_total_tva_bon(normalized_text)
+        print(f"[TVA Debug] TOTAL TVA BON: {tva_bon_total}", flush=True)
+
+        # CANDIDATE COLLECTION APPROACH: Try all patterns, collect candidates, select best
+        all_candidates = []  # List of (pattern_name, confidence, entries, sum)
+
+        # === LIDL-STYLE PATTERNS (NEW) ===
+        # Lidl format: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13" (no hyphen/colon)
+        # This pattern handles multi-rate TVA receipts
+        lidl_entries = self._try_pattern_lidl(normalized_text)
+        if lidl_entries:
+            lidl_sum = sum(e['amount'] for e in lidl_entries)
+            all_candidates.append(('lidl', 0.96, lidl_entries, lidl_sum))
+            print(f"[TVA Debug] Lidl pattern: {len(lidl_entries)} entries, sum={lidl_sum}", flush=True)
+
        # Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
        # Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
        # OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
@@ -1372,10 +1666,21 @@ class ReceiptExtractor:
                    except (ValueError, InvalidOperation):
                        continue

-        # Extract TOTAL TVA BON as reference (separate from individual entries)
-        tva_bon_total = self._extract_total_tva_bon(normalized_text)
+        # Add existing extraction results to candidates (if any)
+        if tva_entries:
+            entries_sum = sum(entry['amount'] for entry in tva_entries)
+            all_candidates.append(('standard', 0.90, tva_entries, entries_sum))
+            print(f"[TVA Debug] Standard patterns: {len(tva_entries)} entries, sum={entries_sum}", flush=True)

-        # Calculate sum from entries
+        # === CANDIDATE SELECTION ===
+        # Select best candidate using TOTAL TVA BON as authoritative reference
+        if all_candidates:
+            best_entries, best_sum = self._select_best_tva_candidate(all_candidates, tva_bon_total)
+            if best_entries:
+                tva_entries = best_entries
+                entries_sum = best_sum
+
+        # Calculate sum from entries (if not set by candidate selection)
        entries_sum = None
        if tva_entries:
            entries_sum = sum(entry['amount'] for entry in tva_entries)