roa2web-service-auto/backend/modules/data_entry/services/ocr_extractor.py

"""Extract structured fields from OCR text (Romanian receipts)."""

import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple, List
from dataclasses import dataclass, field

from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry


@dataclass
class ExtractionResult:
    """Structured extraction result from receipt."""
    receipt_type: str = 'bon_fiscal'
    receipt_number: Optional[str] = None
    receipt_series: Optional[str] = None
    receipt_date: Optional[date] = None
    amount: Optional[Decimal] = None
    partner_name: Optional[str] = None
    cui: Optional[str] = None
    description: Optional[str] = None
    # Additional extracted fields - Multiple TVA entries support
    tva_entries: List[dict] = field(default_factory=list)  # [{code, percent, amount}]
    tva_total: Optional[Decimal] = None
    address: Optional[str] = None
    items_count: Optional[int] = None
    payment_methods: List[dict] = field(default_factory=list)  # [{"method":"CARD","amount":Decimal}]
    suggested_payment_mode: Optional[str] = None  # 'banca' if CARD detected, 'numerar' if cash only

    # Client data (for B2B receipts - buyer information)
    client_name: Optional[str] = None
    client_cui: Optional[str] = None
    client_address: Optional[str] = None

    confidence_amount: float = 0.0
    confidence_date: float = 0.0
    confidence_vendor: float = 0.0
    confidence_client: float = 0.0
    raw_text: str = ""
    ocr_engine: str = ""  # OCR engine used: paddleocr or tesseract
    processing_time_ms: int = 0  # Processing time in milliseconds

    # Validation tracking (added by bon-ocr-validation feature)
    needs_manual_review: Optional[bool] = None  # None=not validated, False=ok, True=needs review
    validation_warnings: List[str] = field(default_factory=list)
    validation_errors: List[str] = field(default_factory=list)
    confidence_adjustments: dict[str, float] = field(default_factory=dict)  # Field -> penalty
    inter_ocr_ratios: dict[str, float] = field(default_factory=dict)  # Field -> ratio

    @property
    def overall_confidence(self) -> float:
        """Calculate weighted overall confidence score."""
        weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
        return round(
            self.confidence_amount * weights['amount'] +
            self.confidence_date * weights['date'] +
            self.confidence_vendor * weights['vendor'],
            2
        )


class ReceiptExtractor:
    """Extract receipt fields using pattern matching for Romanian receipts."""

    # =========================================================================
    # DEPRECATED: STORE_PROFILES dict - USE ProfileRegistry INSTEAD
    # =========================================================================
    # Store profiles are now managed by ProfileRegistry in:
    #   backend/modules/data_entry/services/ocr/profiles/
    #
    # This dict is kept for reference only. All extraction logic now uses:
    #   ProfileRegistry.get_profile(cui)
    #
    # See: backend/modules/data_entry/services/ocr/profiles/README.md
    # =========================================================================
    STORE_PROFILES = {
        # Lidl - multi-rate TVA (A+B), specific format without hyphen/colon
        "22891860": {
            "name": "LIDL DISCOUNT S.R.L.",
            "tva_pattern": "lidl",
            "tva_format": "TVA {code} {percent}% {amount}",
            "has_multi_rate_tva": True,
            "card_equals_total": True,
        },
        # OMV Petrom - single TVA rate, client CUI included
        "11201891": {
            "name": "OMV PETROM MARKETING S.R.L.",
            "tva_pattern": "standard",
            "has_client_cui": True,
        },
        # FIVE-HOLDING (BRICK) - standard format
        "10562600": {
            "name": "FIVE-HOLDING S.A.",
            "tva_pattern": "standard",
        },
        # Dedeman - e-factura format
        "2816464": {
            "name": "DEDEMAN SRL",
            "tva_pattern": "standard",
            "has_efactura": True,
        },
        # SOCAR Petroleum
        "12546600": {
            "name": "SOCAR PETROLEUM S.A.",
            "tva_pattern": "standard",
            "has_client_cui": True,
        },
        # Kineterra - non-VAT payer
        "31180432": {
            "name": "KINETERRA CONCEPT SRL",
            "tva_pattern": "none",
            "is_non_vat_payer": True,
        },
    }

    # Total amount patterns (most specific first)
    # Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
    # OCR often produces errors, so patterns must be tolerant
    TOTAL_PATTERNS = [
        # Most common: TOTAL LEI followed by amount (with OCR-tolerant variations)
        # Handles: TOTAL LEI, TOTAL. LE!, T0TAL LEI, TOTAL LE1, etc.
        (r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),  # OCR-tolerant: TOTAL. LE!, T0TAL LEI
        (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),  # Standard clean pattern
        (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),  # OCR may miss first letter
        # Standard patterns
        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
        # SUBTOTAL when TOTAL not found
        (r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
        (r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88),  # OCR variations
        # Payment methods
        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
        (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
        (r'REST\s*:?\s*([\d\s.,]+)', 0.70),  # Sometimes total is near REST
    ]

    # Fallback: Find the largest repeated amount (likely the total)
    # This handles cases where OCR doesn't capture "TOTAL" keyword

    # Date patterns - support dash, dot, and slash separators
    # OCR may produce DRTA instead of DATA, DAIA, etc.
    # OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
    DATE_PATTERNS = [
        # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
        (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
        (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
        # Date followed by ORA (time) - OCR may produce 0RA
        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
        # Date followed by time without ORA keyword
        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
        # Standalone date
        (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
        # YYYY-MM-DD format (less common)
        (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
    ]

    # OCR-corrupted date patterns with spaces/commas
    # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
    DATE_PATTERNS_OCR_SPACES = [
        # YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
        # YYYY. MM. DD format with spaces (standalone)
        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
        # DD. MM, YYYY or DD, MM. YYYY (with time following)
        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
        # DD. MM, YYYY or DD, MM. YYYY (standalone)
        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
    ]

    # Receipt number patterns - Romanian fiscal receipt formats
    # OCR may produce N instead of : or other errors
    NUMBER_PATTERNS = [
        # NDS format (common in Romanian POS)
        (r'NDS\s*:?\s*(\d+)', 0.98),
        # C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760)
        (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),  # CT2N1360760 format
        (r'C3POS.*?(\d{6,7})\b', 0.95),  # Any C3POS followed by 6-7 digit number
        (r'CT2[N:]\s*(\d{6,})', 0.95),  # CT2N prefix
        # BF (Bon Fiscal) number - high priority
        # Format: "Z:0864 BF:0018" - extract only the number after BF:
        (r'BF\s*:\s*(\d{4,})', 0.96),  # BF: with colon (most specific)
        (r'BF\s+(\d{4,})', 0.93),  # BF followed by space and number
        # NIVS format
        (r'NIVS\s*:?\s*(\d+)', 0.95),
        # Standard NR BON formats
        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
        # Document number
        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
        # ID BF format
        (r'ID\s*BF\s*:?\s*(\d+)', 0.90),
        # TD format (transaction ID)
        (r'TD\s*:?\s*(\d+)', 0.85),
        # 6-8 digit number (typical receipt number length)
        (r'\b(\d{6,8})\b', 0.70),
        # Generic long number at end (fallback)
        (r'NR\.?\s*:?\s*(\d{4,})', 0.65),
    ]

    # CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI
    # CIF = Cod de Identificare Fiscală (vendor's tax ID)
    # CLIENT C.U.I. = client's tax ID (should be ignored)
    # OCR errors: R0 instead of RO, C1F instead of CIF
    CUI_PATTERNS = [
        # CIF at start of line (definitely vendor) - tolerant to OCR errors
        # NOTE: Capture full CUI including RO prefix: (R[O0]?\d{6,10}) or ((?:R[O0])?\d{6,10})
        (r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
        (r'^CIF\s*:?\s*(\d{6,10})', 0.97),  # Without RO prefix
        (r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),  # C1F OCR error
        (r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),  # C1F without RO
        # CIF not preceded by CLIENT (negative lookbehind)
        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
        # Standalone CIF: format with OCR tolerance
        (r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
        (r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
        # COD FISCAL (vendor)
        (r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
        (r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
        # C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
        # Also handles double colon from OMV/Petrom: "C. I.F.: : RO11201891"
        (r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
        (r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
        # C.I.F. format (with dots, no spaces)
        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
        # CUI format (less specific, use with caution)
        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
        # Lidl format: "Cod Identificare fiscala: RO..." (OCR corrupts to "Ced Identificanfliscalar")
        # Matches: "Identificare fiscala", "Identificanfliscalar", "Identificoan/Fljscales"
        (r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
        # Generic: anything with "fiscal" followed by RO + digits
        (r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
    ]

    # Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
    # Common in some receipts: "RO11201891\nC. I. F." - number on line before label
    # IMPORTANT: Capture the full CUI including RO prefix
    CUI_REVERSED_PATTERNS = [
        # RO/R0 + 6-10 digits on line immediately before C.I.F./CIF label
        # Capture the FULL CUI including RO prefix
        (r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
        # Just digits before C.I.F. label (neplatitor TVA - no RO prefix)
        (r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
    ]

    # Series patterns - be strict to avoid false matches
    SERIES_PATTERNS = [
        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
        # Z: format from Romanian fiscal receipts (must be at start of line or after space)
        (r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
        # BF series with explicit marker
        (r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
    ]

    # TVA (VAT) patterns - OCR may produce TUA, TVR, IVA, etc.
    # All patterns are case-insensitive (re.IGNORECASE applied in extraction)
    TVA_PATTERNS = [
        # TOTAL TVA BON format (OCR tolerant: TUA, TVR, IVA)
        (r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
        (r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95),
        # IVA variant (Spanish/Portuguese influence, some receipts)
        (r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95),
        (r'IVA\s+[A-D]?\s*[-:]?\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.93),
        # TVA with percentage (OCR tolerant)
        (r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
        (r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
        # 5% TVA rate (books, newspapers - TVA C)
        (r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93),
        (r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92),
        # Garbled OCR: T0TAL, TVAI, TUAI, etc.
        (r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88),
        # OCR corruption: "TA F 194" (TVA with V→F or space), "T A 19%"
        # Handles: "TOTAL TA F 194" where TVA became "TA F"
        (r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
        (r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
        # NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching
        # percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases.
        # Simple TVA/IVA pattern - this is the reliable fallback
        (r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
        # Standalone percentage line near TVA
        (r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
    ]

    # Payment method patterns - appears after TOTAL LEI, before TOTAL TVA
    # Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00"
    # OMV/Petrom uses "CARTE CREDIT" or "CARTE CREDIT 318, 16"
    PAYMENT_METHOD_PATTERNS = [
        # CARTE CREDIT with amount on same line (OMV/Petrom receipts)
        # Handles: "CARTE CREDIT 318, 16" with OCR spaces in number
        (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
        # CARTE CREDIT with amount on next line (OCR may split lines)
        # Handles: "CARTE CREDIT\n318, 16"
        (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
        # CARD with amount (high confidence)
        # Also handles OCR artifacts like "CARD F 100.00" where F is noise
        (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
        # NUMERAR (cash) with amount
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
        # CASH alternative spelling
        (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
        # Truncation recovery patterns (for OCR left-margin truncation issues)
        # IMPROVED: More restrictive - require max 6 digits before decimals
        # to avoid matching CUI numbers like RO10562600 → RD10562600
        # "RD" = truncated "CARD" (only 2 chars visible)
        (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
        # "ARD" = truncated "CARD" (3 chars visible)
        (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
        # "MERAR" = truncated "NUMERAR"
        (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
    ]

    # Maximum reasonable payment amount for a receipt (100,000 LEI)
    # Amounts larger than this are likely OCR errors (e.g., CUI parsed as amount)
    MAX_REASONABLE_PAYMENT = Decimal('100000')

    # Items count patterns - OCR may produce OZ instead of POZ, etc.
    # Number may be on separate line before or after the label
    # IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
    ITEMS_COUNT_PATTERNS = [
        # NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
        # OCR tolerant: OZ instead of POZ, ARI instead of ART
        (r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
        # Number on line BEFORE "OZ. ART. IN BON:" - OCR sometimes reorders
        (r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
        # Number may be on next line after label
        (r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
        (r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
        # Simpler patterns - but more specific
        (r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
        # POZ at start of line or after colon (not in product descriptions)
        (r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
    ]

    # Address patterns (Romanian format)
    ADDRESS_PATTERNS = [
        # Street patterns
        (r'(STR\.?\s+[A-Z0-9\s.,]+(?:NR\.?\s*\d+)?)', 0.90),
        # Full address with JUD (county)
        (r'(JUD\.?\s+[A-Z]+,?\s*(?:MUN\.?|OR\.?|COM\.?)?\s*[A-Z]+)', 0.85),
    ]

    # Client/Buyer patterns (for B2B receipts)
    # CLIENT, CUMPARATOR, BENEFICIAR sections
    # Variations: "CIF CLIENT:", "CLIENT C.U.I/C.I.F.", "CLIENT C. U. I./ C. I.F."
    CLIENT_SECTION_MARKERS = [
        # Reversed format: CIF/CUI before CLIENT
        r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',  # CIF CLIENT:
        r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',  # CUI CLIENT:
        # Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF CL IENT:"
        r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:',  # "CIF a IENT:", "CIF CL IENT:", "CIF LIENT:"
        r'C[I1]F\s+LIENT\s*:',  # "CIF LIENT:" (missing C from CLIENT)
        # CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
        # Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
        r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
        r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',  # CLIENT CUI: or CLIENT CIF:
        r'CLIENT\s*:',
        # CUMPARATOR variants
        r'CUMPARATOR\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',  # CUMPARATOR CUI: or CIF:
        r'CUMPARATOR\s*:',
        r'BENEFICIAR\s*:',
        r'CUMP[AĂ]R[AĂ]TOR\s*:',
        r'DATE\s+CLIENT',
        r'LIENT\s*:',  # OCR truncation
    ]

    # Client CUI patterns (explicitly after CLIENT marker)
    # OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
    CLIENT_CUI_PATTERNS = [
        # NEW: CUI on line BEFORE CLIENT marker (docTR/OCR may output value before label)
        # Pattern: "RO1879855\nCLIENT C.U.I./C.I.F.:" - CUI on line before CLIENT label
        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
        # Same but with optional colon after RO number
        (r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
        # "CIF I CLIENT:" or "CIF IDENTIFICARE CLIENT:" format (OCR may insert extra chars)
        # Common OCR artifact: "CIF I CLIENT: R01879855"
        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
        # CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
        # CLIENT C.U.I/C.I.F. or CLIENT C. U. I./ C. I.F. (slash variant - all spacing)
        # Most flexible pattern for slash variants
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
        # OCR artifact: doubled letters like "C.U U. I." or "C.I I.F." (docTR sometimes duplicates)
        (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
        # CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
        # Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF L IENT:", "CIF C IENT:"
        # OCR often corrupts "CLIENT" when it appears after "CIF"
        (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),  # "CIF a IENT:", "CIF CL IENT:"
        (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
        (r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),  # "CIF LIENT:" (missing C)
        (r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
        # CUMPARATOR variants
        (r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        (r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        # CUMPARATOR with CUI/CIF on next line: "CUMPARATOR: NAME\nCIF: 12345678"
        (r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
        (r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),  # F or T (OCR error)
        # CUMPARATOR with CUI/CIF two lines down: "CUMPARATOR: NAME\nADDRESS\nCIF: 12345678"
        (r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
        # CUI/CIF on line immediately after CLIENT marker
        (r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        (r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),  # F or T (OCR error)
        # CUI/CIF after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
        (r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
        (r'CLIENT\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),  # CIF/CIT after name
    ]

    # Vendor name indicators (lines containing these are likely vendor names)
    # These should be company type suffixes, not generic words
    # Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
    VENDOR_INDICATORS = [
        r'\bS\.?\s*R\.?\s*L\.?\b',      # S.R.L. or S. R. L.
        r'\bS\.?\s*A\.?\b',              # S.A. or S. A.
        r'\bS\.?\s*N\.?\s*C\.?\b',      # S.N.C. or S. N. C.
        r'\bS\.?\s*C\.?\s*S\.?\b',      # S.C.S. or S. C. S.
        r'\bI\.?\s*I\.?\b',              # I.I. or I. I.
        r'\bP\.?\s*F\.?\s*A\.?\b',      # P.F.A. or P. F. A.
        # S.C. alone is too short and generic - only match if followed by company name
        r'\bS\.?\s*C\.?\s+[A-Z]',       # S.C. followed by company name
        r'HOLDING',
        r'COMPANY',
        r'GROUP',
        # Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
    ]

    def extract(self, text: str) -> ExtractionResult:
        """Extract all fields from OCR text."""
        result = ExtractionResult()
        result.raw_text = text
        text_upper = text.upper()

        # =========================================================================
        # STEP 1: Extract vendor info FIRST to find store profile
        # =========================================================================
        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
        result.cui, _ = self._extract_cui(text_upper, text)
        result.cui = OCRValidationEngine.normalize_cui(result.cui)

        # Lookup store-specific profile for enhanced extraction accuracy
        store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
        if store_profile:
            print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)

        # =========================================================================
        # STEP 2: Extract ALL fields using profile (if available) or generic
        # =========================================================================
        if store_profile:
            # Profile-specific extraction (higher accuracy for known stores)
            result.amount, result.confidence_amount = store_profile.extract_total(text_upper)
            result.receipt_date, result.confidence_date = store_profile.extract_date(text_upper)
            result.receipt_number, _ = store_profile.extract_receipt_number(text_upper)
            result.tva_entries = store_profile.extract_tva_entries(text_upper)
            result.tva_total = sum(e['amount'] for e in result.tva_entries) if result.tva_entries else None
            result.payment_methods = store_profile.extract_payment_methods(text_upper)

            # Client data extraction via profile (CUI + name)
            profile_client_cui, cui_confidence = store_profile.extract_client_cui(text_upper)
            profile_client_name, name_confidence = store_profile.extract_client_name(text)

            if profile_client_cui or profile_client_name:
                # Use profile extraction results
                result.client_cui = OCRValidationEngine.normalize_cui(profile_client_cui) if profile_client_cui else None
                result.client_name = profile_client_name
                result.confidence_client = max(cui_confidence, name_confidence)
                # Address still via generic (no profile method)
                _, _, client_address, _ = self._extract_client_data(text_upper, text)
                result.client_address = client_address
            else:
                # Fallback to generic client extraction
                client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
                result.client_name = client_name
                result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
                result.client_address = client_address
                result.confidence_client = confidence

            print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
                  f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
        else:
            # Generic extraction for unknown stores
            result.amount, result.confidence_amount = self._extract_amount(text_upper)
            result.receipt_date, result.confidence_date = self._extract_date(text_upper)
            result.receipt_number, _ = self._extract_number(text_upper)
            result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
            result.payment_methods = self._extract_payment_methods(text_upper)

            # Generic client extraction
            client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
            result.client_name = client_name
            result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
            result.client_address = client_address
            result.confidence_client = confidence

        # Series extraction (no profile method, always generic)
        result.receipt_series, _ = self._extract_series(text_upper)

        # =========================================================================
        # STEP 3: Debug logging and validation
        # =========================================================================
        if not result.tva_entries:
            print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
            normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
            taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
            rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
            print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)

        # Log TVA vs TOTAL for debugging
        if result.tva_total and result.amount:
            if result.tva_total > result.amount:
                print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
            elif result.tva_total > result.amount * Decimal('0.5'):
                print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)

        # Additional generic extractions
        result.items_count = self._extract_items_count(text_upper)
        result.address = self._extract_address(text_upper)

        # =========================================================================
        # STEP 4: Validate and post-process
        # =========================================================================
        # Save original payment methods before validation (for payment mode detection)
        original_payment_methods = result.payment_methods.copy() if result.payment_methods else []

        # Validate payment methods against extracted amount
        result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)

        # Auto-suggest payment_mode based on detected payment methods
        payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
        if payment_methods_for_mode:
            card_amount = sum(
                pm.get('amount', Decimal('0'))
                for pm in payment_methods_for_mode
                if pm.get('method') == 'CARD'
            )
            if card_amount > 0:
                result.suggested_payment_mode = 'banca'
                print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
            else:
                result.suggested_payment_mode = 'numerar'
                print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)

        # Detect receipt type
        result.receipt_type = self._detect_receipt_type(text_upper)

        # Reverse TVA validation
        if result.tva_entries and result.amount:
            is_valid, expected_total, msg = self._validate_tva_reverse(result.tva_entries, result.amount)
            if not is_valid:
                print(f"[TVA Reverse Validation] {msg}", flush=True)

        # Cross-validate amount using payment methods and TVA
        validated_amount, validated_confidence, source = self._cross_validate_and_calculate_amount(
            result.amount,
            result.confidence_amount,
            result.payment_methods,
            result.tva_entries,
            result.tva_total
        )
        if validated_amount != result.amount:
            print(f"[Cross-Validation] Amount updated: {result.amount} -> {validated_amount} (source: {source})", flush=True)
        result.amount = validated_amount
        result.confidence_amount = validated_confidence

        return result

    def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
        """Extract total amount from text."""
        # PRE-FILTER: Remove lines containing REST (rest = change, not total)
        # When paid by card, there's no change - exact amount is paid
        lines = text.split('\n')
        filtered_lines = []
        for line in lines:
            # Skip lines with REST pattern (change amount, not total)
            if re.search(r'\bREST\b', line, re.IGNORECASE):
                continue
            filtered_lines.append(line)
        text = '\n'.join(filtered_lines)

        # First try standard patterns (TOTAL, SUBTOTAL, etc.)
        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                try:
                    # IMPORTANT: Call _normalize_number FIRST to handle "190 60" → "190.60"
                    # before stripping other characters
                    amount_str = match.group(1).strip()
                    amount_str = self._normalize_number(amount_str)
                    # Now remove any remaining non-numeric chars (except decimal point)
                    amount_str = re.sub(r'[^\d.]', '', amount_str)
                    amount = Decimal(amount_str)
                    if amount > 0:
                        return amount, confidence
                except (InvalidOperation, ValueError):
                    continue

        # Strategy 2: Find amounts AFTER product lines end
        # Products have pattern: "X BUC/ROLA X price = price"
        # Total appears after all products
        product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X'
        product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE))
        if product_matches:
            # Get text after the last product line
            last_product_pos = product_matches[-1].end()
            after_products = text[last_product_pos:]

            # Find standalone amounts on their own line after products
            line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
            standalone_amounts = []
            for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE):
                try:
                    amount_str = match.group(1).replace(' ', '')
                    amount_str = self._normalize_number(amount_str)
                    amount = Decimal(amount_str)
                    if amount > 10:  # Filter out small values
                        standalone_amounts.append(amount)
                except (InvalidOperation, ValueError):
                    continue

            if standalone_amounts:
                # The largest standalone amount after products is likely the total
                max_amount = max(standalone_amounts)
                # Higher confidence if it appears multiple times
                count = standalone_amounts.count(max_amount)
                confidence = 0.85 if count >= 2 else 0.75
                return max_amount, confidence

        # Strategy 3: Find the most repeated large amount
        # Normalize spaces in numbers (OCR may produce "186. 16")
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
        amount_pattern = r'(\d{2,4}[.,]\d{2})\b'
        amounts = re.findall(amount_pattern, normalized_text)
        if amounts:
            from collections import Counter
            amount_counts = Counter(amounts)
            # Filter amounts that appear 2+ times and are > 20
            candidates = []
            for amt_str, count in amount_counts.items():
                try:
                    amt = Decimal(self._normalize_number(amt_str))
                    if count >= 2 and amt > 20:
                        candidates.append((amt, count))
                except (InvalidOperation, ValueError):
                    continue

            if candidates:
                # Return the LARGEST amount that appears multiple times
                candidates.sort(key=lambda x: x[0], reverse=True)
                return candidates[0][0], 0.65

        # Last resort: Find any standalone large amount
        line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
        for match in re.finditer(line_amount_pattern, text, re.MULTILINE):
            try:
                amount_str = match.group(1).replace(' ', '')
                amount_str = self._normalize_number(amount_str)
                amount = Decimal(amount_str)
                if amount > 50:  # Higher threshold for fallback
                    return amount, 0.50
            except (InvalidOperation, ValueError):
                continue

        return None, 0.0

    def _normalize_number(self, num_str: str) -> str:
        """Normalize Romanian number format to standard decimal."""
        # OCR often reads "." as " " (space) - handle "190 60" as "190.60"
        # Pattern: digits + space + exactly 2 digits at end
        space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', num_str.strip())
        if space_decimal_match:
            num_str = f"{space_decimal_match.group(1)}.{space_decimal_match.group(2)}"
        else:
            # Handle "1 234 56" pattern (thousands + decimal with spaces)
            # Match: digits + space(s) + digits + space + 2 digits
            multi_space_match = re.match(r'^([\d\s]+?)\s+(\d{2})$', num_str.strip())
            if multi_space_match:
                integer_part = multi_space_match.group(1).replace(' ', '')
                decimal_part = multi_space_match.group(2)
                num_str = f"{integer_part}.{decimal_part}"
            else:
                # Remove remaining spaces (thousands separators)
                num_str = num_str.replace(' ', '')

        # Handle comma as decimal separator
        if ',' in num_str and '.' in num_str:
            # Romanian format: 1.234,56
            num_str = num_str.replace('.', '').replace(',', '.')
        elif ',' in num_str:
            # Could be 1,50 or 1,234
            parts = num_str.split(',')
            if len(parts) == 2 and len(parts[1]) <= 2:
                # Decimal comma: 1,50
                num_str = num_str.replace(',', '.')
            else:
                # Thousands comma: 1,234
                num_str = num_str.replace(',', '')
        elif '.' in num_str:
            parts = num_str.split('.')
            if len(parts) > 2:
                # Multiple dots: 1.234.567 -> 1234567
                num_str = ''.join(parts[:-1]) + '.' + parts[-1]

        return num_str

    def _calculate_multi_rate_tva_total(self, tva_entries: List[dict]) -> Optional[Decimal]:
        """
        Calculate implied total from ALL TVA entries (multi-rate support).

        Formula for each entry: total_for_entry = tva * (100 + rate) / rate
        Final total = sum of all entry totals

        Example for Lidl (TVA A 21% = 7.71, TVA B 11% = 2.13):
            Entry A: 7.71 * 121 / 21 = 44.45
            Entry B: 2.13 * 111 / 11 = 21.49
            Total: 44.45 + 21.49 = 65.94 ≈ 65.86 (within tolerance)

        Returns:
            Implied total Decimal, or None if calculation not possible
        """
        if not tva_entries:
            return None

        total = Decimal('0')
        for entry in tva_entries:
            rate = entry.get('percent', 0)
            tva_amount = entry.get('amount')
            if tva_amount and rate > 0:
                try:
                    tva_dec = Decimal(str(tva_amount))
                    # Formula: total_for_entry = tva * (100 + rate) / rate
                    entry_total = tva_dec * Decimal(100 + rate) / Decimal(rate)
                    total += entry_total
                    print(f"[Multi-rate TVA] Entry {entry.get('code', '?')}: tva={tva_amount}, rate={rate}% -> implied={entry_total:.2f}", flush=True)
                except (InvalidOperation, ValueError, TypeError):
                    continue

        return total.quantize(Decimal('0.01')) if total > 0 else None

    def _cross_validate_and_calculate_amount(
        self,
        amount: Optional[Decimal],
        confidence_amount: float,
        payment_methods: List[dict],
        tva_entries: List[dict],
        tva_total: Optional[Decimal]
    ) -> Tuple[Optional[Decimal], float, str]:
        """
        Cross-validate and potentially calculate total from payment methods and TVA.

        Returns: (amount, confidence, source_description)

        Logic:
        1. Collect all available sources: extracted amount, payment sum, TVA-implied total
        2. Find consensus: 2+ sources within 3% tolerance
        3. If consensus found, use the higher-confidence source value
        4. If extracted differs >10% from all others, it's an outlier - correct it
        5. If no consensus possible, fallback to individual validations
        """
        # Calculate payment methods sum
        payment_sum = Decimal('0')
        if payment_methods:
            for pm in payment_methods:
                try:
                    pm_amount = pm.get('amount')
                    if pm_amount:
                        payment_sum += Decimal(str(pm_amount))
                except (InvalidOperation, ValueError, TypeError):
                    continue

        # Calculate TVA-implied total using ALL entries (multi-rate fix)
        tva_implied_total = self._calculate_multi_rate_tva_total(tva_entries)

        # Multi-source consensus approach (3% tolerance for multi-rate TVA rounding)
        CONSENSUS_TOLERANCE = 3.0  # 3% tolerance

        # Collect all available sources with their confidences
        sources = []
        if amount and amount > 0:
            sources.append(('extracted', float(amount), confidence_amount))
        if payment_sum > 0:
            sources.append(('payment', float(payment_sum), 0.92))  # Payment is very reliable
        if tva_implied_total and tva_implied_total > 0:
            sources.append(('tva_calc', float(tva_implied_total), 0.88))  # TVA calc is reliable

        print(f"[Cross-Validation] Sources: {[(s[0], f'{s[1]:.2f}', f'{s[2]:.2f}') for s in sources]}", flush=True)

        # Find consensus: 2+ sources within tolerance
        if len(sources) >= 2:
            for i, (name1, val1, conf1) in enumerate(sources):
                for name2, val2, conf2 in sources[i+1:]:
                    if val1 <= 0 or val2 <= 0:
                        continue
                    diff_pct = abs(val1 - val2) / max(val1, val2) * 100
                    if diff_pct <= CONSENSUS_TOLERANCE:
                        # Consensus found! Use value from higher-confidence source
                        if conf1 >= conf2:
                            consensus_val, consensus_conf = val1, conf1
                        else:
                            consensus_val, consensus_conf = val2, conf2
                        # Boost confidence for consensus
                        consensus_conf = min(0.98, consensus_conf + 0.05)
                        print(f"[Cross-Validation] Consensus: {name1}={val1:.2f} ≈ {name2}={val2:.2f} (diff={diff_pct:.1f}%)", flush=True)
                        return Decimal(str(round(consensus_val, 2))), consensus_conf, f"consensus ({name1}+{name2})"

        # No consensus - check if extracted is an outlier (differs >10% from all others)
        if amount and amount > 0 and len(sources) >= 2:
            other_sources = [s for s in sources if s[0] != 'extracted']
            if other_sources:
                extracted_val = float(amount)
                all_differ = all(
                    abs(extracted_val - s[1]) / max(extracted_val, s[1]) * 100 > 10
                    for s in other_sources if s[1] > 0
                )
                if all_differ:
                    # Extracted differs significantly from all others - use the best other source
                    best_other = max(other_sources, key=lambda s: s[2])
                    print(f"[Cross-Validation] Extracted outlier: {extracted_val:.2f} differs >10% from all others, using {best_other[0]}={best_other[1]:.2f}", flush=True)
                    return Decimal(str(round(best_other[1], 2))), best_other[2], f"corrected (extracted outlier, using {best_other[0]})"

        # Fallback: Case 1 - Amount valid with high confidence
        if amount and amount > 0 and confidence_amount >= 0.8:
            # Check TVA-implied total
            if tva_implied_total and tva_implied_total > 0:
                tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
                if tva_diff_percent <= 3:
                    return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
                elif tva_diff_percent > 10:
                    print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
                    return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"

            # Cross-validate with payment methods
            if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
                return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
            elif payment_sum > 0:
                payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
                if payment_diff_percent > 10:
                    print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
                    return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"

            return amount, confidence_amount, "extracted"

        # Case 2: Amount exists but low confidence - try to validate/correct
        if amount and amount > 0:
            if tva_implied_total and tva_implied_total > 0:
                tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
                if tva_diff_percent <= 3:
                    return amount, 0.88, "extracted (validated by TVA)"
                elif tva_diff_percent > 10:
                    print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
                    return tva_implied_total, 0.85, "calculated from TVA"

            if payment_sum > 0:
                payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
                if payment_diff_percent <= 1:
                    return amount, 0.90, "extracted (validated by payment methods)"
                elif payment_diff_percent > 10:
                    print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
                    return payment_sum, 0.85, "calculated from payment methods"

            return amount, confidence_amount, "extracted (unvalidated)"

        # Case 3: Amount is 0 or None - calculate from payment methods
        if payment_sum > 0:
            print(f"[Cross-Validation] Amount not found, using payment sum: {payment_sum}", flush=True)
            return payment_sum, 0.85, "calculated from payment methods"

        # Case 4: Try TVA-implied total as last resort
        if tva_implied_total and tva_implied_total > 0:
            print(f"[Cross-Validation] Amount not found, using TVA-implied total: {tva_implied_total}", flush=True)
            return tva_implied_total, 0.70, "calculated from TVA"

        # Nothing worked - return original
        return amount, confidence_amount, "not found"

    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
        """Extract receipt date from text."""
        # First try standard patterns (clean dates)
        for pattern, confidence in self.DATE_PATTERNS:
            match = re.search(pattern, text)
            if match:
                try:
                    # Normalize separators to dots
                    date_str = match.group(1).replace('/', '.').replace('-', '.')

                    # Try DD.MM.YYYY format first
                    try:
                        parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
                    except ValueError:
                        # Try YYYY.MM.DD format
                        parsed = datetime.strptime(date_str, '%Y.%m.%d').date()

                    # Validate date range
                    today = date.today()
                    if parsed <= today and parsed.year >= 2020:
                        return parsed, confidence
                except ValueError:
                    continue

        # Then try OCR-corrupted patterns (dates with spaces/commas)
        # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
        for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
            match = re.search(pattern, text)
            if match:
                try:
                    if fmt == 'ymd':
                        # YYYY. MM. DD format (OMV/Petrom)
                        year = match.group(1)
                        month = match.group(2)
                        day = match.group(3)
                    else:
                        # DD. MM. YYYY format (default)
                        day = match.group(1)
                        month = match.group(2)
                        year = match.group(3)

                    date_str = f"{day}.{month}.{year}"
                    parsed = datetime.strptime(date_str, '%d.%m.%Y').date()

                    # Validate date range
                    today = date.today()
                    if parsed <= today and parsed.year >= 2020:
                        return parsed, confidence
                except ValueError:
                    continue

        return None, 0.0

    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
        """Extract receipt number from text."""
        for pattern, confidence in self.NUMBER_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1), confidence
        return None, 0.0

    def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
        """Extract receipt series from text."""
        for pattern, confidence in self.SERIES_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).upper(), confidence
        return None, 0.0

    def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
        """
        Extract vendor/partner name from text.
        Uses multiple strategies:
        1. Look for lines with company type indicators (S.R.L., S.A., etc.)
        2. Look for company name + SRL on separate lines
        3. Look for lines near CIF
        4. Use first valid line as fallback
        """
        lines = text.split('\n')
        skip_keywords = [
            'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
            'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
            'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT',
            'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
            'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
            'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
            'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
            'NIVS', 'BENZINA', 'PETROM', 'OMV'
        ]

        # Strategy 0: Look for company name followed by SRL/SA on next line
        # Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
        for i, line in enumerate(lines[:15]):
            line = line.strip()
            if not line or len(line) < 3:
                continue

            line_upper = line.upper()

            # Skip lines with skip keywords
            if any(kw in line_upper for kw in skip_keywords):
                continue

            # Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip().upper()
                # Match standalone company type suffix
                if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
                   re.match(r'^S\.?\s*A\.?$', next_line) or \
                   re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
                   re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
                   re.match(r'^I\.?\s*I\.?$', next_line):
                    # Combine: "COMPANY NAME" + " " + "SRL"
                    vendor = self._clean_vendor_name(f"{line} {next_line}")
                    if vendor and len(vendor) >= 5:
                        return vendor, 0.95

        # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
        for i, line in enumerate(lines[:15]):  # Check first 15 lines
            line = line.strip()
            if not line or len(line) < 3:
                continue

            line_upper = line.upper()

            # Skip lines with skip keywords (CUMPARATOR, CLIENT, etc.)
            if any(kw in line_upper for kw in skip_keywords):
                continue

            # Check for vendor indicators
            for indicator in self.VENDOR_INDICATORS:
                if re.search(indicator, line_upper):
                    # Found a company name indicator
                    vendor = self._clean_vendor_name(line)
                    if vendor and len(vendor) >= 3:
                        # High confidence for lines with company indicators
                        return vendor, 0.95

        # Strategy 2: Look for lines right before or after CIF
        for i, line in enumerate(lines[:15]):
            line_upper = line.upper()
            if 'CIF' in line_upper and 'CLIENT' not in line_upper:
                # Check line before
                if i > 0:
                    prev_line = lines[i-1].strip()
                    if prev_line and len(prev_line) >= 3:
                        if not any(kw in prev_line.upper() for kw in skip_keywords):
                            vendor = self._clean_vendor_name(prev_line)
                            if vendor:
                                return vendor, 0.85

        # Strategy 3: First valid line as fallback
        for i, line in enumerate(lines[:10]):
            line = line.strip()

            # Skip empty lines
            if not line or len(line) < 3:
                continue

            # Skip lines that are just numbers or codes
            if re.match(r'^[\d.,\s:]+$', line):
                continue

            # Skip lines with barcodes/product codes
            if re.match(r'^[A-Z]*\d{6,}', line):
                continue

            # Skip lines with keywords
            if any(kw in line.upper() for kw in skip_keywords):
                continue

            # Clean the line
            vendor = self._clean_vendor_name(line)

            if vendor and len(vendor) >= 3:
                # Confidence decreases for lines further down
                confidence = max(0.3, 0.7 - (i * 0.05))
                return vendor, confidence

        return None, 0.0

    def _clean_vendor_name(self, name: str) -> Optional[str]:
        """Clean and normalize vendor name."""
        if not name:
            return None

        # Remove common OCR artifacts
        name = re.sub(r'[^\w\s.,&\-()]', ' ', name)
        # Normalize whitespace
        name = re.sub(r'\s+', ' ', name).strip()

        name_upper = name.upper()

        # Skip if it looks like an address line only
        # Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company)
        # Only reject SC when followed by a number (staircase), not when followed by company name
        # Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes
        #          SC is only address when followed by digit (e.g., "SC 2", "SC. 5")
        if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper):
            return None

        # SC followed by digit = staircase (address), reject
        # SC followed by letter/company name = "Societate Comercială", keep
        if re.match(r'^S\.?\s*C\.?\s+\d', name_upper):
            return None

        # Skip if too short after cleaning
        if len(name) < 3:
            return None

        return name

    def _get_store_profile(self, cui: Optional[str]) -> Optional[dict]:
        """
        Get store-specific profile by CUI.

        DEPRECATED: Use ProfileRegistry.get_profile() directly for profile objects.
        This method is kept for backward compatibility and returns validation hints dict.

        Args:
            cui: The CUI extracted from receipt (with or without RO prefix)

        Returns:
            Store profile validation hints dict or None if not found
        """
        profile = ProfileRegistry.get_profile(cui)
        if profile:
            # Return validation hints for backward compatibility
            hints = profile.get_validation_hints()
            hints['name'] = profile.STORE_NAME
            print(f"[Store Profile] Found profile for {cui}: {profile.STORE_NAME}", flush=True)
            return hints
        return None

    def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
        """
        Extract vendor CUI (fiscal identification code) from text.
        Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
        """
        def get_cui_digit_count(cui: str) -> int:
            """Get the count of digits in CUI (excluding RO/R0 prefix)."""
            cui_upper = cui.upper().strip()
            if cui_upper.startswith('RO') or cui_upper.startswith('R0'):
                return len(cui_upper) - 2
            return len(cui_upper)

        # Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
        # This is common in some receipts: "RO11201891\nC. I. F."
        for pattern, confidence in self.CUI_REVERSED_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
            if match:
                cui = match.group(1)
                digit_count = get_cui_digit_count(cui)
                if 6 <= digit_count <= 10:
                    # Verify this is not the CLIENT CUI by checking context
                    start = match.start()
                    # Check 50 chars before the match for CLIENT keyword
                    context_start = max(0, start - 50)
                    context = text_upper[context_start:start]
                    if 'CLIENT' not in context and 'LIENT' not in context:
                        return cui, confidence

        # Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
        lines = text_upper.split('\n')
        for line in lines:
            # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
            if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line:
                continue

            # Look for CIF in this line
            for pattern, confidence in self.CUI_PATTERNS:
                match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
                if match:
                    cui = match.group(1)
                    digit_count = get_cui_digit_count(cui)
                    if 6 <= digit_count <= 10:
                        return cui, confidence

        # Strategy 2: Fallback - search entire text but exclude CLIENT patterns
        for pattern, confidence in self.CUI_PATTERNS:
            # Find all matches
            for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
                cui = match.group(1)
                digit_count = get_cui_digit_count(cui)
                if 6 <= digit_count <= 10:
                    # Check if this match is preceded by CLIENT in the same line
                    start = match.start()
                    line_start = text_upper.rfind('\n', 0, start) + 1
                    line_text = text_upper[line_start:start]
                    if 'CLIENT' not in line_text and 'LIENT' not in line_text:
                        return cui, confidence

        return None, 0.0

    def _detect_receipt_type(self, text: str) -> str:
        """Detect receipt type from text content.

        BON FISCAL variants: "BON FISCAL", "BON FISCAL.", "BON  FISCAL"
        CHITANTA variants: "CHITANTA", "CHITANȚĂ"
        """
        # Check for explicit BON FISCAL first (handles OCR spacing variations)
        if re.search(r'BON\s+FISCAL', text):
            return 'bon_fiscal'
        if 'CHITANTA' in text or 'CHITANȚĂ' in text:
            return 'chitanta'
        # Default to bon_fiscal if neither found
        return 'bon_fiscal'

    def _try_pattern_lidl(self, text: str) -> List[dict]:
        """
        Try Lidl-style TVA pattern: "TVA A 21,00% 7.71" (no hyphen/colon separator).

        Lidl receipts format:
            TOTAL TVA 9,84
            TVA A 21,00% 7,71
            TVA B 11,00% 2,13

        Returns list of TVA entries found.
        """
        entries = []
        seen = set()

        # Pattern: TVA/TUA/IVA + code (A-D) + percent + amount (on same line)
        # Handles: "TVA A 21,00% 7,71", "TVA B 11,00% 2,13", "TUA A 21% 7.71"
        lidl_patterns = [
            # Same line: "TVA A  21,00%   7.71" (with various spacing)
            r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
            # Same line with backslash (OCR artifact): "TVA A \21,00% 7.71"
            r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
            # IVA variant
            r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
        ]

        for pattern in lidl_patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
                    amount_str = self._normalize_number(match.group(3))
                    amount = Decimal(amount_str)

                    if amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen:
                            entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen.add(entry_key)
                            print(f"[TVA Lidl] Found: TVA {code} {percent}% = {amount}", flush=True)
                except (ValueError, InvalidOperation):
                    continue

        return entries

    def _select_best_tva_candidate(
        self,
        candidates: List[tuple],
        tva_bon_total: Optional[Decimal]
    ) -> Tuple[List[dict], Optional[Decimal]]:
        """
        Select the best TVA candidate from collected candidates.

        Selection criteria (priority order):
        1. Sum matches TOTAL TVA BON (highest priority)
        2. More entries = better (for multi-rate receipts)
        3. Pattern confidence as tiebreaker

        Args:
            candidates: List of (pattern_name, confidence, entries, sum)
            tva_bon_total: Authoritative TOTAL TVA BON value (if extracted)

        Returns:
            (best_entries, best_sum)
        """
        if not candidates:
            return [], None

        # Score each candidate
        scored = []
        for name, confidence, entries, sum_val in candidates:
            score = 0.0

            # Criterion 1: Sum matches TOTAL TVA BON (highest priority)
            if tva_bon_total and sum_val:
                tolerance = max(Decimal('0.02'), tva_bon_total * Decimal('0.02'))  # 2% tolerance
                if abs(sum_val - tva_bon_total) <= tolerance:
                    score += 100  # High bonus for matching authoritative total
                    print(f"[TVA Select] {name}: sum {sum_val} matches tva_bon_total {tva_bon_total}", flush=True)

            # Criterion 2: More entries (for multi-rate receipts)
            score += len(entries) * 10

            # Criterion 3: Pattern confidence
            score += confidence * 5

            scored.append((score, name, confidence, entries, sum_val))
            print(f"[TVA Select] Candidate {name}: score={score:.1f}, entries={len(entries)}, sum={sum_val}", flush=True)

        # Sort by score descending
        scored.sort(key=lambda x: x[0], reverse=True)
        best = scored[0]
        print(f"[TVA Select] Winner: {best[1]} (score={best[0]:.1f})", flush=True)

        return best[3], best[4]

    def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
        """
        Extract multiple TVA (VAT) entries from text.
        Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).

        Uses CANDIDATE COLLECTION approach:
        - Try ALL patterns and collect candidates
        - Select best candidate based on matching TOTAL TVA BON

        Returns (tva_entries, tva_total) where tva_entries is a list of:
            {'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
        """
        tva_entries = []
        seen_entries = set()  # To avoid duplicates

        # Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
        # OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
        # Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
        non_vat_patterns = [
            # Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
            r'NEPLAT\w*OR',           # NEPLATITOR, NEPLATTOR, NEPLATOR
            r'[ANM]EPLAT\w*O?R',      # OCR errors: ANEPLATHTOR, MEPLATITOR
            r'TOTAL\s+NEPLAT',        # TOTAL NEPLATITOR...
            r'TOTAL\s+[ANM]EPLAT',    # TOTAL ANEPLAT... (OCR error)
            r'SCUTIT\s*(?:DE\s+)?T[VU]A',  # SCUTIT DE TVA
            r'NEPLAT\w*\s+T[VU]A',    # NEPLATITOR TVA
            r'NEPLAT\w*\s+DE\s+T',    # NEPLATITOR DE T... (truncated)
        ]
        for pattern in non_vat_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                # Non-VAT payer - return TVA = 0
                return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')

        # Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
        # Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
        normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)

        # Extract TOTAL TVA BON/TOTAL TVA first as the authoritative reference
        tva_bon_total = self._extract_total_tva_bon(normalized_text)
        print(f"[TVA Debug] TOTAL TVA BON: {tva_bon_total}", flush=True)

        # CANDIDATE COLLECTION APPROACH: Try all patterns, collect candidates, select best
        all_candidates = []  # List of (pattern_name, confidence, entries, sum)

        # === LIDL-STYLE PATTERNS (NEW) ===
        # Lidl format: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13" (no hyphen/colon)
        # This pattern handles multi-rate TVA receipts
        lidl_entries = self._try_pattern_lidl(normalized_text)
        if lidl_entries:
            lidl_sum = sum(e['amount'] for e in lidl_entries)
            all_candidates.append(('lidl', 0.96, lidl_entries, lidl_sum))
            print(f"[TVA Debug] Lidl pattern: {len(lidl_entries)} entries, sum={lidl_sum}", flush=True)

        # Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
        # Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
        # OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
        # OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
        total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
        taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)

        # Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
        if not taxe_match:
            reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
            taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)

        if taxe_match:
            # Also need to find the TVA rate from the table
            # Pattern handles: "A-21%", "-21,00%", "21%" etc.
            rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
            rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
            if rate_match:
                try:
                    code = rate_match.group(1).upper() if rate_match.group(1) else 'A'  # Default to A if missing
                    percent = int(rate_match.group(2))
                    amount_str = taxe_match.group(1).replace(' ', '')
                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    pass

        # Pattern 0b: Table format "A-21,00%  285,66  49,58" (code-percent  base  tva_amount)
        # This format appears after a TVA header line like "TVA  TOTAL  VALDARE"
        # The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
        if not tva_entries:
            table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
            for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
                    amount1_str = match.group(3).replace(' ', '')
                    amount2_str = match.group(4).replace(' ', '')
                    amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
                    amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))

                    # Determine which is TVA: the smaller amount is usually TVA
                    # (TVA is a fraction of the total, so it's always smaller)
                    tva_amount = min(amount1, amount2)

                    if tva_amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': tva_amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 0c: REVERSED FORMAT "5.00% TUA*B" followed by amount on next line
        # This handles receipts where percentage comes BEFORE TVA code (e.g., books with 5% rate)
        # Matches: "5.00% TUA*B", "5% TVA B", "5.00% TVA", "9% TUA", "5% IVA"
        if not tva_entries:
            # Pattern: PERCENT% + TVA/IVA + optional code, then amount on next line
            reversed_tva_pattern = r'(\d{1,2})[.,]?\d{0,2}\s*%\s*(?:T[VU][AR]|IVA)\s*\*?([A-D])?'
            for match in re.finditer(reversed_tva_pattern, normalized_text, re.IGNORECASE):
                try:
                    percent = int(match.group(1))
                    code = (match.group(2) or self._get_tva_code_from_percent(percent)).upper()

                    # Look for amount on the next line(s) after the match
                    after_match = normalized_text[match.end():]
                    # Find standalone number (amount) - skip empty lines
                    amount_match = re.search(r'^[\s\n]*([\d]+[.,]\d{2})\b', after_match)
                    if amount_match:
                        amount_str = self._normalize_number(amount_match.group(1))
                        amount = Decimal(amount_str)
                        if amount > 0:
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 0d: "TOTAL TUA:", "TOTAL TVA:", "TOTAL IVA:" with amount (OCR variants)
        if not tva_entries:
            total_tva_simple = r'TOTAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d.,]+)'
            match = re.search(total_tva_simple, normalized_text, re.IGNORECASE)
            if match:
                try:
                    amount_str = self._normalize_number(match.group(1))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        # Try to find the rate in nearby text
                        percent = self._detect_tva_percent(text)
                        if percent:
                            code = self._get_tva_code_from_percent(percent)
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    pass

        # Pattern 0e: Multiline "TOTAL TUA\n198\n30.43" where:
        #   - "TOTAL TUA" on one line
        #   - "198" or similar (corrupted "19%") on next line (optional)
        #   - "30.43" (TVA amount) on following line
        # OCR often splits this across multiple lines
        if not tva_entries:
            multiline_tva = r'TOTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s*\n\s*\d*\s*\n?\s*([\d]+[.,]\d{2})\b'
            match = re.search(multiline_tva, normalized_text, re.IGNORECASE)
            if match:
                try:
                    amount_str = self._normalize_number(match.group(1))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        percent = self._detect_tva_percent(text)
                        if percent:
                            code = self._get_tva_code_from_percent(percent)
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    pass

        # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" or "IVA A - 19%" (with code)
        # OCR tolerant: TUA, TVR, IVA, etc.
        pattern_with_code = r'(?:T[VU][AR]|IVA)\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
        for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
            try:
                code = match.group(1).upper()
                percent = int(match.group(2))
                amount_str = match.group(3).replace(' ', '')
                amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                amount = Decimal(amount_str)
                if amount > 0:
                    entry_key = (code, percent)
                    if entry_key not in seen_entries:
                        tva_entries.append({
                            'code': code,
                            'percent': percent,
                            'amount': amount
                        })
                        seen_entries.add(entry_key)
            except (ValueError, InvalidOperation):
                continue

        # Pattern 2: "TVA - 21%: 32.31" or "IVA - 21%: 32.31" (without explicit code, assume 'A')
        if not tva_entries:
            pattern_no_code = r'(?:T[VU][AR]|IVA)\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
            for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
                try:
                    percent = int(match.group(1))
                    amount_str = match.group(2).replace(' ', '')
                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        # Determine code based on percent
                        code = self._get_tva_code_from_percent(percent)
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 3: "TOTAL TVA A - 21%" or "TOTAL IVA" with amount on same line or "TOTAL TVA BON" with amount
        if not tva_entries:
            # First try: "TOTAL TVA A - 21%  32.31" or "TOTAL IVA A - 21% 32.31" (amount on same line)
            tva_with_amount = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
            for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
                    amount_str = self._normalize_number(match.group(3))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 3b: "TOTAL TVA A - 21%" or "TOTAL IVA A - 21%" on one line, look for "TOTAL TVA BON" amount
        if not tva_entries:
            tva_total_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))

                    # Look for "TOTAL TVA BON" or "TOTAL IVA BON" followed by amount
                    tva_bon_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON[:\s]*([\d.,]+)'
                    tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
                    if tva_bon_match:
                        amount_str = self._normalize_number(tva_bon_match.group(1))
                        amount = Decimal(amount_str)
                        if amount > 0:
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                            continue

                    # Fallback: Amount after TOTAL TVA BON or TOTAL IVA BON on next line
                    tva_bon_pos = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON', normalized_text, re.IGNORECASE)
                    if tva_bon_pos:
                        after_bon = normalized_text[tva_bon_pos.end():]
                        # Find first standalone number (likely TVA amount)
                        amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
                        if amount_match:
                            amount_str = self._normalize_number(amount_match.group(1))
                            amount = Decimal(amount_str)
                            if amount > 0:
                                entry_key = (code, percent)
                                if entry_key not in seen_entries:
                                    tva_entries.append({
                                        'code': code,
                                        'percent': percent,
                                        'amount': amount
                                    })
                                    seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 3c: "TVAA - 21%" or "IVA A - 21%" on one line, amount on next line (simpler format)
        if not tva_entries:
            tva_line_pattern = r'(?:T[VU][AR]|IVA)\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = (match.group(1) or 'A').upper()
                    percent = int(match.group(2))

                    # Look for amount on the next line or immediately after
                    after_tva = normalized_text[match.end():]
                    amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva)
                    if amount_match:
                        amount_str = self._normalize_number(amount_match.group(1))
                        amount = Decimal(amount_str)
                        if amount > 0:
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 4: Use TVA_PATTERNS for fallback
        if not tva_entries:
            for pattern, _ in self.TVA_PATTERNS:
                match = re.search(pattern, normalized_text, re.IGNORECASE)
                if match:
                    try:
                        # Some patterns have 2 groups (percent, amount), others just amount
                        if match.lastindex >= 2:
                            percent = int(match.group(1))
                            amount_str = match.group(2)
                        else:
                            amount_str = match.group(1)
                            # Try to detect percent from text
                            percent = self._detect_tva_percent(text)

                        amount_str = amount_str.replace(' ', '')
                        amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                        amount = Decimal(amount_str)
                        if amount > 0 and percent:
                            code = self._get_tva_code_from_percent(percent)
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                                break  # Only use first match from fallback
                    except (ValueError, InvalidOperation):
                        continue

        # Add existing extraction results to candidates (if any)
        if tva_entries:
            entries_sum = sum(entry['amount'] for entry in tva_entries)
            all_candidates.append(('standard', 0.90, tva_entries, entries_sum))
            print(f"[TVA Debug] Standard patterns: {len(tva_entries)} entries, sum={entries_sum}", flush=True)

        # === CANDIDATE SELECTION ===
        # Select best candidate using TOTAL TVA BON as authoritative reference
        if all_candidates:
            best_entries, best_sum = self._select_best_tva_candidate(all_candidates, tva_bon_total)
            if best_entries:
                tva_entries = best_entries
                entries_sum = best_sum

        # Calculate sum from entries (if not set by candidate selection)
        entries_sum = None
        if tva_entries:
            entries_sum = sum(entry['amount'] for entry in tva_entries)

        # Validate and correct TVA values
        tva_entries, tva_total = self._validate_and_correct_tva(
            tva_entries, entries_sum, tva_bon_total
        )

        # Sort by code (A, B, C, D)
        tva_entries.sort(key=lambda x: x.get('code', 'Z'))

        return tva_entries, tva_total

    def _get_tva_code_from_percent(self, percent: int) -> str:
        """Map TVA percentage to standard Romanian code.

        Romanian TVA rates changed in August 2025:
        - Standard rate: 19% → 21%
        - Reduced rate: 9% → 11%
        - Other rates (5%, 0%) remain unchanged

        Old rates (before Aug 2025):  New rates (from Aug 2025):
        - A = 19% (standard)          - A = 21% (standard)
        - B = 9%  (reduced)           - B = 11% (reduced)
        - C = 5%  (reduced)           - C = 5%  (reduced)
        - D = 0%  (exempt)            - D = 0%  (exempt)

        Both old and new rates are supported for historical receipts.
        """
        if percent in (19, 21):
            return 'A'  # Standard rate (19% old, 21% new from Aug 2025)
        elif percent in (9, 11):
            return 'B'  # Reduced rate (9% old, 11% new from Aug 2025)
        elif percent == 5:
            return 'C'  # Reduced rate (unchanged)
        elif percent == 0:
            return 'D'  # Exempt (unchanged)
        else:
            return 'A'  # Default to standard rate

    def _extract_total_tva_bon(self, text: str) -> Optional[Decimal]:
        """
        Extract TOTAL TVA BON value separately as the reference.
        This is the authoritative total TVA on the receipt.

        Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, TOTAL IVA BON, etc.
        """
        # Pattern for TOTAL TVA BON or TOTAL IVA BON with amount after
        # OCR corruptions: TUAL (TVA+L merged), TVAL, TUAI, etc.
        patterns = [
            # Standard: TOTAL TVA BON: 14.92 or TOTAL IVA BON: 14.92
            # Handles: TUAL (TVA+L), TVAL, TUAI, etc. with optional trailing letters
            r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
            # Amount before: 14.92 OTAL TUA BON (OCR line break)
            r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON',
            # Amount on next line after TOTAL TVA BON or TOTAL IVA BON
            r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
        ]

        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    amount_str = self._normalize_number(match.group(1))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        return amount
                except (InvalidOperation, ValueError):
                    continue

        return None

    def _validate_and_correct_tva(
        self,
        tva_entries: List[dict],
        entries_sum: Optional[Decimal],
        tva_bon_total: Optional[Decimal]
    ) -> Tuple[List[dict], Optional[Decimal]]:
        """
        Validate and correct TVA values.

        Rules:
        1. TVA cannot be greater than TOTAL amount (will be validated at higher level)
        2. Sum of TVA A + TVA B + ... should equal TOTAL TVA BON
        3. If single entry and sum != tva_bon_total, use tva_bon_total
        4. Detect and fix OCR concatenation errors (e.g., 14.921492 from 14.92 + 14.92)
        """
        if not tva_entries:
            return tva_entries, tva_bon_total

        # Check for OCR concatenation errors in individual entries
        # Pattern: X.XX followed by another decimal (e.g., 14.921492 from 14.92 + 14.92)
        corrected_entries = []
        for entry in tva_entries:
            amount = entry['amount']
            amount_str = str(amount)

            # Check if amount looks like concatenated decimals
            # e.g., 14.921492 could be 14.92 + 14.92 incorrectly joined
            # or 32.3132.31 from 32.31 + 32.31
            if len(amount_str) > 6 and '.' in amount_str:
                int_part, dec_part = amount_str.split('.')

                # If decimal part > 2 digits, it's likely concatenation
                if len(dec_part) > 2:
                    # Try to extract the first valid decimal amount
                    # e.g., from 14.921492, extract 14.92
                    try:
                        corrected_amount = Decimal(f"{int_part}.{dec_part[:2]}")
                        print(f"[TVA Validation] Corrected concatenation error: {amount} -> {corrected_amount}", flush=True)
                        entry['amount'] = corrected_amount
                    except InvalidOperation:
                        pass

            corrected_entries.append(entry)

        tva_entries = corrected_entries

        # Recalculate sum after corrections
        entries_sum = sum(entry['amount'] for entry in tva_entries) if tva_entries else None

        # Validate sum against TOTAL TVA BON
        if tva_bon_total and entries_sum:
            # Allow small tolerance for rounding (0.02)
            tolerance = Decimal('0.02')
            difference = abs(entries_sum - tva_bon_total)

            if difference > tolerance:
                print(f"[TVA Validation] Sum mismatch: entries_sum={entries_sum}, tva_bon_total={tva_bon_total}", flush=True)

                # If single entry and sum doesn't match, use TOTAL TVA BON as reference
                if len(tva_entries) == 1:
                    print(f"[TVA Validation] Single entry - using TOTAL TVA BON as reference: {tva_bon_total}", flush=True)
                    tva_entries[0]['amount'] = tva_bon_total
                    entries_sum = tva_bon_total
                # If multiple entries and sum > tva_bon_total, likely double counting
                elif entries_sum > tva_bon_total:
                    # Check if one entry is the duplicate of another
                    amounts = [e['amount'] for e in tva_entries]
                    unique_amounts = set(amounts)
                    if len(unique_amounts) < len(amounts):
                        # Duplicate detected - likely TOTAL TVA BON counted as separate entry
                        print(f"[TVA Validation] Duplicate TVA detected, removing duplicates", flush=True)
                        # Keep only unique entries
                        seen = set()
                        unique_entries = []
                        for entry in tva_entries:
                            key = (entry.get('code'), entry['amount'])
                            if key not in seen:
                                seen.add(key)
                                unique_entries.append(entry)
                        tva_entries = unique_entries
                        entries_sum = sum(e['amount'] for e in tva_entries)

        # Final total
        tva_total = entries_sum if entries_sum else tva_bon_total

        return tva_entries, tva_total

    def _detect_tva_percent(self, text: str) -> Optional[int]:
        """Detect TVA percentage from text content.

        IMPORTANT: Prioritize rates found near TVA markers over rates found elsewhere.
        E.g., "REDUCERE 5%" should not override "TVA A 19%".
        Also handle OCR corruptions like "194" for "19%" in "TOTAL TA F 194".
        """
        import re as regex

        # First, look for percent NEAR TVA markers (most reliable)
        # This handles "TVA A 19%", "TVA 19,00%", "TOTAL TVA 19%"
        tva_context_patterns = [
            r'T[VU][AR]\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
            r'IVA\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
            # OCR corruption: "TOTAL TA F 194" where 194 = 19% (4 is artifact)
            r'TOTAL\s+T[VA][AR]?\s*[F\s]?\s*(19|21)\d\b',
        ]
        for pattern in tva_context_patterns:
            match = regex.search(pattern, text, regex.IGNORECASE)
            if match:
                rate = int(match.group(1))
                if rate in (19, 21, 11, 9, 5):
                    return rate

        # Fallback: Look for common Romanian TVA percentages anywhere
        # But EXCLUDE patterns near "REDUCERE", "DISCOUNT", "RED." (these are discounts, not TVA)
        # Clean text by removing discount context
        # Handle OCR corruptions: RED.CERE (C instead of U), RED CERE, REDUC, etc.
        text_no_discount = regex.sub(r'(?:REDUC|DISCOUNT|RED)[.\sA-Z]*\d+[.,]?\d*\s*%', '', text, flags=regex.IGNORECASE)

        # Now search in cleaned text (priority order: 19% > 21% > 11% > 9% > 5%)
        if regex.search(r'\b19[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 19
        elif regex.search(r'\b21[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 21
        elif regex.search(r'\b11[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 11
        elif regex.search(r'\b9[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 9
        elif regex.search(r'\b5[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 5

        # Default: If no percent found but we're in Romanian receipt context,
        # assume 19% (standard rate)
        if regex.search(r'T[VU][AR]|IVA', text, regex.IGNORECASE):
            return 19

        return None

    def _validate_tva_reverse(
        self,
        tva_entries: List[dict],
        total_amount: Optional[Decimal]
    ) -> Tuple[bool, Optional[Decimal], str]:
        """
        Reverse TVA validation: from TVA amount and rate, calculate expected total.

        Formula (CORRECT):
            For TVA that is INCLUDED in total (standard Romanian receipts):
            total = base + tva
            tva = base * rate/100
            Therefore: base = tva * 100 / rate
            And: total = base + tva = tva * 100 / rate + tva = tva * (100 + rate) / rate

        Returns (is_valid, expected_total, message)
        """
        if not tva_entries or not total_amount:
            return True, None, "Insufficient data for reverse validation"

        expected_total = Decimal('0')
        for entry in tva_entries:
            tva_amount = entry['amount']
            rate = Decimal(str(entry['percent']))

            print(f"[TVA Debug] Entry: amount={tva_amount}, rate={rate}%", flush=True)

            if rate > 0:
                # CORRECT formula: total = tva * (100 + rate) / rate
                # Example: tva=55.22, rate=21 → total = 55.22 * 121 / 21 = 318.16
                gross_for_entry = tva_amount * (Decimal('100') + rate) / rate
                expected_total += gross_for_entry
                print(f"[TVA Debug] Calculated gross: {gross_for_entry}", flush=True)
            else:
                # 0% TVA - can't calculate base, skip
                pass

        if expected_total == 0:
            return True, None, "Cannot calculate expected total (0% TVA only)"

        # Tolerance: max(0.50 RON, 1% of total)
        tolerance = max(Decimal('0.50'), total_amount * Decimal('0.01'))
        difference = abs(expected_total - total_amount)

        if difference <= tolerance:
            return True, expected_total, f"TVA reverse validation passed (expected: {expected_total}, actual: {total_amount}, diff: {difference})"
        else:
            return False, expected_total, f"TVA reverse validation WARNING: expected {expected_total}, actual {total_amount}, diff {difference}"

    def _extract_items_count(self, text: str) -> Optional[int]:
        """Extract number of items/articles from receipt."""
        for pattern, _ in self.ITEMS_COUNT_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    count = int(match.group(1))
                    if 0 < count < 1000:  # Reasonable range
                        return count
                except ValueError:
                    continue
        return None

    def _extract_address(self, text: str) -> Optional[str]:
        """Extract vendor address from text."""
        lines = text.split('\n')
        address_parts = []

        for line in lines[:15]:  # Check first 15 lines
            line = line.strip()
            if not line:
                continue

            # Check for address patterns
            line_upper = line.upper()

            # JUD. (county) pattern
            if re.search(r'\bJUD\.?\s+', line_upper):
                address_parts.append(line)
                continue

            # STR. (street) pattern
            if re.search(r'\bSTR\.?\s+', line_upper):
                address_parts.append(line)
                continue

            # MUN./OR./COM. (city/town) pattern
            if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper):
                address_parts.append(line)
                continue

        if address_parts:
            # Join and clean address parts
            address = ', '.join(address_parts)
            # Clean up
            address = re.sub(r'\s+', ' ', address).strip()
            address = re.sub(r',\s*,', ',', address)
            return address if len(address) >= 5 else None

        return None

    def _extract_payment_methods(self, text: str) -> List[dict]:
        """
        Extract payment methods (CARD/NUMERAR) from receipt.
        These appear after TOTAL LEI and before TOTAL TVA section.

        Returns list of: {'method': 'CARD'/'NUMERAR', 'amount': Decimal}
        """
        payment_methods = []
        seen_methods = set()

        # Normalize spaces in numbers
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)

        # Find the region between TOTAL LEI and TOTAL TVA
        total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE)
        total_tva_match = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)', normalized_text, re.IGNORECASE)

        # Define search region (after TOTAL LEI, before TOTAL TVA if exists)
        if total_lei_match:
            start_pos = total_lei_match.end()
            end_pos = total_tva_match.start() if total_tva_match else len(normalized_text)
            search_region = normalized_text[start_pos:end_pos]
        else:
            search_region = normalized_text  # Fallback to full text

        for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS:
            for match in re.finditer(pattern, search_region, re.IGNORECASE | re.MULTILINE):
                try:
                    amount_str = match.group(1).replace(' ', '')
                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                    amount = Decimal(amount_str)
                    # Validate: amount must be positive and reasonable (< MAX_REASONABLE_PAYMENT)
                    # This prevents OCR errors like CUI being parsed as payment
                    if amount > 0 and amount < self.MAX_REASONABLE_PAYMENT and method not in seen_methods:
                        payment_methods.append({
                            'method': method,
                            'amount': amount
                        })
                        seen_methods.add(method)
                        print(f"[Payment] Found {method}: {amount} (pattern matched)", flush=True)
                    elif amount >= self.MAX_REASONABLE_PAYMENT:
                        print(f"[Payment] Rejected unreasonable amount {amount} for {method} (likely OCR error)", flush=True)
                except (InvalidOperation, ValueError):
                    continue

        return payment_methods

    def _validate_payment_methods(
        self, payment_methods: List[dict], total: Optional[Decimal]
    ) -> List[dict]:
        """
        Validate payment methods against extracted total.

        If payment sum is way larger than total (>10x), it's likely an OCR error
        (e.g., CUI number parsed as payment amount). Clear invalid payments.

        Args:
            payment_methods: List of {'method': str, 'amount': Decimal}
            total: Extracted total amount

        Returns:
            Validated payment methods (may be empty if all were invalid)
        """
        if not total or not payment_methods:
            return payment_methods

        payment_sum = sum(pm.get('amount', Decimal('0')) for pm in payment_methods)

        # If payment sum > 10x total, it's definitely an error
        if payment_sum > total * 10:
            print(f"[Payment Validation] Payment sum {payment_sum} >> Total {total} (>10x), clearing invalid payments", flush=True)
            return []

        # If payment sum > 2x total, it's suspicious but might be valid in some edge cases
        # Just log a warning
        if payment_sum > total * 2:
            print(f"[Payment Validation] Warning: Payment sum {payment_sum} > 2x Total {total}, possible OCR error", flush=True)

        return payment_methods

    def _extract_client_data(
        self, text_upper: str, original_text: str
    ) -> Tuple[Optional[str], Optional[str], Optional[str], float]:
        """
        Extract client/buyer data from B2B receipts.

        Returns (client_name, client_cui, client_address, confidence)
        """
        client_name = None
        client_cui = None
        client_address = None
        confidence = 0.0

        # Step 1: Find CLIENT section marker
        client_section_start = None
        for marker in self.CLIENT_SECTION_MARKERS:
            match = re.search(marker, text_upper, re.IGNORECASE)
            if match:
                client_section_start = match.start()
                break

        if client_section_start is None:
            # No client section found
            return None, None, None, 0.0

        # Step 2: Extract client CUI
        for pattern, conf in self.CLIENT_CUI_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
            if match:
                cui = match.group(1)
                if 6 <= len(cui) <= 10:
                    client_cui = cui
                    confidence = max(confidence, conf)
                    break

        # Step 3: Extract client name from CLIENT section
        # Look for company name after CLIENT: marker
        lines = original_text.split('\n')
        for i, line in enumerate(lines):
            line_upper = line.upper().strip()

            # Check if this line contains CLIENT marker
            if any(re.search(marker, line_upper) for marker in self.CLIENT_SECTION_MARKERS):
                # Check if name is on same line after ":"
                if ':' in line:
                    name_part = line.split(':', 1)[1].strip()
                    if name_part and len(name_part) >= 3:
                        # Skip if it looks like a CUI (R/RO followed by digits)
                        if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
                            # This is a CUI, not a name - extract it if not already found
                            if not client_cui:
                                cui_digits = re.sub(r'[^0-9]', '', name_part)
                                if 6 <= len(cui_digits) <= 10:
                                    client_cui = cui_digits
                                    confidence = max(confidence, 0.90)
                            continue
                        # Check for company indicators
                        if any(re.search(ind, name_part.upper()) for ind in self.VENDOR_INDICATORS):
                            client_name = self._clean_vendor_name(name_part)
                            confidence = max(confidence, 0.95)
                            break
                        elif len(name_part) >= 5 and not name_part.isdigit():
                            client_name = self._clean_vendor_name(name_part)
                            confidence = max(confidence, 0.80)
                            break

                # Check next line for company name
                if i + 1 < len(lines):
                    next_line = lines[i + 1].strip()
                    next_upper = next_line.upper()

                    # Skip if it's a CUI/CIF line
                    if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
                        if any(re.search(ind, next_upper) for ind in self.VENDOR_INDICATORS):
                            client_name = self._clean_vendor_name(next_line)
                            confidence = max(confidence, 0.90)
                            break
                        elif len(next_line) >= 5 and not next_line.isdigit():
                            # Check if it looks like a company name
                            if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
                                client_name = self._clean_vendor_name(next_line)
                                confidence = max(confidence, 0.75)
                                break

        # Step 4: Extract client address (if present after client section)
        if client_section_start:
            # Look for address patterns after client section
            client_region = text_upper[client_section_start:client_section_start + 500]
            for pattern, _ in self.ADDRESS_PATTERNS:
                match = re.search(pattern, client_region)
                if match:
                    client_address = match.group(1).strip()
                    break

        # Log extraction result
        if client_cui or client_name:
            print(f"[Client Extraction] Found: name={client_name}, cui={client_cui}, conf={confidence}", flush=True)

        return client_name, client_cui, client_address, confidence