feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions
--- a/backend/modules/data_entry/services/ocr_extractor.py
+++ b/backend/modules/data_entry/services/ocr_extractor.py
@@ -6,6 +6,8 @@ from decimal import Decimal, InvalidOperation
 from typing import Optional, Tuple, List
 from dataclasses import dataclass, field

+from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+

@dataclass
 class ExtractionResult:
@@ -24,6 +26,7 @@ class ExtractionResult:
    address: Optional[str] = None
    items_count: Optional[int] = None
    payment_methods: List[dict] = field(default_factory=list)  # [{"method":"CARD","amount":Decimal}]
+    suggested_payment_mode: Optional[str] = None  # 'banca' if CARD detected, 'numerar' if cash only

    # Client data (for B2B receipts - buyer information)
    client_name: Optional[str] = None
@@ -125,8 +128,10 @@ class ReceiptExtractor:
        (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),  # CT2N1360760 format
        (r'C3POS.*?(\d{6,7})\b', 0.95),  # Any C3POS followed by 6-7 digit number
        (r'CT2[N:]\s*(\d{6,})', 0.95),  # CT2N prefix
-        # BF (Bon Fiscal) number
-        (r'BF\s*:?\s*(\d+)', 0.93),
+        # BF (Bon Fiscal) number - high priority
+        # Format: "Z:0864 BF:0018" - extract only the number after BF:
+        (r'BF\s*:\s*(\d{4,})', 0.96),  # BF: with colon (most specific)
+        (r'BF\s+(\d{4,})', 0.93),  # BF followed by space and number
        # NIVS format
        (r'NIVS\s*:?\s*(\d+)', 0.95),
        # Standard NR BON formats
@@ -151,28 +156,45 @@ class ReceiptExtractor:
    # OCR errors: R0 instead of RO, C1F instead of CIF
    CUI_PATTERNS = [
        # CIF at start of line (definitely vendor) - tolerant to OCR errors
-        (r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
-        (r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),  # C1F OCR error
+        # NOTE: Capture full CUI including RO prefix: (R[O0]?\d{6,10}) or ((?:R[O0])?\d{6,10})
+        (r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'^CIF\s*:?\s*(\d{6,10})', 0.97),  # Without RO prefix
+        (r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),  # C1F OCR error
+        (r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),  # C1F without RO
        # CIF not preceded by CLIENT (negative lookbehind)
-        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
+        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
        # Standalone CIF: format with OCR tolerance
-        (r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
+        (r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
+        (r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
        # COD FISCAL (vendor)
-        (r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
+        (r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
+        (r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
        # C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
-        (r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
+        # Also handles double colon from OMV/Petrom: "C. I.F.: : RO11201891"
+        (r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
+        (r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
        # C.I.F. format (with dots, no spaces)
-        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
+        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
+        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
        # CUI format (less specific, use with caution)
-        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
+        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
+        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
+        # Lidl format: "Cod Identificare fiscala: RO..." (OCR corrupts to "Ced Identificanfliscalar")
+        # Matches: "Identificare fiscala", "Identificanfliscalar", "Identificoan/Fljscales"
+        (r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
+        # Generic: anything with "fiscal" followed by RO + digits
+        (r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
    ]

    # Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
-    # Common in some receipts: "R011201891\nC. I. F." - number on line before label
+    # Common in some receipts: "RO11201891\nC. I. F." - number on line before label
+    # IMPORTANT: Capture the full CUI including RO prefix
    CUI_REVERSED_PATTERNS = [
-        # RO + 8-10 digits on line immediately before C.I.F./CIF label
-        (r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
-        # Just digits before C.I.F. label
+        # RO/R0 + 6-10 digits on line immediately before C.I.F./CIF label
+        # Capture the FULL CUI including RO prefix
+        (r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
+        # Just digits before C.I.F. label (neplatitor TVA - no RO prefix)
        (r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
    ]

@@ -185,38 +207,67 @@ class ReceiptExtractor:
        (r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
    ]

-    # TVA (VAT) patterns - OCR may produce TUA, TVR, etc.
+    # TVA (VAT) patterns - OCR may produce TUA, TVR, IVA, etc.
+    # All patterns are case-insensitive (re.IGNORECASE applied in extraction)
    TVA_PATTERNS = [
-        # TOTAL TVA BON format (OCR tolerant: TUA, TVR)
-        (r'TOTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
-        (r'T[O0]TAL\s+T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.95),
+        # TOTAL TVA BON format (OCR tolerant: TUA, TVR, IVA)
+        (r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
+        (r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95),
+        # IVA variant (Spanish/Portuguese influence, some receipts)
+        (r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95),
+        (r'IVA\s+[A-D]?\s*[-:]?\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.93),
        # TVA with percentage (OCR tolerant)
        (r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
        (r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
-        # Simple TVA pattern
-        (r'T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.85),
+        # 5% TVA rate (books, newspapers - TVA C)
+        (r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93),
+        (r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92),
+        # Garbled OCR: T0TAL, TVAI, TUAI, etc.
+        (r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88),
+        # OCR corruption: "TA F 194" (TVA with V→F or space), "T A 19%"
+        # Handles: "TOTAL TA F 194" where TVA became "TA F"
+        (r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
+        (r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
+        # "TUA" with random letter after (OCR noise): "TUA F", "TUA I"
+        (r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83),
+        # Simple TVA/IVA pattern
+        (r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
        # Standalone percentage line near TVA
        (r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
    ]

    # Payment method patterns - appears after TOTAL LEI, before TOTAL TVA
    # Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00"
+    # OMV/Petrom uses "CARTE CREDIT" or "CARTE CREDIT 318, 16"
    PAYMENT_METHOD_PATTERNS = [
+        # CARTE CREDIT with amount on same line (OMV/Petrom receipts)
+        # Handles: "CARTE CREDIT 318, 16" with OCR spaces in number
+        (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
+        # CARTE CREDIT with amount on next line (OCR may split lines)
+        # Handles: "CARTE CREDIT\n318, 16"
+        (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
        # CARD with amount (high confidence)
-        (r'(?:PLATA\s+)?CARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
+        # Also handles OCR artifacts like "CARD F 100.00" where F is noise
+        (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
        # NUMERAR (cash) with amount
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
        # CASH alternative spelling
        (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
        # Truncation recovery patterns (for OCR left-margin truncation issues)
+        # IMPROVED: More restrictive - require max 6 digits before decimals
+        # to avoid matching CUI numbers like RO10562600 → RD10562600
        # "RD" = truncated "CARD" (only 2 chars visible)
-        (r'\bRD\s*:?\s*([\d\s.,]+)', 'CARD', 0.70),
+        (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
        # "ARD" = truncated "CARD" (3 chars visible)
-        (r'\bARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.75),
+        (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
        # "MERAR" = truncated "NUMERAR"
-        (r'\bMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.70),
+        (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
    ]

+    # Maximum reasonable payment amount for a receipt (100,000 LEI)
+    # Amounts larger than this are likely OCR errors (e.g., CUI parsed as amount)
+    MAX_REASONABLE_PAYMENT = Decimal('100000')
+
    # Items count patterns - OCR may produce OZ instead of POZ, etc.
    # Number may be on separate line before or after the label
    # IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
@@ -250,6 +301,9 @@ class ReceiptExtractor:
        # Reversed format: CIF/CUI before CLIENT
        r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',  # CIF CLIENT:
        r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',  # CUI CLIENT:
+        # Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF CL IENT:"
+        r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:',  # "CIF a IENT:", "CIF CL IENT:", "CIF LIENT:"
+        r'C[I1]F\s+LIENT\s*:',  # "CIF LIENT:" (missing C from CLIENT)
        # CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
        # Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
        r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
@@ -267,6 +321,16 @@ class ReceiptExtractor:
    # Client CUI patterns (explicitly after CLIENT marker)
    # OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
    CLIENT_CUI_PATTERNS = [
+        # NEW: CUI on line BEFORE CLIENT marker (docTR/OCR may output value before label)
+        # Pattern: "RO1879855\nCLIENT C.U.I./C.I.F.:" - CUI on line before CLIENT label
+        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
+        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
+        # Same but with optional colon after RO number
+        (r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
+        # "CIF I CLIENT:" or "CIF IDENTIFICARE CLIENT:" format (OCR may insert extra chars)
+        # Common OCR artifact: "CIF I CLIENT: R01879855"
+        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
+        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
        # CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
@@ -276,19 +340,34 @@ class ReceiptExtractor:
        # Most flexible pattern for slash variants
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
+        # OCR artifact: doubled letters like "C.U U. I." or "C.I I.F." (docTR sometimes duplicates)
+        (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
        # CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
+        # Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF L IENT:", "CIF C IENT:"
+        # OCR often corrupts "CLIENT" when it appears after "CIF"
+        (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),  # "CIF a IENT:", "CIF CL IENT:"
+        (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
+        (r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),  # "CIF LIENT:" (missing C)
+        (r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
        # CUMPARATOR variants
        (r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        (r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        # CUMPARATOR with CUI/CIF on next line: "CUMPARATOR: NAME\nCIF: 12345678"
+        (r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
+        (r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),  # F or T (OCR error)
+        # CUMPARATOR with CUI/CIF two lines down: "CUMPARATOR: NAME\nADDRESS\nCIF: 12345678"
+        (r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
        # CUI/CIF on line immediately after CLIENT marker
        (r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
-        (r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
-        # CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
-        (r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
+        (r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),  # F or T (OCR error)
+        # CUI/CIF after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
+        (r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
+        (r'CLIENT\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),  # CIF/CIT after name
    ]

    # Vendor name indicators (lines containing these are likely vendor names)
@@ -322,6 +401,8 @@ class ReceiptExtractor:
        result.receipt_series, _ = self._extract_series(text_upper)
        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
        result.cui, _ = self._extract_cui(text_upper, text)
+        # Normalize CUI: fix R0 → RO OCR error and validate format
+        result.cui = OCRValidationEngine.normalize_cui(result.cui)

        # Extract additional fields - Multiple TVA entries
        result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
@@ -345,10 +426,35 @@ class ReceiptExtractor:
        result.address = self._extract_address(text_upper)
        result.payment_methods = self._extract_payment_methods(text_upper)

+        # Validate payment methods against extracted amount
+        # If payment sum >> amount, clear invalid payments (likely OCR error)
+        # Save original payment methods before validation (for payment mode detection)
+        original_payment_methods = result.payment_methods.copy() if result.payment_methods else []
+
+        result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)
+
+        # Auto-suggest payment_mode based on detected payment methods
+        # Use ORIGINAL payment_methods to detect CARD even if validation cleared them
+        # (e.g., CARD 318.16 is valid even if total validation failed)
+        payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
+        if payment_methods_for_mode:
+            card_amount = sum(
+                pm.get('amount', Decimal('0'))
+                for pm in payment_methods_for_mode
+                if pm.get('method') == 'CARD'
+            )
+            if card_amount > 0:
+                result.suggested_payment_mode = 'banca'
+                print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
+            else:
+                # Only cash payments detected
+                result.suggested_payment_mode = 'numerar'
+                print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)
+
        # Extract client data (B2B receipts)
        client_name, client_cui, client_address, confidence_client = self._extract_client_data(text_upper, text)
        result.client_name = client_name
-        result.client_cui = client_cui
+        result.client_cui = OCRValidationEngine.normalize_cui(client_cui)  # Fix R0 → RO OCR error
        result.client_address = client_address
        result.confidence_client = confidence_client

@@ -378,13 +484,28 @@ class ReceiptExtractor:

    def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
        """Extract total amount from text."""
+        # PRE-FILTER: Remove lines containing REST (rest = change, not total)
+        # When paid by card, there's no change - exact amount is paid
+        lines = text.split('\n')
+        filtered_lines = []
+        for line in lines:
+            # Skip lines with REST pattern (change amount, not total)
+            if re.search(r'\bREST\b', line, re.IGNORECASE):
+                continue
+            filtered_lines.append(line)
+        text = '\n'.join(filtered_lines)
+
        # First try standard patterns (TOTAL, SUBTOTAL, etc.)
        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                try:
-                    amount_str = re.sub(r'[^\d.,]', '', match.group(1))
+                    # IMPORTANT: Call _normalize_number FIRST to handle "190 60" → "190.60"
+                    # before stripping other characters
+                    amount_str = match.group(1).strip()
                    amount_str = self._normalize_number(amount_str)
+                    # Now remove any remaining non-numeric chars (except decimal point)
+                    amount_str = re.sub(r'[^\d.]', '', amount_str)
                    amount = Decimal(amount_str)
                    if amount > 0:
                        return amount, confidence
@@ -461,8 +582,22 @@ class ReceiptExtractor:

    def _normalize_number(self, num_str: str) -> str:
        """Normalize Romanian number format to standard decimal."""
-        # Remove spaces
-        num_str = num_str.replace(' ', '')
+        # OCR often reads "." as " " (space) - handle "190 60" as "190.60"
+        # Pattern: digits + space + exactly 2 digits at end
+        space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', num_str.strip())
+        if space_decimal_match:
+            num_str = f"{space_decimal_match.group(1)}.{space_decimal_match.group(2)}"
+        else:
+            # Handle "1 234 56" pattern (thousands + decimal with spaces)
+            # Match: digits + space(s) + digits + space + 2 digits
+            multi_space_match = re.match(r'^([\d\s]+?)\s+(\d{2})$', num_str.strip())
+            if multi_space_match:
+                integer_part = multi_space_match.group(1).replace(' ', '')
+                decimal_part = multi_space_match.group(2)
+                num_str = f"{integer_part}.{decimal_part}"
+            else:
+                # Remove remaining spaces (thousands separators)
+                num_str = num_str.replace(' ', '')

        # Handle comma as decimal separator
        if ',' in num_str and '.' in num_str:
@@ -532,34 +667,57 @@ class ReceiptExtractor:
                except (InvalidOperation, ValueError, TypeError):
                    pass

-        # Case 1: Amount is valid with high confidence - just validate
+        # Case 1: Amount is valid with high confidence - validate against TVA and payments
        if amount and amount > 0 and confidence_amount >= 0.8:
-            # Cross-validate: check if it matches payment methods
+            # First check TVA-implied total (most reliable when TVA is extracted correctly)
+            if tva_implied_total and tva_implied_total > 0:
+                tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
+                if tva_diff_percent <= 1:
+                    # Near-perfect TVA match - highest confidence
+                    return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
+                elif tva_diff_percent > 10:
+                    # Significant mismatch - TVA-implied total is more reliable
+                    # This catches cases where wrong TOTAL line was extracted (e.g., REST, SUBTOTAL)
+                    print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
+                    return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"
+
+            # Cross-validate with payment methods
            if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
                # Perfect match - boost confidence
                return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
+            elif payment_sum > 0:
+                payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
+                if payment_diff_percent > 10:
+                    # Significant mismatch - payment sum is more reliable
+                    print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
+                    return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"
+
            return amount, confidence_amount, "extracted"

        # Case 2: Amount exists but low confidence - try to validate/correct
        if amount and amount > 0:
+            # First check TVA-implied total (most reliable)
+            if tva_implied_total and tva_implied_total > 0:
+                tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
+                if tva_diff_percent <= 2:
+                    # Close match - boost confidence
+                    return amount, 0.88, "extracted (validated by TVA)"
+                elif tva_diff_percent > 10:
+                    # Significant mismatch - use TVA-implied total
+                    print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
+                    return tva_implied_total, 0.85, "calculated from TVA"
+
            # Check if payment methods sum matches
            if payment_sum > 0:
-                if abs(amount - payment_sum) <= Decimal('0.02'):
-                    # Match - boost confidence
+                payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
+                if payment_diff_percent <= 0.5:
+                    # Close match - boost confidence
                    return amount, 0.90, "extracted (validated by payment methods)"
-                else:
+                elif payment_diff_percent > 10:
                    # Mismatch - prefer payment_sum as it's more reliable
                    print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
                    return payment_sum, 0.85, "calculated from payment methods"

-            # Check TVA-implied total
-            if tva_implied_total:
-                if abs(amount - tva_implied_total) <= Decimal('0.50'):
-                    # Close match - use extracted amount
-                    return amount, 0.80, "extracted (validated by TVA)"
-                else:
-                    print(f"[Cross-Validation] TVA mismatch: extracted={amount}, tva_implied={tva_implied_total}", flush=True)
-
            # No validation possible - return as-is
            return amount, confidence_amount, "extracted (unvalidated)"

@@ -701,6 +859,10 @@ class ReceiptExtractor:

            line_upper = line.upper()

+            # Skip lines with skip keywords (CUMPARATOR, CLIENT, etc.)
+            if any(kw in line_upper for kw in skip_keywords):
+                continue
+
            # Check for vendor indicators
            for indicator in self.VENDOR_INDICATORS:
                if re.search(indicator, line_upper):
@@ -778,13 +940,21 @@ class ReceiptExtractor:
        Extract vendor CUI (fiscal identification code) from text.
        Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
        """
+        def get_cui_digit_count(cui: str) -> int:
+            """Get the count of digits in CUI (excluding RO/R0 prefix)."""
+            cui_upper = cui.upper().strip()
+            if cui_upper.startswith('RO') or cui_upper.startswith('R0'):
+                return len(cui_upper) - 2
+            return len(cui_upper)
+
        # Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
-        # This is common in some receipts: "R011201891\nC. I. F."
+        # This is common in some receipts: "RO11201891\nC. I. F."
        for pattern, confidence in self.CUI_REVERSED_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
            if match:
                cui = match.group(1)
-                if 6 <= len(cui) <= 10:
+                digit_count = get_cui_digit_count(cui)
+                if 6 <= digit_count <= 10:
                    # Verify this is not the CLIENT CUI by checking context
                    start = match.start()
                    # Check 50 chars before the match for CLIENT keyword
@@ -805,7 +975,8 @@ class ReceiptExtractor:
                match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
                if match:
                    cui = match.group(1)
-                    if 6 <= len(cui) <= 10:
+                    digit_count = get_cui_digit_count(cui)
+                    if 6 <= digit_count <= 10:
                        return cui, confidence

        # Strategy 2: Fallback - search entire text but exclude CLIENT patterns
@@ -813,7 +984,8 @@ class ReceiptExtractor:
            # Find all matches
            for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
                cui = match.group(1)
-                if 6 <= len(cui) <= 10:
+                digit_count = get_cui_digit_count(cui)
+                if 6 <= digit_count <= 10:
                    # Check if this match is preceded by CLIENT in the same line
                    start = match.start()
                    line_start = text_upper.rfind('\n', 0, start) + 1
@@ -937,9 +1109,90 @@ class ReceiptExtractor:
                except (ValueError, InvalidOperation):
                    continue

-        # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
-        # OCR tolerant: TUA, TVR, etc.
-        pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
+        # Pattern 0c: REVERSED FORMAT "5.00% TUA*B" followed by amount on next line
+        # This handles receipts where percentage comes BEFORE TVA code (e.g., books with 5% rate)
+        # Matches: "5.00% TUA*B", "5% TVA B", "5.00% TVA", "9% TUA", "5% IVA"
+        if not tva_entries:
+            # Pattern: PERCENT% + TVA/IVA + optional code, then amount on next line
+            reversed_tva_pattern = r'(\d{1,2})[.,]?\d{0,2}\s*%\s*(?:T[VU][AR]|IVA)\s*\*?([A-D])?'
+            for match in re.finditer(reversed_tva_pattern, normalized_text, re.IGNORECASE):
+                try:
+                    percent = int(match.group(1))
+                    code = (match.group(2) or self._get_tva_code_from_percent(percent)).upper()
+
+                    # Look for amount on the next line(s) after the match
+                    after_match = normalized_text[match.end():]
+                    # Find standalone number (amount) - skip empty lines
+                    amount_match = re.search(r'^[\s\n]*([\d]+[.,]\d{2})\b', after_match)
+                    if amount_match:
+                        amount_str = self._normalize_number(amount_match.group(1))
+                        amount = Decimal(amount_str)
+                        if amount > 0:
+                            entry_key = (code, percent)
+                            if entry_key not in seen_entries:
+                                tva_entries.append({
+                                    'code': code,
+                                    'percent': percent,
+                                    'amount': amount
+                                })
+                                seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    continue
+
+        # Pattern 0d: "TOTAL TUA:", "TOTAL TVA:", "TOTAL IVA:" with amount (OCR variants)
+        if not tva_entries:
+            total_tva_simple = r'TOTAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d.,]+)'
+            match = re.search(total_tva_simple, normalized_text, re.IGNORECASE)
+            if match:
+                try:
+                    amount_str = self._normalize_number(match.group(1))
+                    amount = Decimal(amount_str)
+                    if amount > 0:
+                        # Try to find the rate in nearby text
+                        percent = self._detect_tva_percent(text)
+                        if percent:
+                            code = self._get_tva_code_from_percent(percent)
+                            entry_key = (code, percent)
+                            if entry_key not in seen_entries:
+                                tva_entries.append({
+                                    'code': code,
+                                    'percent': percent,
+                                    'amount': amount
+                                })
+                                seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    pass
+
+        # Pattern 0e: Multiline "TOTAL TUA\n198\n30.43" where:
+        #   - "TOTAL TUA" on one line
+        #   - "198" or similar (corrupted "19%") on next line (optional)
+        #   - "30.43" (TVA amount) on following line
+        # OCR often splits this across multiple lines
+        if not tva_entries:
+            multiline_tva = r'TOTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s*\n\s*\d*\s*\n?\s*([\d]+[.,]\d{2})\b'
+            match = re.search(multiline_tva, normalized_text, re.IGNORECASE)
+            if match:
+                try:
+                    amount_str = self._normalize_number(match.group(1))
+                    amount = Decimal(amount_str)
+                    if amount > 0:
+                        percent = self._detect_tva_percent(text)
+                        if percent:
+                            code = self._get_tva_code_from_percent(percent)
+                            entry_key = (code, percent)
+                            if entry_key not in seen_entries:
+                                tva_entries.append({
+                                    'code': code,
+                                    'percent': percent,
+                                    'amount': amount
+                                })
+                                seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    pass
+
+        # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" or "IVA A - 19%" (with code)
+        # OCR tolerant: TUA, TVR, IVA, etc.
+        pattern_with_code = r'(?:T[VU][AR]|IVA)\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
        for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
            try:
                code = match.group(1).upper()
@@ -959,9 +1212,9 @@ class ReceiptExtractor:
            except (ValueError, InvalidOperation):
                continue

-        # Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A')
+        # Pattern 2: "TVA - 21%: 32.31" or "IVA - 21%: 32.31" (without explicit code, assume 'A')
        if not tva_entries:
-            pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
+            pattern_no_code = r'(?:T[VU][AR]|IVA)\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
            for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
                try:
                    percent = int(match.group(1))
@@ -982,10 +1235,10 @@ class ReceiptExtractor:
                except (ValueError, InvalidOperation):
                    continue

-        # Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
+        # Pattern 3: "TOTAL TVA A - 21%" or "TOTAL IVA" with amount on same line or "TOTAL TVA BON" with amount
        if not tva_entries:
-            # First try: "TOTAL TVA A - 21%  32.31" (amount on same line)
-            tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
+            # First try: "TOTAL TVA A - 21%  32.31" or "TOTAL IVA A - 21% 32.31" (amount on same line)
+            tva_with_amount = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
            for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
@@ -1004,16 +1257,16 @@ class ReceiptExtractor:
                except (ValueError, InvalidOperation):
                    continue

-        # Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
+        # Pattern 3b: "TOTAL TVA A - 21%" or "TOTAL IVA A - 21%" on one line, look for "TOTAL TVA BON" amount
        if not tva_entries:
-            tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
+            tva_total_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))

-                    # Look for "TOTAL TVA BON" followed by amount
-                    tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
+                    # Look for "TOTAL TVA BON" or "TOTAL IVA BON" followed by amount
+                    tva_bon_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON[:\s]*([\d.,]+)'
                    tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
                    if tva_bon_match:
                        amount_str = self._normalize_number(tva_bon_match.group(1))
@@ -1029,8 +1282,8 @@ class ReceiptExtractor:
                                seen_entries.add(entry_key)
                            continue

-                    # Fallback: Amount after TOTAL TVA BON on next line
-                    tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
+                    # Fallback: Amount after TOTAL TVA BON or TOTAL IVA BON on next line
+                    tva_bon_pos = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON', normalized_text, re.IGNORECASE)
                    if tva_bon_pos:
                        after_bon = normalized_text[tva_bon_pos.end():]
                        # Find first standalone number (likely TVA amount)
@@ -1050,9 +1303,9 @@ class ReceiptExtractor:
                except (ValueError, InvalidOperation):
                    continue

-        # Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
+        # Pattern 3c: "TVAA - 21%" or "IVA A - 21%" on one line, amount on next line (simpler format)
        if not tva_entries:
-            tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
+            tva_line_pattern = r'(?:T[VU][AR]|IVA)\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = (match.group(1) or 'A').upper()
@@ -1158,16 +1411,18 @@ class ReceiptExtractor:
        Extract TOTAL TVA BON value separately as the reference.
        This is the authoritative total TVA on the receipt.

-        Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, etc.
+        Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, TOTAL IVA BON, etc.
        """
-        # Pattern for TOTAL TVA BON with amount after
+        # Pattern for TOTAL TVA BON or TOTAL IVA BON with amount after
+        # OCR corruptions: TUAL (TVA+L merged), TVAL, TUAI, etc.
        patterns = [
-            # Standard: TOTAL TVA BON: 14.92
-            r'T?OTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
+            # Standard: TOTAL TVA BON: 14.92 or TOTAL IVA BON: 14.92
+            # Handles: TUAL (TVA+L), TVAL, TUAI, etc. with optional trailing letters
+            r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
            # Amount before: 14.92 OTAL TUA BON (OCR line break)
-            r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+T[VU][AR]\s+BON',
-            # Amount on next line after TOTAL TVA BON
-            r'T?OTAL\s+T[VU][AR]\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
+            r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON',
+            # Amount on next line after TOTAL TVA BON or TOTAL IVA BON
+            r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
        ]

        for pattern in patterns:
@@ -1271,18 +1526,52 @@ class ReceiptExtractor:
        return tva_entries, tva_total

    def _detect_tva_percent(self, text: str) -> Optional[int]:
-        """Detect TVA percentage from text content."""
-        # Look for common Romanian TVA percentages
-        if '19%' in text or '19 %' in text:
+        """Detect TVA percentage from text content.
+
+        IMPORTANT: Prioritize rates found near TVA markers over rates found elsewhere.
+        E.g., "REDUCERE 5%" should not override "TVA A 19%".
+        Also handle OCR corruptions like "194" for "19%" in "TOTAL TA F 194".
+        """
+        import re as regex
+
+        # First, look for percent NEAR TVA markers (most reliable)
+        # This handles "TVA A 19%", "TVA 19,00%", "TOTAL TVA 19%"
+        tva_context_patterns = [
+            r'T[VU][AR]\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
+            r'IVA\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
+            # OCR corruption: "TOTAL TA F 194" where 194 = 19% (4 is artifact)
+            r'TOTAL\s+T[VA][AR]?\s*[F\s]?\s*(19|21)\d\b',
+        ]
+        for pattern in tva_context_patterns:
+            match = regex.search(pattern, text, regex.IGNORECASE)
+            if match:
+                rate = int(match.group(1))
+                if rate in (19, 21, 11, 9, 5):
+                    return rate
+
+        # Fallback: Look for common Romanian TVA percentages anywhere
+        # But EXCLUDE patterns near "REDUCERE", "DISCOUNT", "RED." (these are discounts, not TVA)
+        # Clean text by removing discount context
+        # Handle OCR corruptions: RED.CERE (C instead of U), RED CERE, REDUC, etc.
+        text_no_discount = regex.sub(r'(?:REDUC|DISCOUNT|RED)[.\sA-Z]*\d+[.,]?\d*\s*%', '', text, flags=regex.IGNORECASE)
+
+        # Now search in cleaned text (priority order: 19% > 21% > 11% > 9% > 5%)
+        if regex.search(r'\b19[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 19
-        elif '21%' in text or '21 %' in text:
+        elif regex.search(r'\b21[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 21
-        elif '11%' in text or '11 %' in text:
+        elif regex.search(r'\b11[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 11
-        elif '9%' in text or '9 %' in text:
+        elif regex.search(r'\b9[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 9
-        elif '5%' in text or '5 %' in text:
+        elif regex.search(r'\b5[.,]?\s*\d{0,2}\s*%', text_no_discount):
            return 5
+
+        # Default: If no percent found but we're in Romanian receipt context,
+        # assume 19% (standard rate)
+        if regex.search(r'T[VU][AR]|IVA', text, regex.IGNORECASE):
+            return 19
+
        return None

    def _validate_tva_reverse(
@@ -1293,9 +1582,12 @@ class ReceiptExtractor:
        """
        Reverse TVA validation: from TVA amount and rate, calculate expected total.

-        Formula:
-            base = tva_amount / (rate/100)
-            expected_total = sum(base + tva_amount) for all entries
+        Formula (CORRECT):
+            For TVA that is INCLUDED in total (standard Romanian receipts):
+            total = base + tva
+            tva = base * rate/100
+            Therefore: base = tva * 100 / rate
+            And: total = base + tva = tva * 100 / rate + tva = tva * (100 + rate) / rate

        Returns (is_valid, expected_total, message)
        """
@@ -1307,10 +1599,14 @@ class ReceiptExtractor:
            tva_amount = entry['amount']
            rate = Decimal(str(entry['percent']))

+            print(f"[TVA Debug] Entry: amount={tva_amount}, rate={rate}%", flush=True)
+
            if rate > 0:
-                # Calculate base from TVA: base = tva / (rate/100)
-                base = tva_amount / (rate / Decimal('100'))
-                expected_total += base + tva_amount
+                # CORRECT formula: total = tva * (100 + rate) / rate
+                # Example: tva=55.22, rate=21 → total = 55.22 * 121 / 21 = 318.16
+                gross_for_entry = tva_amount * (Decimal('100') + rate) / rate
+                expected_total += gross_for_entry
+                print(f"[TVA Debug] Calculated gross: {gross_for_entry}", flush=True)
            else:
                # 0% TVA - can't calculate base, skip
                pass
@@ -1393,7 +1689,7 @@ class ReceiptExtractor:

        # Find the region between TOTAL LEI and TOTAL TVA
        total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE)
-        total_tva_match = re.search(r'TOTAL\s+T[VU][AR]', normalized_text, re.IGNORECASE)
+        total_tva_match = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)', normalized_text, re.IGNORECASE)

        # Define search region (after TOTAL LEI, before TOTAL TVA if exists)
        if total_lei_match:
@@ -1404,22 +1700,60 @@ class ReceiptExtractor:
            search_region = normalized_text  # Fallback to full text

        for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS:
-            for match in re.finditer(pattern, search_region, re.IGNORECASE):
+            for match in re.finditer(pattern, search_region, re.IGNORECASE | re.MULTILINE):
                try:
                    amount_str = match.group(1).replace(' ', '')
                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                    amount = Decimal(amount_str)
-                    if amount > 0 and method not in seen_methods:
+                    # Validate: amount must be positive and reasonable (< MAX_REASONABLE_PAYMENT)
+                    # This prevents OCR errors like CUI being parsed as payment
+                    if amount > 0 and amount < self.MAX_REASONABLE_PAYMENT and method not in seen_methods:
                        payment_methods.append({
                            'method': method,
                            'amount': amount
                        })
                        seen_methods.add(method)
+                        print(f"[Payment] Found {method}: {amount} (pattern matched)", flush=True)
+                    elif amount >= self.MAX_REASONABLE_PAYMENT:
+                        print(f"[Payment] Rejected unreasonable amount {amount} for {method} (likely OCR error)", flush=True)
                except (InvalidOperation, ValueError):
                    continue

        return payment_methods

+    def _validate_payment_methods(
+        self, payment_methods: List[dict], total: Optional[Decimal]
+    ) -> List[dict]:
+        """
+        Validate payment methods against extracted total.
+
+        If payment sum is way larger than total (>10x), it's likely an OCR error
+        (e.g., CUI number parsed as payment amount). Clear invalid payments.
+
+        Args:
+            payment_methods: List of {'method': str, 'amount': Decimal}
+            total: Extracted total amount
+
+        Returns:
+            Validated payment methods (may be empty if all were invalid)
+        """
+        if not total or not payment_methods:
+            return payment_methods
+
+        payment_sum = sum(pm.get('amount', Decimal('0')) for pm in payment_methods)
+
+        # If payment sum > 10x total, it's definitely an error
+        if payment_sum > total * 10:
+            print(f"[Payment Validation] Payment sum {payment_sum} >> Total {total} (>10x), clearing invalid payments", flush=True)
+            return []
+
+        # If payment sum > 2x total, it's suspicious but might be valid in some edge cases
+        # Just log a warning
+        if payment_sum > total * 2:
+            print(f"[Payment Validation] Warning: Payment sum {payment_sum} > 2x Total {total}, possible OCR error", flush=True)
+
+        return payment_methods
+
    def _extract_client_data(
        self, text_upper: str, original_text: str
    ) -> Tuple[Optional[str], Optional[str], Optional[str], float]: