ocr extract

2026-01-07 14:34:07 +00:00
parent 22eca953ce
commit cc98d6f21f
24 changed files with 774 additions and 2346 deletions
--- a/backend/modules/data_entry/services/ocr/profiles/base.py
+++ b/backend/modules/data_entry/services/ocr/profiles/base.py
@@ -50,6 +50,9 @@ class BaseStoreProfile(ABC):
    # Store display name
    STORE_NAME: str = "Unknown Store"

+    # Flag for known non-VAT payer stores (skips TVA extraction)
+    IS_NON_VAT_PAYER: bool = False
+
    # -------------------------------------------------------------------------
    # Generic patterns - can be overridden in subclasses
    # -------------------------------------------------------------------------
@@ -100,18 +103,33 @@ class BaseStoreProfile(ABC):
    ]

    # Payment method patterns (pattern, method_type, confidence)
+    # Handles ALL payment types: CARD, NUMERAR, and card brand names
    PAYMENT_PATTERNS = [
+        # CARTE CREDIT variants (OMV/Petrom/Socar receipts)
        (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
        (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
+        (r'CARTE\s+DE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
+        (r'CARTE\s+DE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
+        # CARD standard
        (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
+        # Card brand names
+        (r'VISA\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
+        (r'MASTERCARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
+        (r'MAESTR[O0]\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
+        (r'CONTACTLESS\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
+        (r'DEBIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
+        (r'CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.88),
+        # Cash variants
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
        (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
+        # Truncation recovery patterns (for OCR left-margin issues)
        (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
        (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
        (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
    ]

    # Client section markers (for B2B receipts) - More flexible patterns
+    # Includes OCR corruption variants (LIENT, C IENT, L IENT)
    CLIENT_MARKERS = [
        r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT',    # "CIF CLIENT" (with or without colon)
        r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT',    # "CUI CLIENT"
@@ -121,24 +139,62 @@ class BaseStoreProfile(ABC):
        r'BENEFICIAR\s*:',                      # "BENEFICIAR:"
        r'CUMP[AĂ]R[AĂ]TOR',                   # "CUMPARATOR" without colon
        r'COD\s+FISCAL\s+CLIENT',              # "COD FISCAL CLIENT"
+        # OCR corruption patterns
+        r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:',      # "CIF a IENT:", "CIF CL IENT:", "CIF L IENT:"
+        r'C[I1]F\s+LIENT\s*:',                  # "CIF LIENT:" (missing C)
+        r'LIENT\s*:',                           # "LIENT:" (missing C and I/L)
+        # Brick-specific (I→L OCR error)
+        r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/',       # "CLIENT C.U.L./" (I read as L)
    ]

-    # Client CUI patterns (pattern, confidence) - More flexible
+    # Client CUI patterns (pattern, confidence) - Comprehensive
+    # Handles: docTR reordering, doubled letters, corruption, CUMPARATOR, Brick L/I swap
    CLIENT_CUI_PATTERNS = [
-        # "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
-        (r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
-        # "CLIENT CIF: XXXXXXX"
-        (r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
-        # "CUI CLIENT: XXXXXXX"
-        (r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
-        # "ROXXXXXXX" followed by CLIENT marker
-        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
-        # "C.I.F. CLIENT: XXXXXXX"
-        (r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
-        # "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
+        # === CUI on line BEFORE CLIENT marker (docTR/OCR reordering) ===
+        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
+        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
+        (r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
+        # === "CIF I CLIENT:" format (OCR extra chars) ===
+        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
+        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
+        # === CIF CLIENT: (reversed - CIF before CLIENT) ===
+        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
+        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
+        # === CLIENT C.U.I/C.I.F. (slash variants) ===
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
+        # === Doubled letters (docTR artifact: "C.U U.I") ===
+        (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
+        # === CLIENT C.U.I. or CLIENT CUI (without slash) ===
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
+        # === Corrupted CLIENT after CIF (OCR errors) ===
+        (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),
+        (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
+        (r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),
+        (r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
+        # === CUMPARATOR variants ===
+        (r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        (r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        # CUMPARATOR with CUI/CIF on next line
+        (r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
+        (r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
+        # CUMPARATOR with CUI/CIF two lines down
+        (r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
+        # === CLIENT on next line ===
+        (r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        (r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
+        # === Standard fallback patterns ===
        (r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
-        # "COD FISCAL CLIENT: XXXXXXX"
        (r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
+        # === Brick-specific (I→L OCR error) ===
+        # Matches: "CLIENT C.U.L./C.IF. :R01879855"
+        (r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
    ]

    # Company type indicators (for identifying company names)
@@ -158,15 +214,133 @@ class BaseStoreProfile(ABC):
    # Maximum reasonable payment amount (to filter OCR errors)
    MAX_PAYMENT = Decimal('100000')

+    # -------------------------------------------------------------------------
+    # TVA (VAT) patterns - ALL FORMATS unified
+    # OCR tolerant: T[VU][AR] matches TVA, TUA, TVR
+    # -------------------------------------------------------------------------
+    TVA_PATTERNS = [
+        # === FORMAT 1: INLINE cu cod și procent (Lidl-style) ===
+        # "TVA A 21,00% 7.71" sau "TVA B 11,00% 2.13"
+        (r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.98, 'inline'),
+        (r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.97, 'inline'),
+        (r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.95, 'inline'),
+
+        # === FORMAT 2: REVERSED (Stepout-style) ===
+        # "5.00% TUA*B" - procent ÎNAINTE de TVA
+        (r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', 0.97, 'reversed'),
+
+        # === FORMAT 3: TABLE (OMV-style) ===
+        # "A-21,00%  285,66  49,58" (cod-procent bază tva)
+        (r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', 0.96, 'table'),
+        (r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', 0.95, 'taxe'),
+
+        # === FORMAT 4: MULTILINE (Brick/Electrobering) ===
+        # "TOTAL TVA A - 19%" pe o linie, amount pe următoarea
+        (r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', 0.96, 'multiline'),
+        (r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', 0.95, 'multiline'),
+        (r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', 0.94, 'multiline'),
+
+        # === FORMAT 5: STANDARD (din extractor) ===
+        (r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98, 'bon'),
+        (r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
+        (r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
+        (r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95, 'percent'),
+        (r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93, 'percent'),
+        (r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93, 'books'),
+        (r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92, 'books'),
+
+        # === FORMAT 6: CODED inline (cu code A-D) ===
+        (r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)', 0.95, 'coded'),
+        (r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)', 0.93, 'coded'),
+
+        # === FALLBACK patterns ===
+        (r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88, 'fallback'),
+        (r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
+        (r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
+        (r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)', 0.90, 'standard'),
+    ]
+
+    # Non-VAT payer patterns - NEPLATITOR DE TVA
+    # OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, ANEPLATHTOR, MEPLATITOR
+    NON_VAT_PATTERNS = [
+        r'NEPLAT\w*OR',           # NEPLATITOR, NEPLATTOR, NEPLATOR
+        r'[ANM]EPLAT\w*O?R',      # OCR errors: ANEPLATHTOR, MEPLATITOR
+        r'TOTAL\s+NEPLAT',        # TOTAL NEPLATITOR...
+        r'TOTAL\s+[ANM]EPLAT',    # TOTAL ANEPLAT... (OCR error)
+        r'SCUTIT\s*(?:DE\s+)?T[VU]A',  # SCUTIT DE TVA
+        r'NEPLAT\w*\s+T[VU]A',    # NEPLATITOR TVA
+        r'NEPLAT\w*\s+DE\s+T',    # NEPLATITOR DE T... (truncated)
+    ]
+
+    # CUI (fiscal code) patterns - VENDOR CUI (exclude CLIENT)
+    # OCR errors: R0 instead of RO, C1F instead of CIF
+    CUI_PATTERNS = [
+        # CIF at start of line (definitely vendor)
+        (r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'^CIF\s*:?\s*(\d{6,10})', 0.97),
+        (r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),
+        (r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),
+        # CIF not preceded by CLIENT (negative lookbehind)
+        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
+        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
+        # Standalone CIF with word boundary
+        (r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
+        (r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
+        # COD FISCAL (vendor)
+        (r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
+        (r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
+        # C. I. F. format with SPACES (OCR artifact)
+        (r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
+        (r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
+        # C.I.F. format (with dots, no spaces)
+        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
+        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
+        # CUI format
+        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
+        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
+        # Lidl format: "Cod Identificare fiscala" (OCR corrupted)
+        (r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
+        # Generic: anything with "fiscal" followed by RO + digits
+        (r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
+    ]
+
+    # CUI REVERSED format (number BEFORE label)
+    CUI_REVERSED_PATTERNS = [
+        (r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
+        (r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
+    ]
+
+    # Items count patterns - NR POZ ART IN BON
+    ITEMS_COUNT_PATTERNS = [
+        (r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
+        (r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
+        (r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
+        (r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
+        (r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
+        (r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
+    ]
+
+    # Series patterns - Romanian fiscal receipt series
+    SERIES_PATTERNS = [
+        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
+        (r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
+        (r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
+    ]
+
    # -------------------------------------------------------------------------
    # Extraction methods - override in subclasses as needed
    # -------------------------------------------------------------------------

    def extract_tva_entries(self, text: str) -> List[dict]:
        """
-        Extract TVA entries from receipt text.
+        Extract TVA entries from receipt text - GENERIC implementation.

-        Override this method in subclasses to handle store-specific TVA formats.
+        Handles ALL formats:
+        - Multi-rate inline (Lidl): "TVA A 21% 7.71"
+        - Reversed (Stepout): "5.00% TUA*B"
+        - Table (OMV): "A-21,00% 285,66 49,58"
+        - Multiline: "TOTAL TVA A - 19%" + amount on next line
+        - Non-VAT payers: Returns empty list

        Args:
            text: Raw OCR text from receipt
@@ -174,12 +348,252 @@ class BaseStoreProfile(ABC):
        Returns:
            List of dicts with keys: code, percent, amount
        """
-        return []
+        entries = []
+        text_upper = text.upper()
+
+        # Step 1: Check for known non-VAT payer (by class flag or text detection)
+        if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper):
+            return []  # No TVA entries for non-VAT payers
+
+        # Step 2: Normalize OCR spaces in numbers
+        normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
+        lines = normalized.split('\n')
+
+        # Step 3: Try all formats, collect candidates
+        candidates = []
+
+        # Try inline multi-rate (Lidl-style)
+        candidates.extend(self._try_tva_inline(normalized))
+
+        # Try reversed format (Stepout-style)
+        candidates.extend(self._try_tva_reversed(normalized, lines))
+
+        # Try multiline format (Brick/Electrobering)
+        candidates.extend(self._try_tva_multiline(normalized, lines))
+
+        # Try table format (OMV-style)
+        candidates.extend(self._try_tva_table(normalized))
+
+        # Try standard/fallback patterns
+        if not candidates:
+            candidates.extend(self._try_tva_standard(normalized))
+
+        # Step 4: Deduplicate and return
+        seen = set()
+        for entry in candidates:
+            key = (entry.get('code', 'A'), entry.get('percent', 19))
+            if key not in seen and entry.get('amount') and entry['amount'] > 0:
+                entries.append(entry)
+                seen.add(key)
+
+        return entries
+
+    def _is_non_vat_payer(self, text: str) -> bool:
+        """Check if receipt is from non-VAT payer."""
+        for pattern in self.NON_VAT_PATTERNS:
+            if re.search(pattern, text, re.IGNORECASE):
+                return True
+        return False
+
+    def _try_tva_inline(self, text: str) -> List[dict]:
+        """Try Lidl-style inline format: 'TVA A 21,00% 7.71'"""
+        entries = []
+        # Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13"
+        for pattern, confidence, fmt in self.TVA_PATTERNS:
+            if fmt != 'inline':
+                continue
+            for match in re.finditer(pattern, text, re.IGNORECASE):
+                try:
+                    groups = match.groups()
+                    if len(groups) >= 3:
+                        code = groups[0].upper() if groups[0] else 'A'
+                        percent = int(groups[1])
+                        amount = self._parse_decimal(self._clean_ocr_number(groups[2]))
+                        if amount and amount > 0:
+                            entries.append({
+                                'code': code,
+                                'percent': percent,
+                                'amount': amount
+                            })
+                except (ValueError, InvalidOperation, IndexError):
+                    continue
+        return entries
+
+    def _try_tva_reversed(self, text: str, lines: List[str]) -> List[dict]:
+        """Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)"""
+        entries = []
+        # Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next
+        for i, line in enumerate(lines):
+            # Try pattern with amount on SAME line: "5.00% TUA*B        2.00"
+            match = re.search(
+                r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?\s+([\d\s.,]+)',
+                line, re.IGNORECASE
+            )
+            if match:
+                try:
+                    percent = int(match.group(1))
+                    code = match.group(2).upper() if match.group(2) else 'A'
+                    amount_str = match.group(3).strip()
+                    amount = self._parse_decimal(amount_str)
+                    if amount and amount > 0:
+                        entries.append({
+                            'code': code,
+                            'percent': percent,
+                            'amount': amount
+                        })
+                        continue  # Check for more entries
+                except (ValueError, InvalidOperation, IndexError):
+                    pass
+
+            # Fallback: amount on NEXT line
+            match = re.search(r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?$', line, re.IGNORECASE)
+            if match:
+                try:
+                    percent = int(match.group(1))
+                    code = match.group(2).upper() if match.group(2) else 'A'
+                    if i + 1 < len(lines):
+                        amount_str = lines[i + 1].strip()
+                        amount = self._parse_decimal(amount_str)
+                        if amount and amount > 0:
+                            entries.append({
+                                'code': code,
+                                'percent': percent,
+                                'amount': amount
+                            })
+                except (ValueError, InvalidOperation, IndexError):
+                    continue
+        return entries
+
+    def _try_tva_multiline(self, text: str, lines: List[str]) -> List[dict]:
+        """Try multiline format: 'TOTAL TVA A - 19%' + amount on next line"""
+        entries = []
+        # Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next
+        multiline_patterns = [
+            r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
+            r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
+            r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%',
+            r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%',
+        ]
+        for i, line in enumerate(lines):
+            for pattern in multiline_patterns:
+                match = re.search(pattern, line, re.IGNORECASE)
+                if match:
+                    try:
+                        code = match.group(1).upper() if match.group(1) else 'A'
+                        percent = int(match.group(2))
+                        # Amount is on next line
+                        if i + 1 < len(lines):
+                            amount_str = lines[i + 1].strip()
+                            amount = self._parse_decimal(amount_str)
+                            if amount and amount > 0:
+                                entries.append({
+                                    'code': code,
+                                    'percent': percent,
+                                    'amount': amount
+                                })
+                                return entries
+                    except (ValueError, InvalidOperation, IndexError):
+                        continue
+        return entries
+
+    def _try_tva_table(self, text: str) -> List[dict]:
+        """Try OMV-style table format: 'A-21,00% 285,66 49,58'"""
+        entries = []
+        # Pattern: "A-21,00%  285,66  49,58" (code-percent base_amount tva_amount)
+        table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)'
+        for match in re.finditer(table_pattern, text, re.IGNORECASE):
+            try:
+                code = match.group(1).upper()
+                percent = int(match.group(2))
+                # Group 4 is the TVA amount (last column in table)
+                tva_amount_str = self._clean_ocr_number(match.group(4))
+                tva_amount = self._parse_decimal(tva_amount_str)
+                if tva_amount and tva_amount > 0:
+                    entries.append({
+                        'code': code,
+                        'percent': percent,
+                        'amount': tva_amount
+                    })
+            except (ValueError, InvalidOperation, IndexError):
+                continue
+
+        # Fallback: "TOTAL TAXE: 55,22"
+        if not entries:
+            taxe_match = re.search(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', text, re.IGNORECASE)
+            if taxe_match:
+                try:
+                    amount_str = self._clean_ocr_number(taxe_match.group(1))
+                    amount = self._parse_decimal(amount_str)
+                    if amount and amount > 0:
+                        entries.append({
+                            'code': 'A',
+                            'percent': 19,  # Default rate
+                            'amount': amount
+                        })
+                except (ValueError, InvalidOperation):
+                    pass
+        return entries
+
+    def _try_tva_standard(self, text: str) -> List[dict]:
+        """Try standard TVA patterns as fallback"""
+        entries = []
+        standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books']
+        for pattern, confidence, fmt in self.TVA_PATTERNS:
+            if fmt not in standard_fmts:
+                continue
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                try:
+                    groups = match.groups()
+                    if len(groups) >= 2:
+                        # Could be (percent, amount) or (code, percent, amount)
+                        if groups[0] and groups[0].isalpha():
+                            code = groups[0].upper()
+                            percent = int(groups[1]) if len(groups) > 1 else 19
+                            amount_str = groups[2] if len(groups) > 2 else None
+                        else:
+                            code = 'A'
+                            percent = int(groups[0]) if groups[0] and groups[0].isdigit() else 19
+                            amount_str = groups[1] if len(groups) > 1 else groups[0]
+                        if amount_str:
+                            amount = self._parse_decimal(self._clean_ocr_number(amount_str))
+                            if amount and amount > 0:
+                                entries.append({
+                                    'code': code,
+                                    'percent': percent,
+                                    'amount': amount
+                                })
+                                return entries
+                    elif len(groups) == 1:
+                        # Just amount
+                        amount = self._parse_decimal(self._clean_ocr_number(groups[0]))
+                        if amount and amount > 0:
+                            entries.append({
+                                'code': 'A',
+                                'percent': 19,
+                                'amount': amount
+                            })
+                            return entries
+                except (ValueError, InvalidOperation, IndexError):
+                    continue
+        return entries
+
+    def _clean_ocr_number(self, value: str) -> str:
+        """Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
+        if not value:
+            return ""
+        value = re.sub(r'\s*([.,])\s*', r'\1', value)
+        value = value.replace(' ', '')
+        return value

    def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
        """
        Extract total amount from receipt text.

+        Supports both single-line and multiline formats:
+        - Single line: "TOTAL: 78.00", "SUMA TOTALA: 78.00"
+        - Multiline: "SUMA\nTOTALA:\n78.00" (common in thermal receipts)
+
        Args:
            text: Raw OCR text from receipt

@@ -187,7 +601,54 @@ class BaseStoreProfile(ABC):
            Tuple of (amount, confidence) or (None, 0.0)
        """
        text_upper = text.upper()
+        lines = text_upper.split('\n')

+        # =====================================================================
+        # STRATEGY 1: Multiline "SUMA TOTALA" pattern (thermal receipts)
+        # Format: SUMA on one line, TOTALA: on next, amount on third
+        # =====================================================================
+        for i, line in enumerate(lines):
+            line_clean = line.strip()
+
+            # Check for "SUMA" keyword (with OCR variants: SUNA, SUHA, SUM A)
+            if re.search(r'S[UU]M[AĂ\s]', line_clean):
+                # Look at next 3 lines for "TOTALA" and amount
+                for j in range(i, min(i + 4, len(lines))):
+                    check_line = lines[j].strip()
+
+                    # Check for "TOTALA:" or "TOTALA -" followed by amount
+                    match = re.search(r'T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', check_line)
+                    if match:
+                        amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
+                        if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                            return (amount, 0.98)
+
+                    # Check for "TOTALA" without amount, amount on next line
+                    if re.search(r'T[O0]TALA\s*[:\-]?\s*$', check_line):
+                        if j + 1 < len(lines):
+                            amount_line = lines[j + 1].strip()
+                            amount = self._parse_decimal(amount_line)
+                            if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                                return (amount, 0.97)
+
+            # Check for "SUMA TOTALA" on single line with amount
+            match = re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', line_clean)
+            if match:
+                amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
+                if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                    return (amount, 0.98)
+
+            # Check for "SUMA TOTALA" without amount, amount on next line
+            if re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]?\s*$', line_clean):
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1].strip()
+                    amount = self._parse_decimal(next_line)
+                    if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                        return (amount, 0.96)
+
+        # =====================================================================
+        # STRATEGY 2: Standard single-line patterns
+        # =====================================================================
        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
@@ -259,28 +720,76 @@ class BaseStoreProfile(ABC):
        """
        Extract payment methods (CARD/NUMERAR) from receipt.

-        Supports multiple payments of the same type (e.g., 2x CARD for split payments).
-        Each payment is returned as a separate entry with its amount.
+        Supports:
+        - Multiline patterns: "CARD\n78.00" (common in thermal receipts)
+        - Multiple payments (split CARD + NUMERAR)
+        - REST (change) detection to calculate actual CARD amount
+        - Keyword-only CARD/NUMERAR that infers from total
+        - Fallback for fiscal receipts without explicit payment

        Args:
            text: Raw OCR text from receipt

        Returns:
            List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
-            Multiple entries of same method type are allowed for split payments.
        """
        text_upper = text.upper()
+        lines = text_upper.split('\n')
        methods = []
-        # Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
        seen_entries = set()

+        # =====================================================================
+        # STEP 0: Try MULTILINE patterns first (thermal receipts)
+        # Format: "CARD" on one line, amount on next line
+        # =====================================================================
+        for i, line in enumerate(lines):
+            line_clean = line.strip()
+
+            # Standalone CARD keyword (not part of MASTERCARD, etc.)
+            if re.match(r'^CARD\s*$', line_clean):
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1].strip()
+                    # Must be a valid amount (not another keyword)
+                    if re.match(r'^[\d\s.,]+$', next_line):
+                        amount = self._parse_decimal(next_line)
+                        if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                            entry_key = ('CARD', amount)
+                            if entry_key not in seen_entries:
+                                methods.append({
+                                    'method': 'CARD',
+                                    'amount': amount,
+                                    'confidence': 0.95
+                                })
+                                seen_entries.add(entry_key)
+
+            # Standalone NUMERAR keyword
+            if re.match(r'^NUMERAR\s*$', line_clean):
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1].strip()
+                    if re.match(r'^[\d\s.,]+$', next_line):
+                        amount = self._parse_decimal(next_line)
+                        if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                            entry_key = ('NUMERAR', amount)
+                            if entry_key not in seen_entries:
+                                methods.append({
+                                    'method': 'NUMERAR',
+                                    'amount': amount,
+                                    'confidence': 0.95
+                                })
+                                seen_entries.add(entry_key)
+
+        # If multiline extraction found methods, return them
+        if methods:
+            return methods
+
+        # =====================================================================
+        # STEP 1: Try pattern-based extraction with explicit amounts
+        # =====================================================================
        for pattern, method, confidence in self.PAYMENT_PATTERNS:
            for match in re.finditer(pattern, text_upper):
                try:
                    amount = self._parse_decimal(match.group(1))
                    if amount and amount > 0 and amount < self.MAX_PAYMENT:
-                        # Deduplicate by (method, amount) to avoid same entry from multiple patterns
-                        # But allow different amounts for same method (split payments)
                        entry_key = (method, amount)
                        if entry_key not in seen_entries:
                            methods.append({
@@ -292,6 +801,70 @@ class BaseStoreProfile(ABC):
                except (ValueError, InvalidOperation):
                    continue

+        # If we found explicit amounts, we're done
+        if methods:
+            return methods
+
+        # Step 2: Try keyword-only detection with REST logic
+        # Get total amount for inference
+        total_amount, _ = self.extract_total(text)
+        if not total_amount:
+            return []
+
+        # Check for payment keywords
+        has_card = any(kw in text_upper for kw in ['CARD', 'CARTE CREDIT', 'VISA', 'MASTERCARD', 'DEBIT', 'CREDIT', 'CONTACTLESS'])
+        has_numerar = any(kw in text_upper for kw in ['NUMERAR', 'CASH'])
+
+        # Find REST (change) amount
+        rest_amount = Decimal('0')
+        for i, line in enumerate(lines):
+            if 'REST' in line:
+                # REST on same line: "REST 0.00" or "REST: 0.00"
+                match = re.search(r'REST\s*:?\s*([\d.,]+)', line)
+                if match:
+                    rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
+                elif i + 1 < len(lines):
+                    # REST on separate line
+                    rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
+                break
+
+        # Calculate payment amounts
+        if has_card:
+            card_amount = total_amount - rest_amount
+            if card_amount > 0:
+                methods.append({
+                    'method': 'CARD',
+                    'amount': card_amount,
+                    'confidence': 0.90
+                })
+
+        if has_numerar:
+            if has_card and rest_amount > 0:
+                # Mixed payment: NUMERAR is the change given back
+                methods.append({
+                    'method': 'NUMERAR',
+                    'amount': rest_amount,
+                    'confidence': 0.85
+                })
+            elif not has_card:
+                # Cash only
+                methods.append({
+                    'method': 'NUMERAR',
+                    'amount': total_amount,
+                    'confidence': 0.90
+                })
+
+        # Step 3: Fallback for fiscal receipts without explicit payment
+        if not methods and total_amount and total_amount > 0:
+            is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
+            if is_fiscal:
+                # Default to CARD for business purchases (most common)
+                methods.append({
+                    'method': 'CARD',
+                    'amount': total_amount,
+                    'confidence': 0.70
+                })
+
        return methods

    def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]: