""" Base class for store-specific OCR extraction profiles. Each store can have different receipt formats (TVA layout, total position, etc.). Store profiles allow customizing extraction logic per-store for better accuracy. Usage: from .base import BaseStoreProfile from . import ProfileRegistry @ProfileRegistry.register class LidlProfile(BaseStoreProfile): CUI_LIST = ["22891860"] NAME_PATTERNS = ["LIDL", "LDL"] def extract_tva_entries(self, text: str) -> Tuple[List[dict], float]: # Custom Lidl TVA extraction logic # Returns (entries_list, confidence_score) ... """ import re from abc import ABC from decimal import Decimal, InvalidOperation from typing import List, Optional, Tuple, Dict, Any from datetime import date class BaseStoreProfile(ABC): """ Abstract base class for store-specific extraction profiles. Each profile defines: - CUI_LIST: CUI codes that identify this store (without RO prefix) - NAME_PATTERNS: OCR-tolerant name patterns for fallback matching - Custom extraction methods for TVA, total, date, etc. The ProfileRegistry uses CUI_LIST to lookup profiles during extraction. """ # ------------------------------------------------------------------------- # Class attributes - override in subclasses # ------------------------------------------------------------------------- # List of CUI codes (without RO prefix) that identify this store CUI_LIST: List[str] = [] # OCR-tolerant name patterns for fallback matching NAME_PATTERNS: List[str] = [] # Store display name STORE_NAME: str = "Unknown Store" # Flag for known non-VAT payer stores (skips TVA extraction) IS_NON_VAT_PAYER: bool = False # ------------------------------------------------------------------------- # Generic patterns - can be overridden in subclasses # ------------------------------------------------------------------------- # Total amount patterns (confidence-weighted) TOTAL_PATTERNS = [ (r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98), (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98), (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95), (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95), (r'SUBTOTAL\s*([\d\s.,]+)', 0.90), (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90), (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85), ] # Date patterns (confidence-weighted) DATE_PATTERNS = [ (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95), (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90), (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80), (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75), ] # Date patterns with OCR-introduced spaces (separate because format is different) DATE_PATTERNS_OCR_SPACES = [ (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'), (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'), ] # Receipt number patterns (confidence-weighted) NUMBER_PATTERNS = [ (r'NDS\s*:?\s*(\d+)', 0.98), (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), (r'C3POS.*?(\d{6,7})\b', 0.95), (r'BF\s*:\s*(\d{4,})', 0.96), (r'BF\s+(\d{4,})', 0.93), (r'NIVS\s*:?\s*(\d+)', 0.95), (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95), (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95), (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95), (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90), (r'ID\s*BF\s*:?\s*(\d+)', 0.90), ] # Payment method patterns (pattern, method_type, confidence) # Handles ALL payment types: CARD, NUMERAR, and card brand names PAYMENT_PATTERNS = [ # CARTE CREDIT variants (OMV/Petrom/Socar receipts) (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98), (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97), (r'CARTE\s+DE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98), (r'CARTE\s+DE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97), # CARD standard (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95), # Card brand names (r'VISA\s*:?\s*([\d\s.,]+)', 'CARD', 0.95), (r'MASTERCARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95), (r'MAESTR[O0]\s*:?\s*([\d\s.,]+)', 'CARD', 0.95), (r'CONTACTLESS\s*:?\s*([\d\s.,]+)', 'CARD', 0.90), (r'DEBIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.90), (r'CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.88), # Cash variants (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95), (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90), # Truncation recovery patterns (for OCR left-margin issues) (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70), (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75), (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70), ] # Client section markers (for B2B receipts) - More flexible patterns # Includes OCR corruption variants (LIENT, C IENT, L IENT) CLIENT_MARKERS = [ r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon) r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT" r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI" r'CLIENT\s*:', # "CLIENT:" r'CUMPARATOR\s*:', # "CUMPARATOR:" r'BENEFICIAR\s*:', # "BENEFICIAR:" r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT" # OCR corruption patterns r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF L IENT:" r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C) r'LIENT\s*:', # "LIENT:" (missing C and I/L) # Brick-specific (I→L OCR error) r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/', # "CLIENT C.U.L./" (I read as L) ] # Client CUI patterns (pattern, confidence) - Comprehensive # Handles: docTR reordering, doubled letters, corruption, CUMPARATOR, Brick L/I swap CLIENT_CUI_PATTERNS = [ # === CUI on line BEFORE CLIENT marker (docTR/OCR reordering) === (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99), (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99), (r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98), # === "CIF I CLIENT:" format (OCR extra chars) === (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98), (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97), # === CIF CLIENT: (reversed - CIF before CLIENT) === (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98), (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98), (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98), (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98), # === CLIENT C.U.I/C.I.F. (slash variants) === (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97), (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97), # === Doubled letters (docTR artifact: "C.U U.I") === (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96), (r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96), # === CLIENT C.U.I. or CLIENT CUI (without slash) === (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96), (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96), (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96), (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96), # === Corrupted CLIENT after CIF (OCR errors) === (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93), (r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93), (r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92), (r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92), # === CUMPARATOR variants === (r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), (r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # CUMPARATOR with CUI/CIF on next line (r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93), (r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93), # CUMPARATOR with CUI/CIF two lines down (r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90), # === CLIENT on next line === (r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), (r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90), # === Standard fallback patterns === (r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90), (r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95), # === Brick-specific (I→L OCR error) === # Matches: "CLIENT C.U.L./C.IF. :R01879855" (r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99), ] # Company type indicators (for identifying company names) COMPANY_INDICATORS = [ r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L. r'\bS\.?\s*A\.?\b', # S.A. or S. A. r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C. r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S. r'\bI\.?\s*I\.?\b', # I.I. or I. I. r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A. r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name r'HOLDING', r'COMPANY', r'GROUP', ] # Maximum reasonable payment amount (to filter OCR errors) MAX_PAYMENT = Decimal('100000') # ------------------------------------------------------------------------- # TVA (VAT) patterns - ALL FORMATS unified # OCR tolerant: T[VU][AR] matches TVA, TUA, TVR # ------------------------------------------------------------------------- TVA_PATTERNS = [ # === FORMAT 1: INLINE cu cod și procent (Lidl-style) === # "TVA A 21,00% 7.71" sau "TVA B 11,00% 2.13" (r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.98, 'inline'), (r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.97, 'inline'), (r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.95, 'inline'), # === FORMAT 2: REVERSED (Stepout-style) === # "5.00% TUA*B" - procent ÎNAINTE de TVA (r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', 0.97, 'reversed'), # === FORMAT 3: TABLE (OMV-style) === # "A-21,00% 285,66 49,58" (cod-procent bază tva) (r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', 0.96, 'table'), (r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', 0.95, 'taxe'), # === FORMAT 4: MULTILINE (Brick/Electrobering) === # "TOTAL TVA A - 19%" pe o linie, amount pe următoarea (r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', 0.96, 'multiline'), (r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', 0.95, 'multiline'), (r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', 0.94, 'multiline'), # === FORMAT 5: STANDARD (din extractor) === (r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98, 'bon'), (r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95, 'standard'), (r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95, 'standard'), (r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95, 'percent'), (r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93, 'percent'), (r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93, 'books'), (r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92, 'books'), # === FORMAT 6: CODED inline (cu code A-D) === (r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)', 0.95, 'coded'), (r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)', 0.93, 'coded'), # === FALLBACK patterns === (r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88, 'fallback'), (r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'), (r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'), (r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)', 0.90, 'standard'), ] # Non-VAT payer patterns - NEPLATITOR DE TVA # OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, ANEPLATHTOR, MEPLATITOR NON_VAT_PATTERNS = [ r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR... r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error) r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated) ] # CUI (fiscal code) patterns - VENDOR CUI (exclude CLIENT) # OCR errors: R0 instead of RO, C1F instead of CIF CUI_PATTERNS = [ # CIF at start of line (definitely vendor) (r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98), (r'^CIF\s*:?\s*(\d{6,10})', 0.97), (r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95), (r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94), # CIF not preceded by CLIENT (negative lookbehind) (r'(? Tuple[List[dict], float]: """ Extract TVA entries from receipt text - GENERIC implementation. Handles ALL formats: - Multi-rate inline (Lidl): "TVA A 21% 7.71" - Reversed (Stepout): "5.00% TUA*B" - Table (OMV): "A-21,00% 285,66 49,58" - Multiline: "TOTAL TVA A - 19%" + amount on next line - Non-VAT payers: Returns empty list Args: text: Raw OCR text from receipt Returns: Tuple of (List of dicts with keys: code, percent, amount, confidence float) """ entries = [] max_confidence = 0.0 text_upper = text.upper() # Step 1: Check for known non-VAT payer (by class flag or text detection) if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper): return ([], 0.0) # No TVA entries for non-VAT payers # Step 2: Normalize OCR spaces in numbers normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper) lines = normalized.split('\n') # Step 3: Try all formats, collect candidates with confidence candidates = [] # Try inline multi-rate (Lidl-style) inline_entries, inline_conf = self._try_tva_inline(normalized) candidates.extend(inline_entries) if inline_conf > max_confidence: max_confidence = inline_conf # Try reversed format (Stepout-style) reversed_entries, reversed_conf = self._try_tva_reversed(normalized, lines) candidates.extend(reversed_entries) if reversed_conf > max_confidence: max_confidence = reversed_conf # Try multiline format (Brick/Electrobering) multiline_entries, multiline_conf = self._try_tva_multiline(normalized, lines) candidates.extend(multiline_entries) if multiline_conf > max_confidence: max_confidence = multiline_conf # Try table format (OMV-style) table_entries, table_conf = self._try_tva_table(normalized) candidates.extend(table_entries) if table_conf > max_confidence: max_confidence = table_conf # Try standard/fallback patterns if not candidates: standard_entries, standard_conf = self._try_tva_standard(normalized) candidates.extend(standard_entries) if standard_conf > max_confidence: max_confidence = standard_conf # Step 4: Deduplicate and return seen = set() for entry in candidates: key = (entry.get('code', 'A'), entry.get('percent', 19)) if key not in seen and entry.get('amount') and entry['amount'] > 0: entries.append(entry) seen.add(key) return (entries, max_confidence if entries else 0.0) def _is_non_vat_payer(self, text: str) -> bool: """Check if receipt is from non-VAT payer.""" for pattern in self.NON_VAT_PATTERNS: if re.search(pattern, text, re.IGNORECASE): return True return False def _try_tva_inline(self, text: str) -> Tuple[List[dict], float]: """Try Lidl-style inline format: 'TVA A 21,00% 7.71'""" entries = [] max_confidence = 0.0 # Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13" for pattern, confidence, fmt in self.TVA_PATTERNS: if fmt != 'inline': continue for match in re.finditer(pattern, text, re.IGNORECASE): try: groups = match.groups() if len(groups) >= 3: code = groups[0].upper() if groups[0] else 'A' percent = int(groups[1]) amount = self._parse_decimal(self._clean_ocr_number(groups[2])) if amount and amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': amount }) if confidence > max_confidence: max_confidence = confidence except (ValueError, InvalidOperation, IndexError): continue return (entries, max_confidence) def _try_tva_reversed(self, text: str, lines: List[str]) -> Tuple[List[dict], float]: """Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)""" entries = [] confidence = 0.97 # Default confidence for reversed format # Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next for i, line in enumerate(lines): # Try pattern with amount on SAME line: "5.00% TUA*B 2.00" match = re.search( r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?\s+([\d\s.,]+)', line, re.IGNORECASE ) if match: try: percent = int(match.group(1)) code = match.group(2).upper() if match.group(2) else 'A' amount_str = match.group(3).strip() amount = self._parse_decimal(amount_str) if amount and amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': amount }) continue # Check for more entries except (ValueError, InvalidOperation, IndexError): pass # Fallback: amount on NEXT line match = re.search(r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?$', line, re.IGNORECASE) if match: try: percent = int(match.group(1)) code = match.group(2).upper() if match.group(2) else 'A' if i + 1 < len(lines): amount_str = lines[i + 1].strip() amount = self._parse_decimal(amount_str) if amount and amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': amount }) except (ValueError, InvalidOperation, IndexError): continue return (entries, confidence if entries else 0.0) def _try_tva_multiline(self, text: str, lines: List[str]) -> Tuple[List[dict], float]: """Try multiline format: 'TOTAL TVA A - 19%' + amount on next line""" entries = [] confidence = 0.95 # Default confidence for multiline format # Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next multiline_patterns = [ r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%', ] for i, line in enumerate(lines): for pattern in multiline_patterns: match = re.search(pattern, line, re.IGNORECASE) if match: try: code = match.group(1).upper() if match.group(1) else 'A' percent = int(match.group(2)) # Amount is on next line if i + 1 < len(lines): amount_str = lines[i + 1].strip() amount = self._parse_decimal(amount_str) if amount and amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': amount }) return (entries, confidence) except (ValueError, InvalidOperation, IndexError): continue return (entries, 0.0) def _try_tva_table(self, text: str) -> Tuple[List[dict], float]: """Try OMV-style table format: 'A-21,00% 285,66 49,58'""" entries = [] confidence = 0.96 # Default confidence for table format # Pattern: "A-21,00% 285,66 49,58" (code-percent base_amount tva_amount) table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)' for match in re.finditer(table_pattern, text, re.IGNORECASE): try: code = match.group(1).upper() percent = int(match.group(2)) # Group 4 is the TVA amount (last column in table) tva_amount_str = self._clean_ocr_number(match.group(4)) tva_amount = self._parse_decimal(tva_amount_str) if tva_amount and tva_amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': tva_amount }) except (ValueError, InvalidOperation, IndexError): continue # Fallback: "TOTAL TAXE: 55,22" if not entries: taxe_match = re.search(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', text, re.IGNORECASE) if taxe_match: try: amount_str = self._clean_ocr_number(taxe_match.group(1)) amount = self._parse_decimal(amount_str) if amount and amount > 0: entries.append({ 'code': 'A', 'percent': 19, # Default rate 'amount': amount }) confidence = 0.90 # Lower confidence for fallback except (ValueError, InvalidOperation): pass return (entries, confidence if entries else 0.0) def _try_tva_standard(self, text: str) -> Tuple[List[dict], float]: """Try standard TVA patterns as fallback""" entries = [] matched_confidence = 0.0 standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books'] for pattern, confidence, fmt in self.TVA_PATTERNS: if fmt not in standard_fmts: continue match = re.search(pattern, text, re.IGNORECASE) if match: try: groups = match.groups() if len(groups) >= 2: # Could be (percent, amount) or (code, percent, amount) if groups[0] and groups[0].isalpha(): code = groups[0].upper() percent = int(groups[1]) if len(groups) > 1 else 19 amount_str = groups[2] if len(groups) > 2 else None else: code = 'A' percent = int(groups[0]) if groups[0] and groups[0].isdigit() else 19 amount_str = groups[1] if len(groups) > 1 else groups[0] if amount_str: amount = self._parse_decimal(self._clean_ocr_number(amount_str)) if amount and amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': amount }) return (entries, confidence) elif len(groups) == 1: # Just amount amount = self._parse_decimal(self._clean_ocr_number(groups[0])) if amount and amount > 0: entries.append({ 'code': 'A', 'percent': 19, 'amount': amount }) return (entries, confidence) except (ValueError, InvalidOperation, IndexError): continue return (entries, matched_confidence) def _clean_ocr_number(self, value: str) -> str: """Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22').""" if not value: return "" value = re.sub(r'\s*([.,])\s*', r'\1', value) value = value.replace(' ', '') return value def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]: """ Extract total amount from receipt text. Supports both single-line and multiline formats: - Single line: "TOTAL: 78.00", "SUMA TOTALA: 78.00" - Multiline: "SUMA\nTOTALA:\n78.00" (common in thermal receipts) Args: text: Raw OCR text from receipt Returns: Tuple of (amount, confidence) or (None, 0.0) """ text_upper = text.upper() lines = text_upper.split('\n') # ===================================================================== # STRATEGY 1: Multiline "SUMA TOTALA" pattern (thermal receipts) # Format: SUMA on one line, TOTALA: on next, amount on third # ===================================================================== for i, line in enumerate(lines): line_clean = line.strip() # Check for "SUMA" keyword (with OCR variants: SUNA, SUHA, SUM A) if re.search(r'S[UU]M[AĂ\s]', line_clean): # Look at next 3 lines for "TOTALA" and amount for j in range(i, min(i + 4, len(lines))): check_line = lines[j].strip() # Check for "TOTALA:" or "TOTALA -" followed by amount match = re.search(r'T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', check_line) if match: amount = self._parse_decimal(self._clean_ocr_number(match.group(1))) if amount and amount > 0 and amount < self.MAX_PAYMENT: return (amount, 0.98) # Check for "TOTALA" without amount, amount on next line if re.search(r'T[O0]TALA\s*[:\-]?\s*$', check_line): if j + 1 < len(lines): amount_line = lines[j + 1].strip() amount = self._parse_decimal(amount_line) if amount and amount > 0 and amount < self.MAX_PAYMENT: return (amount, 0.97) # Check for "SUMA TOTALA" on single line with amount match = re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', line_clean) if match: amount = self._parse_decimal(self._clean_ocr_number(match.group(1))) if amount and amount > 0 and amount < self.MAX_PAYMENT: return (amount, 0.98) # Check for "SUMA TOTALA" without amount, amount on next line if re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]?\s*$', line_clean): if i + 1 < len(lines): next_line = lines[i + 1].strip() amount = self._parse_decimal(next_line) if amount and amount > 0 and amount < self.MAX_PAYMENT: return (amount, 0.96) # ===================================================================== # STRATEGY 2: Standard single-line patterns # ===================================================================== for pattern, confidence in self.TOTAL_PATTERNS: match = re.search(pattern, text_upper) if match: amount = self._parse_decimal(match.group(1)) if amount and amount > 0 and amount < self.MAX_PAYMENT: return (amount, confidence) return (None, 0.0) def extract_date(self, text: str) -> Tuple[Optional[date], float]: """ Extract receipt date from text. Args: text: Raw OCR text from receipt Returns: Tuple of (date, confidence) or (None, 0.0) """ text_upper = text.upper() # Try standard patterns first for pattern, confidence in self.DATE_PATTERNS: match = re.search(pattern, text_upper) if match: parsed = self._parse_date(match.group(1)) if parsed: return (parsed, confidence) # Try OCR-corrupted patterns with spaces for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES: match = re.search(pattern, text_upper) if match: try: if fmt == 'ymd': year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) else: # dmy day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3)) if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100: return (date(year, month, day), confidence) except (ValueError, TypeError): continue return (None, 0.0) def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]: """ Extract receipt number from text. Args: text: Raw OCR text from receipt Returns: Tuple of (number, confidence) or (None, 0.0) """ text_upper = text.upper() for pattern, confidence in self.NUMBER_PATTERNS: match = re.search(pattern, text_upper) if match: number = match.group(1).strip() if number and len(number) >= 3: return (number, confidence) return (None, 0.0) def extract_payment_methods(self, text: str) -> List[dict]: """ Extract payment methods (CARD/NUMERAR) from receipt. Supports: - Multiline patterns: "CARD\n78.00" (common in thermal receipts) - Multiple payments (split CARD + NUMERAR) - REST (change) detection to calculate actual CARD amount - Keyword-only CARD/NUMERAR that infers from total - Fallback for fiscal receipts without explicit payment Args: text: Raw OCR text from receipt Returns: List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}] """ text_upper = text.upper() lines = text_upper.split('\n') methods = [] seen_entries = set() # ===================================================================== # STEP 0: Try MULTILINE patterns first (thermal receipts) # Format: "CARD" on one line, amount on next line # ===================================================================== for i, line in enumerate(lines): line_clean = line.strip() # Standalone CARD keyword (not part of MASTERCARD, etc.) if re.match(r'^CARD\s*$', line_clean): if i + 1 < len(lines): next_line = lines[i + 1].strip() # Must be a valid amount (not another keyword) if re.match(r'^[\d\s.,]+$', next_line): amount = self._parse_decimal(next_line) if amount and amount > 0 and amount < self.MAX_PAYMENT: entry_key = ('CARD', amount) if entry_key not in seen_entries: methods.append({ 'method': 'CARD', 'amount': amount, 'confidence': 0.95 }) seen_entries.add(entry_key) # Standalone NUMERAR keyword if re.match(r'^NUMERAR\s*$', line_clean): if i + 1 < len(lines): next_line = lines[i + 1].strip() if re.match(r'^[\d\s.,]+$', next_line): amount = self._parse_decimal(next_line) if amount and amount > 0 and amount < self.MAX_PAYMENT: entry_key = ('NUMERAR', amount) if entry_key not in seen_entries: methods.append({ 'method': 'NUMERAR', 'amount': amount, 'confidence': 0.95 }) seen_entries.add(entry_key) # If multiline extraction found methods, return them if methods: return methods # ===================================================================== # STEP 1: Try pattern-based extraction with explicit amounts # ===================================================================== for pattern, method, confidence in self.PAYMENT_PATTERNS: for match in re.finditer(pattern, text_upper): try: amount = self._parse_decimal(match.group(1)) if amount and amount > 0 and amount < self.MAX_PAYMENT: entry_key = (method, amount) if entry_key not in seen_entries: methods.append({ 'method': method, 'amount': amount, 'confidence': confidence }) seen_entries.add(entry_key) except (ValueError, InvalidOperation): continue # If we found explicit amounts, we're done if methods: return methods # Step 2: Try keyword-only detection with REST logic # Get total amount for inference total_amount, _ = self.extract_total(text) if not total_amount: return [] # Check for payment keywords has_card = any(kw in text_upper for kw in ['CARD', 'CARTE CREDIT', 'VISA', 'MASTERCARD', 'DEBIT', 'CREDIT', 'CONTACTLESS']) has_numerar = any(kw in text_upper for kw in ['NUMERAR', 'CASH']) # Find REST (change) amount rest_amount = Decimal('0') for i, line in enumerate(lines): if 'REST' in line: # REST on same line: "REST 0.00" or "REST: 0.00" match = re.search(r'REST\s*:?\s*([\d.,]+)', line) if match: rest_amount = self._parse_decimal(match.group(1)) or Decimal('0') elif i + 1 < len(lines): # REST on separate line rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0') break # Calculate payment amounts if has_card: card_amount = total_amount - rest_amount if card_amount > 0: methods.append({ 'method': 'CARD', 'amount': card_amount, 'confidence': 0.90 }) if has_numerar: if has_card and rest_amount > 0: # Mixed payment: NUMERAR is the change given back methods.append({ 'method': 'NUMERAR', 'amount': rest_amount, 'confidence': 0.85 }) elif not has_card: # Cash only methods.append({ 'method': 'NUMERAR', 'amount': total_amount, 'confidence': 0.90 }) # Step 3: Fallback for fiscal receipts without explicit payment if not methods and total_amount and total_amount > 0: is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper if is_fiscal: # Default to CARD for business purchases (most common) methods.append({ 'method': 'CARD', 'amount': total_amount, 'confidence': 0.70 }) return methods def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]: """ Extract client CUI from B2B receipts. Args: text: Raw OCR text from receipt Returns: Tuple of (cui, confidence) or (None, 0.0) """ text_upper = text.upper() # First check if there's a CLIENT section has_client_section = any( re.search(marker, text_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS ) if not has_client_section: return (None, 0.0) # Try to extract CUI for pattern, confidence in self.CLIENT_CUI_PATTERNS: match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE) if match: cui = match.group(1) # Normalize: remove RO prefix for storage cui_digits = re.sub(r'[^0-9]', '', cui) if 6 <= len(cui_digits) <= 10: return (cui_digits, confidence) return (None, 0.0) def extract_client_name(self, text: str) -> Tuple[Optional[str], float]: """ Extract client/buyer company name from B2B receipts. Args: text: Raw OCR text from receipt Returns: Tuple of (client_name, confidence) or (None, 0.0) """ text_upper = text.upper() lines = text.split('\n') # First check if there's a CLIENT section client_section_idx = None for i, line in enumerate(lines): line_upper = line.upper().strip() if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS): client_section_idx = i break if client_section_idx is None: return (None, 0.0) # Look for company name in CLIENT section line = lines[client_section_idx].strip() line_upper = line.upper() # Strategy 1: Check if name is on same line after ":" if ':' in line: name_part = line.split(':', 1)[1].strip() if name_part and len(name_part) >= 3: # Skip if it looks like a CUI (RO followed by digits) if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()): pass # This is CUI, not name - continue to next strategy else: # Check for company indicators name_upper = name_part.upper() if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS): return (self._clean_company_name(name_part), 0.95) elif len(name_part) >= 5 and not name_part.isdigit(): return (self._clean_company_name(name_part), 0.80) # Strategy 2: Check next line for company name if client_section_idx + 1 < len(lines): next_line = lines[client_section_idx + 1].strip() next_upper = next_line.upper() # Skip if it's a CUI/CIF line or looks like CUI if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper): if not re.match(r'^R[O0]?\d{6,10}$', next_upper): if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS): return (self._clean_company_name(next_line), 0.90) elif len(next_line) >= 5 and not next_line.isdigit(): # Check it's not CUI/CIF/COD keywords if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']): return (self._clean_company_name(next_line), 0.75) # Strategy 3: Look for any line with company indicators in CLIENT section region search_end = min(client_section_idx + 5, len(lines)) for i in range(client_section_idx + 1, search_end): line = lines[i].strip() line_upper = line.upper() # Skip CUI/CIF lines if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper): continue if re.match(r'^R[O0]?\d{6,10}$', line_upper): continue if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS): return (self._clean_company_name(line), 0.85) return (None, 0.0) @staticmethod def _clean_company_name(name: str) -> str: """Clean company name for storage.""" if not name: return "" # Remove extra whitespace name = re.sub(r'\s+', ' ', name).strip() # Remove trailing punctuation except periods in S.R.L., S.A., etc. name = re.sub(r'[,;:]+$', '', name).strip() return name # ------------------------------------------------------------------------- # Validation hints - override to customize validation behavior # ------------------------------------------------------------------------- def get_validation_hints(self) -> Dict[str, Any]: """ Return validation hints for this store. Returns: Dict with validation hints. Common keys: - has_multi_rate_tva: bool - Store uses multiple TVA rates - card_equals_total: bool - CARD payment equals total - has_client_cui: bool - Receipt includes client CUI - has_efactura: bool - Store uses e-factura format - is_non_vat_payer: bool - Store is not a VAT payer """ return {} # ------------------------------------------------------------------------- # Helper methods - available to all subclasses # ------------------------------------------------------------------------- @staticmethod def _normalize_number(text: str) -> str: """ Normalize a number string for Decimal conversion. Handles Romanian formats: "1.234,56" -> "1234.56" """ if not text: return "0" # Remove spaces text = text.replace(" ", "") # Determine decimal separator last_comma = text.rfind(",") last_dot = text.rfind(".") if last_comma > last_dot: text = text.replace(".", "").replace(",", ".") elif last_dot > last_comma: text = text.replace(",", "") else: text = text.replace(",", ".") return text @staticmethod def _parse_decimal(text: str) -> Optional[Decimal]: """Parse a string to Decimal, handling various formats.""" try: normalized = BaseStoreProfile._normalize_number(text) return Decimal(normalized) except (InvalidOperation, ValueError, TypeError): return None @staticmethod def _parse_date(text: str) -> Optional[date]: """ Parse date string in various formats. Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD """ if not text: return None # Normalize separators text = text.replace('/', '-').replace('.', '-') try: parts = text.split('-') if len(parts) != 3: return None # Determine format based on first part length if len(parts[0]) == 4: # YYYY-MM-DD year, month, day = int(parts[0]), int(parts[1]), int(parts[2]) else: # DD-MM-YYYY day, month, year = int(parts[0]), int(parts[1]), int(parts[2]) # Validate ranges if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100: return date(year, month, day) except (ValueError, TypeError, IndexError): pass return None @staticmethod def _clean_text(text: str) -> str: """Clean OCR text for pattern matching.""" if not text: return "" text = re.sub(r'\s+', ' ', text) text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text) return text.strip() # ------------------------------------------------------------------------- # Magic methods # ------------------------------------------------------------------------- def __repr__(self) -> str: return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>" def __str__(self) -> str: return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"