""" Base class for store-specific OCR extraction profiles. Each store can have different receipt formats (TVA layout, total position, etc.). Store profiles allow customizing extraction logic per-store for better accuracy. Usage: from .base import BaseStoreProfile from . import ProfileRegistry @ProfileRegistry.register class LidlProfile(BaseStoreProfile): CUI_LIST = ["22891860"] NAME_PATTERNS = ["LIDL", "LDL"] def extract_tva_entries(self, text: str) -> List[dict]: # Custom Lidl TVA extraction logic ... """ import re from abc import ABC from decimal import Decimal, InvalidOperation from typing import List, Optional, Tuple, Dict, Any from datetime import date class BaseStoreProfile(ABC): """ Abstract base class for store-specific extraction profiles. Each profile defines: - CUI_LIST: CUI codes that identify this store (without RO prefix) - NAME_PATTERNS: OCR-tolerant name patterns for fallback matching - Custom extraction methods for TVA, total, date, etc. The ProfileRegistry uses CUI_LIST to lookup profiles during extraction. """ # ------------------------------------------------------------------------- # Class attributes - override in subclasses # ------------------------------------------------------------------------- # List of CUI codes (without RO prefix) that identify this store CUI_LIST: List[str] = [] # OCR-tolerant name patterns for fallback matching NAME_PATTERNS: List[str] = [] # Store display name STORE_NAME: str = "Unknown Store" # ------------------------------------------------------------------------- # Generic patterns - can be overridden in subclasses # ------------------------------------------------------------------------- # Total amount patterns (confidence-weighted) TOTAL_PATTERNS = [ (r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98), (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98), (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95), (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95), (r'SUBTOTAL\s*([\d\s.,]+)', 0.90), (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90), (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85), ] # Date patterns (confidence-weighted) DATE_PATTERNS = [ (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95), (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90), (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80), (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75), ] # Date patterns with OCR-introduced spaces (separate because format is different) DATE_PATTERNS_OCR_SPACES = [ (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'), (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'), ] # Receipt number patterns (confidence-weighted) NUMBER_PATTERNS = [ (r'NDS\s*:?\s*(\d+)', 0.98), (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), (r'C3POS.*?(\d{6,7})\b', 0.95), (r'BF\s*:\s*(\d{4,})', 0.96), (r'BF\s+(\d{4,})', 0.93), (r'NIVS\s*:?\s*(\d+)', 0.95), (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95), (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95), (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95), (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90), (r'ID\s*BF\s*:?\s*(\d+)', 0.90), ] # Payment method patterns (pattern, method_type, confidence) PAYMENT_PATTERNS = [ (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98), (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97), (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95), (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95), (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90), (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70), (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75), (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70), ] # Client section markers (for B2B receipts) - More flexible patterns CLIENT_MARKERS = [ r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon) r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT" r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI" r'CLIENT\s*:', # "CLIENT:" r'CUMPARATOR\s*:', # "CUMPARATOR:" r'BENEFICIAR\s*:', # "BENEFICIAR:" r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT" ] # Client CUI patterns (pattern, confidence) - More flexible CLIENT_CUI_PATTERNS = [ # "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format (r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99), # "CLIENT CIF: XXXXXXX" (r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98), # "CUI CLIENT: XXXXXXX" (r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98), # "ROXXXXXXX" followed by CLIENT marker (r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97), # "C.I.F. CLIENT: XXXXXXX" (r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96), # "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX" (r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90), # "COD FISCAL CLIENT: XXXXXXX" (r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95), ] # Company type indicators (for identifying company names) COMPANY_INDICATORS = [ r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L. r'\bS\.?\s*A\.?\b', # S.A. or S. A. r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C. r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S. r'\bI\.?\s*I\.?\b', # I.I. or I. I. r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A. r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name r'HOLDING', r'COMPANY', r'GROUP', ] # Maximum reasonable payment amount (to filter OCR errors) MAX_PAYMENT = Decimal('100000') # ------------------------------------------------------------------------- # Extraction methods - override in subclasses as needed # ------------------------------------------------------------------------- def extract_tva_entries(self, text: str) -> List[dict]: """ Extract TVA entries from receipt text. Override this method in subclasses to handle store-specific TVA formats. Args: text: Raw OCR text from receipt Returns: List of dicts with keys: code, percent, amount """ return [] def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]: """ Extract total amount from receipt text. Args: text: Raw OCR text from receipt Returns: Tuple of (amount, confidence) or (None, 0.0) """ text_upper = text.upper() for pattern, confidence in self.TOTAL_PATTERNS: match = re.search(pattern, text_upper) if match: amount = self._parse_decimal(match.group(1)) if amount and amount > 0 and amount < self.MAX_PAYMENT: return (amount, confidence) return (None, 0.0) def extract_date(self, text: str) -> Tuple[Optional[date], float]: """ Extract receipt date from text. Args: text: Raw OCR text from receipt Returns: Tuple of (date, confidence) or (None, 0.0) """ text_upper = text.upper() # Try standard patterns first for pattern, confidence in self.DATE_PATTERNS: match = re.search(pattern, text_upper) if match: parsed = self._parse_date(match.group(1)) if parsed: return (parsed, confidence) # Try OCR-corrupted patterns with spaces for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES: match = re.search(pattern, text_upper) if match: try: if fmt == 'ymd': year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) else: # dmy day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3)) if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100: return (date(year, month, day), confidence) except (ValueError, TypeError): continue return (None, 0.0) def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]: """ Extract receipt number from text. Args: text: Raw OCR text from receipt Returns: Tuple of (number, confidence) or (None, 0.0) """ text_upper = text.upper() for pattern, confidence in self.NUMBER_PATTERNS: match = re.search(pattern, text_upper) if match: number = match.group(1).strip() if number and len(number) >= 3: return (number, confidence) return (None, 0.0) def extract_payment_methods(self, text: str) -> List[dict]: """ Extract payment methods (CARD/NUMERAR) from receipt. Supports multiple payments of the same type (e.g., 2x CARD for split payments). Each payment is returned as a separate entry with its amount. Args: text: Raw OCR text from receipt Returns: List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}] Multiple entries of same method type are allowed for split payments. """ text_upper = text.upper() methods = [] # Track (method, amount) pairs to avoid exact duplicates from overlapping patterns seen_entries = set() for pattern, method, confidence in self.PAYMENT_PATTERNS: for match in re.finditer(pattern, text_upper): try: amount = self._parse_decimal(match.group(1)) if amount and amount > 0 and amount < self.MAX_PAYMENT: # Deduplicate by (method, amount) to avoid same entry from multiple patterns # But allow different amounts for same method (split payments) entry_key = (method, amount) if entry_key not in seen_entries: methods.append({ 'method': method, 'amount': amount, 'confidence': confidence }) seen_entries.add(entry_key) except (ValueError, InvalidOperation): continue return methods def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]: """ Extract client CUI from B2B receipts. Args: text: Raw OCR text from receipt Returns: Tuple of (cui, confidence) or (None, 0.0) """ text_upper = text.upper() # First check if there's a CLIENT section has_client_section = any( re.search(marker, text_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS ) if not has_client_section: return (None, 0.0) # Try to extract CUI for pattern, confidence in self.CLIENT_CUI_PATTERNS: match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE) if match: cui = match.group(1) # Normalize: remove RO prefix for storage cui_digits = re.sub(r'[^0-9]', '', cui) if 6 <= len(cui_digits) <= 10: return (cui_digits, confidence) return (None, 0.0) def extract_client_name(self, text: str) -> Tuple[Optional[str], float]: """ Extract client/buyer company name from B2B receipts. Args: text: Raw OCR text from receipt Returns: Tuple of (client_name, confidence) or (None, 0.0) """ text_upper = text.upper() lines = text.split('\n') # First check if there's a CLIENT section client_section_idx = None for i, line in enumerate(lines): line_upper = line.upper().strip() if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS): client_section_idx = i break if client_section_idx is None: return (None, 0.0) # Look for company name in CLIENT section line = lines[client_section_idx].strip() line_upper = line.upper() # Strategy 1: Check if name is on same line after ":" if ':' in line: name_part = line.split(':', 1)[1].strip() if name_part and len(name_part) >= 3: # Skip if it looks like a CUI (RO followed by digits) if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()): pass # This is CUI, not name - continue to next strategy else: # Check for company indicators name_upper = name_part.upper() if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS): return (self._clean_company_name(name_part), 0.95) elif len(name_part) >= 5 and not name_part.isdigit(): return (self._clean_company_name(name_part), 0.80) # Strategy 2: Check next line for company name if client_section_idx + 1 < len(lines): next_line = lines[client_section_idx + 1].strip() next_upper = next_line.upper() # Skip if it's a CUI/CIF line or looks like CUI if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper): if not re.match(r'^R[O0]?\d{6,10}$', next_upper): if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS): return (self._clean_company_name(next_line), 0.90) elif len(next_line) >= 5 and not next_line.isdigit(): # Check it's not CUI/CIF/COD keywords if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']): return (self._clean_company_name(next_line), 0.75) # Strategy 3: Look for any line with company indicators in CLIENT section region search_end = min(client_section_idx + 5, len(lines)) for i in range(client_section_idx + 1, search_end): line = lines[i].strip() line_upper = line.upper() # Skip CUI/CIF lines if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper): continue if re.match(r'^R[O0]?\d{6,10}$', line_upper): continue if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS): return (self._clean_company_name(line), 0.85) return (None, 0.0) @staticmethod def _clean_company_name(name: str) -> str: """Clean company name for storage.""" if not name: return "" # Remove extra whitespace name = re.sub(r'\s+', ' ', name).strip() # Remove trailing punctuation except periods in S.R.L., S.A., etc. name = re.sub(r'[,;:]+$', '', name).strip() return name # ------------------------------------------------------------------------- # Validation hints - override to customize validation behavior # ------------------------------------------------------------------------- def get_validation_hints(self) -> Dict[str, Any]: """ Return validation hints for this store. Returns: Dict with validation hints. Common keys: - has_multi_rate_tva: bool - Store uses multiple TVA rates - card_equals_total: bool - CARD payment equals total - has_client_cui: bool - Receipt includes client CUI - has_efactura: bool - Store uses e-factura format - is_non_vat_payer: bool - Store is not a VAT payer """ return {} # ------------------------------------------------------------------------- # Helper methods - available to all subclasses # ------------------------------------------------------------------------- @staticmethod def _normalize_number(text: str) -> str: """ Normalize a number string for Decimal conversion. Handles Romanian formats: "1.234,56" -> "1234.56" """ if not text: return "0" # Remove spaces text = text.replace(" ", "") # Determine decimal separator last_comma = text.rfind(",") last_dot = text.rfind(".") if last_comma > last_dot: text = text.replace(".", "").replace(",", ".") elif last_dot > last_comma: text = text.replace(",", "") else: text = text.replace(",", ".") return text @staticmethod def _parse_decimal(text: str) -> Optional[Decimal]: """Parse a string to Decimal, handling various formats.""" try: normalized = BaseStoreProfile._normalize_number(text) return Decimal(normalized) except (InvalidOperation, ValueError, TypeError): return None @staticmethod def _parse_date(text: str) -> Optional[date]: """ Parse date string in various formats. Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD """ if not text: return None # Normalize separators text = text.replace('/', '-').replace('.', '-') try: parts = text.split('-') if len(parts) != 3: return None # Determine format based on first part length if len(parts[0]) == 4: # YYYY-MM-DD year, month, day = int(parts[0]), int(parts[1]), int(parts[2]) else: # DD-MM-YYYY day, month, year = int(parts[0]), int(parts[1]), int(parts[2]) # Validate ranges if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100: return date(year, month, day) except (ValueError, TypeError, IndexError): pass return None @staticmethod def _clean_text(text: str) -> str: """Clean OCR text for pattern matching.""" if not text: return "" text = re.sub(r'\s+', ' ', text) text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text) return text.strip() # ------------------------------------------------------------------------- # Magic methods # ------------------------------------------------------------------------- def __repr__(self) -> str: return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>" def __str__(self) -> str: return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"