""" BRICK (Five-Holding) store profile for OCR extraction. Five-Holding S.A. operates BRICK stores with standard receipt format. Receipt structure: - TVA format: "TOTAL TVA A - 21%" with amount on next line - Payment: "CARD" on separate line (amount from TOTAL LEI) - Client CUI: "CLIENT C.U.L./C.IF. :ROXXXXXXX" (OCR reads I as L) """ import re from decimal import Decimal, InvalidOperation from typing import List, Dict, Any, Tuple, Optional from .base import BaseStoreProfile from . import ProfileRegistry @ProfileRegistry.register class BrickProfile(BaseStoreProfile): """ FIVE-HOLDING S.A. (BRICK) - standard TVA format with client CUI. Key characteristics: - Standard TVA format with rate code (A, B, etc.) - TVA amount on separate line after percentage - CARD payment indicated by keyword (amount derived from total) - Client CUI in format: CLIENT C.U.L./C.IF. - OCR often reads "I" as "L" in CUI markers """ CUI_LIST = ["10562600"] NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK", "F1VE"] STORE_NAME = "FIVE-HOLDING S.A." # BRICK TVA patterns (amount often on separate line) TVA_PATTERNS = [ # "TOTAL TVA A - 21%" with amount on next line (captured as multiline) r'TOTAL\s+TVA\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)', # "OTAL IVAA 21%" - OCR error variant r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)', # "TOTAL TVA A 21%" without separator r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%\s*\n?\s*([\d.,]+)', # "TVA A: XX% = YY,YY" - inline format r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)', ] # TOTAL TVA BON pattern (fallback) TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s*BON\s*\n?\s*([\d.,]+)' # Client CUI patterns - specific to Brick (handles OCR L/I confusion) CLIENT_CUI_PATTERNS = [ # "CLIENT C.U.L./C.IF. :R01879855" - exact OCR format (I->L) (r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/?\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99), # "CLIENT C.U.I./C.I.F.: RO1879855" - standard format (r'CLIENT\s+C\.?U\.?I\.?\s*/?\s*C\.?I\.?F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.98), # "CIF CLIENT: XXXXXXX" - alternative format (r'CIF\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.95), ] # Client markers for Brick CLIENT_MARKERS = [ r'CLIENT\s+C\.?U\.?[LI1]', r'CLIENT\s+C\.?I\.?F', r'CIF\s+CLIENT', ] def extract_tva_entries(self, text: str) -> List[dict]: """ Extract BRICK-specific TVA entries. BRICK receipts show TVA in multi-line format: "TOTAL TVA A - 21%" "32.31" Args: text: Raw OCR text from receipt Returns: List of TVA entries with code, percent, and amount """ entries = [] text_upper = text.upper() seen = set() # Try coded patterns first (with multiline support) for pattern in self.TVA_PATTERNS: for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE): try: code = match.group(1).upper() percent = int(match.group(2)) amount = self._parse_decimal(match.group(3)) if amount and amount > 0: entry_key = (code, percent) if entry_key not in seen: entries.append({ 'code': code, 'percent': percent, 'amount': amount }) seen.add(entry_key) return entries # Brick usually has single TVA rate except (ValueError, InvalidOperation, IndexError): continue # Fallback: "TOTAL TVA BON" with amount on next line match = re.search(self.TOTAL_TVA_BON_PATTERN, text_upper, re.IGNORECASE | re.MULTILINE) if match: try: amount = self._parse_decimal(match.group(1)) if amount and amount > 0: entries.append({ 'code': 'A', 'percent': 19, # Default rate 'amount': amount }) except (ValueError, InvalidOperation): pass return entries def extract_payment_methods(self, text: str) -> List[dict]: """ Extract BRICK-specific payment methods. BRICK receipts show payment method on separate line: "TOTAL LEI" "21.18" "CARD" "0.00" <- REST (change) When CARD appears with REST=0, full amount was paid by card. Args: text: Raw OCR text from receipt Returns: List of payment methods with method, amount, and confidence """ payments = [] text_upper = text.upper() lines = text_upper.split('\n') # Find TOTAL LEI amount total_amount = None for i, line in enumerate(lines): if 'TOTAL' in line and 'LEI' in line: # Amount is likely on next line if i + 1 < len(lines): amount_str = lines[i + 1].strip() total_amount = self._parse_decimal(amount_str) break # Also try inline: "TOTAL LEI 21.18" match = re.search(r'TOTAL\s+LEI\s*([\d.,]+)', line) if match: total_amount = self._parse_decimal(match.group(1)) break if not total_amount: # Fallback to generic total extraction total_amount, _ = self.extract_total(text) if not total_amount: return [] # Check for CARD or NUMERAR keywords has_card = any('CARD' in line for line in lines) has_numerar = any('NUMERAR' in line for line in lines) # Find REST amount to determine actual card amount rest_amount = Decimal('0') for i, line in enumerate(lines): if 'REST' in line: # REST amount is on next line or same line match = re.search(r'REST\s*([\d.,]+)', line) if match: rest_amount = self._parse_decimal(match.group(1)) or Decimal('0') elif i + 1 < len(lines): rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0') break if has_card: # Card payment = total - rest card_amount = total_amount - rest_amount if card_amount > 0: payments.append({ 'method': 'CARD', 'amount': card_amount, 'confidence': 0.95 }) if has_numerar: # If both card and cash, need more complex logic # For now, assume numerar is the rest if card is present if not has_card: payments.append({ 'method': 'NUMERAR', 'amount': total_amount, 'confidence': 0.95 }) elif rest_amount > 0: payments.append({ 'method': 'NUMERAR', 'amount': rest_amount, 'confidence': 0.90 }) # If no explicit payment keyword but REST=0, assume card if not payments and rest_amount == 0: # Check for any payment indicators for line in lines: if 'CARD' in line or 'DEBIT' in line or 'CREDIT' in line: payments.append({ 'method': 'CARD', 'amount': total_amount, 'confidence': 0.90 }) break # FALLBACK: If still no payment found but we have total amount, # assume CARD for business receipts (Brick stores usually accept card) # This handles cases where OCR fails to capture payment method if not payments and total_amount and total_amount > 0: # Check if this is a fiscal receipt (BON FISCAL) is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper if is_fiscal: payments.append({ 'method': 'CARD', 'amount': total_amount, 'confidence': 0.70 # Lower confidence for inferred payment }) return payments def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]: """ Extract client CUI from BRICK receipt. BRICK uses format: "CLIENT C.U.L./C.IF. :R01879855" Note: OCR often reads "I" as "L" in these markers. Args: text: Raw OCR text from receipt Returns: Tuple of (cui, confidence) or (None, 0.0) """ text_upper = text.upper() # Check for Brick client markers has_client = any( re.search(marker, text_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS ) if not has_client: return (None, 0.0) # Try Brick-specific patterns for pattern, confidence in self.CLIENT_CUI_PATTERNS: match = re.search(pattern, text_upper, re.IGNORECASE) if match: cui = match.group(1) # Clean up: remove RO prefix, spaces cui_digits = re.sub(r'[^0-9]', '', cui) if 6 <= len(cui_digits) <= 10: return (cui_digits, confidence) return (None, 0.0) def get_validation_hints(self) -> Dict[str, Any]: """Return BRICK-specific validation hints.""" return { "has_multi_rate_tva": False, "card_equals_total": True, # Card amount equals total when REST=0 "has_client_cui": True, # Brick receipts CAN have client CUI "has_efactura": False, "is_non_vat_payer": False, "tva_on_separate_line": True, # TVA amount on next line }