""" OMV Petrom store profile for OCR extraction. OMV receipts typically include client CUI and use standard TVA format. Common at gas stations with fuel purchases. Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14") OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22") """ import re from datetime import date from decimal import Decimal, InvalidOperation from typing import List, Dict, Any, Tuple, Optional from .base import BaseStoreProfile from . import ProfileRegistry @ProfileRegistry.register class OMVProfile(BaseStoreProfile): """ OMV PETROM MARKETING S.R.L. - standard TVA with client CUI. Key characteristics: - Standard TVA format (usually single rate, any percentage) - Includes client CUI on receipt (for business purchases) - TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces) - Supports historical rates (19%) and current rates (21%) - Date format: YYYY. MM. DD (with spaces) - Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX" """ CUI_LIST = ["11201891"] NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants STORE_NAME = "OMV PETROM MARKETING S.R.L." # OMV TVA table patterns (handles OCR spaces in numbers) # Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total) TVA_TABLE_PATTERNS = [ # "A-21, 00% 55, 22 318, 16" - with spaces in numbers r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', # "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', ] # Standard TVA pattern fallback TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)' # OMV specific: prioritize YYYY. MM. DD format with spaces DATE_PATTERNS_OCR_SPACES = [ # YYYY. MM. DD with time (OMV format) (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'), (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'), # Fallback to DD. MM. YYYY (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'), ] # Client CUI patterns for OMV (unique format) CLIENT_CUI_PATTERNS = [ # "CLIENT C.U. I./C.I.F.: RO1879855" (r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99), # "C.U.I./C.I.F. CLIENT: XXXXXXX" (r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98), # Fallback to simpler pattern (r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90), ] # Client markers for OMV CLIENT_MARKERS = [ r'CLIENT\s+C\.?\s*U\.?\s*I', r'CLIENT\s+C\.?\s*I\.?\s*F', r'NUME\s+CLIENT', r'CLIENT\s*:', ] def _clean_ocr_number(self, value: str) -> str: """Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22').""" # Remove spaces around commas and periods value = re.sub(r'\s*([.,])\s*', r'\1', value) # Remove any remaining spaces value = value.replace(' ', '') return value def extract_tva_entries(self, text: str) -> List[dict]: """ Extract OMV-specific TVA entries. OMV receipts show TVA in table format with spaces in numbers. Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base) Args: text: Raw OCR text from receipt Returns: List of TVA entries with code, percent, and amount """ entries = [] text_upper = text.upper() # Try table format first: "A-21, 00% 55, 22 318, 16" table_pattern = self.TVA_TABLE_PATTERNS[0] for match in re.finditer(table_pattern, text_upper): try: code = match.group(1).upper() percent = int(match.group(2)) # Clean OCR spaces from amounts tva_amount_str = self._clean_ocr_number(match.group(3)) tva_amount = self._parse_decimal(tva_amount_str) if tva_amount and tva_amount > 0: entries.append({ 'code': code, 'percent': percent, 'amount': tva_amount }) return entries # OMV usually has single TVA rate except (ValueError, InvalidOperation, IndexError): continue # Fallback: "TOTAL TAXE: 55, 22" fallback_pattern = self.TVA_TABLE_PATTERNS[1] match = re.search(fallback_pattern, text_upper) if match: try: tva_amount_str = self._clean_ocr_number(match.group(1)) tva_amount = self._parse_decimal(tva_amount_str) if tva_amount and tva_amount > 0: entries.append({ 'code': 'A', 'percent': 19, # Standard rate, will be corrected by validation 'amount': tva_amount }) except (ValueError, InvalidOperation): pass return entries def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]: """ Extract client CUI from OMV receipt. OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855" Args: text: Raw OCR text from receipt Returns: Tuple of (cui, confidence) or (None, 0.0) """ text_upper = text.upper() # Check for OMV client markers has_client = any( re.search(marker, text_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS ) if not has_client: return (None, 0.0) # Try OMV-specific patterns for pattern, confidence in self.CLIENT_CUI_PATTERNS: match = re.search(pattern, text_upper, re.IGNORECASE) if match: cui = match.group(1) # Clean up: remove RO prefix, spaces cui_digits = re.sub(r'[^0-9]', '', cui) if 6 <= len(cui_digits) <= 10: return (cui_digits, confidence) return (None, 0.0) def extract_payment_methods(self, text: str) -> List[dict]: """ Extract OMV-specific payment methods. OMV receipts use "CARTE CREDIT" instead of "CARD". Payment amount equals TOTAL for gas station receipts. Args: text: Raw OCR text from receipt Returns: List of payment methods with method, amount, and confidence """ payments = [] text_upper = text.upper() # Get total amount first total_amount, _ = self.extract_total(text) if not total_amount: return [] # OMV payment patterns payment_indicators = [ ('CARTE CREDIT', 'CARD', 0.98), ('CARTE DE CREDIT', 'CARD', 0.98), ('CARD', 'CARD', 0.95), ('VISA', 'CARD', 0.95), ('MASTERCARD', 'CARD', 0.95), ('CONTACTLESS', 'CARD', 0.90), ('NUMERAR', 'NUMERAR', 0.95), ('CASH', 'NUMERAR', 0.90), ] for indicator, method, confidence in payment_indicators: if indicator in text_upper: payments.append({ 'method': method, 'amount': total_amount, 'confidence': confidence }) return payments # OMV usually has single payment method # Fallback: If no explicit payment but has BON FISCAL, assume CARD if 'BON FISCAL' in text_upper: payments.append({ 'method': 'CARD', 'amount': total_amount, 'confidence': 0.70 }) return payments def get_validation_hints(self) -> Dict[str, Any]: """Return OMV-specific validation hints.""" return { "has_multi_rate_tva": False, "card_equals_total": True, # Gas station: card equals total "has_client_cui": True, "has_efactura": False, "is_non_vat_payer": False, "tva_table_format": True, }