""" SOCAR Petroleum store profile for OCR extraction. SOCAR receipts are similar to OMV - gas station with client CUI support. Date format may use YYYY. MM. DD with spaces. """ import re from datetime import date from decimal import Decimal, InvalidOperation from typing import List, Dict, Any, Tuple, Optional from .base import BaseStoreProfile from . import ProfileRegistry @ProfileRegistry.register class SocarProfile(BaseStoreProfile): """ SOCAR PETROLEUM S.A. - standard TVA with client CUI. Key characteristics: - Standard TVA format (usually single rate) - Includes client CUI on receipt (for business purchases) - Similar format to OMV/Petrom - Date format may use YYYY. MM. DD (with spaces) """ CUI_LIST = ["12546600"] NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants STORE_NAME = "SOCAR PETROLEUM S.A." # Standard TVA patterns for gas stations TVA_PATTERNS = [ # Table format: "A-19,00% 285,66 49,58" r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)', # Simple format: "TVA 19% 49,58" r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)', ] # Gas stations may use YYYY. MM. DD format DATE_PATTERNS_OCR_SPACES = [ (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'), (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'), (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'), ] def extract_tva_entries(self, text: str) -> List[dict]: """ Extract SOCAR-specific TVA entries. Args: text: Raw OCR text from receipt Returns: List of TVA entries with code, percent, and amount """ entries = [] seen = set() # Try table format first table_pattern = self.TVA_PATTERNS[0] for match in re.finditer(table_pattern, text, re.IGNORECASE): try: code = match.group(1).upper() percent = int(match.group(2)) tva_amount = self._parse_decimal(match.group(4)) if tva_amount and tva_amount > 0: entry_key = (code, percent) if entry_key not in seen: entries.append({ 'code': code, 'percent': percent, 'amount': tva_amount }) seen.add(entry_key) except (ValueError, InvalidOperation): continue # Fallback to simple format if no table entries found if not entries: simple_pattern = self.TVA_PATTERNS[1] for match in re.finditer(simple_pattern, text, re.IGNORECASE): try: percent = int(match.group(1)) amount = self._parse_decimal(match.group(2)) if amount and amount > 0: # Default to code 'A' for simple format entries.append({ 'code': 'A', 'percent': percent, 'amount': amount }) break # Only take first match for simple format except (ValueError, InvalidOperation): continue return entries def get_validation_hints(self) -> Dict[str, Any]: """Return SOCAR-specific validation hints.""" return { "has_multi_rate_tva": False, "card_equals_total": False, "has_client_cui": True, "has_efactura": False, "is_non_vat_payer": False, }