roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/unlimited_keys.py

"""
UNLIMITED KEYS S.R.L. store profile for OCR extraction.

Key duplication service. Notable for CASH (NUMERAR) payments.
"""

import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Optional, Tuple

from .base import BaseStoreProfile
from . import ProfileRegistry


@ProfileRegistry.register
class UnlimitedKeysProfile(BaseStoreProfile):
    """
    UNLIMITED KEYS S.R.L. - standard TVA profile with NUMERAR payment.

    Key characteristics:
    - Standard TVA format (single rate, any percentage)
    - Key duplication service
    - NUMERAR (cash) payment common - different from most stores!
    - May also accept CARD
    - OCR often reads "TVA" as "TUA" - need OCR error variants
    """

    CUI_LIST = ["18993187"]
    NAME_PATTERNS = ["UNLIMITED KEYS", "UNLIMITED", "UNL1MITED", "UNLIMITED KEYS SRL"]
    STORE_NAME = "UNLIMITED KEYS S.R.L."

    # Standard TVA patterns - including OCR error variants (TVA -> TUA)
    TVA_PATTERNS = [
        # "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY" (including TUA OCR error)
        r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)',
        # "A - XX,XX% = YY,YY"
        r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)',
        # "TVA XX% YY,YY" (simple format, includes TUA)
        r'T[UV]A\s+(\d{1,2})\s*%\s*([\d.,\s]+)',
        # "XX.XX% TUA*A YY.YY" (OCR format with TUA*A or TUA)
        r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?[A-D]?\s*([\d.,\s]+)',
        # "TOTAL TUA: YY.YY" (total TVA amount only)
        r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)',
    ]

    # TOTAL patterns for UNLIMITED KEYS (handles "80 .00" format)
    TOTAL_PATTERNS = [
        # "SUMA TOTALA: 80 .00" (with space before decimal)
        (r'SUMA\s+TOTALA\s*:?\s*([\d\s.,]+)', 0.98),
        # "TOTALA: 80,00"
        (r'TOTALA\s*:?\s*([\d.,]+)', 0.95),
        # Standard TOTAL patterns from base class
        (r'TOTAL\s+(?:DE\s+PLATA|ACHITAT|LEI)\s*:?\s*([\d.,]+)', 0.95),
        (r'TOTAL\s*:?\s*([\d.,]+)', 0.90),
    ]

    # Payment patterns - NUMERAR is primary for this store
    PAYMENT_PATTERNS = [
        # "NUMERAR 80.00" or "NUMERAR: 80.00"
        (r'NUMERAR\s*:?\s*([\d.,\s]+)', 'NUMERAR', 0.98),
        # "CARD 80.00" or "CARD: 80.00"
        (r'CARD\s*:?\s*([\d.,\s]+)', 'CARD', 0.95),
    ]

    # Client CUI patterns - specific to this receipt format
    CLIENT_CUI_PATTERNS = [
        # "CIF CLIENT:1879855" (exact format from OCR)
        (r'CIF\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
        # "CLIENT CIF: ROXXXXXXX"
        (r'CLIENT\s+CIF\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
        # "C.I.F. CLIENT: XXXXXXX"
        (r'C\.?I\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
    ]

    # Override client markers to be less strict
    CLIENT_MARKERS = [
        r'CIF\s+CLIENT',
        r'CLIENT\s+CIF',
        r'C\.?I\.?F\.?\s+CLIENT',
        r'CLIENT\s*:',
    ]

    def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
        """
        Extract total amount from receipt text.

        Handles UNLIMITED KEYS format with space before decimal (e.g., "80 .00").

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (total_amount, confidence) or (None, 0.0)
        """
        text_upper = text.upper()

        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE)
            if match:
                try:
                    # Clean up amount string (remove spaces, fix decimal)
                    amount_str = match.group(1)
                    # Remove spaces that might appear before decimal
                    amount_str = re.sub(r'\s+', '', amount_str)
                    amount = self._parse_decimal(amount_str)

                    if amount and amount > 0:
                        return (amount, confidence)
                except (ValueError, InvalidOperation):
                    continue

        return (None, 0.0)

    def extract_tva_entries(self, text: str) -> List[dict]:
        """
        Extract TVA entries from receipt text.

        Handles OCR errors where TVA is read as TUA.

        Args:
            text: Raw OCR text from receipt

        Returns:
            List of TVA entries with code, percent, and amount
        """
        entries = []
        text_upper = text.upper()

        # Pattern 4: "XX.XX% TUA*A YY.YY" - common OCR format
        pattern4 = self.TVA_PATTERNS[3]
        match = re.search(pattern4, text_upper)
        if match:
            try:
                percent = int(match.group(1))
                amount_str = re.sub(r'\s+', '', match.group(2))
                amount = self._parse_decimal(amount_str)
                if amount and amount > 0:
                    entries.append({
                        'code': 'A',
                        'percent': percent,
                        'amount': amount
                    })
                    return entries
            except (ValueError, InvalidOperation, IndexError):
                pass

        # Pattern 5: "TOTAL TUA: YY.YY" - fallback to total TVA
        pattern5 = self.TVA_PATTERNS[4]
        match = re.search(pattern5, text_upper)
        if match:
            try:
                amount_str = re.sub(r'\s+', '', match.group(1))
                amount = self._parse_decimal(amount_str)
                if amount and amount > 0:
                    # Infer percent from amount vs total ratio
                    entries.append({
                        'code': 'A',
                        'percent': 19,  # Standard Romanian TVA rate
                        'amount': amount
                    })
                    return entries
            except (ValueError, InvalidOperation, IndexError):
                pass

        # Try coded patterns
        for pattern in self.TVA_PATTERNS[:3]:
            for match in re.finditer(pattern, text_upper, re.IGNORECASE):
                try:
                    groups = match.groups()
                    if len(groups) == 3:
                        code = groups[0].upper()
                        percent = int(groups[1])
                        amount_str = re.sub(r'\s+', '', groups[2])
                    else:
                        code = 'A'
                        percent = int(groups[0])
                        amount_str = re.sub(r'\s+', '', groups[1])

                    amount = self._parse_decimal(amount_str)
                    if amount and amount > 0:
                        entries.append({
                            'code': code,
                            'percent': percent,
                            'amount': amount
                        })
                        return entries
                except (ValueError, InvalidOperation, IndexError):
                    continue

        return entries

    def extract_payment_methods(self, text: str) -> List[dict]:
        """
        Extract payment methods from receipt text.

        Handles NUMERAR (cash) as primary payment for this store.

        Args:
            text: Raw OCR text from receipt

        Returns:
            List of payment methods with method, amount, and confidence
        """
        payments = []
        text_upper = text.upper()

        for pattern, method, confidence in self.PAYMENT_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE)
            if match:
                try:
                    amount_str = re.sub(r'\s+', '', match.group(1))
                    amount = self._parse_decimal(amount_str)

                    if amount and amount > 0:
                        payments.append({
                            'method': method,
                            'amount': amount,
                            'confidence': confidence
                        })
                except (ValueError, InvalidOperation):
                    continue

        return payments

    def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
        """
        Extract client CUI from receipt text.

        Handles "CIF CLIENT:1879855" format specific to this store.

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (cui, confidence) or (None, 0.0)
        """
        text_upper = text.upper()

        # Check for client markers
        has_client = any(
            re.search(marker, text_upper, re.IGNORECASE)
            for marker in self.CLIENT_MARKERS
        )

        if not has_client:
            return (None, 0.0)

        # Try client CUI patterns
        for pattern, confidence in self.CLIENT_CUI_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE)
            if match:
                cui = match.group(1)
                # Clean up: remove RO prefix, spaces
                cui_digits = re.sub(r'[^0-9]', '', cui)
                if 6 <= len(cui_digits) <= 10:
                    return (cui_digits, confidence)

        return (None, 0.0)

    def get_validation_hints(self) -> Dict[str, Any]:
        """Return UNLIMITED KEYS-specific validation hints."""
        return {
            "has_multi_rate_tva": False,
            "card_equals_total": False,  # May be NUMERAR (cash)
            "has_client_cui": True,  # May have client CUI
            "has_efactura": False,
            "is_non_vat_payer": False,
            "common_payment": "NUMERAR",  # Cash payments common
        }