roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/base.py

"""
Base class for store-specific OCR extraction profiles.

Each store can have different receipt formats (TVA layout, total position, etc.).
Store profiles allow customizing extraction logic per-store for better accuracy.

Usage:
    from .base import BaseStoreProfile
    from . import ProfileRegistry

    @ProfileRegistry.register
    class LidlProfile(BaseStoreProfile):
        CUI_LIST = ["22891860"]
        NAME_PATTERNS = ["LIDL", "LDL"]

        def extract_tva_entries(self, text: str) -> List[dict]:
            # Custom Lidl TVA extraction logic
            ...
"""

import re
from abc import ABC
from decimal import Decimal, InvalidOperation
from typing import List, Optional, Tuple, Dict, Any
from datetime import date


class BaseStoreProfile(ABC):
    """
    Abstract base class for store-specific extraction profiles.

    Each profile defines:
    - CUI_LIST: CUI codes that identify this store (without RO prefix)
    - NAME_PATTERNS: OCR-tolerant name patterns for fallback matching
    - Custom extraction methods for TVA, total, date, etc.

    The ProfileRegistry uses CUI_LIST to lookup profiles during extraction.
    """

    # -------------------------------------------------------------------------
    # Class attributes - override in subclasses
    # -------------------------------------------------------------------------

    # List of CUI codes (without RO prefix) that identify this store
    CUI_LIST: List[str] = []

    # OCR-tolerant name patterns for fallback matching
    NAME_PATTERNS: List[str] = []

    # Store display name
    STORE_NAME: str = "Unknown Store"

    # -------------------------------------------------------------------------
    # Generic patterns - can be overridden in subclasses
    # -------------------------------------------------------------------------

    # Total amount patterns (confidence-weighted)
    TOTAL_PATTERNS = [
        (r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),
        (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
        (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),
        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
        (r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
    ]

    # Date patterns (confidence-weighted)
    DATE_PATTERNS = [
        (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
        (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
        (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
        (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
    ]

    # Date patterns with OCR-introduced spaces (separate because format is different)
    DATE_PATTERNS_OCR_SPACES = [
        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
    ]

    # Receipt number patterns (confidence-weighted)
    NUMBER_PATTERNS = [
        (r'NDS\s*:?\s*(\d+)', 0.98),
        (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),
        (r'C3POS.*?(\d{6,7})\b', 0.95),
        (r'BF\s*:\s*(\d{4,})', 0.96),
        (r'BF\s+(\d{4,})', 0.93),
        (r'NIVS\s*:?\s*(\d+)', 0.95),
        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
        (r'ID\s*BF\s*:?\s*(\d+)', 0.90),
    ]

    # Payment method patterns (pattern, method_type, confidence)
    PAYMENT_PATTERNS = [
        (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
        (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
        (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
        (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
        (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
        (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
        (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
    ]

    # Client section markers (for B2B receipts) - More flexible patterns
    CLIENT_MARKERS = [
        r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT',    # "CIF CLIENT" (with or without colon)
        r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT',    # "CUI CLIENT"
        r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]',  # "CLIENT CIF" / "CLIENT CUI"
        r'CLIENT\s*:',                          # "CLIENT:"
        r'CUMPARATOR\s*:',                      # "CUMPARATOR:"
        r'BENEFICIAR\s*:',                      # "BENEFICIAR:"
        r'CUMP[AĂ]R[AĂ]TOR',                   # "CUMPARATOR" without colon
        r'COD\s+FISCAL\s+CLIENT',              # "COD FISCAL CLIENT"
    ]

    # Client CUI patterns (pattern, confidence) - More flexible
    CLIENT_CUI_PATTERNS = [
        # "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
        (r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
        # "CLIENT CIF: XXXXXXX"
        (r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
        # "CUI CLIENT: XXXXXXX"
        (r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
        # "ROXXXXXXX" followed by CLIENT marker
        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
        # "C.I.F. CLIENT: XXXXXXX"
        (r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
        # "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
        (r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
        # "COD FISCAL CLIENT: XXXXXXX"
        (r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
    ]

    # Company type indicators (for identifying company names)
    COMPANY_INDICATORS = [
        r'\bS\.?\s*R\.?\s*L\.?\b',      # S.R.L. or S. R. L.
        r'\bS\.?\s*A\.?\b',              # S.A. or S. A.
        r'\bS\.?\s*N\.?\s*C\.?\b',      # S.N.C. or S. N. C.
        r'\bS\.?\s*C\.?\s*S\.?\b',      # S.C.S. or S. C. S.
        r'\bI\.?\s*I\.?\b',              # I.I. or I. I.
        r'\bP\.?\s*F\.?\s*A\.?\b',      # P.F.A. or P. F. A.
        r'\bS\.?\s*C\.?\s+[A-Z]',       # S.C. followed by company name
        r'HOLDING',
        r'COMPANY',
        r'GROUP',
    ]

    # Maximum reasonable payment amount (to filter OCR errors)
    MAX_PAYMENT = Decimal('100000')

    # -------------------------------------------------------------------------
    # Extraction methods - override in subclasses as needed
    # -------------------------------------------------------------------------

    def extract_tva_entries(self, text: str) -> List[dict]:
        """
        Extract TVA entries from receipt text.

        Override this method in subclasses to handle store-specific TVA formats.

        Args:
            text: Raw OCR text from receipt

        Returns:
            List of dicts with keys: code, percent, amount
        """
        return []

    def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
        """
        Extract total amount from receipt text.

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (amount, confidence) or (None, 0.0)
        """
        text_upper = text.upper()

        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                amount = self._parse_decimal(match.group(1))
                if amount and amount > 0 and amount < self.MAX_PAYMENT:
                    return (amount, confidence)

        return (None, 0.0)

    def extract_date(self, text: str) -> Tuple[Optional[date], float]:
        """
        Extract receipt date from text.

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (date, confidence) or (None, 0.0)
        """
        text_upper = text.upper()

        # Try standard patterns first
        for pattern, confidence in self.DATE_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                parsed = self._parse_date(match.group(1))
                if parsed:
                    return (parsed, confidence)

        # Try OCR-corrupted patterns with spaces
        for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
            match = re.search(pattern, text_upper)
            if match:
                try:
                    if fmt == 'ymd':
                        year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
                    else:  # dmy
                        day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))

                    if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
                        return (date(year, month, day), confidence)
                except (ValueError, TypeError):
                    continue

        return (None, 0.0)

    def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]:
        """
        Extract receipt number from text.

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (number, confidence) or (None, 0.0)
        """
        text_upper = text.upper()

        for pattern, confidence in self.NUMBER_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                number = match.group(1).strip()
                if number and len(number) >= 3:
                    return (number, confidence)

        return (None, 0.0)

    def extract_payment_methods(self, text: str) -> List[dict]:
        """
        Extract payment methods (CARD/NUMERAR) from receipt.

        Supports multiple payments of the same type (e.g., 2x CARD for split payments).
        Each payment is returned as a separate entry with its amount.

        Args:
            text: Raw OCR text from receipt

        Returns:
            List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
            Multiple entries of same method type are allowed for split payments.
        """
        text_upper = text.upper()
        methods = []
        # Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
        seen_entries = set()

        for pattern, method, confidence in self.PAYMENT_PATTERNS:
            for match in re.finditer(pattern, text_upper):
                try:
                    amount = self._parse_decimal(match.group(1))
                    if amount and amount > 0 and amount < self.MAX_PAYMENT:
                        # Deduplicate by (method, amount) to avoid same entry from multiple patterns
                        # But allow different amounts for same method (split payments)
                        entry_key = (method, amount)
                        if entry_key not in seen_entries:
                            methods.append({
                                'method': method,
                                'amount': amount,
                                'confidence': confidence
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        return methods

    def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
        """
        Extract client CUI from B2B receipts.

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (cui, confidence) or (None, 0.0)
        """
        text_upper = text.upper()

        # First check if there's a CLIENT section
        has_client_section = any(
            re.search(marker, text_upper, re.IGNORECASE)
            for marker in self.CLIENT_MARKERS
        )

        if not has_client_section:
            return (None, 0.0)

        # Try to extract CUI
        for pattern, confidence in self.CLIENT_CUI_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
            if match:
                cui = match.group(1)
                # Normalize: remove RO prefix for storage
                cui_digits = re.sub(r'[^0-9]', '', cui)
                if 6 <= len(cui_digits) <= 10:
                    return (cui_digits, confidence)

        return (None, 0.0)

    def extract_client_name(self, text: str) -> Tuple[Optional[str], float]:
        """
        Extract client/buyer company name from B2B receipts.

        Args:
            text: Raw OCR text from receipt

        Returns:
            Tuple of (client_name, confidence) or (None, 0.0)
        """
        text_upper = text.upper()
        lines = text.split('\n')

        # First check if there's a CLIENT section
        client_section_idx = None
        for i, line in enumerate(lines):
            line_upper = line.upper().strip()
            if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS):
                client_section_idx = i
                break

        if client_section_idx is None:
            return (None, 0.0)

        # Look for company name in CLIENT section
        line = lines[client_section_idx].strip()
        line_upper = line.upper()

        # Strategy 1: Check if name is on same line after ":"
        if ':' in line:
            name_part = line.split(':', 1)[1].strip()
            if name_part and len(name_part) >= 3:
                # Skip if it looks like a CUI (RO followed by digits)
                if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
                    pass  # This is CUI, not name - continue to next strategy
                else:
                    # Check for company indicators
                    name_upper = name_part.upper()
                    if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS):
                        return (self._clean_company_name(name_part), 0.95)
                    elif len(name_part) >= 5 and not name_part.isdigit():
                        return (self._clean_company_name(name_part), 0.80)

        # Strategy 2: Check next line for company name
        if client_section_idx + 1 < len(lines):
            next_line = lines[client_section_idx + 1].strip()
            next_upper = next_line.upper()

            # Skip if it's a CUI/CIF line or looks like CUI
            if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
                if not re.match(r'^R[O0]?\d{6,10}$', next_upper):
                    if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS):
                        return (self._clean_company_name(next_line), 0.90)
                    elif len(next_line) >= 5 and not next_line.isdigit():
                        # Check it's not CUI/CIF/COD keywords
                        if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
                            return (self._clean_company_name(next_line), 0.75)

        # Strategy 3: Look for any line with company indicators in CLIENT section region
        search_end = min(client_section_idx + 5, len(lines))
        for i in range(client_section_idx + 1, search_end):
            line = lines[i].strip()
            line_upper = line.upper()

            # Skip CUI/CIF lines
            if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper):
                continue
            if re.match(r'^R[O0]?\d{6,10}$', line_upper):
                continue

            if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS):
                return (self._clean_company_name(line), 0.85)

        return (None, 0.0)

    @staticmethod
    def _clean_company_name(name: str) -> str:
        """Clean company name for storage."""
        if not name:
            return ""
        # Remove extra whitespace
        name = re.sub(r'\s+', ' ', name).strip()
        # Remove trailing punctuation except periods in S.R.L., S.A., etc.
        name = re.sub(r'[,;:]+$', '', name).strip()
        return name

    # -------------------------------------------------------------------------
    # Validation hints - override to customize validation behavior
    # -------------------------------------------------------------------------

    def get_validation_hints(self) -> Dict[str, Any]:
        """
        Return validation hints for this store.

        Returns:
            Dict with validation hints. Common keys:
            - has_multi_rate_tva: bool - Store uses multiple TVA rates
            - card_equals_total: bool - CARD payment equals total
            - has_client_cui: bool - Receipt includes client CUI
            - has_efactura: bool - Store uses e-factura format
            - is_non_vat_payer: bool - Store is not a VAT payer
        """
        return {}

    # -------------------------------------------------------------------------
    # Helper methods - available to all subclasses
    # -------------------------------------------------------------------------

    @staticmethod
    def _normalize_number(text: str) -> str:
        """
        Normalize a number string for Decimal conversion.

        Handles Romanian formats: "1.234,56" -> "1234.56"
        """
        if not text:
            return "0"

        # Remove spaces
        text = text.replace(" ", "")

        # Determine decimal separator
        last_comma = text.rfind(",")
        last_dot = text.rfind(".")

        if last_comma > last_dot:
            text = text.replace(".", "").replace(",", ".")
        elif last_dot > last_comma:
            text = text.replace(",", "")
        else:
            text = text.replace(",", ".")

        return text

    @staticmethod
    def _parse_decimal(text: str) -> Optional[Decimal]:
        """Parse a string to Decimal, handling various formats."""
        try:
            normalized = BaseStoreProfile._normalize_number(text)
            return Decimal(normalized)
        except (InvalidOperation, ValueError, TypeError):
            return None

    @staticmethod
    def _parse_date(text: str) -> Optional[date]:
        """
        Parse date string in various formats.

        Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD
        """
        if not text:
            return None

        # Normalize separators
        text = text.replace('/', '-').replace('.', '-')

        try:
            parts = text.split('-')
            if len(parts) != 3:
                return None

            # Determine format based on first part length
            if len(parts[0]) == 4:
                # YYYY-MM-DD
                year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
            else:
                # DD-MM-YYYY
                day, month, year = int(parts[0]), int(parts[1]), int(parts[2])

            # Validate ranges
            if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
                return date(year, month, day)
        except (ValueError, TypeError, IndexError):
            pass

        return None

    @staticmethod
    def _clean_text(text: str) -> str:
        """Clean OCR text for pattern matching."""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text)
        return text.strip()

    # -------------------------------------------------------------------------
    # Magic methods
    # -------------------------------------------------------------------------

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>"

    def __str__(self) -> str:
        return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"