feat(ocr): Add modular store profiles with hot-reload support

## Store Profiles System - Add ProfileRegistry for CUI-based profile lookup - Add BaseStoreProfile with generic extraction patterns - Implement hot-reload via POST /api/data-entry/ocr/profiles/reload ## 12 Store Profiles - LIDL: Multi-rate TVA (A, B, C, D codes) - OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates - BRICK, DEDEMAN: Standard TVA, e-factura support - KINETERRA, BEST PRINT: Non-VAT payers (returns []) - STEPOUT MARKET: TVA 5% (books/reduced rate) - UNLIMITED KEYS: NUMERAR payment detection - GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA ## Flexible TVA Patterns - All patterns use (\d{1,2})% to accept any rate - Supports historical (19%, 9%, 5%) and current (21%, 11%) ## Payment Methods Fix - Fixed base.py to support multiple payments of same type - Changed deduplication from method-only to (method, amount) tuple - Returns separate entries for split payments ## Tools - Add generate_store_profile.py for automatic profile generation - Analyzes PDFs via OCR API and detects patterns 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-06 23:07:07 +00:00
parent 67b0082df0
commit 099556213d
25 changed files with 3707 additions and 114 deletions
--- a/backend/modules/data_entry/services/ocr/profiles/base.py
+++ b/backend/modules/data_entry/services/ocr/profiles/base.py
@@ -0,0 +1,515 @@
+"""
+Base class for store-specific OCR extraction profiles.
+
+Each store can have different receipt formats (TVA layout, total position, etc.).
+Store profiles allow customizing extraction logic per-store for better accuracy.
+
+Usage:
+    from .base import BaseStoreProfile
+    from . import ProfileRegistry
+
+    @ProfileRegistry.register
+    class LidlProfile(BaseStoreProfile):
+        CUI_LIST = ["22891860"]
+        NAME_PATTERNS = ["LIDL", "LDL"]
+
+        def extract_tva_entries(self, text: str) -> List[dict]:
+            # Custom Lidl TVA extraction logic
+            ...
+"""
+
+import re
+from abc import ABC
+from decimal import Decimal, InvalidOperation
+from typing import List, Optional, Tuple, Dict, Any
+from datetime import date
+
+
+class BaseStoreProfile(ABC):
+    """
+    Abstract base class for store-specific extraction profiles.
+
+    Each profile defines:
+    - CUI_LIST: CUI codes that identify this store (without RO prefix)
+    - NAME_PATTERNS: OCR-tolerant name patterns for fallback matching
+    - Custom extraction methods for TVA, total, date, etc.
+
+    The ProfileRegistry uses CUI_LIST to lookup profiles during extraction.
+    """
+
+    # -------------------------------------------------------------------------
+    # Class attributes - override in subclasses
+    # -------------------------------------------------------------------------
+
+    # List of CUI codes (without RO prefix) that identify this store
+    CUI_LIST: List[str] = []
+
+    # OCR-tolerant name patterns for fallback matching
+    NAME_PATTERNS: List[str] = []
+
+    # Store display name
+    STORE_NAME: str = "Unknown Store"
+
+    # -------------------------------------------------------------------------
+    # Generic patterns - can be overridden in subclasses
+    # -------------------------------------------------------------------------
+
+    # Total amount patterns (confidence-weighted)
+    TOTAL_PATTERNS = [
+        (r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),
+        (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
+        (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),
+        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
+        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
+        (r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
+        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
+        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
+    ]
+
+    # Date patterns (confidence-weighted)
+    DATE_PATTERNS = [
+        (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
+        (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
+        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
+        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
+        (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
+        (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
+    ]
+
+    # Date patterns with OCR-introduced spaces (separate because format is different)
+    DATE_PATTERNS_OCR_SPACES = [
+        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
+        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
+        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
+        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
+    ]
+
+    # Receipt number patterns (confidence-weighted)
+    NUMBER_PATTERNS = [
+        (r'NDS\s*:?\s*(\d+)', 0.98),
+        (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),
+        (r'C3POS.*?(\d{6,7})\b', 0.95),
+        (r'BF\s*:\s*(\d{4,})', 0.96),
+        (r'BF\s+(\d{4,})', 0.93),
+        (r'NIVS\s*:?\s*(\d+)', 0.95),
+        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
+        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
+        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
+        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
+        (r'ID\s*BF\s*:?\s*(\d+)', 0.90),
+    ]
+
+    # Payment method patterns (pattern, method_type, confidence)
+    PAYMENT_PATTERNS = [
+        (r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
+        (r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
+        (r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
+        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
+        (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
+        (r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
+        (r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
+        (r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
+    ]
+
+    # Client section markers (for B2B receipts)
+    CLIENT_MARKERS = [
+        r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',
+        r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',
+        r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',
+        r'CLIENT\s*:',
+        r'CUMPARATOR\s*:',
+        r'BENEFICIAR\s*:',
+    ]
+
+    # Client CUI patterns (pattern, confidence)
+    CLIENT_CUI_PATTERNS = [
+        (r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
+        (r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
+        (r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
+        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.95),
+        (r'CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.90),
+    ]
+
+    # Company type indicators (for identifying company names)
+    COMPANY_INDICATORS = [
+        r'\bS\.?\s*R\.?\s*L\.?\b',      # S.R.L. or S. R. L.
+        r'\bS\.?\s*A\.?\b',              # S.A. or S. A.
+        r'\bS\.?\s*N\.?\s*C\.?\b',      # S.N.C. or S. N. C.
+        r'\bS\.?\s*C\.?\s*S\.?\b',      # S.C.S. or S. C. S.
+        r'\bI\.?\s*I\.?\b',              # I.I. or I. I.
+        r'\bP\.?\s*F\.?\s*A\.?\b',      # P.F.A. or P. F. A.
+        r'\bS\.?\s*C\.?\s+[A-Z]',       # S.C. followed by company name
+        r'HOLDING',
+        r'COMPANY',
+        r'GROUP',
+    ]
+
+    # Maximum reasonable payment amount (to filter OCR errors)
+    MAX_PAYMENT = Decimal('100000')
+
+    # -------------------------------------------------------------------------
+    # Extraction methods - override in subclasses as needed
+    # -------------------------------------------------------------------------
+
+    def extract_tva_entries(self, text: str) -> List[dict]:
+        """
+        Extract TVA entries from receipt text.
+
+        Override this method in subclasses to handle store-specific TVA formats.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            List of dicts with keys: code, percent, amount
+        """
+        return []
+
+    def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
+        """
+        Extract total amount from receipt text.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            Tuple of (amount, confidence) or (None, 0.0)
+        """
+        text_upper = text.upper()
+
+        for pattern, confidence in self.TOTAL_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                amount = self._parse_decimal(match.group(1))
+                if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                    return (amount, confidence)
+
+        return (None, 0.0)
+
+    def extract_date(self, text: str) -> Tuple[Optional[date], float]:
+        """
+        Extract receipt date from text.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            Tuple of (date, confidence) or (None, 0.0)
+        """
+        text_upper = text.upper()
+
+        # Try standard patterns first
+        for pattern, confidence in self.DATE_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                parsed = self._parse_date(match.group(1))
+                if parsed:
+                    return (parsed, confidence)
+
+        # Try OCR-corrupted patterns with spaces
+        for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
+            match = re.search(pattern, text_upper)
+            if match:
+                try:
+                    if fmt == 'ymd':
+                        year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
+                    else:  # dmy
+                        day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
+
+                    if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
+                        return (date(year, month, day), confidence)
+                except (ValueError, TypeError):
+                    continue
+
+        return (None, 0.0)
+
+    def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]:
+        """
+        Extract receipt number from text.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            Tuple of (number, confidence) or (None, 0.0)
+        """
+        text_upper = text.upper()
+
+        for pattern, confidence in self.NUMBER_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                number = match.group(1).strip()
+                if number and len(number) >= 3:
+                    return (number, confidence)
+
+        return (None, 0.0)
+
+    def extract_payment_methods(self, text: str) -> List[dict]:
+        """
+        Extract payment methods (CARD/NUMERAR) from receipt.
+
+        Supports multiple payments of the same type (e.g., 2x CARD for split payments).
+        Each payment is returned as a separate entry with its amount.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
+            Multiple entries of same method type are allowed for split payments.
+        """
+        text_upper = text.upper()
+        methods = []
+        # Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
+        seen_entries = set()
+
+        for pattern, method, confidence in self.PAYMENT_PATTERNS:
+            for match in re.finditer(pattern, text_upper):
+                try:
+                    amount = self._parse_decimal(match.group(1))
+                    if amount and amount > 0 and amount < self.MAX_PAYMENT:
+                        # Deduplicate by (method, amount) to avoid same entry from multiple patterns
+                        # But allow different amounts for same method (split payments)
+                        entry_key = (method, amount)
+                        if entry_key not in seen_entries:
+                            methods.append({
+                                'method': method,
+                                'amount': amount,
+                                'confidence': confidence
+                            })
+                            seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    continue
+
+        return methods
+
+    def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
+        """
+        Extract client CUI from B2B receipts.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            Tuple of (cui, confidence) or (None, 0.0)
+        """
+        text_upper = text.upper()
+
+        # First check if there's a CLIENT section
+        has_client_section = any(
+            re.search(marker, text_upper, re.IGNORECASE)
+            for marker in self.CLIENT_MARKERS
+        )
+
+        if not has_client_section:
+            return (None, 0.0)
+
+        # Try to extract CUI
+        for pattern, confidence in self.CLIENT_CUI_PATTERNS:
+            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
+            if match:
+                cui = match.group(1)
+                # Normalize: remove RO prefix for storage
+                cui_digits = re.sub(r'[^0-9]', '', cui)
+                if 6 <= len(cui_digits) <= 10:
+                    return (cui_digits, confidence)
+
+        return (None, 0.0)
+
+    def extract_client_name(self, text: str) -> Tuple[Optional[str], float]:
+        """
+        Extract client/buyer company name from B2B receipts.
+
+        Args:
+            text: Raw OCR text from receipt
+
+        Returns:
+            Tuple of (client_name, confidence) or (None, 0.0)
+        """
+        text_upper = text.upper()
+        lines = text.split('\n')
+
+        # First check if there's a CLIENT section
+        client_section_idx = None
+        for i, line in enumerate(lines):
+            line_upper = line.upper().strip()
+            if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS):
+                client_section_idx = i
+                break
+
+        if client_section_idx is None:
+            return (None, 0.0)
+
+        # Look for company name in CLIENT section
+        line = lines[client_section_idx].strip()
+        line_upper = line.upper()
+
+        # Strategy 1: Check if name is on same line after ":"
+        if ':' in line:
+            name_part = line.split(':', 1)[1].strip()
+            if name_part and len(name_part) >= 3:
+                # Skip if it looks like a CUI (RO followed by digits)
+                if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
+                    pass  # This is CUI, not name - continue to next strategy
+                else:
+                    # Check for company indicators
+                    name_upper = name_part.upper()
+                    if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS):
+                        return (self._clean_company_name(name_part), 0.95)
+                    elif len(name_part) >= 5 and not name_part.isdigit():
+                        return (self._clean_company_name(name_part), 0.80)
+
+        # Strategy 2: Check next line for company name
+        if client_section_idx + 1 < len(lines):
+            next_line = lines[client_section_idx + 1].strip()
+            next_upper = next_line.upper()
+
+            # Skip if it's a CUI/CIF line or looks like CUI
+            if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
+                if not re.match(r'^R[O0]?\d{6,10}$', next_upper):
+                    if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS):
+                        return (self._clean_company_name(next_line), 0.90)
+                    elif len(next_line) >= 5 and not next_line.isdigit():
+                        # Check it's not CUI/CIF/COD keywords
+                        if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
+                            return (self._clean_company_name(next_line), 0.75)
+
+        # Strategy 3: Look for any line with company indicators in CLIENT section region
+        search_end = min(client_section_idx + 5, len(lines))
+        for i in range(client_section_idx + 1, search_end):
+            line = lines[i].strip()
+            line_upper = line.upper()
+
+            # Skip CUI/CIF lines
+            if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper):
+                continue
+            if re.match(r'^R[O0]?\d{6,10}$', line_upper):
+                continue
+
+            if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS):
+                return (self._clean_company_name(line), 0.85)
+
+        return (None, 0.0)
+
+    @staticmethod
+    def _clean_company_name(name: str) -> str:
+        """Clean company name for storage."""
+        if not name:
+            return ""
+        # Remove extra whitespace
+        name = re.sub(r'\s+', ' ', name).strip()
+        # Remove trailing punctuation except periods in S.R.L., S.A., etc.
+        name = re.sub(r'[,;:]+$', '', name).strip()
+        return name
+
+    # -------------------------------------------------------------------------
+    # Validation hints - override to customize validation behavior
+    # -------------------------------------------------------------------------
+
+    def get_validation_hints(self) -> Dict[str, Any]:
+        """
+        Return validation hints for this store.
+
+        Returns:
+            Dict with validation hints. Common keys:
+            - has_multi_rate_tva: bool - Store uses multiple TVA rates
+            - card_equals_total: bool - CARD payment equals total
+            - has_client_cui: bool - Receipt includes client CUI
+            - has_efactura: bool - Store uses e-factura format
+            - is_non_vat_payer: bool - Store is not a VAT payer
+        """
+        return {}
+
+    # -------------------------------------------------------------------------
+    # Helper methods - available to all subclasses
+    # -------------------------------------------------------------------------
+
+    @staticmethod
+    def _normalize_number(text: str) -> str:
+        """
+        Normalize a number string for Decimal conversion.
+
+        Handles Romanian formats: "1.234,56" -> "1234.56"
+        """
+        if not text:
+            return "0"
+
+        # Remove spaces
+        text = text.replace(" ", "")
+
+        # Determine decimal separator
+        last_comma = text.rfind(",")
+        last_dot = text.rfind(".")
+
+        if last_comma > last_dot:
+            text = text.replace(".", "").replace(",", ".")
+        elif last_dot > last_comma:
+            text = text.replace(",", "")
+        else:
+            text = text.replace(",", ".")
+
+        return text
+
+    @staticmethod
+    def _parse_decimal(text: str) -> Optional[Decimal]:
+        """Parse a string to Decimal, handling various formats."""
+        try:
+            normalized = BaseStoreProfile._normalize_number(text)
+            return Decimal(normalized)
+        except (InvalidOperation, ValueError, TypeError):
+            return None
+
+    @staticmethod
+    def _parse_date(text: str) -> Optional[date]:
+        """
+        Parse date string in various formats.
+
+        Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD
+        """
+        if not text:
+            return None
+
+        # Normalize separators
+        text = text.replace('/', '-').replace('.', '-')
+
+        try:
+            parts = text.split('-')
+            if len(parts) != 3:
+                return None
+
+            # Determine format based on first part length
+            if len(parts[0]) == 4:
+                # YYYY-MM-DD
+                year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
+            else:
+                # DD-MM-YYYY
+                day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
+
+            # Validate ranges
+            if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
+                return date(year, month, day)
+        except (ValueError, TypeError, IndexError):
+            pass
+
+        return None
+
+    @staticmethod
+    def _clean_text(text: str) -> str:
+        """Clean OCR text for pattern matching."""
+        if not text:
+            return ""
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text)
+        return text.strip()
+
+    # -------------------------------------------------------------------------
+    # Magic methods
+    # -------------------------------------------------------------------------
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>"
+
+    def __str__(self) -> str:
+        return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"