roa2web-service-auto/data-entry-app/backend/app/services/ocr_extractor.py

"""Extract structured fields from OCR text (Romanian receipts)."""

import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple, List
from dataclasses import dataclass, field


@dataclass
class ExtractionResult:
    """Structured extraction result from receipt."""
    receipt_type: str = 'bon_fiscal'
    receipt_number: Optional[str] = None
    receipt_series: Optional[str] = None
    receipt_date: Optional[date] = None
    amount: Optional[Decimal] = None
    partner_name: Optional[str] = None
    cui: Optional[str] = None
    description: Optional[str] = None
    # Additional extracted fields - Multiple TVA entries support
    tva_entries: List[dict] = field(default_factory=list)  # [{code, percent, amount}]
    tva_total: Optional[Decimal] = None
    address: Optional[str] = None
    items_count: Optional[int] = None

    confidence_amount: float = 0.0
    confidence_date: float = 0.0
    confidence_vendor: float = 0.0
    raw_text: str = ""

    @property
    def overall_confidence(self) -> float:
        """Calculate weighted overall confidence score."""
        weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
        return round(
            self.confidence_amount * weights['amount'] +
            self.confidence_date * weights['date'] +
            self.confidence_vendor * weights['vendor'],
            2
        )


class ReceiptExtractor:
    """Extract receipt fields using pattern matching for Romanian receipts."""

    # Total amount patterns (most specific first)
    # Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
    # OCR often produces errors, so patterns must be tolerant
    TOTAL_PATTERNS = [
        # Most common: TOTAL LEI followed by amount
        (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
        (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),  # OCR may miss first letter
        # Standard patterns
        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
        # SUBTOTAL when TOTAL not found
        (r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
        (r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88),  # OCR variations
        # Payment methods
        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
        (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
        (r'REST\s*:?\s*([\d\s.,]+)', 0.70),  # Sometimes total is near REST
    ]

    # Fallback: Find the largest repeated amount (likely the total)
    # This handles cases where OCR doesn't capture "TOTAL" keyword

    # Date patterns - support dash, dot, and slash separators
    # OCR may produce DRTA instead of DATA, DAIA, etc.
    DATE_PATTERNS = [
        # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
        (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
        (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
        # Date followed by ORA (time) - OCR may produce 0RA
        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
        # Date followed by time without ORA keyword
        (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
        # Standalone date
        (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
        # YYYY-MM-DD format (less common)
        (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
    ]

    # Receipt number patterns - Romanian fiscal receipt formats
    # OCR may produce N instead of : or other errors
    NUMBER_PATTERNS = [
        # NDS format (common in Romanian POS)
        (r'NDS\s*:?\s*(\d+)', 0.98),
        # C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760)
        (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),  # CT2N1360760 format
        (r'C3POS.*?(\d{6,7})\b', 0.95),  # Any C3POS followed by 6-7 digit number
        (r'CT2[N:]\s*(\d{6,})', 0.95),  # CT2N prefix
        # BF (Bon Fiscal) number
        (r'BF\s*:?\s*(\d+)', 0.93),
        # NIVS format
        (r'NIVS\s*:?\s*(\d+)', 0.95),
        # Standard NR BON formats
        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
        # Document number
        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
        # ID BF format
        (r'ID\s*BF\s*:?\s*(\d+)', 0.90),
        # TD format (transaction ID)
        (r'TD\s*:?\s*(\d+)', 0.85),
        # 6-8 digit number (typical receipt number length)
        (r'\b(\d{6,8})\b', 0.70),
        # Generic long number at end (fallback)
        (r'NR\.?\s*:?\s*(\d{4,})', 0.65),
    ]

    # CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI
    # CIF = Cod de Identificare Fiscală (vendor's tax ID)
    # CLIENT C.U.I. = client's tax ID (should be ignored)
    # OCR errors: R0 instead of RO, C1F instead of CIF
    CUI_PATTERNS = [
        # CIF at start of line (definitely vendor) - tolerant to OCR errors
        (r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
        (r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),  # C1F OCR error
        # CIF not preceded by CLIENT (negative lookbehind)
        (r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        # Standalone CIF: format with OCR tolerance
        (r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
        # COD FISCAL (vendor)
        (r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
        # C.I.F. format (with dots)
        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
        # CUI format (less specific, use with caution)
        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
    ]

    # Series patterns - be strict to avoid false matches
    SERIES_PATTERNS = [
        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
        # Z: format from Romanian fiscal receipts (must be at start of line or after space)
        (r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
        # BF series with explicit marker
        (r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
    ]

    # TVA (VAT) patterns - OCR may produce TUA, TVR, etc.
    TVA_PATTERNS = [
        # TOTAL TVA BON format (OCR tolerant: TUA, TVR)
        (r'TOTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
        (r'T[O0]TAL\s+T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.95),
        # TVA with percentage (OCR tolerant)
        (r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
        (r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
        # Simple TVA pattern
        (r'T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.85),
        # Standalone percentage line near TVA
        (r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
    ]

    # Items count patterns - OCR may produce OZ instead of POZ, etc.
    # Number may be on separate line before or after the label
    ITEMS_COUNT_PATTERNS = [
        # NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
        # OCR tolerant: OZ instead of POZ, ARI instead of ART
        (r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
        # Number on line BEFORE "OZ. ART. IN BON:" - OCR sometimes reorders
        (r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
        # Number may be on next line after label
        (r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
        (r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
        # Simpler patterns
        (r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
        (r'P?[O0]Z\s*:?\s*(\d+)', 0.85),
        # X articole/pozitii
        (r'(\d+)\s*(?:ARTIC[O0]LE|P[O0]ZITII|BUC)', 0.80),
    ]

    # Address patterns (Romanian format)
    ADDRESS_PATTERNS = [
        # Street patterns
        (r'(STR\.?\s+[A-Z0-9\s.,]+(?:NR\.?\s*\d+)?)', 0.90),
        # Full address with JUD (county)
        (r'(JUD\.?\s+[A-Z]+,?\s*(?:MUN\.?|OR\.?|COM\.?)?\s*[A-Z]+)', 0.85),
    ]

    # Vendor name indicators (lines containing these are likely vendor names)
    VENDOR_INDICATORS = [
        r'\bS\.?R\.?L\.?\b',      # S.R.L.
        r'\bS\.?A\.?\b',          # S.A.
        r'\bS\.?N\.?C\.?\b',      # S.N.C.
        r'\bS\.?C\.?S\.?\b',      # S.C.S.
        r'\bI\.?I\.?\b',          # I.I. (Individual)
        r'\bP\.?F\.?A\.?\b',      # P.F.A.
        r'\bS\.?C\.?\b',          # S.C.
        r'HOLDING',
        r'COMPANY',
        r'GROUP',
        r'MAGAZIN',
        r'MARKET',
        r'SHOP',
    ]

    def extract(self, text: str) -> ExtractionResult:
        """Extract all fields from OCR text."""
        result = ExtractionResult()
        result.raw_text = text
        text_upper = text.upper()

        # Extract core fields
        result.amount, result.confidence_amount = self._extract_amount(text_upper)
        result.receipt_date, result.confidence_date = self._extract_date(text_upper)
        result.receipt_number, _ = self._extract_number(text_upper)
        result.receipt_series, _ = self._extract_series(text_upper)
        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
        result.cui, _ = self._extract_cui(text_upper, text)

        # Extract additional fields - Multiple TVA entries
        result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
        result.items_count = self._extract_items_count(text_upper)
        result.address = self._extract_address(text_upper)

        # Detect receipt type
        result.receipt_type = self._detect_receipt_type(text_upper)

        return result

    def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
        """Extract total amount from text."""
        # First try standard patterns (TOTAL, SUBTOTAL, etc.)
        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                try:
                    amount_str = re.sub(r'[^\d.,]', '', match.group(1))
                    amount_str = self._normalize_number(amount_str)
                    amount = Decimal(amount_str)
                    if amount > 0:
                        return amount, confidence
                except (InvalidOperation, ValueError):
                    continue

        # Strategy 2: Find amounts AFTER product lines end
        # Products have pattern: "X BUC/ROLA X price = price"
        # Total appears after all products
        product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X'
        product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE))
        if product_matches:
            # Get text after the last product line
            last_product_pos = product_matches[-1].end()
            after_products = text[last_product_pos:]

            # Find standalone amounts on their own line after products
            line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
            standalone_amounts = []
            for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE):
                try:
                    amount_str = match.group(1).replace(' ', '')
                    amount_str = self._normalize_number(amount_str)
                    amount = Decimal(amount_str)
                    if amount > 10:  # Filter out small values
                        standalone_amounts.append(amount)
                except (InvalidOperation, ValueError):
                    continue

            if standalone_amounts:
                # The largest standalone amount after products is likely the total
                max_amount = max(standalone_amounts)
                # Higher confidence if it appears multiple times
                count = standalone_amounts.count(max_amount)
                confidence = 0.85 if count >= 2 else 0.75
                return max_amount, confidence

        # Strategy 3: Find the most repeated large amount
        # Normalize spaces in numbers (OCR may produce "186. 16")
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
        amount_pattern = r'(\d{2,4}[.,]\d{2})\b'
        amounts = re.findall(amount_pattern, normalized_text)
        if amounts:
            from collections import Counter
            amount_counts = Counter(amounts)
            # Filter amounts that appear 2+ times and are > 20
            candidates = []
            for amt_str, count in amount_counts.items():
                try:
                    amt = Decimal(self._normalize_number(amt_str))
                    if count >= 2 and amt > 20:
                        candidates.append((amt, count))
                except (InvalidOperation, ValueError):
                    continue

            if candidates:
                # Return the LARGEST amount that appears multiple times
                candidates.sort(key=lambda x: x[0], reverse=True)
                return candidates[0][0], 0.65

        # Last resort: Find any standalone large amount
        line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
        for match in re.finditer(line_amount_pattern, text, re.MULTILINE):
            try:
                amount_str = match.group(1).replace(' ', '')
                amount_str = self._normalize_number(amount_str)
                amount = Decimal(amount_str)
                if amount > 50:  # Higher threshold for fallback
                    return amount, 0.50
            except (InvalidOperation, ValueError):
                continue

        return None, 0.0

    def _normalize_number(self, num_str: str) -> str:
        """Normalize Romanian number format to standard decimal."""
        # Remove spaces
        num_str = num_str.replace(' ', '')

        # Handle comma as decimal separator
        if ',' in num_str and '.' in num_str:
            # Romanian format: 1.234,56
            num_str = num_str.replace('.', '').replace(',', '.')
        elif ',' in num_str:
            # Could be 1,50 or 1,234
            parts = num_str.split(',')
            if len(parts) == 2 and len(parts[1]) <= 2:
                # Decimal comma: 1,50
                num_str = num_str.replace(',', '.')
            else:
                # Thousands comma: 1,234
                num_str = num_str.replace(',', '')
        elif '.' in num_str:
            parts = num_str.split('.')
            if len(parts) > 2:
                # Multiple dots: 1.234.567 -> 1234567
                num_str = ''.join(parts[:-1]) + '.' + parts[-1]

        return num_str

    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
        """Extract receipt date from text."""
        for pattern, confidence in self.DATE_PATTERNS:
            match = re.search(pattern, text)
            if match:
                try:
                    # Normalize separators to dots
                    date_str = match.group(1).replace('/', '.').replace('-', '.')

                    # Try DD.MM.YYYY format first
                    try:
                        parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
                    except ValueError:
                        # Try YYYY.MM.DD format
                        parsed = datetime.strptime(date_str, '%Y.%m.%d').date()

                    # Validate date range
                    today = date.today()
                    if parsed <= today and parsed.year >= 2020:
                        return parsed, confidence
                except ValueError:
                    continue
        return None, 0.0

    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
        """Extract receipt number from text."""
        for pattern, confidence in self.NUMBER_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1), confidence
        return None, 0.0

    def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
        """Extract receipt series from text."""
        for pattern, confidence in self.SERIES_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).upper(), confidence
        return None, 0.0

    def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
        """
        Extract vendor/partner name from text.
        Uses multiple strategies:
        1. Look for lines with company type indicators (S.R.L., S.A., etc.)
        2. Look for lines near CIF
        3. Use first valid line as fallback
        """
        lines = text.split('\n')
        skip_keywords = [
            'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
            'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
            'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT',
            'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
            'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
            'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
            'BUC', 'ROLA', 'CUMPARATOR'
        ]

        # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
        for i, line in enumerate(lines[:15]):  # Check first 15 lines
            line = line.strip()
            if not line or len(line) < 3:
                continue

            line_upper = line.upper()

            # Check for vendor indicators
            for indicator in self.VENDOR_INDICATORS:
                if re.search(indicator, line_upper):
                    # Found a company name indicator
                    vendor = self._clean_vendor_name(line)
                    if vendor and len(vendor) >= 3:
                        # High confidence for lines with company indicators
                        return vendor, 0.95

        # Strategy 2: Look for lines right before or after CIF
        for i, line in enumerate(lines[:15]):
            line_upper = line.upper()
            if 'CIF' in line_upper and 'CLIENT' not in line_upper:
                # Check line before
                if i > 0:
                    prev_line = lines[i-1].strip()
                    if prev_line and len(prev_line) >= 3:
                        if not any(kw in prev_line.upper() for kw in skip_keywords):
                            vendor = self._clean_vendor_name(prev_line)
                            if vendor:
                                return vendor, 0.85

        # Strategy 3: First valid line as fallback
        for i, line in enumerate(lines[:10]):
            line = line.strip()

            # Skip empty lines
            if not line or len(line) < 3:
                continue

            # Skip lines that are just numbers or codes
            if re.match(r'^[\d.,\s:]+$', line):
                continue

            # Skip lines with barcodes/product codes
            if re.match(r'^[A-Z]*\d{6,}', line):
                continue

            # Skip lines with keywords
            if any(kw in line.upper() for kw in skip_keywords):
                continue

            # Clean the line
            vendor = self._clean_vendor_name(line)

            if vendor and len(vendor) >= 3:
                # Confidence decreases for lines further down
                confidence = max(0.3, 0.7 - (i * 0.05))
                return vendor, confidence

        return None, 0.0

    def _clean_vendor_name(self, name: str) -> Optional[str]:
        """Clean and normalize vendor name."""
        if not name:
            return None

        # Remove common OCR artifacts
        name = re.sub(r'[^\w\s.,&\-()]', ' ', name)
        # Normalize whitespace
        name = re.sub(r'\s+', ' ', name).strip()

        # Skip if it looks like an address line only
        if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
            return None

        # Skip if too short after cleaning
        if len(name) < 3:
            return None

        return name

    def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
        """
        Extract vendor CUI (fiscal identification code) from text.
        Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
        """
        # First, try to find CIF on a line that doesn't contain CLIENT
        lines = text_upper.split('\n')
        for line in lines:
            # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
            if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line:
                continue

            # Look for CIF in this line
            for pattern, confidence in self.CUI_PATTERNS:
                match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
                if match:
                    cui = match.group(1)
                    if 6 <= len(cui) <= 10:
                        return cui, confidence

        # Fallback: search entire text but exclude CLIENT patterns
        for pattern, confidence in self.CUI_PATTERNS:
            # Find all matches
            for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
                cui = match.group(1)
                if 6 <= len(cui) <= 10:
                    # Check if this match is preceded by CLIENT in the same line
                    start = match.start()
                    line_start = text_upper.rfind('\n', 0, start) + 1
                    line_text = text_upper[line_start:start]
                    if 'CLIENT' not in line_text and 'LIENT' not in line_text:
                        return cui, confidence

        return None, 0.0

    def _detect_receipt_type(self, text: str) -> str:
        """Detect receipt type from text content."""
        if 'CHITANTA' in text or 'CHITANȚĂ' in text:
            return 'chitanta'
        return 'bon_fiscal'

    def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
        """
        Extract multiple TVA (VAT) entries from text.
        Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).

        Returns (tva_entries, tva_total) where tva_entries is a list of:
            {'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
        """
        tva_entries = []
        seen_entries = set()  # To avoid duplicates

        # Normalize spaces in numbers first (OCR may produce "32. 31")
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)

        # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
        # OCR tolerant: TUA, TVR, etc.
        pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
        for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
            try:
                code = match.group(1).upper()
                percent = int(match.group(2))
                amount_str = match.group(3).replace(' ', '')
                amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                amount = Decimal(amount_str)
                if amount > 0:
                    entry_key = (code, percent)
                    if entry_key not in seen_entries:
                        tva_entries.append({
                            'code': code,
                            'percent': percent,
                            'amount': amount
                        })
                        seen_entries.add(entry_key)
            except (ValueError, InvalidOperation):
                continue

        # Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A')
        if not tva_entries:
            pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
            for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
                try:
                    percent = int(match.group(1))
                    amount_str = match.group(2).replace(' ', '')
                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        # Determine code based on percent
                        code = self._get_tva_code_from_percent(percent)
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 3: "TVAA - 21%" on one line, amount on next line
        if not tva_entries:
            tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = (match.group(1) or 'A').upper()
                    percent = int(match.group(2))

                    # Look for amount on the next line or immediately after
                    after_tva = normalized_text[match.end():]
                    amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva)
                    if amount_match:
                        amount_str = self._normalize_number(amount_match.group(1))
                        amount = Decimal(amount_str)
                        if amount > 0:
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue

        # Pattern 4: Use TVA_PATTERNS for fallback
        if not tva_entries:
            for pattern, _ in self.TVA_PATTERNS:
                match = re.search(pattern, normalized_text, re.IGNORECASE)
                if match:
                    try:
                        # Some patterns have 2 groups (percent, amount), others just amount
                        if match.lastindex >= 2:
                            percent = int(match.group(1))
                            amount_str = match.group(2)
                        else:
                            amount_str = match.group(1)
                            # Try to detect percent from text
                            percent = self._detect_tva_percent(text)

                        amount_str = amount_str.replace(' ', '')
                        amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                        amount = Decimal(amount_str)
                        if amount > 0 and percent:
                            code = self._get_tva_code_from_percent(percent)
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                                break  # Only use first match from fallback
                    except (ValueError, InvalidOperation):
                        continue

        # Calculate total
        tva_total = None
        if tva_entries:
            tva_total = sum(entry['amount'] for entry in tva_entries)

        # Sort by code (A, B, C, D)
        tva_entries.sort(key=lambda x: x.get('code', 'Z'))

        return tva_entries, tva_total

    def _get_tva_code_from_percent(self, percent: int) -> str:
        """Map TVA percentage to standard Romanian code.

        Romanian TVA rates changed in August 2025:
        - Standard rate: 19% → 21%
        - Reduced rate: 9% → 11%
        - Other rates (5%, 0%) remain unchanged

        Old rates (before Aug 2025):  New rates (from Aug 2025):
        - A = 19% (standard)          - A = 21% (standard)
        - B = 9%  (reduced)           - B = 11% (reduced)
        - C = 5%  (reduced)           - C = 5%  (reduced)
        - D = 0%  (exempt)            - D = 0%  (exempt)

        Both old and new rates are supported for historical receipts.
        """
        if percent in (19, 21):
            return 'A'  # Standard rate (19% old, 21% new from Aug 2025)
        elif percent in (9, 11):
            return 'B'  # Reduced rate (9% old, 11% new from Aug 2025)
        elif percent == 5:
            return 'C'  # Reduced rate (unchanged)
        elif percent == 0:
            return 'D'  # Exempt (unchanged)
        else:
            return 'A'  # Default to standard rate

    def _detect_tva_percent(self, text: str) -> Optional[int]:
        """Detect TVA percentage from text content."""
        # Look for common Romanian TVA percentages
        if '19%' in text or '19 %' in text:
            return 19
        elif '21%' in text or '21 %' in text:
            return 21
        elif '11%' in text or '11 %' in text:
            return 11
        elif '9%' in text or '9 %' in text:
            return 9
        elif '5%' in text or '5 %' in text:
            return 5
        return None

    def _extract_items_count(self, text: str) -> Optional[int]:
        """Extract number of items/articles from receipt."""
        for pattern, _ in self.ITEMS_COUNT_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    count = int(match.group(1))
                    if 0 < count < 1000:  # Reasonable range
                        return count
                except ValueError:
                    continue
        return None

    def _extract_address(self, text: str) -> Optional[str]:
        """Extract vendor address from text."""
        lines = text.split('\n')
        address_parts = []

        for line in lines[:15]:  # Check first 15 lines
            line = line.strip()
            if not line:
                continue

            # Check for address patterns
            line_upper = line.upper()

            # JUD. (county) pattern
            if re.search(r'\bJUD\.?\s+', line_upper):
                address_parts.append(line)
                continue

            # STR. (street) pattern
            if re.search(r'\bSTR\.?\s+', line_upper):
                address_parts.append(line)
                continue

            # MUN./OR./COM. (city/town) pattern
            if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper):
                address_parts.append(line)
                continue

        if address_parts:
            # Join and clean address parts
            address = ', '.join(address_parts)
            # Clean up
            address = re.sub(r'\s+', ' ', address).strip()
            address = re.sub(r',\s*,', ',', address)
            return address if len(address) >= 5 else None

        return None