roa2web-service-auto/data-entry-app/backend/app/services/ocr_extractor.py

"""Extract structured fields from OCR text (Romanian receipts)."""

import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple
from dataclasses import dataclass, field


@dataclass
class ExtractionResult:
    """Structured extraction result from receipt."""
    receipt_type: str = 'bon_fiscal'
    receipt_number: Optional[str] = None
    receipt_series: Optional[str] = None
    receipt_date: Optional[date] = None
    amount: Optional[Decimal] = None
    partner_name: Optional[str] = None
    cui: Optional[str] = None
    description: Optional[str] = None

    confidence_amount: float = 0.0
    confidence_date: float = 0.0
    confidence_vendor: float = 0.0
    raw_text: str = ""

    @property
    def overall_confidence(self) -> float:
        """Calculate weighted overall confidence score."""
        weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
        return round(
            self.confidence_amount * weights['amount'] +
            self.confidence_date * weights['date'] +
            self.confidence_vendor * weights['vendor'],
            2
        )


class ReceiptExtractor:
    """Extract receipt fields using pattern matching for Romanian receipts."""

    # Total amount patterns (most specific first)
    TOTAL_PATTERNS = [
        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
        (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
    ]

    # Date patterns
    DATE_PATTERNS = [
        (r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
        (r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
        (r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
        (r'(\d{4}[./]\d{2}[./]\d{2})', 0.75),  # YYYY.MM.DD format
    ]

    # Receipt number patterns
    NUMBER_PATTERNS = [
        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
        (r'NR\.?\s*:?\s*(\d{4,})', 0.70),
    ]

    # CUI (fiscal code) patterns
    CUI_PATTERNS = [
        (r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
        (r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
        (r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
        (r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
    ]

    # Series patterns
    SERIES_PATTERNS = [
        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
        (r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
    ]

    def extract(self, text: str) -> ExtractionResult:
        """Extract all fields from OCR text."""
        result = ExtractionResult()
        result.raw_text = text
        text_upper = text.upper()

        # Extract fields
        result.amount, result.confidence_amount = self._extract_amount(text_upper)
        result.receipt_date, result.confidence_date = self._extract_date(text_upper)
        result.receipt_number, _ = self._extract_number(text_upper)
        result.receipt_series, _ = self._extract_series(text_upper)
        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
        result.cui, _ = self._extract_cui(text_upper)

        # Detect receipt type
        result.receipt_type = self._detect_receipt_type(text_upper)

        return result

    def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
        """Extract total amount from text."""
        for pattern, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                try:
                    amount_str = re.sub(r'[^\d.,]', '', match.group(1))
                    # Handle Romanian number format (1.234,56)
                    amount_str = self._normalize_number(amount_str)
                    amount = Decimal(amount_str)
                    if amount > 0:
                        return amount, confidence
                except (InvalidOperation, ValueError):
                    continue
        return None, 0.0

    def _normalize_number(self, num_str: str) -> str:
        """Normalize Romanian number format to standard decimal."""
        # Remove spaces
        num_str = num_str.replace(' ', '')

        # Handle comma as decimal separator
        if ',' in num_str and '.' in num_str:
            # Romanian format: 1.234,56
            num_str = num_str.replace('.', '').replace(',', '.')
        elif ',' in num_str:
            # Could be 1,50 or 1,234
            parts = num_str.split(',')
            if len(parts) == 2 and len(parts[1]) <= 2:
                # Decimal comma: 1,50
                num_str = num_str.replace(',', '.')
            else:
                # Thousands comma: 1,234
                num_str = num_str.replace(',', '')
        elif '.' in num_str:
            parts = num_str.split('.')
            if len(parts) > 2:
                # Multiple dots: 1.234.567 -> 1234567
                num_str = ''.join(parts[:-1]) + '.' + parts[-1]

        return num_str

    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
        """Extract receipt date from text."""
        for pattern, confidence in self.DATE_PATTERNS:
            match = re.search(pattern, text)
            if match:
                try:
                    date_str = match.group(1).replace('/', '.')

                    # Try DD.MM.YYYY format first
                    try:
                        parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
                    except ValueError:
                        # Try YYYY.MM.DD format
                        parsed = datetime.strptime(date_str, '%Y.%m.%d').date()

                    # Validate date range
                    today = date.today()
                    if parsed <= today and parsed.year >= 2020:
                        return parsed, confidence
                except ValueError:
                    continue
        return None, 0.0

    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
        """Extract receipt number from text."""
        for pattern, confidence in self.NUMBER_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1), confidence
        return None, 0.0

    def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
        """Extract receipt series from text."""
        for pattern, confidence in self.SERIES_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).upper(), confidence
        return None, 0.0

    def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
        """Extract vendor/partner name from text."""
        lines = text.split('\n')
        skip_keywords = [
            'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
            'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
            'RON', 'LEI', 'CHITANTA', 'REST'
        ]

        for i, line in enumerate(lines[:7]):  # Check first 7 lines
            line = line.strip()

            # Skip empty lines
            if not line:
                continue

            # Skip lines that are just numbers
            if re.match(r'^[\d.,\s]+$', line):
                continue

            # Skip lines with keywords
            if any(kw in line.upper() for kw in skip_keywords):
                continue

            # Clean the line
            vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()

            if len(vendor) >= 3:
                # Confidence decreases for lines further down
                confidence = max(0.3, 0.8 - (i * 0.1))
                return vendor, confidence

        return None, 0.0

    def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
        """Extract CUI (fiscal identification code) from text."""
        for pattern, confidence in self.CUI_PATTERNS:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                cui = match.group(1)
                if 6 <= len(cui) <= 10:
                    return cui, confidence
        return None, 0.0

    def _detect_receipt_type(self, text: str) -> str:
        """Detect receipt type from text content."""
        if 'CHITANTA' in text or 'CHITANȚĂ' in text:
            return 'chitanta'
        return 'bon_fiscal'