feat: Improve OCR adaptive pipeline with early exit and better pattern matching

- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 01:54:52 +02:00
parent 6c3dd89f6d
commit 9f06482681
9 changed files with 952 additions and 116 deletions
--- a/data-entry-app/backend/app/services/ocr_extractor.py
+++ b/data-entry-app/backend/app/services/ocr_extractor.py
@@ -28,6 +28,8 @@ class ExtractionResult:
    confidence_date: float = 0.0
    confidence_vendor: float = 0.0
    raw_text: str = ""
+    ocr_engine: str = ""  # OCR engine used: paddleocr or tesseract
+    processing_time_ms: int = 0  # Processing time in milliseconds

    @property
    def overall_confidence(self) -> float:
@@ -70,6 +72,7 @@ class ReceiptExtractor:

    # Date patterns - support dash, dot, and slash separators
    # OCR may produce DRTA instead of DATA, DAIA, etc.
+    # OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
    DATE_PATTERNS = [
        # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
        (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
@@ -84,6 +87,19 @@ class ReceiptExtractor:
        (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
    ]

+    # OCR-corrupted date patterns with spaces/commas
+    # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
+    DATE_PATTERNS_OCR_SPACES = [
+        # YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
+        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
+        # YYYY. MM. DD format with spaces (standalone)
+        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
+        # DD. MM, YYYY or DD, MM. YYYY (with time following)
+        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
+        # DD. MM, YYYY or DD, MM. YYYY (standalone)
+        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
+    ]
+
    # Receipt number patterns - Romanian fiscal receipt formats
    # OCR may produce N instead of : or other errors
    NUMBER_PATTERNS = [
@@ -127,12 +143,23 @@ class ReceiptExtractor:
        (r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
        # COD FISCAL (vendor)
        (r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
-        # C.I.F. format (with dots)
+        # C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
+        (r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
+        # C.I.F. format (with dots, no spaces)
        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
        # CUI format (less specific, use with caution)
        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
    ]

+    # Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
+    # Common in some receipts: "R011201891\nC. I. F." - number on line before label
+    CUI_REVERSED_PATTERNS = [
+        # RO + 8-10 digits on line immediately before C.I.F./CIF label
+        (r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
+        # Just digits before C.I.F. label
+        (r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
+    ]
+
    # Series patterns - be strict to avoid false matches
    SERIES_PATTERNS = [
        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
@@ -158,6 +185,7 @@ class ReceiptExtractor:

    # Items count patterns - OCR may produce OZ instead of POZ, etc.
    # Number may be on separate line before or after the label
+    # IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
    ITEMS_COUNT_PATTERNS = [
        # NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
        # OCR tolerant: OZ instead of POZ, ARI instead of ART
@@ -167,11 +195,10 @@ class ReceiptExtractor:
        # Number may be on next line after label
        (r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
        (r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
-        # Simpler patterns
+        # Simpler patterns - but more specific
        (r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
-        (r'P?[O0]Z\s*:?\s*(\d+)', 0.85),
-        # X articole/pozitii
-        (r'(\d+)\s*(?:ARTIC[O0]LE|P[O0]ZITII|BUC)', 0.80),
+        # POZ at start of line or after colon (not in product descriptions)
+        (r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
    ]

    # Address patterns (Romanian format)
@@ -183,20 +210,21 @@ class ReceiptExtractor:
    ]

    # Vendor name indicators (lines containing these are likely vendor names)
+    # These should be company type suffixes, not generic words
+    # Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
    VENDOR_INDICATORS = [
-        r'\bS\.?R\.?L\.?\b',      # S.R.L.
-        r'\bS\.?A\.?\b',          # S.A.
-        r'\bS\.?N\.?C\.?\b',      # S.N.C.
-        r'\bS\.?C\.?S\.?\b',      # S.C.S.
-        r'\bI\.?I\.?\b',          # I.I. (Individual)
-        r'\bP\.?F\.?A\.?\b',      # P.F.A.
-        r'\bS\.?C\.?\b',          # S.C.
+        r'\bS\.?\s*R\.?\s*L\.?\b',      # S.R.L. or S. R. L.
+        r'\bS\.?\s*A\.?\b',              # S.A. or S. A.
+        r'\bS\.?\s*N\.?\s*C\.?\b',      # S.N.C. or S. N. C.
+        r'\bS\.?\s*C\.?\s*S\.?\b',      # S.C.S. or S. C. S.
+        r'\bI\.?\s*I\.?\b',              # I.I. or I. I.
+        r'\bP\.?\s*F\.?\s*A\.?\b',      # P.F.A. or P. F. A.
+        # S.C. alone is too short and generic - only match if followed by company name
+        r'\bS\.?\s*C\.?\s+[A-Z]',       # S.C. followed by company name
        r'HOLDING',
        r'COMPANY',
        r'GROUP',
-        r'MAGAZIN',
-        r'MARKET',
-        r'SHOP',
+        # Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
    ]

    def extract(self, text: str) -> ExtractionResult:
@@ -215,6 +243,14 @@ class ReceiptExtractor:

        # Extract additional fields - Multiple TVA entries
        result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
+        if not result.tva_entries:
+            print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
+            # Debug: show what patterns see
+            import re
+            normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
+            taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
+            rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
+            print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
        result.items_count = self._extract_items_count(text_upper)
        result.address = self._extract_address(text_upper)

@@ -334,6 +370,7 @@ class ReceiptExtractor:

    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
        """Extract receipt date from text."""
+        # First try standard patterns (clean dates)
        for pattern, confidence in self.DATE_PATTERNS:
            match = re.search(pattern, text)
            if match:
@@ -354,6 +391,34 @@ class ReceiptExtractor:
                        return parsed, confidence
                except ValueError:
                    continue
+
+        # Then try OCR-corrupted patterns (dates with spaces/commas)
+        # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
+        for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
+            match = re.search(pattern, text)
+            if match:
+                try:
+                    if fmt == 'ymd':
+                        # YYYY. MM. DD format (OMV/Petrom)
+                        year = match.group(1)
+                        month = match.group(2)
+                        day = match.group(3)
+                    else:
+                        # DD. MM. YYYY format (default)
+                        day = match.group(1)
+                        month = match.group(2)
+                        year = match.group(3)
+
+                    date_str = f"{day}.{month}.{year}"
+                    parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
+
+                    # Validate date range
+                    today = date.today()
+                    if parsed <= today and parsed.year >= 2020:
+                        return parsed, confidence
+                except ValueError:
+                    continue
+
        return None, 0.0

    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
@@ -377,8 +442,9 @@ class ReceiptExtractor:
        Extract vendor/partner name from text.
        Uses multiple strategies:
        1. Look for lines with company type indicators (S.R.L., S.A., etc.)
-        2. Look for lines near CIF
-        3. Use first valid line as fallback
+        2. Look for company name + SRL on separate lines
+        3. Look for lines near CIF
+        4. Use first valid line as fallback
        """
        lines = text.split('\n')
        skip_keywords = [
@@ -388,9 +454,37 @@ class ReceiptExtractor:
            'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
            'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
            'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
-            'BUC', 'ROLA', 'CUMPARATOR'
+            'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
+            'NIVS', 'BENZINA', 'PETROM', 'OMV'
        ]

+        # Strategy 0: Look for company name followed by SRL/SA on next line
+        # Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
+        for i, line in enumerate(lines[:15]):
+            line = line.strip()
+            if not line or len(line) < 3:
+                continue
+
+            line_upper = line.upper()
+
+            # Skip lines with skip keywords
+            if any(kw in line_upper for kw in skip_keywords):
+                continue
+
+            # Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
+            if i + 1 < len(lines):
+                next_line = lines[i + 1].strip().upper()
+                # Match standalone company type suffix
+                if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
+                   re.match(r'^S\.?\s*A\.?$', next_line) or \
+                   re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
+                   re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
+                   re.match(r'^I\.?\s*I\.?$', next_line):
+                    # Combine: "COMPANY NAME" + " " + "SRL"
+                    vendor = self._clean_vendor_name(f"{line} {next_line}")
+                    if vendor and len(vendor) >= 5:
+                        return vendor, 0.95
+
        # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
        for i, line in enumerate(lines[:15]):  # Check first 15 lines
            line = line.strip()
@@ -476,7 +570,22 @@ class ReceiptExtractor:
        Extract vendor CUI (fiscal identification code) from text.
        Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
        """
-        # First, try to find CIF on a line that doesn't contain CLIENT
+        # Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
+        # This is common in some receipts: "R011201891\nC. I. F."
+        for pattern, confidence in self.CUI_REVERSED_PATTERNS:
+            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
+            if match:
+                cui = match.group(1)
+                if 6 <= len(cui) <= 10:
+                    # Verify this is not the CLIENT CUI by checking context
+                    start = match.start()
+                    # Check 50 chars before the match for CLIENT keyword
+                    context_start = max(0, start - 50)
+                    context = text_upper[context_start:start]
+                    if 'CLIENT' not in context and 'LIENT' not in context:
+                        return cui, confidence
+
+        # Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
        lines = text_upper.split('\n')
        for line in lines:
            # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
@@ -491,7 +600,7 @@ class ReceiptExtractor:
                    if 6 <= len(cui) <= 10:
                        return cui, confidence

-        # Fallback: search entire text but exclude CLIENT patterns
+        # Strategy 2: Fallback - search entire text but exclude CLIENT patterns
        for pattern, confidence in self.CUI_PATTERNS:
            # Find all matches
            for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
@@ -523,8 +632,94 @@ class ReceiptExtractor:
        tva_entries = []
        seen_entries = set()  # To avoid duplicates

-        # Normalize spaces in numbers first (OCR may produce "32. 31")
+        # Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
+        # OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
+        # Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
+        non_vat_patterns = [
+            # Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
+            r'NEPLAT\w*OR',           # NEPLATITOR, NEPLATTOR, NEPLATOR
+            r'[ANM]EPLAT\w*O?R',      # OCR errors: ANEPLATHTOR, MEPLATITOR
+            r'TOTAL\s+NEPLAT',        # TOTAL NEPLATITOR...
+            r'TOTAL\s+[ANM]EPLAT',    # TOTAL ANEPLAT... (OCR error)
+            r'SCUTIT\s*(?:DE\s+)?T[VU]A',  # SCUTIT DE TVA
+            r'NEPLAT\w*\s+T[VU]A',    # NEPLATITOR TVA
+            r'NEPLAT\w*\s+DE\s+T',    # NEPLATITOR DE T... (truncated)
+        ]
+        for pattern in non_vat_patterns:
+            if re.search(pattern, text, re.IGNORECASE):
+                # Non-VAT payer - return TVA = 0
+                return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')
+
+        # Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
+        # Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
+        normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
+
+        # Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
+        # Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
+        # OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
+        # OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
+        total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
+        taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
+
+        # Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
+        if not taxe_match:
+            reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
+            taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
+
+        if taxe_match:
+            # Also need to find the TVA rate from the table
+            # Pattern handles: "A-21%", "-21,00%", "21%" etc.
+            rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
+            rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
+            if rate_match:
+                try:
+                    code = rate_match.group(1).upper() if rate_match.group(1) else 'A'  # Default to A if missing
+                    percent = int(rate_match.group(2))
+                    amount_str = taxe_match.group(1).replace(' ', '')
+                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
+                    amount = Decimal(amount_str)
+                    if amount > 0:
+                        entry_key = (code, percent)
+                        if entry_key not in seen_entries:
+                            tva_entries.append({
+                                'code': code,
+                                'percent': percent,
+                                'amount': amount
+                            })
+                            seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    pass
+
+        # Pattern 0b: Table format "A-21,00%  285,66  49,58" (code-percent  base  tva_amount)
+        # This format appears after a TVA header line like "TVA  TOTAL  VALDARE"
+        # The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
+        if not tva_entries:
+            table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
+            for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
+                try:
+                    code = match.group(1).upper()
+                    percent = int(match.group(2))
+                    amount1_str = match.group(3).replace(' ', '')
+                    amount2_str = match.group(4).replace(' ', '')
+                    amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
+                    amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
+
+                    # Determine which is TVA: the smaller amount is usually TVA
+                    # (TVA is a fraction of the total, so it's always smaller)
+                    tva_amount = min(amount1, amount2)
+
+                    if tva_amount > 0:
+                        entry_key = (code, percent)
+                        if entry_key not in seen_entries:
+                            tva_entries.append({
+                                'code': code,
+                                'percent': percent,
+                                'amount': tva_amount
+                            })
+                            seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    continue

        # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
        # OCR tolerant: TUA, TVR, etc.
@@ -571,7 +766,75 @@ class ReceiptExtractor:
                except (ValueError, InvalidOperation):
                    continue

-        # Pattern 3: "TVAA - 21%" on one line, amount on next line
+        # Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
+        if not tva_entries:
+            # First try: "TOTAL TVA A - 21%  32.31" (amount on same line)
+            tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
+            for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
+                try:
+                    code = match.group(1).upper()
+                    percent = int(match.group(2))
+                    amount_str = self._normalize_number(match.group(3))
+                    amount = Decimal(amount_str)
+                    if amount > 0:
+                        entry_key = (code, percent)
+                        if entry_key not in seen_entries:
+                            tva_entries.append({
+                                'code': code,
+                                'percent': percent,
+                                'amount': amount
+                            })
+                            seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    continue
+
+        # Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
+        if not tva_entries:
+            tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
+            for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
+                try:
+                    code = match.group(1).upper()
+                    percent = int(match.group(2))
+
+                    # Look for "TOTAL TVA BON" followed by amount
+                    tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
+                    tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
+                    if tva_bon_match:
+                        amount_str = self._normalize_number(tva_bon_match.group(1))
+                        amount = Decimal(amount_str)
+                        if amount > 0:
+                            entry_key = (code, percent)
+                            if entry_key not in seen_entries:
+                                tva_entries.append({
+                                    'code': code,
+                                    'percent': percent,
+                                    'amount': amount
+                                })
+                                seen_entries.add(entry_key)
+                            continue
+
+                    # Fallback: Amount after TOTAL TVA BON on next line
+                    tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
+                    if tva_bon_pos:
+                        after_bon = normalized_text[tva_bon_pos.end():]
+                        # Find first standalone number (likely TVA amount)
+                        amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
+                        if amount_match:
+                            amount_str = self._normalize_number(amount_match.group(1))
+                            amount = Decimal(amount_str)
+                            if amount > 0:
+                                entry_key = (code, percent)
+                                if entry_key not in seen_entries:
+                                    tva_entries.append({
+                                        'code': code,
+                                        'percent': percent,
+                                        'amount': amount
+                                    })
+                                    seen_entries.add(entry_key)
+                except (ValueError, InvalidOperation):
+                    continue
+
+        # Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
        if not tva_entries:
            tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):