fix(ocr): Improve CUI matching and vendor name extraction

- Add CUI variant matching for Romanian fiscal codes (handles "RO22891860", "RO 22891860", and "22891860" formats) in both sync_service and validation - Fix vendor name extraction to properly handle "SC." prefix (Societate Comercială) vs "SC" as staircase in addresses - Remove problematic TVA pattern that was incorrectly matching percentage values - Add docTR Plus engine option to dropdown with "(recomandat)" label 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 05:34:31 +02:00
parent f1f6760bef
commit 2f7ef55868
4 changed files with 61 additions and 9 deletions
--- a/backend/modules/data_entry/services/ocr_extractor.py
+++ b/backend/modules/data_entry/services/ocr_extractor.py
@@ -228,9 +228,9 @@ class ReceiptExtractor:
        # Handles: "TOTAL TA F 194" where TVA became "TA F"
        (r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
        (r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
-        # "TUA" with random letter after (OCR noise): "TUA F", "TUA I"
-        (r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83),
-        # Simple TVA/IVA pattern
+        # NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching
+        # percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases.
+        # Simple TVA/IVA pattern - this is the reliable fallback
        (r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
        # Standalone percentage line near TVA
        (r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
@@ -925,8 +925,19 @@ class ReceiptExtractor:
        # Normalize whitespace
        name = re.sub(r'\s+', ' ', name).strip()

+        name_upper = name.upper()
+
        # Skip if it looks like an address line only
-        if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
+        # Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company)
+        # Only reject SC when followed by a number (staircase), not when followed by company name
+        # Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes
+        #          SC is only address when followed by digit (e.g., "SC 2", "SC. 5")
+        if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper):
+            return None
+
+        # SC followed by digit = staircase (address), reject
+        # SC followed by letter/company name = "Societate Comercială", keep
+        if re.match(r'^S\.?\s*C\.?\s+\d', name_upper):
            return None

        # Skip if too short after cleaning