From 2f7ef55868b9a84a35f8da36992c7e200349dc96 Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Sun, 4 Jan 2026 05:34:31 +0200 Subject: [PATCH] fix(ocr): Improve CUI matching and vendor name extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add CUI variant matching for Romanian fiscal codes (handles "RO22891860", "RO 22891860", and "22891860" formats) in both sync_service and validation - Fix vendor name extraction to properly handle "SC." prefix (Societate Comercială) vs "SC" as staircase in addresses - Remove problematic TVA pattern that was incorrectly matching percentage values - Add docTR Plus engine option to dropdown with "(recomandat)" label 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../data_entry/services/ocr/validation.py | 12 +++++-- .../data_entry/services/ocr_extractor.py | 19 ++++++++--- .../data_entry/services/sync_service.py | 33 +++++++++++++++++-- .../components/ocr/OCRUploadZone.vue | 6 +++- 4 files changed, 61 insertions(+), 9 deletions(-) diff --git a/backend/modules/data_entry/services/ocr/validation.py b/backend/modules/data_entry/services/ocr/validation.py index 1bc6f9b..99a3988 100644 --- a/backend/modules/data_entry/services/ocr/validation.py +++ b/backend/modules/data_entry/services/ocr/validation.py @@ -1129,14 +1129,21 @@ class OCRValidationEngine: # Helper to search database for CUI async def lookup_cui_in_db(digits: str) -> Optional[tuple[str, str]]: """Search both synced and local suppliers for CUI.""" - # Search patterns: with and without RO prefix - search_patterns = [digits, f"RO{digits}"] + # Search patterns: with and without RO prefix, with and without space + # Database may have: "22891860", "RO22891860", "RO 22891860" + search_patterns = [ + digits, # Just digits: 22891860 + f"RO{digits}", # With RO prefix: RO22891860 + f"RO {digits}", # With RO prefix and space: RO 22891860 + digits.lstrip('0'), # Handle leading zeros + ] # Search synced_suppliers first (more data) stmt = select(SyncedSupplier.fiscal_code, SyncedSupplier.name).where( or_( SyncedSupplier.fiscal_code == digits, SyncedSupplier.fiscal_code == f"RO{digits}", + SyncedSupplier.fiscal_code == f"RO {digits}", # With space SyncedSupplier.fiscal_code == digits.lstrip('0'), # Handle leading zeros ) ).limit(1) @@ -1150,6 +1157,7 @@ class OCRValidationEngine: or_( LocalSupplier.fiscal_code == digits, LocalSupplier.fiscal_code == f"RO{digits}", + LocalSupplier.fiscal_code == f"RO {digits}", # With space LocalSupplier.fiscal_code == digits.lstrip('0'), ) ).limit(1) diff --git a/backend/modules/data_entry/services/ocr_extractor.py b/backend/modules/data_entry/services/ocr_extractor.py index f2ac6a3..5a87190 100644 --- a/backend/modules/data_entry/services/ocr_extractor.py +++ b/backend/modules/data_entry/services/ocr_extractor.py @@ -228,9 +228,9 @@ class ReceiptExtractor: # Handles: "TOTAL TA F 194" where TVA became "TA F" (r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85), (r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82), - # "TUA" with random letter after (OCR noise): "TUA F", "TUA I" - (r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83), - # Simple TVA/IVA pattern + # NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching + # percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases. + # Simple TVA/IVA pattern - this is the reliable fallback (r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85), # Standalone percentage line near TVA (r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75), @@ -925,8 +925,19 @@ class ReceiptExtractor: # Normalize whitespace name = re.sub(r'\s+', ' ', name).strip() + name_upper = name.upper() + # Skip if it looks like an address line only - if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()): + # Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company) + # Only reject SC when followed by a number (staircase), not when followed by company name + # Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes + # SC is only address when followed by digit (e.g., "SC 2", "SC. 5") + if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper): + return None + + # SC followed by digit = staircase (address), reject + # SC followed by letter/company name = "Societate Comercială", keep + if re.match(r'^S\.?\s*C\.?\s+\d', name_upper): return None # Skip if too short after cleaning diff --git a/backend/modules/data_entry/services/sync_service.py b/backend/modules/data_entry/services/sync_service.py index 3f59694..405f66a 100644 --- a/backend/modules/data_entry/services/sync_service.py +++ b/backend/modules/data_entry/services/sync_service.py @@ -237,6 +237,31 @@ class SyncService: return synced, errors + @staticmethod + def _get_fiscal_code_variants(fiscal_code: str) -> list: + """ + Generate all possible variants of a Romanian fiscal code (CUI). + Database may store: "22891860", "RO22891860", "RO 22891860" + OCR may extract: "RO22891860" or "22891860" + """ + import re + # Extract just the digits + digits = re.sub(r'[^0-9]', '', fiscal_code) + if not digits: + return [fiscal_code] + + # Generate all variants + variants = [ + digits, # Just digits: 22891860 + f"RO{digits}", # With RO prefix: RO22891860 + f"RO {digits}", # With RO prefix and space: RO 22891860 + ] + # Also add the original if different + if fiscal_code not in variants: + variants.append(fiscal_code) + + return variants + @staticmethod async def search_supplier( session: AsyncSession, @@ -251,9 +276,11 @@ class SyncService: """ # 1. Search in synced suppliers if fiscal_code: + # Search all variants of the fiscal code (with/without RO, with/without space) + variants = SyncService._get_fiscal_code_variants(fiscal_code) stmt = select(SyncedSupplier).where( SyncedSupplier.company_id == company_id, - SyncedSupplier.fiscal_code == fiscal_code + SyncedSupplier.fiscal_code.in_(variants) ) elif name: stmt = select(SyncedSupplier).where( @@ -276,9 +303,11 @@ class SyncService: # 2. Search in local suppliers if fiscal_code: + # Search all variants of the fiscal code (with/without RO, with/without space) + variants = SyncService._get_fiscal_code_variants(fiscal_code) stmt = select(LocalSupplier).where( LocalSupplier.company_id == company_id, - LocalSupplier.fiscal_code == fiscal_code + LocalSupplier.fiscal_code.in_(variants) ) elif name: stmt = select(LocalSupplier).where( diff --git a/src/modules/data-entry/components/ocr/OCRUploadZone.vue b/src/modules/data-entry/components/ocr/OCRUploadZone.vue index b67352c..25fce18 100644 --- a/src/modules/data-entry/components/ocr/OCRUploadZone.vue +++ b/src/modules/data-entry/components/ocr/OCRUploadZone.vue @@ -113,6 +113,10 @@ const engineConfig = { label: 'docTR', desc: 'Rapid, bună acuratețe' }, + 'doctr_plus': { + label: 'docTR Plus', + desc: '2 treceri secvențiale · (recomandat)' + }, 'paddleocr': { label: 'PaddleOCR', desc: 'Cea mai bună calitate' @@ -123,7 +127,7 @@ const engineConfig = { }, 'hybrid': { label: 'Hybrid', - desc: 'docTR+Tess paralel · Recomandat' + desc: 'docTR+Tess paralel' }, 'hybrid-quality': { label: 'Hybrid Calitate',