fix(ocr): Improve CUI matching and vendor name extraction

- Add CUI variant matching for Romanian fiscal codes (handles "RO22891860",
  "RO 22891860", and "22891860" formats) in both sync_service and validation
- Fix vendor name extraction to properly handle "SC." prefix (Societate
  Comercială) vs "SC" as staircase in addresses
- Remove problematic TVA pattern that was incorrectly matching percentage values
- Add docTR Plus engine option to dropdown with "(recomandat)" label

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-04 05:34:31 +02:00
parent f1f6760bef
commit 2f7ef55868
4 changed files with 61 additions and 9 deletions

View File

@@ -1129,14 +1129,21 @@ class OCRValidationEngine:
# Helper to search database for CUI
async def lookup_cui_in_db(digits: str) -> Optional[tuple[str, str]]:
"""Search both synced and local suppliers for CUI."""
# Search patterns: with and without RO prefix
search_patterns = [digits, f"RO{digits}"]
# Search patterns: with and without RO prefix, with and without space
# Database may have: "22891860", "RO22891860", "RO 22891860"
search_patterns = [
digits, # Just digits: 22891860
f"RO{digits}", # With RO prefix: RO22891860
f"RO {digits}", # With RO prefix and space: RO 22891860
digits.lstrip('0'), # Handle leading zeros
]
# Search synced_suppliers first (more data)
stmt = select(SyncedSupplier.fiscal_code, SyncedSupplier.name).where(
or_(
SyncedSupplier.fiscal_code == digits,
SyncedSupplier.fiscal_code == f"RO{digits}",
SyncedSupplier.fiscal_code == f"RO {digits}", # With space
SyncedSupplier.fiscal_code == digits.lstrip('0'), # Handle leading zeros
)
).limit(1)
@@ -1150,6 +1157,7 @@ class OCRValidationEngine:
or_(
LocalSupplier.fiscal_code == digits,
LocalSupplier.fiscal_code == f"RO{digits}",
LocalSupplier.fiscal_code == f"RO {digits}", # With space
LocalSupplier.fiscal_code == digits.lstrip('0'),
)
).limit(1)

View File

@@ -228,9 +228,9 @@ class ReceiptExtractor:
# Handles: "TOTAL TA F 194" where TVA became "TA F"
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
(r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
# "TUA" with random letter after (OCR noise): "TUA F", "TUA I"
(r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83),
# Simple TVA/IVA pattern
# NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching
# percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases.
# Simple TVA/IVA pattern - this is the reliable fallback
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
# Standalone percentage line near TVA
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
@@ -925,8 +925,19 @@ class ReceiptExtractor:
# Normalize whitespace
name = re.sub(r'\s+', ' ', name).strip()
name_upper = name.upper()
# Skip if it looks like an address line only
if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
# Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company)
# Only reject SC when followed by a number (staircase), not when followed by company name
# Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes
# SC is only address when followed by digit (e.g., "SC 2", "SC. 5")
if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper):
return None
# SC followed by digit = staircase (address), reject
# SC followed by letter/company name = "Societate Comercială", keep
if re.match(r'^S\.?\s*C\.?\s+\d', name_upper):
return None
# Skip if too short after cleaning

View File

@@ -237,6 +237,31 @@ class SyncService:
return synced, errors
@staticmethod
def _get_fiscal_code_variants(fiscal_code: str) -> list:
"""
Generate all possible variants of a Romanian fiscal code (CUI).
Database may store: "22891860", "RO22891860", "RO 22891860"
OCR may extract: "RO22891860" or "22891860"
"""
import re
# Extract just the digits
digits = re.sub(r'[^0-9]', '', fiscal_code)
if not digits:
return [fiscal_code]
# Generate all variants
variants = [
digits, # Just digits: 22891860
f"RO{digits}", # With RO prefix: RO22891860
f"RO {digits}", # With RO prefix and space: RO 22891860
]
# Also add the original if different
if fiscal_code not in variants:
variants.append(fiscal_code)
return variants
@staticmethod
async def search_supplier(
session: AsyncSession,
@@ -251,9 +276,11 @@ class SyncService:
"""
# 1. Search in synced suppliers
if fiscal_code:
# Search all variants of the fiscal code (with/without RO, with/without space)
variants = SyncService._get_fiscal_code_variants(fiscal_code)
stmt = select(SyncedSupplier).where(
SyncedSupplier.company_id == company_id,
SyncedSupplier.fiscal_code == fiscal_code
SyncedSupplier.fiscal_code.in_(variants)
)
elif name:
stmt = select(SyncedSupplier).where(
@@ -276,9 +303,11 @@ class SyncService:
# 2. Search in local suppliers
if fiscal_code:
# Search all variants of the fiscal code (with/without RO, with/without space)
variants = SyncService._get_fiscal_code_variants(fiscal_code)
stmt = select(LocalSupplier).where(
LocalSupplier.company_id == company_id,
LocalSupplier.fiscal_code == fiscal_code
LocalSupplier.fiscal_code.in_(variants)
)
elif name:
stmt = select(LocalSupplier).where(