fix(ocr): Improve CUI matching and vendor name extraction

- Add CUI variant matching for Romanian fiscal codes (handles "RO22891860",
  "RO 22891860", and "22891860" formats) in both sync_service and validation
- Fix vendor name extraction to properly handle "SC." prefix (Societate
  Comercială) vs "SC" as staircase in addresses
- Remove problematic TVA pattern that was incorrectly matching percentage values
- Add docTR Plus engine option to dropdown with "(recomandat)" label

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-04 05:34:31 +02:00
parent f1f6760bef
commit 2f7ef55868
4 changed files with 61 additions and 9 deletions

View File

@@ -228,9 +228,9 @@ class ReceiptExtractor:
# Handles: "TOTAL TA F 194" where TVA became "TA F"
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
(r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
# "TUA" with random letter after (OCR noise): "TUA F", "TUA I"
(r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83),
# Simple TVA/IVA pattern
# NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching
# percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases.
# Simple TVA/IVA pattern - this is the reliable fallback
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
# Standalone percentage line near TVA
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
@@ -925,8 +925,19 @@ class ReceiptExtractor:
# Normalize whitespace
name = re.sub(r'\s+', ' ', name).strip()
name_upper = name.upper()
# Skip if it looks like an address line only
if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
# Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company)
# Only reject SC when followed by a number (staircase), not when followed by company name
# Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes
# SC is only address when followed by digit (e.g., "SC 2", "SC. 5")
if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper):
return None
# SC followed by digit = staircase (address), reject
# SC followed by letter/company name = "Societate Comercială", keep
if re.match(r'^S\.?\s*C\.?\s+\d', name_upper):
return None
# Skip if too short after cleaning