fix(ocr): Improve CUI matching and vendor name extraction
- Add CUI variant matching for Romanian fiscal codes (handles "RO22891860", "RO 22891860", and "22891860" formats) in both sync_service and validation - Fix vendor name extraction to properly handle "SC." prefix (Societate Comercială) vs "SC" as staircase in addresses - Remove problematic TVA pattern that was incorrectly matching percentage values - Add docTR Plus engine option to dropdown with "(recomandat)" label 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -228,9 +228,9 @@ class ReceiptExtractor:
|
||||
# Handles: "TOTAL TA F 194" where TVA became "TA F"
|
||||
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
|
||||
(r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
|
||||
# "TUA" with random letter after (OCR noise): "TUA F", "TUA I"
|
||||
(r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83),
|
||||
# Simple TVA/IVA pattern
|
||||
# NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching
|
||||
# percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases.
|
||||
# Simple TVA/IVA pattern - this is the reliable fallback
|
||||
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
|
||||
# Standalone percentage line near TVA
|
||||
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
|
||||
@@ -925,8 +925,19 @@ class ReceiptExtractor:
|
||||
# Normalize whitespace
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
|
||||
name_upper = name.upper()
|
||||
|
||||
# Skip if it looks like an address line only
|
||||
if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
|
||||
# Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company)
|
||||
# Only reject SC when followed by a number (staircase), not when followed by company name
|
||||
# Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes
|
||||
# SC is only address when followed by digit (e.g., "SC 2", "SC. 5")
|
||||
if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper):
|
||||
return None
|
||||
|
||||
# SC followed by digit = staircase (address), reject
|
||||
# SC followed by letter/company name = "Societate Comercială", keep
|
||||
if re.match(r'^S\.?\s*C\.?\s+\d', name_upper):
|
||||
return None
|
||||
|
||||
# Skip if too short after cleaning
|
||||
|
||||
Reference in New Issue
Block a user