fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -111,25 +111,34 @@ class BaseStoreProfile(ABC):
|
||||
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
|
||||
]
|
||||
|
||||
# Client section markers (for B2B receipts)
|
||||
# Client section markers (for B2B receipts) - More flexible patterns
|
||||
CLIENT_MARKERS = [
|
||||
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',
|
||||
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',
|
||||
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',
|
||||
r'CLIENT\s*:',
|
||||
r'CUMPARATOR\s*:',
|
||||
r'BENEFICIAR\s*:',
|
||||
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
|
||||
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
|
||||
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI"
|
||||
r'CLIENT\s*:', # "CLIENT:"
|
||||
r'CUMPARATOR\s*:', # "CUMPARATOR:"
|
||||
r'BENEFICIAR\s*:', # "BENEFICIAR:"
|
||||
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
|
||||
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
|
||||
]
|
||||
|
||||
# Client CUI patterns (pattern, confidence)
|
||||
# Client CUI patterns (pattern, confidence) - More flexible
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
|
||||
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
|
||||
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.90),
|
||||
# "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
|
||||
(r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
|
||||
# "CLIENT CIF: XXXXXXX"
|
||||
(r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
# "CUI CLIENT: XXXXXXX"
|
||||
(r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
# "ROXXXXXXX" followed by CLIENT marker
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
|
||||
# "C.I.F. CLIENT: XXXXXXX"
|
||||
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
|
||||
# "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
|
||||
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
|
||||
# "COD FISCAL CLIENT: XXXXXXX"
|
||||
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
|
||||
]
|
||||
|
||||
# Company type indicators (for identifying company names)
|
||||
|
||||
Reference in New Issue
Block a user