fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -456,7 +456,9 @@ class ReceiptExtractor:
|
||||
# Lookup store-specific profile for enhanced extraction accuracy
|
||||
store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
|
||||
if store_profile:
|
||||
print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
|
||||
print(f"[Profile] ✅ Using {store_profile.STORE_NAME} ({store_profile.__class__.__name__}) for CUI {result.cui}", flush=True)
|
||||
else:
|
||||
print(f"[Profile] ⚠️ No profile found for CUI '{result.cui}' - using GENERIC extraction", flush=True)
|
||||
|
||||
# =========================================================================
|
||||
# STEP 2: Extract ALL fields using profile (if available) or generic
|
||||
@@ -490,8 +492,11 @@ class ReceiptExtractor:
|
||||
result.client_address = client_address
|
||||
result.confidence_client = confidence
|
||||
|
||||
# Log extraction results for debugging
|
||||
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
|
||||
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
|
||||
print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
|
||||
f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
|
||||
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
|
||||
else:
|
||||
# Generic extraction for unknown stores
|
||||
result.amount, result.confidence_amount = self._extract_amount(text_upper)
|
||||
@@ -507,6 +512,12 @@ class ReceiptExtractor:
|
||||
result.client_address = client_address
|
||||
result.confidence_client = confidence
|
||||
|
||||
# Log generic extraction results for debugging
|
||||
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
|
||||
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
|
||||
print(f"[Generic] Extracted: total={result.amount}, date={result.receipt_date}, "
|
||||
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
|
||||
|
||||
# Series extraction (no profile method, always generic)
|
||||
result.receipt_series, _ = self._extract_series(text_upper)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user