fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions

View File

@@ -456,7 +456,9 @@ class ReceiptExtractor:
# Lookup store-specific profile for enhanced extraction accuracy
store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
if store_profile:
print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
print(f"[Profile] Using {store_profile.STORE_NAME} ({store_profile.__class__.__name__}) for CUI {result.cui}", flush=True)
else:
print(f"[Profile] ⚠️ No profile found for CUI '{result.cui}' - using GENERIC extraction", flush=True)
# =========================================================================
# STEP 2: Extract ALL fields using profile (if available) or generic
@@ -490,8 +492,11 @@ class ReceiptExtractor:
result.client_address = client_address
result.confidence_client = confidence
# Log extraction results for debugging
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
else:
# Generic extraction for unknown stores
result.amount, result.confidence_amount = self._extract_amount(text_upper)
@@ -507,6 +512,12 @@ class ReceiptExtractor:
result.client_address = client_address
result.confidence_client = confidence
# Log generic extraction results for debugging
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
print(f"[Generic] Extracted: total={result.amount}, date={result.receipt_date}, "
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
# Series extraction (no profile method, always generic)
result.receipt_series, _ = self._extract_series(text_upper)