fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions
--- a/backend/modules/data_entry/services/ocr_extractor.py
+++ b/backend/modules/data_entry/services/ocr_extractor.py
@@ -456,7 +456,9 @@ class ReceiptExtractor:
        # Lookup store-specific profile for enhanced extraction accuracy
        store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
        if store_profile:
-            print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
+            print(f"[Profile] ✅ Using {store_profile.STORE_NAME} ({store_profile.__class__.__name__}) for CUI {result.cui}", flush=True)
+        else:
+            print(f"[Profile] ⚠️ No profile found for CUI '{result.cui}' - using GENERIC extraction", flush=True)

        # =========================================================================
        # STEP 2: Extract ALL fields using profile (if available) or generic
@@ -490,8 +492,11 @@ class ReceiptExtractor:
                result.client_address = client_address
                result.confidence_client = confidence

+            # Log extraction results for debugging
+            tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
+            payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
            print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
-                  f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
+                  f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
        else:
            # Generic extraction for unknown stores
            result.amount, result.confidence_amount = self._extract_amount(text_upper)
@@ -507,6 +512,12 @@ class ReceiptExtractor:
            result.client_address = client_address
            result.confidence_client = confidence

+            # Log generic extraction results for debugging
+            tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
+            payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
+            print(f"[Generic] Extracted: total={result.amount}, date={result.receipt_date}, "
+                  f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
+
        # Series extraction (no profile method, always generic)
        result.receipt_series, _ = self._extract_series(text_upper)