feat(ocr): Add modular store profiles with hot-reload support

## Store Profiles System
- Add ProfileRegistry for CUI-based profile lookup
- Add BaseStoreProfile with generic extraction patterns
- Implement hot-reload via POST /api/data-entry/ocr/profiles/reload

## 12 Store Profiles
- LIDL: Multi-rate TVA (A, B, C, D codes)
- OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates
- BRICK, DEDEMAN: Standard TVA, e-factura support
- KINETERRA, BEST PRINT: Non-VAT payers (returns [])
- STEPOUT MARKET: TVA 5% (books/reduced rate)
- UNLIMITED KEYS: NUMERAR payment detection
- GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA

## Flexible TVA Patterns
- All patterns use (\d{1,2})% to accept any rate
- Supports historical (19%, 9%, 5%) and current (21%, 11%)

## Payment Methods Fix
- Fixed base.py to support multiple payments of same type
- Changed deduplication from method-only to (method, amount) tuple
- Returns separate entries for split payments

## Tools
- Add generate_store_profile.py for automatic profile generation
- Analyzes PDFs via OCR API and detects patterns

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-01-06 23:07:07 +00:00
parent 67b0082df0
commit 099556213d
25 changed files with 3707 additions and 114 deletions

View File

@@ -7,6 +7,7 @@ from typing import Optional, Tuple, List
from dataclasses import dataclass, field
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry
@dataclass
@@ -63,6 +64,57 @@ class ExtractionResult:
class ReceiptExtractor:
"""Extract receipt fields using pattern matching for Romanian receipts."""
# =========================================================================
# DEPRECATED: STORE_PROFILES dict - USE ProfileRegistry INSTEAD
# =========================================================================
# Store profiles are now managed by ProfileRegistry in:
# backend/modules/data_entry/services/ocr/profiles/
#
# This dict is kept for reference only. All extraction logic now uses:
# ProfileRegistry.get_profile(cui)
#
# See: backend/modules/data_entry/services/ocr/profiles/README.md
# =========================================================================
STORE_PROFILES = {
# Lidl - multi-rate TVA (A+B), specific format without hyphen/colon
"22891860": {
"name": "LIDL DISCOUNT S.R.L.",
"tva_pattern": "lidl",
"tva_format": "TVA {code} {percent}% {amount}",
"has_multi_rate_tva": True,
"card_equals_total": True,
},
# OMV Petrom - single TVA rate, client CUI included
"11201891": {
"name": "OMV PETROM MARKETING S.R.L.",
"tva_pattern": "standard",
"has_client_cui": True,
},
# FIVE-HOLDING (BRICK) - standard format
"10562600": {
"name": "FIVE-HOLDING S.A.",
"tva_pattern": "standard",
},
# Dedeman - e-factura format
"2816464": {
"name": "DEDEMAN SRL",
"tva_pattern": "standard",
"has_efactura": True,
},
# SOCAR Petroleum
"12546600": {
"name": "SOCAR PETROLEUM S.A.",
"tva_pattern": "standard",
"has_client_cui": True,
},
# Kineterra - non-VAT payer
"31180432": {
"name": "KINETERRA CONCEPT SRL",
"tva_pattern": "none",
"is_non_vat_payer": True,
},
}
# Total amount patterns (most specific first)
# Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
# OCR often produces errors, so patterns must be tolerant
@@ -394,48 +446,101 @@ class ReceiptExtractor:
result.raw_text = text
text_upper = text.upper()
# Extract core fields
result.amount, result.confidence_amount = self._extract_amount(text_upper)
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
result.receipt_number, _ = self._extract_number(text_upper)
result.receipt_series, _ = self._extract_series(text_upper)
# =========================================================================
# STEP 1: Extract vendor info FIRST to find store profile
# =========================================================================
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
result.cui, _ = self._extract_cui(text_upper, text)
# Normalize CUI: fix R0 → RO OCR error and validate format
result.cui = OCRValidationEngine.normalize_cui(result.cui)
# Extract additional fields - Multiple TVA entries
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
# Lookup store-specific profile for enhanced extraction accuracy
store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
if store_profile:
print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
# =========================================================================
# STEP 2: Extract ALL fields using profile (if available) or generic
# =========================================================================
if store_profile:
# Profile-specific extraction (higher accuracy for known stores)
result.amount, result.confidence_amount = store_profile.extract_total(text_upper)
result.receipt_date, result.confidence_date = store_profile.extract_date(text_upper)
result.receipt_number, _ = store_profile.extract_receipt_number(text_upper)
result.tva_entries = store_profile.extract_tva_entries(text_upper)
result.tva_total = sum(e['amount'] for e in result.tva_entries) if result.tva_entries else None
result.payment_methods = store_profile.extract_payment_methods(text_upper)
# Client data extraction via profile (CUI + name)
profile_client_cui, cui_confidence = store_profile.extract_client_cui(text_upper)
profile_client_name, name_confidence = store_profile.extract_client_name(text)
if profile_client_cui or profile_client_name:
# Use profile extraction results
result.client_cui = OCRValidationEngine.normalize_cui(profile_client_cui) if profile_client_cui else None
result.client_name = profile_client_name
result.confidence_client = max(cui_confidence, name_confidence)
# Address still via generic (no profile method)
_, _, client_address, _ = self._extract_client_data(text_upper, text)
result.client_address = client_address
else:
# Fallback to generic client extraction
client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
result.client_name = client_name
result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
result.client_address = client_address
result.confidence_client = confidence
print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
else:
# Generic extraction for unknown stores
result.amount, result.confidence_amount = self._extract_amount(text_upper)
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
result.receipt_number, _ = self._extract_number(text_upper)
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
result.payment_methods = self._extract_payment_methods(text_upper)
# Generic client extraction
client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
result.client_name = client_name
result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
result.client_address = client_address
result.confidence_client = confidence
# Series extraction (no profile method, always generic)
result.receipt_series, _ = self._extract_series(text_upper)
# =========================================================================
# STEP 3: Debug logging and validation
# =========================================================================
if not result.tva_entries:
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
# Debug: show what patterns see
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
# Log TVA vs TOTAL for debugging (validation happens in ocr_service._final_validation)
# NOTE: We NO LONGER clear TVA here - the service will recalculate TOTAL from TVA if needed
# Log TVA vs TOTAL for debugging
if result.tva_total and result.amount:
if result.tva_total > result.amount:
print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
elif result.tva_total > result.amount * Decimal('0.5'):
print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)
# Additional generic extractions
result.items_count = self._extract_items_count(text_upper)
result.address = self._extract_address(text_upper)
result.payment_methods = self._extract_payment_methods(text_upper)
# Validate payment methods against extracted amount
# If payment sum >> amount, clear invalid payments (likely OCR error)
# =========================================================================
# STEP 4: Validate and post-process
# =========================================================================
# Save original payment methods before validation (for payment mode detection)
original_payment_methods = result.payment_methods.copy() if result.payment_methods else []
# Validate payment methods against extracted amount
result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)
# Auto-suggest payment_mode based on detected payment methods
# Use ORIGINAL payment_methods to detect CARD even if validation cleared them
# (e.g., CARD 318.16 is valid even if total validation failed)
payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
if payment_methods_for_mode:
card_amount = sum(
@@ -447,17 +552,9 @@ class ReceiptExtractor:
result.suggested_payment_mode = 'banca'
print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
else:
# Only cash payments detected
result.suggested_payment_mode = 'numerar'
print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)
# Extract client data (B2B receipts)
client_name, client_cui, client_address, confidence_client = self._extract_client_data(text_upper, text)
result.client_name = client_name
result.client_cui = OCRValidationEngine.normalize_cui(client_cui) # Fix R0 → RO OCR error
result.client_address = client_address
result.confidence_client = confidence_client
# Detect receipt type
result.receipt_type = self._detect_receipt_type(text_upper)
@@ -620,6 +717,40 @@ class ReceiptExtractor:
return num_str
def _calculate_multi_rate_tva_total(self, tva_entries: List[dict]) -> Optional[Decimal]:
"""
Calculate implied total from ALL TVA entries (multi-rate support).
Formula for each entry: total_for_entry = tva * (100 + rate) / rate
Final total = sum of all entry totals
Example for Lidl (TVA A 21% = 7.71, TVA B 11% = 2.13):
Entry A: 7.71 * 121 / 21 = 44.45
Entry B: 2.13 * 111 / 11 = 21.49
Total: 44.45 + 21.49 = 65.94 ≈ 65.86 (within tolerance)
Returns:
Implied total Decimal, or None if calculation not possible
"""
if not tva_entries:
return None
total = Decimal('0')
for entry in tva_entries:
rate = entry.get('percent', 0)
tva_amount = entry.get('amount')
if tva_amount and rate > 0:
try:
tva_dec = Decimal(str(tva_amount))
# Formula: total_for_entry = tva * (100 + rate) / rate
entry_total = tva_dec * Decimal(100 + rate) / Decimal(rate)
total += entry_total
print(f"[Multi-rate TVA] Entry {entry.get('code', '?')}: tva={tva_amount}, rate={rate}% -> implied={entry_total:.2f}", flush=True)
except (InvalidOperation, ValueError, TypeError):
continue
return total.quantize(Decimal('0.01')) if total > 0 else None
def _cross_validate_and_calculate_amount(
self,
amount: Optional[Decimal],
@@ -634,12 +765,11 @@ class ReceiptExtractor:
Returns: (amount, confidence, source_description)
Logic:
1. If amount is valid (>0) with high confidence (>=0.8), use it directly
2. Calculate payment_sum = CARD + NUMERAR + other methods
3. Calculate tva_implied_total = tva_total * (100 + rate) / rate
4. Cross-validate: if payment_sum matches extracted amount, boost confidence
5. If amount is 0/None, use payment_sum as total
6. If payment_sum is 0, try to calculate from TVA
1. Collect all available sources: extracted amount, payment sum, TVA-implied total
2. Find consensus: 2+ sources within 3% tolerance
3. If consensus found, use the higher-confidence source value
4. If extracted differs >10% from all others, it's an outlier - correct it
5. If no consensus possible, fallback to individual validations
"""
# Calculate payment methods sum
payment_sum = Decimal('0')
@@ -652,43 +782,73 @@ class ReceiptExtractor:
except (InvalidOperation, ValueError, TypeError):
continue
# Calculate TVA-implied total: total = tva * (100 + rate) / rate
tva_implied_total = None
if tva_entries:
# Use the main TVA entry (typically the largest or first one)
main_entry = tva_entries[0]
rate = main_entry.get('percent', 19)
tva_amount = main_entry.get('amount')
if tva_amount and rate > 0:
try:
tva_dec = Decimal(str(tva_amount))
# total = tva * (100 + rate) / rate
tva_implied_total = (tva_dec * Decimal(100 + rate) / Decimal(rate)).quantize(Decimal('0.01'))
except (InvalidOperation, ValueError, TypeError):
pass
# Calculate TVA-implied total using ALL entries (multi-rate fix)
tva_implied_total = self._calculate_multi_rate_tva_total(tva_entries)
# Case 1: Amount is valid with high confidence - validate against TVA and payments
# Multi-source consensus approach (3% tolerance for multi-rate TVA rounding)
CONSENSUS_TOLERANCE = 3.0 # 3% tolerance
# Collect all available sources with their confidences
sources = []
if amount and amount > 0:
sources.append(('extracted', float(amount), confidence_amount))
if payment_sum > 0:
sources.append(('payment', float(payment_sum), 0.92)) # Payment is very reliable
if tva_implied_total and tva_implied_total > 0:
sources.append(('tva_calc', float(tva_implied_total), 0.88)) # TVA calc is reliable
print(f"[Cross-Validation] Sources: {[(s[0], f'{s[1]:.2f}', f'{s[2]:.2f}') for s in sources]}", flush=True)
# Find consensus: 2+ sources within tolerance
if len(sources) >= 2:
for i, (name1, val1, conf1) in enumerate(sources):
for name2, val2, conf2 in sources[i+1:]:
if val1 <= 0 or val2 <= 0:
continue
diff_pct = abs(val1 - val2) / max(val1, val2) * 100
if diff_pct <= CONSENSUS_TOLERANCE:
# Consensus found! Use value from higher-confidence source
if conf1 >= conf2:
consensus_val, consensus_conf = val1, conf1
else:
consensus_val, consensus_conf = val2, conf2
# Boost confidence for consensus
consensus_conf = min(0.98, consensus_conf + 0.05)
print(f"[Cross-Validation] Consensus: {name1}={val1:.2f}{name2}={val2:.2f} (diff={diff_pct:.1f}%)", flush=True)
return Decimal(str(round(consensus_val, 2))), consensus_conf, f"consensus ({name1}+{name2})"
# No consensus - check if extracted is an outlier (differs >10% from all others)
if amount and amount > 0 and len(sources) >= 2:
other_sources = [s for s in sources if s[0] != 'extracted']
if other_sources:
extracted_val = float(amount)
all_differ = all(
abs(extracted_val - s[1]) / max(extracted_val, s[1]) * 100 > 10
for s in other_sources if s[1] > 0
)
if all_differ:
# Extracted differs significantly from all others - use the best other source
best_other = max(other_sources, key=lambda s: s[2])
print(f"[Cross-Validation] Extracted outlier: {extracted_val:.2f} differs >10% from all others, using {best_other[0]}={best_other[1]:.2f}", flush=True)
return Decimal(str(round(best_other[1], 2))), best_other[2], f"corrected (extracted outlier, using {best_other[0]})"
# Fallback: Case 1 - Amount valid with high confidence
if amount and amount > 0 and confidence_amount >= 0.8:
# First check TVA-implied total (most reliable when TVA is extracted correctly)
# Check TVA-implied total
if tva_implied_total and tva_implied_total > 0:
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
if tva_diff_percent <= 1:
# Near-perfect TVA match - highest confidence
if tva_diff_percent <= 3:
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
elif tva_diff_percent > 10:
# Significant mismatch - TVA-implied total is more reliable
# This catches cases where wrong TOTAL line was extracted (e.g., REST, SUBTOTAL)
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"
# Cross-validate with payment methods
if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
# Perfect match - boost confidence
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
elif payment_sum > 0:
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
if payment_diff_percent > 10:
# Significant mismatch - payment sum is more reliable
print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"
@@ -696,29 +856,22 @@ class ReceiptExtractor:
# Case 2: Amount exists but low confidence - try to validate/correct
if amount and amount > 0:
# First check TVA-implied total (most reliable)
if tva_implied_total and tva_implied_total > 0:
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
if tva_diff_percent <= 2:
# Close match - boost confidence
if tva_diff_percent <= 3:
return amount, 0.88, "extracted (validated by TVA)"
elif tva_diff_percent > 10:
# Significant mismatch - use TVA-implied total
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
return tva_implied_total, 0.85, "calculated from TVA"
# Check if payment methods sum matches
if payment_sum > 0:
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
if payment_diff_percent <= 0.5:
# Close match - boost confidence
if payment_diff_percent <= 1:
return amount, 0.90, "extracted (validated by payment methods)"
elif payment_diff_percent > 10:
# Mismatch - prefer payment_sum as it's more reliable
print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
return payment_sum, 0.85, "calculated from payment methods"
# No validation possible - return as-is
return amount, confidence_amount, "extracted (unvalidated)"
# Case 3: Amount is 0 or None - calculate from payment methods
@@ -946,6 +1099,28 @@ class ReceiptExtractor:
return name
def _get_store_profile(self, cui: Optional[str]) -> Optional[dict]:
"""
Get store-specific profile by CUI.
DEPRECATED: Use ProfileRegistry.get_profile() directly for profile objects.
This method is kept for backward compatibility and returns validation hints dict.
Args:
cui: The CUI extracted from receipt (with or without RO prefix)
Returns:
Store profile validation hints dict or None if not found
"""
profile = ProfileRegistry.get_profile(cui)
if profile:
# Return validation hints for backward compatibility
hints = profile.get_validation_hints()
hints['name'] = profile.STORE_NAME
print(f"[Store Profile] Found profile for {cui}: {profile.STORE_NAME}", flush=True)
return hints
return None
def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
"""
Extract vendor CUI (fiscal identification code) from text.
@@ -1020,11 +1195,114 @@ class ReceiptExtractor:
# Default to bon_fiscal if neither found
return 'bon_fiscal'
def _try_pattern_lidl(self, text: str) -> List[dict]:
"""
Try Lidl-style TVA pattern: "TVA A 21,00% 7.71" (no hyphen/colon separator).
Lidl receipts format:
TOTAL TVA 9,84
TVA A 21,00% 7,71
TVA B 11,00% 2,13
Returns list of TVA entries found.
"""
entries = []
seen = set()
# Pattern: TVA/TUA/IVA + code (A-D) + percent + amount (on same line)
# Handles: "TVA A 21,00% 7,71", "TVA B 11,00% 2,13", "TUA A 21% 7.71"
lidl_patterns = [
# Same line: "TVA A 21,00% 7.71" (with various spacing)
r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
# Same line with backslash (OCR artifact): "TVA A \21,00% 7.71"
r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
# IVA variant
r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
]
for pattern in lidl_patterns:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount_str = self._normalize_number(match.group(3))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
print(f"[TVA Lidl] Found: TVA {code} {percent}% = {amount}", flush=True)
except (ValueError, InvalidOperation):
continue
return entries
def _select_best_tva_candidate(
self,
candidates: List[tuple],
tva_bon_total: Optional[Decimal]
) -> Tuple[List[dict], Optional[Decimal]]:
"""
Select the best TVA candidate from collected candidates.
Selection criteria (priority order):
1. Sum matches TOTAL TVA BON (highest priority)
2. More entries = better (for multi-rate receipts)
3. Pattern confidence as tiebreaker
Args:
candidates: List of (pattern_name, confidence, entries, sum)
tva_bon_total: Authoritative TOTAL TVA BON value (if extracted)
Returns:
(best_entries, best_sum)
"""
if not candidates:
return [], None
# Score each candidate
scored = []
for name, confidence, entries, sum_val in candidates:
score = 0.0
# Criterion 1: Sum matches TOTAL TVA BON (highest priority)
if tva_bon_total and sum_val:
tolerance = max(Decimal('0.02'), tva_bon_total * Decimal('0.02')) # 2% tolerance
if abs(sum_val - tva_bon_total) <= tolerance:
score += 100 # High bonus for matching authoritative total
print(f"[TVA Select] {name}: sum {sum_val} matches tva_bon_total {tva_bon_total}", flush=True)
# Criterion 2: More entries (for multi-rate receipts)
score += len(entries) * 10
# Criterion 3: Pattern confidence
score += confidence * 5
scored.append((score, name, confidence, entries, sum_val))
print(f"[TVA Select] Candidate {name}: score={score:.1f}, entries={len(entries)}, sum={sum_val}", flush=True)
# Sort by score descending
scored.sort(key=lambda x: x[0], reverse=True)
best = scored[0]
print(f"[TVA Select] Winner: {best[1]} (score={best[0]:.1f})", flush=True)
return best[3], best[4]
def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
"""
Extract multiple TVA (VAT) entries from text.
Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).
Uses CANDIDATE COLLECTION approach:
- Try ALL patterns and collect candidates
- Select best candidate based on matching TOTAL TVA BON
Returns (tva_entries, tva_total) where tva_entries is a list of:
{'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
"""
@@ -1054,6 +1332,22 @@ class ReceiptExtractor:
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
# Extract TOTAL TVA BON/TOTAL TVA first as the authoritative reference
tva_bon_total = self._extract_total_tva_bon(normalized_text)
print(f"[TVA Debug] TOTAL TVA BON: {tva_bon_total}", flush=True)
# CANDIDATE COLLECTION APPROACH: Try all patterns, collect candidates, select best
all_candidates = [] # List of (pattern_name, confidence, entries, sum)
# === LIDL-STYLE PATTERNS (NEW) ===
# Lidl format: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13" (no hyphen/colon)
# This pattern handles multi-rate TVA receipts
lidl_entries = self._try_pattern_lidl(normalized_text)
if lidl_entries:
lidl_sum = sum(e['amount'] for e in lidl_entries)
all_candidates.append(('lidl', 0.96, lidl_entries, lidl_sum))
print(f"[TVA Debug] Lidl pattern: {len(lidl_entries)} entries, sum={lidl_sum}", flush=True)
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
@@ -1372,10 +1666,21 @@ class ReceiptExtractor:
except (ValueError, InvalidOperation):
continue
# Extract TOTAL TVA BON as reference (separate from individual entries)
tva_bon_total = self._extract_total_tva_bon(normalized_text)
# Add existing extraction results to candidates (if any)
if tva_entries:
entries_sum = sum(entry['amount'] for entry in tva_entries)
all_candidates.append(('standard', 0.90, tva_entries, entries_sum))
print(f"[TVA Debug] Standard patterns: {len(tva_entries)} entries, sum={entries_sum}", flush=True)
# Calculate sum from entries
# === CANDIDATE SELECTION ===
# Select best candidate using TOTAL TVA BON as authoritative reference
if all_candidates:
best_entries, best_sum = self._select_best_tva_candidate(all_candidates, tva_bon_total)
if best_entries:
tva_entries = best_entries
entries_sum = best_sum
# Calculate sum from entries (if not set by candidate selection)
entries_sum = None
if tva_entries:
entries_sum = sum(entry['amount'] for entry in tva_entries)