feat(ocr): Add modular store profiles with hot-reload support
## Store Profiles System
- Add ProfileRegistry for CUI-based profile lookup
- Add BaseStoreProfile with generic extraction patterns
- Implement hot-reload via POST /api/data-entry/ocr/profiles/reload
## 12 Store Profiles
- LIDL: Multi-rate TVA (A, B, C, D codes)
- OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates
- BRICK, DEDEMAN: Standard TVA, e-factura support
- KINETERRA, BEST PRINT: Non-VAT payers (returns [])
- STEPOUT MARKET: TVA 5% (books/reduced rate)
- UNLIMITED KEYS: NUMERAR payment detection
- GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA
## Flexible TVA Patterns
- All patterns use (\d{1,2})% to accept any rate
- Supports historical (19%, 9%, 5%) and current (21%, 11%)
## Payment Methods Fix
- Fixed base.py to support multiple payments of same type
- Changed deduplication from method-only to (method, amount) tuple
- Returns separate entries for split payments
## Tools
- Add generate_store_profile.py for automatic profile generation
- Analyzes PDFs via OCR API and detects patterns
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ from typing import Optional, Tuple, List
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
||||
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -63,6 +64,57 @@ class ExtractionResult:
|
||||
class ReceiptExtractor:
|
||||
"""Extract receipt fields using pattern matching for Romanian receipts."""
|
||||
|
||||
# =========================================================================
|
||||
# DEPRECATED: STORE_PROFILES dict - USE ProfileRegistry INSTEAD
|
||||
# =========================================================================
|
||||
# Store profiles are now managed by ProfileRegistry in:
|
||||
# backend/modules/data_entry/services/ocr/profiles/
|
||||
#
|
||||
# This dict is kept for reference only. All extraction logic now uses:
|
||||
# ProfileRegistry.get_profile(cui)
|
||||
#
|
||||
# See: backend/modules/data_entry/services/ocr/profiles/README.md
|
||||
# =========================================================================
|
||||
STORE_PROFILES = {
|
||||
# Lidl - multi-rate TVA (A+B), specific format without hyphen/colon
|
||||
"22891860": {
|
||||
"name": "LIDL DISCOUNT S.R.L.",
|
||||
"tva_pattern": "lidl",
|
||||
"tva_format": "TVA {code} {percent}% {amount}",
|
||||
"has_multi_rate_tva": True,
|
||||
"card_equals_total": True,
|
||||
},
|
||||
# OMV Petrom - single TVA rate, client CUI included
|
||||
"11201891": {
|
||||
"name": "OMV PETROM MARKETING S.R.L.",
|
||||
"tva_pattern": "standard",
|
||||
"has_client_cui": True,
|
||||
},
|
||||
# FIVE-HOLDING (BRICK) - standard format
|
||||
"10562600": {
|
||||
"name": "FIVE-HOLDING S.A.",
|
||||
"tva_pattern": "standard",
|
||||
},
|
||||
# Dedeman - e-factura format
|
||||
"2816464": {
|
||||
"name": "DEDEMAN SRL",
|
||||
"tva_pattern": "standard",
|
||||
"has_efactura": True,
|
||||
},
|
||||
# SOCAR Petroleum
|
||||
"12546600": {
|
||||
"name": "SOCAR PETROLEUM S.A.",
|
||||
"tva_pattern": "standard",
|
||||
"has_client_cui": True,
|
||||
},
|
||||
# Kineterra - non-VAT payer
|
||||
"31180432": {
|
||||
"name": "KINETERRA CONCEPT SRL",
|
||||
"tva_pattern": "none",
|
||||
"is_non_vat_payer": True,
|
||||
},
|
||||
}
|
||||
|
||||
# Total amount patterns (most specific first)
|
||||
# Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
|
||||
# OCR often produces errors, so patterns must be tolerant
|
||||
@@ -394,48 +446,101 @@ class ReceiptExtractor:
|
||||
result.raw_text = text
|
||||
text_upper = text.upper()
|
||||
|
||||
# Extract core fields
|
||||
result.amount, result.confidence_amount = self._extract_amount(text_upper)
|
||||
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
|
||||
result.receipt_number, _ = self._extract_number(text_upper)
|
||||
result.receipt_series, _ = self._extract_series(text_upper)
|
||||
# =========================================================================
|
||||
# STEP 1: Extract vendor info FIRST to find store profile
|
||||
# =========================================================================
|
||||
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
|
||||
result.cui, _ = self._extract_cui(text_upper, text)
|
||||
# Normalize CUI: fix R0 → RO OCR error and validate format
|
||||
result.cui = OCRValidationEngine.normalize_cui(result.cui)
|
||||
|
||||
# Extract additional fields - Multiple TVA entries
|
||||
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
|
||||
# Lookup store-specific profile for enhanced extraction accuracy
|
||||
store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
|
||||
if store_profile:
|
||||
print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
|
||||
|
||||
# =========================================================================
|
||||
# STEP 2: Extract ALL fields using profile (if available) or generic
|
||||
# =========================================================================
|
||||
if store_profile:
|
||||
# Profile-specific extraction (higher accuracy for known stores)
|
||||
result.amount, result.confidence_amount = store_profile.extract_total(text_upper)
|
||||
result.receipt_date, result.confidence_date = store_profile.extract_date(text_upper)
|
||||
result.receipt_number, _ = store_profile.extract_receipt_number(text_upper)
|
||||
result.tva_entries = store_profile.extract_tva_entries(text_upper)
|
||||
result.tva_total = sum(e['amount'] for e in result.tva_entries) if result.tva_entries else None
|
||||
result.payment_methods = store_profile.extract_payment_methods(text_upper)
|
||||
|
||||
# Client data extraction via profile (CUI + name)
|
||||
profile_client_cui, cui_confidence = store_profile.extract_client_cui(text_upper)
|
||||
profile_client_name, name_confidence = store_profile.extract_client_name(text)
|
||||
|
||||
if profile_client_cui or profile_client_name:
|
||||
# Use profile extraction results
|
||||
result.client_cui = OCRValidationEngine.normalize_cui(profile_client_cui) if profile_client_cui else None
|
||||
result.client_name = profile_client_name
|
||||
result.confidence_client = max(cui_confidence, name_confidence)
|
||||
# Address still via generic (no profile method)
|
||||
_, _, client_address, _ = self._extract_client_data(text_upper, text)
|
||||
result.client_address = client_address
|
||||
else:
|
||||
# Fallback to generic client extraction
|
||||
client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
|
||||
result.client_name = client_name
|
||||
result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
|
||||
result.client_address = client_address
|
||||
result.confidence_client = confidence
|
||||
|
||||
print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
|
||||
f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
|
||||
else:
|
||||
# Generic extraction for unknown stores
|
||||
result.amount, result.confidence_amount = self._extract_amount(text_upper)
|
||||
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
|
||||
result.receipt_number, _ = self._extract_number(text_upper)
|
||||
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
|
||||
result.payment_methods = self._extract_payment_methods(text_upper)
|
||||
|
||||
# Generic client extraction
|
||||
client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
|
||||
result.client_name = client_name
|
||||
result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
|
||||
result.client_address = client_address
|
||||
result.confidence_client = confidence
|
||||
|
||||
# Series extraction (no profile method, always generic)
|
||||
result.receipt_series, _ = self._extract_series(text_upper)
|
||||
|
||||
# =========================================================================
|
||||
# STEP 3: Debug logging and validation
|
||||
# =========================================================================
|
||||
if not result.tva_entries:
|
||||
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
|
||||
# Debug: show what patterns see
|
||||
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
||||
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
||||
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
||||
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
|
||||
|
||||
# Log TVA vs TOTAL for debugging (validation happens in ocr_service._final_validation)
|
||||
# NOTE: We NO LONGER clear TVA here - the service will recalculate TOTAL from TVA if needed
|
||||
# Log TVA vs TOTAL for debugging
|
||||
if result.tva_total and result.amount:
|
||||
if result.tva_total > result.amount:
|
||||
print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
|
||||
elif result.tva_total > result.amount * Decimal('0.5'):
|
||||
print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)
|
||||
|
||||
# Additional generic extractions
|
||||
result.items_count = self._extract_items_count(text_upper)
|
||||
result.address = self._extract_address(text_upper)
|
||||
result.payment_methods = self._extract_payment_methods(text_upper)
|
||||
|
||||
# Validate payment methods against extracted amount
|
||||
# If payment sum >> amount, clear invalid payments (likely OCR error)
|
||||
# =========================================================================
|
||||
# STEP 4: Validate and post-process
|
||||
# =========================================================================
|
||||
# Save original payment methods before validation (for payment mode detection)
|
||||
original_payment_methods = result.payment_methods.copy() if result.payment_methods else []
|
||||
|
||||
# Validate payment methods against extracted amount
|
||||
result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)
|
||||
|
||||
# Auto-suggest payment_mode based on detected payment methods
|
||||
# Use ORIGINAL payment_methods to detect CARD even if validation cleared them
|
||||
# (e.g., CARD 318.16 is valid even if total validation failed)
|
||||
payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
|
||||
if payment_methods_for_mode:
|
||||
card_amount = sum(
|
||||
@@ -447,17 +552,9 @@ class ReceiptExtractor:
|
||||
result.suggested_payment_mode = 'banca'
|
||||
print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
|
||||
else:
|
||||
# Only cash payments detected
|
||||
result.suggested_payment_mode = 'numerar'
|
||||
print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)
|
||||
|
||||
# Extract client data (B2B receipts)
|
||||
client_name, client_cui, client_address, confidence_client = self._extract_client_data(text_upper, text)
|
||||
result.client_name = client_name
|
||||
result.client_cui = OCRValidationEngine.normalize_cui(client_cui) # Fix R0 → RO OCR error
|
||||
result.client_address = client_address
|
||||
result.confidence_client = confidence_client
|
||||
|
||||
# Detect receipt type
|
||||
result.receipt_type = self._detect_receipt_type(text_upper)
|
||||
|
||||
@@ -620,6 +717,40 @@ class ReceiptExtractor:
|
||||
|
||||
return num_str
|
||||
|
||||
def _calculate_multi_rate_tva_total(self, tva_entries: List[dict]) -> Optional[Decimal]:
|
||||
"""
|
||||
Calculate implied total from ALL TVA entries (multi-rate support).
|
||||
|
||||
Formula for each entry: total_for_entry = tva * (100 + rate) / rate
|
||||
Final total = sum of all entry totals
|
||||
|
||||
Example for Lidl (TVA A 21% = 7.71, TVA B 11% = 2.13):
|
||||
Entry A: 7.71 * 121 / 21 = 44.45
|
||||
Entry B: 2.13 * 111 / 11 = 21.49
|
||||
Total: 44.45 + 21.49 = 65.94 ≈ 65.86 (within tolerance)
|
||||
|
||||
Returns:
|
||||
Implied total Decimal, or None if calculation not possible
|
||||
"""
|
||||
if not tva_entries:
|
||||
return None
|
||||
|
||||
total = Decimal('0')
|
||||
for entry in tva_entries:
|
||||
rate = entry.get('percent', 0)
|
||||
tva_amount = entry.get('amount')
|
||||
if tva_amount and rate > 0:
|
||||
try:
|
||||
tva_dec = Decimal(str(tva_amount))
|
||||
# Formula: total_for_entry = tva * (100 + rate) / rate
|
||||
entry_total = tva_dec * Decimal(100 + rate) / Decimal(rate)
|
||||
total += entry_total
|
||||
print(f"[Multi-rate TVA] Entry {entry.get('code', '?')}: tva={tva_amount}, rate={rate}% -> implied={entry_total:.2f}", flush=True)
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
continue
|
||||
|
||||
return total.quantize(Decimal('0.01')) if total > 0 else None
|
||||
|
||||
def _cross_validate_and_calculate_amount(
|
||||
self,
|
||||
amount: Optional[Decimal],
|
||||
@@ -634,12 +765,11 @@ class ReceiptExtractor:
|
||||
Returns: (amount, confidence, source_description)
|
||||
|
||||
Logic:
|
||||
1. If amount is valid (>0) with high confidence (>=0.8), use it directly
|
||||
2. Calculate payment_sum = CARD + NUMERAR + other methods
|
||||
3. Calculate tva_implied_total = tva_total * (100 + rate) / rate
|
||||
4. Cross-validate: if payment_sum matches extracted amount, boost confidence
|
||||
5. If amount is 0/None, use payment_sum as total
|
||||
6. If payment_sum is 0, try to calculate from TVA
|
||||
1. Collect all available sources: extracted amount, payment sum, TVA-implied total
|
||||
2. Find consensus: 2+ sources within 3% tolerance
|
||||
3. If consensus found, use the higher-confidence source value
|
||||
4. If extracted differs >10% from all others, it's an outlier - correct it
|
||||
5. If no consensus possible, fallback to individual validations
|
||||
"""
|
||||
# Calculate payment methods sum
|
||||
payment_sum = Decimal('0')
|
||||
@@ -652,43 +782,73 @@ class ReceiptExtractor:
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Calculate TVA-implied total: total = tva * (100 + rate) / rate
|
||||
tva_implied_total = None
|
||||
if tva_entries:
|
||||
# Use the main TVA entry (typically the largest or first one)
|
||||
main_entry = tva_entries[0]
|
||||
rate = main_entry.get('percent', 19)
|
||||
tva_amount = main_entry.get('amount')
|
||||
if tva_amount and rate > 0:
|
||||
try:
|
||||
tva_dec = Decimal(str(tva_amount))
|
||||
# total = tva * (100 + rate) / rate
|
||||
tva_implied_total = (tva_dec * Decimal(100 + rate) / Decimal(rate)).quantize(Decimal('0.01'))
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
pass
|
||||
# Calculate TVA-implied total using ALL entries (multi-rate fix)
|
||||
tva_implied_total = self._calculate_multi_rate_tva_total(tva_entries)
|
||||
|
||||
# Case 1: Amount is valid with high confidence - validate against TVA and payments
|
||||
# Multi-source consensus approach (3% tolerance for multi-rate TVA rounding)
|
||||
CONSENSUS_TOLERANCE = 3.0 # 3% tolerance
|
||||
|
||||
# Collect all available sources with their confidences
|
||||
sources = []
|
||||
if amount and amount > 0:
|
||||
sources.append(('extracted', float(amount), confidence_amount))
|
||||
if payment_sum > 0:
|
||||
sources.append(('payment', float(payment_sum), 0.92)) # Payment is very reliable
|
||||
if tva_implied_total and tva_implied_total > 0:
|
||||
sources.append(('tva_calc', float(tva_implied_total), 0.88)) # TVA calc is reliable
|
||||
|
||||
print(f"[Cross-Validation] Sources: {[(s[0], f'{s[1]:.2f}', f'{s[2]:.2f}') for s in sources]}", flush=True)
|
||||
|
||||
# Find consensus: 2+ sources within tolerance
|
||||
if len(sources) >= 2:
|
||||
for i, (name1, val1, conf1) in enumerate(sources):
|
||||
for name2, val2, conf2 in sources[i+1:]:
|
||||
if val1 <= 0 or val2 <= 0:
|
||||
continue
|
||||
diff_pct = abs(val1 - val2) / max(val1, val2) * 100
|
||||
if diff_pct <= CONSENSUS_TOLERANCE:
|
||||
# Consensus found! Use value from higher-confidence source
|
||||
if conf1 >= conf2:
|
||||
consensus_val, consensus_conf = val1, conf1
|
||||
else:
|
||||
consensus_val, consensus_conf = val2, conf2
|
||||
# Boost confidence for consensus
|
||||
consensus_conf = min(0.98, consensus_conf + 0.05)
|
||||
print(f"[Cross-Validation] Consensus: {name1}={val1:.2f} ≈ {name2}={val2:.2f} (diff={diff_pct:.1f}%)", flush=True)
|
||||
return Decimal(str(round(consensus_val, 2))), consensus_conf, f"consensus ({name1}+{name2})"
|
||||
|
||||
# No consensus - check if extracted is an outlier (differs >10% from all others)
|
||||
if amount and amount > 0 and len(sources) >= 2:
|
||||
other_sources = [s for s in sources if s[0] != 'extracted']
|
||||
if other_sources:
|
||||
extracted_val = float(amount)
|
||||
all_differ = all(
|
||||
abs(extracted_val - s[1]) / max(extracted_val, s[1]) * 100 > 10
|
||||
for s in other_sources if s[1] > 0
|
||||
)
|
||||
if all_differ:
|
||||
# Extracted differs significantly from all others - use the best other source
|
||||
best_other = max(other_sources, key=lambda s: s[2])
|
||||
print(f"[Cross-Validation] Extracted outlier: {extracted_val:.2f} differs >10% from all others, using {best_other[0]}={best_other[1]:.2f}", flush=True)
|
||||
return Decimal(str(round(best_other[1], 2))), best_other[2], f"corrected (extracted outlier, using {best_other[0]})"
|
||||
|
||||
# Fallback: Case 1 - Amount valid with high confidence
|
||||
if amount and amount > 0 and confidence_amount >= 0.8:
|
||||
# First check TVA-implied total (most reliable when TVA is extracted correctly)
|
||||
# Check TVA-implied total
|
||||
if tva_implied_total and tva_implied_total > 0:
|
||||
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
|
||||
if tva_diff_percent <= 1:
|
||||
# Near-perfect TVA match - highest confidence
|
||||
if tva_diff_percent <= 3:
|
||||
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
|
||||
elif tva_diff_percent > 10:
|
||||
# Significant mismatch - TVA-implied total is more reliable
|
||||
# This catches cases where wrong TOTAL line was extracted (e.g., REST, SUBTOTAL)
|
||||
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
|
||||
return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"
|
||||
|
||||
# Cross-validate with payment methods
|
||||
if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
|
||||
# Perfect match - boost confidence
|
||||
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
|
||||
elif payment_sum > 0:
|
||||
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
|
||||
if payment_diff_percent > 10:
|
||||
# Significant mismatch - payment sum is more reliable
|
||||
print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
|
||||
return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"
|
||||
|
||||
@@ -696,29 +856,22 @@ class ReceiptExtractor:
|
||||
|
||||
# Case 2: Amount exists but low confidence - try to validate/correct
|
||||
if amount and amount > 0:
|
||||
# First check TVA-implied total (most reliable)
|
||||
if tva_implied_total and tva_implied_total > 0:
|
||||
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
|
||||
if tva_diff_percent <= 2:
|
||||
# Close match - boost confidence
|
||||
if tva_diff_percent <= 3:
|
||||
return amount, 0.88, "extracted (validated by TVA)"
|
||||
elif tva_diff_percent > 10:
|
||||
# Significant mismatch - use TVA-implied total
|
||||
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
|
||||
return tva_implied_total, 0.85, "calculated from TVA"
|
||||
|
||||
# Check if payment methods sum matches
|
||||
if payment_sum > 0:
|
||||
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
|
||||
if payment_diff_percent <= 0.5:
|
||||
# Close match - boost confidence
|
||||
if payment_diff_percent <= 1:
|
||||
return amount, 0.90, "extracted (validated by payment methods)"
|
||||
elif payment_diff_percent > 10:
|
||||
# Mismatch - prefer payment_sum as it's more reliable
|
||||
print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
|
||||
return payment_sum, 0.85, "calculated from payment methods"
|
||||
|
||||
# No validation possible - return as-is
|
||||
return amount, confidence_amount, "extracted (unvalidated)"
|
||||
|
||||
# Case 3: Amount is 0 or None - calculate from payment methods
|
||||
@@ -946,6 +1099,28 @@ class ReceiptExtractor:
|
||||
|
||||
return name
|
||||
|
||||
def _get_store_profile(self, cui: Optional[str]) -> Optional[dict]:
|
||||
"""
|
||||
Get store-specific profile by CUI.
|
||||
|
||||
DEPRECATED: Use ProfileRegistry.get_profile() directly for profile objects.
|
||||
This method is kept for backward compatibility and returns validation hints dict.
|
||||
|
||||
Args:
|
||||
cui: The CUI extracted from receipt (with or without RO prefix)
|
||||
|
||||
Returns:
|
||||
Store profile validation hints dict or None if not found
|
||||
"""
|
||||
profile = ProfileRegistry.get_profile(cui)
|
||||
if profile:
|
||||
# Return validation hints for backward compatibility
|
||||
hints = profile.get_validation_hints()
|
||||
hints['name'] = profile.STORE_NAME
|
||||
print(f"[Store Profile] Found profile for {cui}: {profile.STORE_NAME}", flush=True)
|
||||
return hints
|
||||
return None
|
||||
|
||||
def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract vendor CUI (fiscal identification code) from text.
|
||||
@@ -1020,11 +1195,114 @@ class ReceiptExtractor:
|
||||
# Default to bon_fiscal if neither found
|
||||
return 'bon_fiscal'
|
||||
|
||||
def _try_pattern_lidl(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Try Lidl-style TVA pattern: "TVA A 21,00% 7.71" (no hyphen/colon separator).
|
||||
|
||||
Lidl receipts format:
|
||||
TOTAL TVA 9,84
|
||||
TVA A 21,00% 7,71
|
||||
TVA B 11,00% 2,13
|
||||
|
||||
Returns list of TVA entries found.
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
|
||||
# Pattern: TVA/TUA/IVA + code (A-D) + percent + amount (on same line)
|
||||
# Handles: "TVA A 21,00% 7,71", "TVA B 11,00% 2,13", "TUA A 21% 7.71"
|
||||
lidl_patterns = [
|
||||
# Same line: "TVA A 21,00% 7.71" (with various spacing)
|
||||
r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
||||
# Same line with backslash (OCR artifact): "TVA A \21,00% 7.71"
|
||||
r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
||||
# IVA variant
|
||||
r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
||||
]
|
||||
|
||||
for pattern in lidl_patterns:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount_str = self._normalize_number(match.group(3))
|
||||
amount = Decimal(amount_str)
|
||||
|
||||
if amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
print(f"[TVA Lidl] Found: TVA {code} {percent}% = {amount}", flush=True)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def _select_best_tva_candidate(
|
||||
self,
|
||||
candidates: List[tuple],
|
||||
tva_bon_total: Optional[Decimal]
|
||||
) -> Tuple[List[dict], Optional[Decimal]]:
|
||||
"""
|
||||
Select the best TVA candidate from collected candidates.
|
||||
|
||||
Selection criteria (priority order):
|
||||
1. Sum matches TOTAL TVA BON (highest priority)
|
||||
2. More entries = better (for multi-rate receipts)
|
||||
3. Pattern confidence as tiebreaker
|
||||
|
||||
Args:
|
||||
candidates: List of (pattern_name, confidence, entries, sum)
|
||||
tva_bon_total: Authoritative TOTAL TVA BON value (if extracted)
|
||||
|
||||
Returns:
|
||||
(best_entries, best_sum)
|
||||
"""
|
||||
if not candidates:
|
||||
return [], None
|
||||
|
||||
# Score each candidate
|
||||
scored = []
|
||||
for name, confidence, entries, sum_val in candidates:
|
||||
score = 0.0
|
||||
|
||||
# Criterion 1: Sum matches TOTAL TVA BON (highest priority)
|
||||
if tva_bon_total and sum_val:
|
||||
tolerance = max(Decimal('0.02'), tva_bon_total * Decimal('0.02')) # 2% tolerance
|
||||
if abs(sum_val - tva_bon_total) <= tolerance:
|
||||
score += 100 # High bonus for matching authoritative total
|
||||
print(f"[TVA Select] {name}: sum {sum_val} matches tva_bon_total {tva_bon_total}", flush=True)
|
||||
|
||||
# Criterion 2: More entries (for multi-rate receipts)
|
||||
score += len(entries) * 10
|
||||
|
||||
# Criterion 3: Pattern confidence
|
||||
score += confidence * 5
|
||||
|
||||
scored.append((score, name, confidence, entries, sum_val))
|
||||
print(f"[TVA Select] Candidate {name}: score={score:.1f}, entries={len(entries)}, sum={sum_val}", flush=True)
|
||||
|
||||
# Sort by score descending
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
best = scored[0]
|
||||
print(f"[TVA Select] Winner: {best[1]} (score={best[0]:.1f})", flush=True)
|
||||
|
||||
return best[3], best[4]
|
||||
|
||||
def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
|
||||
"""
|
||||
Extract multiple TVA (VAT) entries from text.
|
||||
Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).
|
||||
|
||||
Uses CANDIDATE COLLECTION approach:
|
||||
- Try ALL patterns and collect candidates
|
||||
- Select best candidate based on matching TOTAL TVA BON
|
||||
|
||||
Returns (tva_entries, tva_total) where tva_entries is a list of:
|
||||
{'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
|
||||
"""
|
||||
@@ -1054,6 +1332,22 @@ class ReceiptExtractor:
|
||||
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
|
||||
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
|
||||
|
||||
# Extract TOTAL TVA BON/TOTAL TVA first as the authoritative reference
|
||||
tva_bon_total = self._extract_total_tva_bon(normalized_text)
|
||||
print(f"[TVA Debug] TOTAL TVA BON: {tva_bon_total}", flush=True)
|
||||
|
||||
# CANDIDATE COLLECTION APPROACH: Try all patterns, collect candidates, select best
|
||||
all_candidates = [] # List of (pattern_name, confidence, entries, sum)
|
||||
|
||||
# === LIDL-STYLE PATTERNS (NEW) ===
|
||||
# Lidl format: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13" (no hyphen/colon)
|
||||
# This pattern handles multi-rate TVA receipts
|
||||
lidl_entries = self._try_pattern_lidl(normalized_text)
|
||||
if lidl_entries:
|
||||
lidl_sum = sum(e['amount'] for e in lidl_entries)
|
||||
all_candidates.append(('lidl', 0.96, lidl_entries, lidl_sum))
|
||||
print(f"[TVA Debug] Lidl pattern: {len(lidl_entries)} entries, sum={lidl_sum}", flush=True)
|
||||
|
||||
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
|
||||
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
|
||||
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
|
||||
@@ -1372,10 +1666,21 @@ class ReceiptExtractor:
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Extract TOTAL TVA BON as reference (separate from individual entries)
|
||||
tva_bon_total = self._extract_total_tva_bon(normalized_text)
|
||||
# Add existing extraction results to candidates (if any)
|
||||
if tva_entries:
|
||||
entries_sum = sum(entry['amount'] for entry in tva_entries)
|
||||
all_candidates.append(('standard', 0.90, tva_entries, entries_sum))
|
||||
print(f"[TVA Debug] Standard patterns: {len(tva_entries)} entries, sum={entries_sum}", flush=True)
|
||||
|
||||
# Calculate sum from entries
|
||||
# === CANDIDATE SELECTION ===
|
||||
# Select best candidate using TOTAL TVA BON as authoritative reference
|
||||
if all_candidates:
|
||||
best_entries, best_sum = self._select_best_tva_candidate(all_candidates, tva_bon_total)
|
||||
if best_entries:
|
||||
tva_entries = best_entries
|
||||
entries_sum = best_sum
|
||||
|
||||
# Calculate sum from entries (if not set by candidate selection)
|
||||
entries_sum = None
|
||||
if tva_entries:
|
||||
entries_sum = sum(entry['amount'] for entry in tva_entries)
|
||||
|
||||
Reference in New Issue
Block a user