New unified receipt creation system with: - UnifiedReceiptForm component with inline OCR preview and confidence indicators - Compact upload zone with drag-drop and camera support - TVA and Payment fields with dynamic add/remove - Supplier dual-field with autocomplete and OCR hint - Receipt form sections with collapsible auxiliary data Backend OCR improvements: - Add confidence_tva and confidence_payment to extraction results - Update TVA extraction to return confidence scores - Include TVA (15%) and payment (10%) in overall_confidence calculation Also includes: - CSS design system rules documentation - Port check helper function for service scripts - Expanded design tokens documentation in CLAUDE.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2234 lines
108 KiB
Python
2234 lines
108 KiB
Python
"""Extract structured fields from OCR text (Romanian receipts)."""
|
|
|
|
import re
|
|
from datetime import date, datetime
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import Optional, Tuple, List
|
|
from dataclasses import dataclass, field
|
|
|
|
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
|
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Structured extraction result from receipt."""
|
|
receipt_type: str = 'bon_fiscal'
|
|
receipt_number: Optional[str] = None
|
|
receipt_series: Optional[str] = None
|
|
receipt_date: Optional[date] = None
|
|
amount: Optional[Decimal] = None
|
|
partner_name: Optional[str] = None
|
|
cui: Optional[str] = None
|
|
description: Optional[str] = None
|
|
# Additional extracted fields - Multiple TVA entries support
|
|
tva_entries: List[dict] = field(default_factory=list) # [{code, percent, amount}]
|
|
tva_total: Optional[Decimal] = None
|
|
address: Optional[str] = None
|
|
items_count: Optional[int] = None
|
|
payment_methods: List[dict] = field(default_factory=list) # [{"method":"CARD","amount":Decimal}]
|
|
suggested_payment_mode: Optional[str] = None # 'banca' if CARD detected, 'numerar' if cash only
|
|
|
|
# Client data (for B2B receipts - buyer information)
|
|
client_name: Optional[str] = None
|
|
client_cui: Optional[str] = None
|
|
client_address: Optional[str] = None
|
|
|
|
confidence_amount: float = 0.0
|
|
confidence_date: float = 0.0
|
|
confidence_vendor: float = 0.0
|
|
confidence_client: float = 0.0
|
|
confidence_tva: float = 0.0
|
|
confidence_payment: float = 0.0
|
|
raw_text: str = ""
|
|
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
|
|
processing_time_ms: int = 0 # Processing time in milliseconds
|
|
|
|
# Validation tracking (added by bon-ocr-validation feature)
|
|
needs_manual_review: Optional[bool] = None # None=not validated, False=ok, True=needs review
|
|
validation_warnings: List[str] = field(default_factory=list)
|
|
validation_errors: List[str] = field(default_factory=list)
|
|
confidence_adjustments: dict[str, float] = field(default_factory=dict) # Field -> penalty
|
|
inter_ocr_ratios: dict[str, float] = field(default_factory=dict) # Field -> ratio
|
|
|
|
@property
|
|
def overall_confidence(self) -> float:
|
|
"""Calculate weighted overall confidence score including TVA and payment."""
|
|
weights = {
|
|
'amount': 0.35, # Most important - receipt total
|
|
'date': 0.20, # Receipt date
|
|
'vendor': 0.20, # Vendor identification
|
|
'tva': 0.15, # TVA extraction accuracy
|
|
'payment': 0.10 # Payment method detection
|
|
}
|
|
return round(
|
|
self.confidence_amount * weights['amount'] +
|
|
self.confidence_date * weights['date'] +
|
|
self.confidence_vendor * weights['vendor'] +
|
|
self.confidence_tva * weights['tva'] +
|
|
self.confidence_payment * weights['payment'],
|
|
2
|
|
)
|
|
|
|
|
|
class ReceiptExtractor:
|
|
"""Extract receipt fields using pattern matching for Romanian receipts."""
|
|
|
|
# =========================================================================
|
|
# DEPRECATED: STORE_PROFILES dict - USE ProfileRegistry INSTEAD
|
|
# =========================================================================
|
|
# Store profiles are now managed by ProfileRegistry in:
|
|
# backend/modules/data_entry/services/ocr/profiles/
|
|
#
|
|
# This dict is kept for reference only. All extraction logic now uses:
|
|
# ProfileRegistry.get_profile(cui)
|
|
#
|
|
# See: backend/modules/data_entry/services/ocr/profiles/README.md
|
|
# =========================================================================
|
|
STORE_PROFILES = {
|
|
# Lidl - multi-rate TVA (A+B), specific format without hyphen/colon
|
|
"22891860": {
|
|
"name": "LIDL DISCOUNT S.R.L.",
|
|
"tva_pattern": "lidl",
|
|
"tva_format": "TVA {code} {percent}% {amount}",
|
|
"has_multi_rate_tva": True,
|
|
"card_equals_total": True,
|
|
},
|
|
# OMV Petrom - single TVA rate, client CUI included
|
|
"11201891": {
|
|
"name": "OMV PETROM MARKETING S.R.L.",
|
|
"tva_pattern": "standard",
|
|
"has_client_cui": True,
|
|
},
|
|
# FIVE-HOLDING (BRICK) - standard format
|
|
"10562600": {
|
|
"name": "FIVE-HOLDING S.A.",
|
|
"tva_pattern": "standard",
|
|
},
|
|
# Dedeman - e-factura format
|
|
"2816464": {
|
|
"name": "DEDEMAN SRL",
|
|
"tva_pattern": "standard",
|
|
"has_efactura": True,
|
|
},
|
|
# SOCAR Petroleum
|
|
"12546600": {
|
|
"name": "SOCAR PETROLEUM S.A.",
|
|
"tva_pattern": "standard",
|
|
"has_client_cui": True,
|
|
},
|
|
# Kineterra - non-VAT payer
|
|
"31180432": {
|
|
"name": "KINETERRA CONCEPT SRL",
|
|
"tva_pattern": "none",
|
|
"is_non_vat_payer": True,
|
|
},
|
|
}
|
|
|
|
# Total amount patterns (most specific first)
|
|
# Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
|
|
# OCR often produces errors, so patterns must be tolerant
|
|
TOTAL_PATTERNS = [
|
|
# Most common: TOTAL LEI followed by amount (with OCR-tolerant variations)
|
|
# Handles: TOTAL LEI, TOTAL. LE!, T0TAL LEI, TOTAL LE1, etc.
|
|
(r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98), # OCR-tolerant: TOTAL. LE!, T0TAL LEI
|
|
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98), # Standard clean pattern
|
|
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), # OCR may miss first letter
|
|
# Standard patterns
|
|
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
|
|
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
|
|
# SUBTOTAL when TOTAL not found
|
|
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
|
|
(r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88), # OCR variations
|
|
# Payment methods
|
|
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
|
|
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
|
|
(r'REST\s*:?\s*([\d\s.,]+)', 0.70), # Sometimes total is near REST
|
|
]
|
|
|
|
# Fallback: Find the largest repeated amount (likely the total)
|
|
# This handles cases where OCR doesn't capture "TOTAL" keyword
|
|
|
|
# Date patterns - support dash, dot, and slash separators
|
|
# OCR may produce DRTA instead of DATA, DAIA, etc.
|
|
# OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
|
|
DATE_PATTERNS = [
|
|
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
|
|
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
|
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
|
# Date followed by ORA (time) - OCR may produce 0RA
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
|
|
# Date followed by time without ORA keyword
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
|
|
# Standalone date
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
|
|
# YYYY-MM-DD format (less common)
|
|
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
|
|
]
|
|
|
|
# OCR-corrupted date patterns with spaces/commas
|
|
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
|
|
DATE_PATTERNS_OCR_SPACES = [
|
|
# YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
|
|
# YYYY. MM. DD format with spaces (standalone)
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
|
|
# DD. MM, YYYY or DD, MM. YYYY (with time following)
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
|
# DD. MM, YYYY or DD, MM. YYYY (standalone)
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
|
]
|
|
|
|
# Receipt number patterns - Romanian fiscal receipt formats
|
|
# OCR may produce N instead of : or other errors
|
|
NUMBER_PATTERNS = [
|
|
# NDS format (common in Romanian POS)
|
|
(r'NDS\s*:?\s*(\d+)', 0.98),
|
|
# C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760)
|
|
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format
|
|
(r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number
|
|
(r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix
|
|
# BF (Bon Fiscal) number - high priority
|
|
# Format: "Z:0864 BF:0018" - extract only the number after BF:
|
|
(r'BF\s*:\s*(\d{4,})', 0.96), # BF: with colon (most specific)
|
|
(r'BF\s+(\d{4,})', 0.93), # BF followed by space and number
|
|
# NIVS format
|
|
(r'NIVS\s*:?\s*(\d+)', 0.95),
|
|
# Standard NR BON formats
|
|
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
|
|
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
|
|
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
|
|
# Document number
|
|
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
|
|
# ID BF format
|
|
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
|
|
# TD format (transaction ID)
|
|
(r'TD\s*:?\s*(\d+)', 0.85),
|
|
# 6-8 digit number (typical receipt number length)
|
|
(r'\b(\d{6,8})\b', 0.70),
|
|
# Generic long number at end (fallback)
|
|
(r'NR\.?\s*:?\s*(\d{4,})', 0.65),
|
|
]
|
|
|
|
# CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI
|
|
# CIF = Cod de Identificare Fiscală (vendor's tax ID)
|
|
# CLIENT C.U.I. = client's tax ID (should be ignored)
|
|
# OCR errors: R0 instead of RO, C1F instead of CIF
|
|
CUI_PATTERNS = [
|
|
# CIF at start of line (definitely vendor) - tolerant to OCR errors
|
|
# NOTE: Capture full CUI including RO prefix: (R[O0]?\d{6,10}) or ((?:R[O0])?\d{6,10})
|
|
(r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
|
(r'^CIF\s*:?\s*(\d{6,10})', 0.97), # Without RO prefix
|
|
(r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95), # C1F OCR error
|
|
(r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94), # C1F without RO
|
|
# CIF not preceded by CLIENT (negative lookbehind)
|
|
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
|
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
|
|
# Standalone CIF: format with OCR tolerance
|
|
(r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
|
|
(r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
|
|
# COD FISCAL (vendor)
|
|
(r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
|
|
(r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
|
|
# C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
|
|
# Also handles double colon from OMV/Petrom: "C. I.F.: : RO11201891"
|
|
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
|
|
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
|
|
# C.I.F. format (with dots, no spaces)
|
|
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
|
|
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
|
|
# CUI format (less specific, use with caution)
|
|
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
|
|
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
|
|
# Lidl format: "Cod Identificare fiscala: RO..." (OCR corrupts to "Ced Identificanfliscalar")
|
|
# Matches: "Identificare fiscala", "Identificanfliscalar", "Identificoan/Fljscales"
|
|
(r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
|
|
# Generic: anything with "fiscal" followed by RO + digits
|
|
(r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
|
|
]
|
|
|
|
# Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
|
|
# Common in some receipts: "RO11201891\nC. I. F." - number on line before label
|
|
# IMPORTANT: Capture the full CUI including RO prefix
|
|
CUI_REVERSED_PATTERNS = [
|
|
# RO/R0 + 6-10 digits on line immediately before C.I.F./CIF label
|
|
# Capture the FULL CUI including RO prefix
|
|
(r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
|
# Just digits before C.I.F. label (neplatitor TVA - no RO prefix)
|
|
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
|
|
]
|
|
|
|
# Series patterns - be strict to avoid false matches
|
|
SERIES_PATTERNS = [
|
|
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
|
# Z: format from Romanian fiscal receipts (must be at start of line or after space)
|
|
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
|
|
# BF series with explicit marker
|
|
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
|
|
]
|
|
|
|
# TVA (VAT) patterns - OCR may produce TUA, TVR, IVA, etc.
|
|
# All patterns are case-insensitive (re.IGNORECASE applied in extraction)
|
|
TVA_PATTERNS = [
|
|
# TOTAL TVA BON format (OCR tolerant: TUA, TVR, IVA)
|
|
(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
|
|
(r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95),
|
|
# IVA variant (Spanish/Portuguese influence, some receipts)
|
|
(r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95),
|
|
(r'IVA\s+[A-D]?\s*[-:]?\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.93),
|
|
# TVA with percentage (OCR tolerant)
|
|
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
|
|
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
|
|
# 5% TVA rate (books, newspapers - TVA C)
|
|
(r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93),
|
|
(r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92),
|
|
# Garbled OCR: T0TAL, TVAI, TUAI, etc.
|
|
(r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88),
|
|
# OCR corruption: "TA F 194" (TVA with V→F or space), "T A 19%"
|
|
# Handles: "TOTAL TA F 194" where TVA became "TA F"
|
|
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
|
|
# NOTE: Removed problematic pattern for "TUA F" (OCR noise) that was matching
|
|
# percentage values like "TVA A\n19,00%" incorrectly. Pattern 12 handles these cases.
|
|
# Simple TVA/IVA pattern - this is the reliable fallback
|
|
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
|
|
# Standalone percentage line near TVA
|
|
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
|
|
]
|
|
|
|
# Payment method patterns - appears after TOTAL LEI, before TOTAL TVA
|
|
# Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00"
|
|
# OMV/Petrom uses "CARTE CREDIT" or "CARTE CREDIT 318, 16"
|
|
PAYMENT_METHOD_PATTERNS = [
|
|
# CARTE CREDIT with amount on same line (OMV/Petrom receipts)
|
|
# Handles: "CARTE CREDIT 318, 16" with OCR spaces in number
|
|
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
|
# CARTE CREDIT with amount on next line (OCR may split lines)
|
|
# Handles: "CARTE CREDIT\n318, 16"
|
|
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
|
# CARD with amount (high confidence)
|
|
# Also handles OCR artifacts like "CARD F 100.00" where F is noise
|
|
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
|
|
# NUMERAR (cash) with amount
|
|
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
|
|
# CASH alternative spelling
|
|
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
|
|
# Truncation recovery patterns (for OCR left-margin truncation issues)
|
|
# IMPROVED: More restrictive - require max 6 digits before decimals
|
|
# to avoid matching CUI numbers like RO10562600 → RD10562600
|
|
# "RD" = truncated "CARD" (only 2 chars visible)
|
|
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
|
|
# "ARD" = truncated "CARD" (3 chars visible)
|
|
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
|
|
# "MERAR" = truncated "NUMERAR"
|
|
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
|
|
]
|
|
|
|
# Maximum reasonable payment amount for a receipt (100,000 LEI)
|
|
# Amounts larger than this are likely OCR errors (e.g., CUI parsed as amount)
|
|
MAX_REASONABLE_PAYMENT = Decimal('100000')
|
|
|
|
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
|
# Number may be on separate line before or after the label
|
|
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
|
|
ITEMS_COUNT_PATTERNS = [
|
|
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
|
|
# OCR tolerant: OZ instead of POZ, ARI instead of ART
|
|
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
|
|
# Number on line BEFORE "OZ. ART. IN BON:" - OCR sometimes reorders
|
|
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
|
|
# Number may be on next line after label
|
|
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
|
|
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
|
|
# Simpler patterns - but more specific
|
|
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
|
|
# POZ at start of line or after colon (not in product descriptions)
|
|
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
|
|
]
|
|
|
|
# Address patterns (Romanian format)
|
|
ADDRESS_PATTERNS = [
|
|
# Street patterns
|
|
(r'(STR\.?\s+[A-Z0-9\s.,]+(?:NR\.?\s*\d+)?)', 0.90),
|
|
# Full address with JUD (county)
|
|
(r'(JUD\.?\s+[A-Z]+,?\s*(?:MUN\.?|OR\.?|COM\.?)?\s*[A-Z]+)', 0.85),
|
|
]
|
|
|
|
# Client/Buyer patterns (for B2B receipts)
|
|
# CLIENT, CUMPARATOR, BENEFICIAR sections
|
|
# Variations: "CIF CLIENT:", "CLIENT C.U.I/C.I.F.", "CLIENT C. U. I./ C. I.F."
|
|
CLIENT_SECTION_MARKERS = [
|
|
# Reversed format: CIF/CUI before CLIENT
|
|
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT:
|
|
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:', # CUI CLIENT:
|
|
# Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF CL IENT:"
|
|
r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF LIENT:"
|
|
r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C from CLIENT)
|
|
# CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
|
|
# Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
|
|
r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
|
|
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CLIENT CUI: or CLIENT CIF:
|
|
r'CLIENT\s*:',
|
|
# CUMPARATOR variants
|
|
r'CUMPARATOR\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CUMPARATOR CUI: or CIF:
|
|
r'CUMPARATOR\s*:',
|
|
r'BENEFICIAR\s*:',
|
|
r'CUMP[AĂ]R[AĂ]TOR\s*:',
|
|
r'DATE\s+CLIENT',
|
|
r'LIENT\s*:', # OCR truncation
|
|
]
|
|
|
|
# Client CUI patterns (explicitly after CLIENT marker)
|
|
# OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
|
|
CLIENT_CUI_PATTERNS = [
|
|
# NEW: CUI on line BEFORE CLIENT marker (docTR/OCR may output value before label)
|
|
# Pattern: "RO1879855\nCLIENT C.U.I./C.I.F.:" - CUI on line before CLIENT label
|
|
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
|
|
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
|
|
# Same but with optional colon after RO number
|
|
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
|
|
# "CIF I CLIENT:" or "CIF IDENTIFICARE CLIENT:" format (OCR may insert extra chars)
|
|
# Common OCR artifact: "CIF I CLIENT: R01879855"
|
|
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
|
|
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
|
# CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
|
|
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
|
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
|
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
|
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
|
# CLIENT C.U.I/C.I.F. or CLIENT C. U. I./ C. I.F. (slash variant - all spacing)
|
|
# Most flexible pattern for slash variants
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
|
|
# OCR artifact: doubled letters like "C.U U. I." or "C.I I.F." (docTR sometimes duplicates)
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
|
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
|
# Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF L IENT:", "CIF C IENT:"
|
|
# OCR often corrupts "CLIENT" when it appears after "CIF"
|
|
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93), # "CIF a IENT:", "CIF CL IENT:"
|
|
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
|
(r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92), # "CIF LIENT:" (missing C)
|
|
(r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
|
# CUMPARATOR variants
|
|
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
# CUMPARATOR with CUI/CIF on next line: "CUMPARATOR: NAME\nCIF: 12345678"
|
|
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
|
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93), # F or T (OCR error)
|
|
# CUMPARATOR with CUI/CIF two lines down: "CUMPARATOR: NAME\nADDRESS\nCIF: 12345678"
|
|
(r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
|
# CUI/CIF on line immediately after CLIENT marker
|
|
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
(r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # F or T (OCR error)
|
|
# CUI/CIF after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
|
|
(r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
|
(r'CLIENT\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90), # CIF/CIT after name
|
|
]
|
|
|
|
# Vendor name indicators (lines containing these are likely vendor names)
|
|
# These should be company type suffixes, not generic words
|
|
# Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
|
|
VENDOR_INDICATORS = [
|
|
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
|
|
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
|
|
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
|
|
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
|
|
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
|
|
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
|
|
# S.C. alone is too short and generic - only match if followed by company name
|
|
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
|
|
r'HOLDING',
|
|
r'COMPANY',
|
|
r'GROUP',
|
|
# Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
|
|
]
|
|
|
|
def extract(self, text: str) -> ExtractionResult:
|
|
"""Extract all fields from OCR text."""
|
|
result = ExtractionResult()
|
|
result.raw_text = text
|
|
text_upper = text.upper()
|
|
|
|
# =========================================================================
|
|
# STEP 1: Extract vendor info FIRST to find store profile
|
|
# =========================================================================
|
|
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
|
|
result.cui, _ = self._extract_cui(text_upper, text)
|
|
result.cui = OCRValidationEngine.normalize_cui(result.cui)
|
|
|
|
# Lookup store-specific profile for enhanced extraction accuracy
|
|
store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
|
|
if store_profile:
|
|
print(f"[Profile] ✅ Using {store_profile.STORE_NAME} ({store_profile.__class__.__name__}) for CUI {result.cui}", flush=True)
|
|
else:
|
|
print(f"[Profile] ⚠️ No profile found for CUI '{result.cui}' - using GENERIC extraction", flush=True)
|
|
|
|
# =========================================================================
|
|
# STEP 2: Extract ALL fields using profile (if available) or generic
|
|
# =========================================================================
|
|
if store_profile:
|
|
# Profile-specific extraction (higher accuracy for known stores)
|
|
result.amount, result.confidence_amount = store_profile.extract_total(text_upper)
|
|
result.receipt_date, result.confidence_date = store_profile.extract_date(text_upper)
|
|
result.receipt_number, _ = store_profile.extract_receipt_number(text_upper)
|
|
result.tva_entries, result.confidence_tva = store_profile.extract_tva_entries(text_upper)
|
|
result.tva_total = sum((e['amount'] for e in result.tva_entries), Decimal(0)) if result.tva_entries else None
|
|
result.payment_methods = store_profile.extract_payment_methods(text_upper)
|
|
# Calculate payment confidence from individual payment method confidences
|
|
if result.payment_methods:
|
|
payment_confidences = [pm.get('confidence', 0.0) for pm in result.payment_methods]
|
|
result.confidence_payment = max(payment_confidences) if payment_confidences else 0.0
|
|
else:
|
|
result.confidence_payment = 0.0
|
|
|
|
# Client data extraction via profile (CUI + name)
|
|
profile_client_cui, cui_confidence = store_profile.extract_client_cui(text_upper)
|
|
profile_client_name, name_confidence = store_profile.extract_client_name(text)
|
|
|
|
if profile_client_cui or profile_client_name:
|
|
# Use profile extraction results
|
|
result.client_cui = OCRValidationEngine.normalize_cui(profile_client_cui) if profile_client_cui else None
|
|
result.client_name = profile_client_name
|
|
result.confidence_client = max(cui_confidence, name_confidence)
|
|
# Address still via generic (no profile method)
|
|
_, _, client_address, _ = self._extract_client_data(text_upper, text)
|
|
result.client_address = client_address
|
|
else:
|
|
# Fallback to generic client extraction
|
|
client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
|
|
result.client_name = client_name
|
|
result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
|
|
result.client_address = client_address
|
|
result.confidence_client = confidence
|
|
|
|
# Log extraction results for debugging
|
|
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
|
|
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
|
|
print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
|
|
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
|
|
else:
|
|
# Generic extraction for unknown stores
|
|
result.amount, result.confidence_amount = self._extract_amount(text_upper)
|
|
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
|
|
result.receipt_number, _ = self._extract_number(text_upper)
|
|
result.tva_entries, result.tva_total, result.confidence_tva = self._extract_tva_entries(text_upper)
|
|
result.payment_methods, result.confidence_payment = self._extract_payment_methods(text_upper)
|
|
|
|
# Generic client extraction
|
|
client_name, client_cui, client_address, confidence = self._extract_client_data(text_upper, text)
|
|
result.client_name = client_name
|
|
result.client_cui = OCRValidationEngine.normalize_cui(client_cui)
|
|
result.client_address = client_address
|
|
result.confidence_client = confidence
|
|
|
|
# Log generic extraction results for debugging
|
|
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
|
|
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
|
|
print(f"[Generic] Extracted: total={result.amount}, date={result.receipt_date}, "
|
|
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
|
|
|
|
# Series extraction (no profile method, always generic)
|
|
result.receipt_series, _ = self._extract_series(text_upper)
|
|
|
|
# =========================================================================
|
|
# STEP 3: Debug logging and validation
|
|
# =========================================================================
|
|
if not result.tva_entries:
|
|
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
|
|
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
|
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
|
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
|
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
|
|
|
|
# Log TVA vs TOTAL for debugging
|
|
if result.tva_total and result.amount:
|
|
if result.tva_total > result.amount:
|
|
print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
|
|
elif result.tva_total > result.amount * Decimal('0.5'):
|
|
print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)
|
|
|
|
# Additional generic extractions
|
|
result.items_count = self._extract_items_count(text_upper)
|
|
result.address = self._extract_address(text_upper)
|
|
|
|
# =========================================================================
|
|
# STEP 4: Validate and post-process
|
|
# =========================================================================
|
|
# Save original payment methods before validation (for payment mode detection)
|
|
original_payment_methods = result.payment_methods.copy() if result.payment_methods else []
|
|
|
|
# Validate payment methods against extracted amount
|
|
result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)
|
|
|
|
# Auto-suggest payment_mode based on detected payment methods
|
|
payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
|
|
if payment_methods_for_mode:
|
|
card_amount = sum(
|
|
pm.get('amount', Decimal('0'))
|
|
for pm in payment_methods_for_mode
|
|
if pm.get('method') == 'CARD'
|
|
)
|
|
if card_amount > 0:
|
|
result.suggested_payment_mode = 'banca'
|
|
print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
|
|
else:
|
|
result.suggested_payment_mode = 'numerar'
|
|
print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)
|
|
|
|
# Detect receipt type
|
|
result.receipt_type = self._detect_receipt_type(text_upper)
|
|
|
|
# Reverse TVA validation
|
|
if result.tva_entries and result.amount:
|
|
is_valid, expected_total, msg = self._validate_tva_reverse(result.tva_entries, result.amount)
|
|
if not is_valid:
|
|
print(f"[TVA Reverse Validation] {msg}", flush=True)
|
|
|
|
# Cross-validate amount using payment methods and TVA
|
|
original_amount = result.amount
|
|
validated_amount, validated_confidence, source = self._cross_validate_and_calculate_amount(
|
|
result.amount,
|
|
result.confidence_amount,
|
|
result.payment_methods,
|
|
result.tva_entries,
|
|
result.tva_total
|
|
)
|
|
|
|
# Add validation warnings when TOTAL is calculated (not directly extracted)
|
|
if 'calculated from TVA' in source:
|
|
warning_msg = f"TOTAL ({validated_amount}) calculat din TVA (nu a fost extras direct din bon)"
|
|
result.validation_warnings.append(warning_msg)
|
|
print(f"[Cross-Validation] ⚠️ {warning_msg}", flush=True)
|
|
|
|
# Add comparison if original was different
|
|
if original_amount and original_amount != validated_amount:
|
|
diff = abs(float(validated_amount) - float(original_amount))
|
|
result.validation_warnings.append(
|
|
f"TOTAL extras ({original_amount}) diferă de cel calculat ({validated_amount}) cu {diff:.2f} RON"
|
|
)
|
|
|
|
elif 'calculated from payment methods' in source:
|
|
warning_msg = f"TOTAL ({validated_amount}) calculat din suma metodelor de plată (nu a fost extras direct)"
|
|
result.validation_warnings.append(warning_msg)
|
|
print(f"[Cross-Validation] ⚠️ {warning_msg}", flush=True)
|
|
|
|
if original_amount and original_amount != validated_amount:
|
|
diff = abs(float(validated_amount) - float(original_amount))
|
|
result.validation_warnings.append(
|
|
f"TOTAL extras ({original_amount}) diferă de suma plăților ({validated_amount}) cu {diff:.2f} RON"
|
|
)
|
|
|
|
elif source == 'not found':
|
|
result.validation_warnings.append("TOTAL nu a fost detectat și nu a putut fi calculat")
|
|
print("[Cross-Validation] ⚠️ TOTAL nu a fost detectat", flush=True)
|
|
|
|
elif validated_amount != original_amount:
|
|
print(f"[Cross-Validation] Amount updated: {original_amount} -> {validated_amount} (source: {source})", flush=True)
|
|
|
|
result.amount = validated_amount
|
|
result.confidence_amount = validated_confidence
|
|
|
|
return result
|
|
|
|
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
|
|
"""Extract total amount from text."""
|
|
# PRE-FILTER: Remove lines containing REST (rest = change, not total)
|
|
# When paid by card, there's no change - exact amount is paid
|
|
lines = text.split('\n')
|
|
filtered_lines = []
|
|
for line in lines:
|
|
# Skip lines with REST pattern (change amount, not total)
|
|
if re.search(r'\bREST\b', line, re.IGNORECASE):
|
|
continue
|
|
filtered_lines.append(line)
|
|
text = '\n'.join(filtered_lines)
|
|
|
|
# First try standard patterns (TOTAL, SUBTOTAL, etc.)
|
|
for pattern, confidence in self.TOTAL_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
try:
|
|
# IMPORTANT: Call _normalize_number FIRST to handle "190 60" → "190.60"
|
|
# before stripping other characters
|
|
amount_str = match.group(1).strip()
|
|
amount_str = self._normalize_number(amount_str)
|
|
# Now remove any remaining non-numeric chars (except decimal point)
|
|
amount_str = re.sub(r'[^\d.]', '', amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
return amount, confidence
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
# Strategy 2: Find amounts AFTER product lines end
|
|
# Products have pattern: "X BUC/ROLA X price = price"
|
|
# Total appears after all products
|
|
product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X'
|
|
product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE))
|
|
if product_matches:
|
|
# Get text after the last product line
|
|
last_product_pos = product_matches[-1].end()
|
|
after_products = text[last_product_pos:]
|
|
|
|
# Find standalone amounts on their own line after products
|
|
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
|
|
standalone_amounts = []
|
|
for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE):
|
|
try:
|
|
amount_str = match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 10: # Filter out small values
|
|
standalone_amounts.append(amount)
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
if standalone_amounts:
|
|
# The largest standalone amount after products is likely the total
|
|
max_amount = max(standalone_amounts)
|
|
# Higher confidence if it appears multiple times
|
|
count = standalone_amounts.count(max_amount)
|
|
confidence = 0.85 if count >= 2 else 0.75
|
|
return max_amount, confidence
|
|
|
|
# Strategy 3: Find the most repeated large amount
|
|
# Normalize spaces in numbers (OCR may produce "186. 16")
|
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
|
amount_pattern = r'(\d{2,4}[.,]\d{2})\b'
|
|
amounts = re.findall(amount_pattern, normalized_text)
|
|
if amounts:
|
|
from collections import Counter
|
|
amount_counts = Counter(amounts)
|
|
# Filter amounts that appear 2+ times and are > 20
|
|
candidates = []
|
|
for amt_str, count in amount_counts.items():
|
|
try:
|
|
amt = Decimal(self._normalize_number(amt_str))
|
|
if count >= 2 and amt > 20:
|
|
candidates.append((amt, count))
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
if candidates:
|
|
# Return the LARGEST amount that appears multiple times
|
|
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
return candidates[0][0], 0.65
|
|
|
|
# Last resort: Find any standalone large amount
|
|
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
|
|
for match in re.finditer(line_amount_pattern, text, re.MULTILINE):
|
|
try:
|
|
amount_str = match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 50: # Higher threshold for fallback
|
|
return amount, 0.50
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
return None, 0.0
|
|
|
|
def _normalize_number(self, num_str: str) -> str:
|
|
"""Normalize Romanian number format to standard decimal."""
|
|
# OCR often reads "." as " " (space) - handle "190 60" as "190.60"
|
|
# Pattern: digits + space + exactly 2 digits at end
|
|
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', num_str.strip())
|
|
if space_decimal_match:
|
|
num_str = f"{space_decimal_match.group(1)}.{space_decimal_match.group(2)}"
|
|
else:
|
|
# Handle "1 234 56" pattern (thousands + decimal with spaces)
|
|
# Match: digits + space(s) + digits + space + 2 digits
|
|
multi_space_match = re.match(r'^([\d\s]+?)\s+(\d{2})$', num_str.strip())
|
|
if multi_space_match:
|
|
integer_part = multi_space_match.group(1).replace(' ', '')
|
|
decimal_part = multi_space_match.group(2)
|
|
num_str = f"{integer_part}.{decimal_part}"
|
|
else:
|
|
# Remove remaining spaces (thousands separators)
|
|
num_str = num_str.replace(' ', '')
|
|
|
|
# Handle comma as decimal separator
|
|
if ',' in num_str and '.' in num_str:
|
|
# Romanian format: 1.234,56
|
|
num_str = num_str.replace('.', '').replace(',', '.')
|
|
elif ',' in num_str:
|
|
# Could be 1,50 or 1,234
|
|
parts = num_str.split(',')
|
|
if len(parts) == 2 and len(parts[1]) <= 2:
|
|
# Decimal comma: 1,50
|
|
num_str = num_str.replace(',', '.')
|
|
else:
|
|
# Thousands comma: 1,234
|
|
num_str = num_str.replace(',', '')
|
|
elif '.' in num_str:
|
|
parts = num_str.split('.')
|
|
if len(parts) > 2:
|
|
# Multiple dots: 1.234.567 -> 1234567
|
|
num_str = ''.join(parts[:-1]) + '.' + parts[-1]
|
|
|
|
return num_str
|
|
|
|
def _calculate_multi_rate_tva_total(self, tva_entries: List[dict]) -> Optional[Decimal]:
|
|
"""
|
|
Calculate implied total from ALL TVA entries (multi-rate support).
|
|
|
|
Formula for each entry: total_for_entry = tva * (100 + rate) / rate
|
|
Final total = sum of all entry totals
|
|
|
|
Example for Lidl (TVA A 21% = 7.71, TVA B 11% = 2.13):
|
|
Entry A: 7.71 * 121 / 21 = 44.45
|
|
Entry B: 2.13 * 111 / 11 = 21.49
|
|
Total: 44.45 + 21.49 = 65.94 ≈ 65.86 (within tolerance)
|
|
|
|
Returns:
|
|
Implied total Decimal, or None if calculation not possible
|
|
"""
|
|
if not tva_entries:
|
|
return None
|
|
|
|
total = Decimal('0')
|
|
for entry in tva_entries:
|
|
rate = entry.get('percent', 0)
|
|
tva_amount = entry.get('amount')
|
|
if tva_amount and rate > 0:
|
|
try:
|
|
tva_dec = Decimal(str(tva_amount))
|
|
# Formula: total_for_entry = tva * (100 + rate) / rate
|
|
entry_total = tva_dec * Decimal(100 + rate) / Decimal(rate)
|
|
total += entry_total
|
|
print(f"[Multi-rate TVA] Entry {entry.get('code', '?')}: tva={tva_amount}, rate={rate}% -> implied={entry_total:.2f}", flush=True)
|
|
except (InvalidOperation, ValueError, TypeError):
|
|
continue
|
|
|
|
return total.quantize(Decimal('0.01')) if total > 0 else None
|
|
|
|
def _cross_validate_and_calculate_amount(
|
|
self,
|
|
amount: Optional[Decimal],
|
|
confidence_amount: float,
|
|
payment_methods: List[dict],
|
|
tva_entries: List[dict],
|
|
tva_total: Optional[Decimal]
|
|
) -> Tuple[Optional[Decimal], float, str]:
|
|
"""
|
|
Cross-validate and potentially calculate total from payment methods and TVA.
|
|
|
|
Returns: (amount, confidence, source_description)
|
|
|
|
Logic:
|
|
1. Collect all available sources: extracted amount, payment sum, TVA-implied total
|
|
2. Find consensus: 2+ sources within 3% tolerance
|
|
3. If consensus found, use the higher-confidence source value
|
|
4. If extracted differs >10% from all others, it's an outlier - correct it
|
|
5. If no consensus possible, fallback to individual validations
|
|
"""
|
|
# Calculate payment methods sum
|
|
payment_sum = Decimal('0')
|
|
if payment_methods:
|
|
for pm in payment_methods:
|
|
try:
|
|
pm_amount = pm.get('amount')
|
|
if pm_amount:
|
|
payment_sum += Decimal(str(pm_amount))
|
|
except (InvalidOperation, ValueError, TypeError):
|
|
continue
|
|
|
|
# Calculate TVA-implied total using ALL entries (multi-rate fix)
|
|
tva_implied_total = self._calculate_multi_rate_tva_total(tva_entries)
|
|
|
|
# Multi-source consensus approach (3% tolerance for multi-rate TVA rounding)
|
|
CONSENSUS_TOLERANCE = 3.0 # 3% tolerance
|
|
|
|
# Collect all available sources with their confidences
|
|
sources = []
|
|
if amount and amount > 0:
|
|
sources.append(('extracted', float(amount), confidence_amount))
|
|
if payment_sum > 0:
|
|
sources.append(('payment', float(payment_sum), 0.92)) # Payment is very reliable
|
|
if tva_implied_total and tva_implied_total > 0:
|
|
sources.append(('tva_calc', float(tva_implied_total), 0.88)) # TVA calc is reliable
|
|
|
|
print(f"[Cross-Validation] Sources: {[(s[0], f'{s[1]:.2f}', f'{s[2]:.2f}') for s in sources]}", flush=True)
|
|
|
|
# Find consensus: 2+ sources within tolerance
|
|
if len(sources) >= 2:
|
|
for i, (name1, val1, conf1) in enumerate(sources):
|
|
for name2, val2, conf2 in sources[i+1:]:
|
|
if val1 <= 0 or val2 <= 0:
|
|
continue
|
|
diff_pct = abs(val1 - val2) / max(val1, val2) * 100
|
|
if diff_pct <= CONSENSUS_TOLERANCE:
|
|
# Consensus found! Use value from higher-confidence source
|
|
if conf1 >= conf2:
|
|
consensus_val, consensus_conf = val1, conf1
|
|
else:
|
|
consensus_val, consensus_conf = val2, conf2
|
|
# Boost confidence for consensus
|
|
consensus_conf = min(0.98, consensus_conf + 0.05)
|
|
print(f"[Cross-Validation] Consensus: {name1}={val1:.2f} ≈ {name2}={val2:.2f} (diff={diff_pct:.1f}%)", flush=True)
|
|
return Decimal(str(round(consensus_val, 2))), consensus_conf, f"consensus ({name1}+{name2})"
|
|
|
|
# No consensus - check if extracted is an outlier (differs >10% from all others)
|
|
if amount and amount > 0 and len(sources) >= 2:
|
|
other_sources = [s for s in sources if s[0] != 'extracted']
|
|
if other_sources:
|
|
extracted_val = float(amount)
|
|
all_differ = all(
|
|
abs(extracted_val - s[1]) / max(extracted_val, s[1]) * 100 > 10
|
|
for s in other_sources if s[1] > 0
|
|
)
|
|
if all_differ:
|
|
# Extracted differs significantly from all others - use the best other source
|
|
best_other = max(other_sources, key=lambda s: s[2])
|
|
print(f"[Cross-Validation] Extracted outlier: {extracted_val:.2f} differs >10% from all others, using {best_other[0]}={best_other[1]:.2f}", flush=True)
|
|
return Decimal(str(round(best_other[1], 2))), best_other[2], f"corrected (extracted outlier, using {best_other[0]})"
|
|
|
|
# Fallback: Case 1 - Amount valid with high confidence
|
|
if amount and amount > 0 and confidence_amount >= 0.8:
|
|
# Check TVA-implied total
|
|
if tva_implied_total and tva_implied_total > 0:
|
|
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
|
|
if tva_diff_percent <= 3:
|
|
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
|
|
elif tva_diff_percent > 10:
|
|
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
|
|
return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"
|
|
|
|
# Cross-validate with payment methods
|
|
if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
|
|
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
|
|
elif payment_sum > 0:
|
|
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
|
|
if payment_diff_percent > 10:
|
|
print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
|
|
return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"
|
|
|
|
return amount, confidence_amount, "extracted"
|
|
|
|
# Case 2: Amount exists but low confidence - try to validate/correct
|
|
if amount and amount > 0:
|
|
if tva_implied_total and tva_implied_total > 0:
|
|
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
|
|
if tva_diff_percent <= 3:
|
|
return amount, 0.88, "extracted (validated by TVA)"
|
|
elif tva_diff_percent > 10:
|
|
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
|
|
return tva_implied_total, 0.85, "calculated from TVA"
|
|
|
|
if payment_sum > 0:
|
|
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
|
|
if payment_diff_percent <= 1:
|
|
return amount, 0.90, "extracted (validated by payment methods)"
|
|
elif payment_diff_percent > 10:
|
|
print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
|
|
return payment_sum, 0.85, "calculated from payment methods"
|
|
|
|
return amount, confidence_amount, "extracted (unvalidated)"
|
|
|
|
# Case 3: Amount is 0 or None - calculate from payment methods
|
|
if payment_sum > 0:
|
|
print(f"[Cross-Validation] Amount not found, using payment sum: {payment_sum}", flush=True)
|
|
return payment_sum, 0.85, "calculated from payment methods"
|
|
|
|
# Case 4: Try TVA-implied total as last resort
|
|
if tva_implied_total and tva_implied_total > 0:
|
|
print(f"[Cross-Validation] Amount not found, using TVA-implied total: {tva_implied_total}", flush=True)
|
|
return tva_implied_total, 0.70, "calculated from TVA"
|
|
|
|
# Nothing worked - return original
|
|
return amount, confidence_amount, "not found"
|
|
|
|
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
|
"""Extract receipt date from text."""
|
|
# First try standard patterns (clean dates)
|
|
for pattern, confidence in self.DATE_PATTERNS:
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
try:
|
|
# Normalize separators to dots
|
|
date_str = match.group(1).replace('/', '.').replace('-', '.')
|
|
|
|
# Try DD.MM.YYYY format first
|
|
try:
|
|
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
|
|
except ValueError:
|
|
# Try YYYY.MM.DD format
|
|
parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
|
|
|
|
# Validate date range
|
|
today = date.today()
|
|
if parsed <= today and parsed.year >= 2020:
|
|
return parsed, confidence
|
|
except ValueError:
|
|
continue
|
|
|
|
# Then try OCR-corrupted patterns (dates with spaces/commas)
|
|
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
|
|
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
try:
|
|
if fmt == 'ymd':
|
|
# YYYY. MM. DD format (OMV/Petrom)
|
|
year = match.group(1)
|
|
month = match.group(2)
|
|
day = match.group(3)
|
|
else:
|
|
# DD. MM. YYYY format (default)
|
|
day = match.group(1)
|
|
month = match.group(2)
|
|
year = match.group(3)
|
|
|
|
date_str = f"{day}.{month}.{year}"
|
|
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
|
|
|
|
# Validate date range
|
|
today = date.today()
|
|
if parsed <= today and parsed.year >= 2020:
|
|
return parsed, confidence
|
|
except ValueError:
|
|
continue
|
|
|
|
return None, 0.0
|
|
|
|
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract receipt number from text."""
|
|
for pattern, confidence in self.NUMBER_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1), confidence
|
|
return None, 0.0
|
|
|
|
def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract receipt series from text."""
|
|
for pattern, confidence in self.SERIES_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper(), confidence
|
|
return None, 0.0
|
|
|
|
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract vendor/partner name from text.
|
|
Uses multiple strategies:
|
|
1. Look for lines with company type indicators (S.R.L., S.A., etc.)
|
|
2. Look for company name + SRL on separate lines
|
|
3. Look for lines near CIF
|
|
4. Use first valid line as fallback
|
|
"""
|
|
lines = text.split('\n')
|
|
skip_keywords = [
|
|
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
|
|
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
|
|
'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT',
|
|
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
|
|
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
|
|
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
|
|
'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
|
|
'NIVS', 'BENZINA', 'PETROM', 'OMV'
|
|
]
|
|
|
|
# Strategy 0: Look for company name followed by SRL/SA on next line
|
|
# Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
|
|
for i, line in enumerate(lines[:15]):
|
|
line = line.strip()
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
line_upper = line.upper()
|
|
|
|
# Skip lines with skip keywords
|
|
if any(kw in line_upper for kw in skip_keywords):
|
|
continue
|
|
|
|
# Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip().upper()
|
|
# Match standalone company type suffix
|
|
if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
|
|
re.match(r'^S\.?\s*A\.?$', next_line) or \
|
|
re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
|
|
re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
|
|
re.match(r'^I\.?\s*I\.?$', next_line):
|
|
# Combine: "COMPANY NAME" + " " + "SRL"
|
|
vendor = self._clean_vendor_name(f"{line} {next_line}")
|
|
if vendor and len(vendor) >= 5:
|
|
return vendor, 0.95
|
|
|
|
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
|
|
for i, line in enumerate(lines[:15]): # Check first 15 lines
|
|
line = line.strip()
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
line_upper = line.upper()
|
|
|
|
# Skip lines with skip keywords (CUMPARATOR, CLIENT, etc.)
|
|
if any(kw in line_upper for kw in skip_keywords):
|
|
continue
|
|
|
|
# Check for vendor indicators
|
|
for indicator in self.VENDOR_INDICATORS:
|
|
if re.search(indicator, line_upper):
|
|
# Found a company name indicator
|
|
vendor = self._clean_vendor_name(line)
|
|
if vendor and len(vendor) >= 3:
|
|
# High confidence for lines with company indicators
|
|
return vendor, 0.95
|
|
|
|
# Strategy 2: Look for lines right before or after CIF
|
|
for i, line in enumerate(lines[:15]):
|
|
line_upper = line.upper()
|
|
if 'CIF' in line_upper and 'CLIENT' not in line_upper:
|
|
# Check line before
|
|
if i > 0:
|
|
prev_line = lines[i-1].strip()
|
|
if prev_line and len(prev_line) >= 3:
|
|
if not any(kw in prev_line.upper() for kw in skip_keywords):
|
|
vendor = self._clean_vendor_name(prev_line)
|
|
if vendor:
|
|
return vendor, 0.85
|
|
|
|
# Strategy 3: First valid line as fallback
|
|
for i, line in enumerate(lines[:10]):
|
|
line = line.strip()
|
|
|
|
# Skip empty lines
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
# Skip lines that are just numbers or codes
|
|
if re.match(r'^[\d.,\s:]+$', line):
|
|
continue
|
|
|
|
# Skip lines with barcodes/product codes
|
|
if re.match(r'^[A-Z]*\d{6,}', line):
|
|
continue
|
|
|
|
# Skip lines with keywords
|
|
if any(kw in line.upper() for kw in skip_keywords):
|
|
continue
|
|
|
|
# Clean the line
|
|
vendor = self._clean_vendor_name(line)
|
|
|
|
if vendor and len(vendor) >= 3:
|
|
# Confidence decreases for lines further down
|
|
confidence = max(0.3, 0.7 - (i * 0.05))
|
|
return vendor, confidence
|
|
|
|
return None, 0.0
|
|
|
|
def _clean_vendor_name(self, name: str) -> Optional[str]:
|
|
"""Clean and normalize vendor name."""
|
|
if not name:
|
|
return None
|
|
|
|
# Remove common OCR artifacts
|
|
name = re.sub(r'[^\w\s.,&\-()]', ' ', name)
|
|
# Normalize whitespace
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
|
|
name_upper = name.upper()
|
|
|
|
# Skip if it looks like an address line only
|
|
# Note: SC (Scara/staircase) is tricky because S.C. also means "Societate Comercială" (company)
|
|
# Only reject SC when followed by a number (staircase), not when followed by company name
|
|
# Pattern: STR, JUD, MUN, NR, BL, ET, AP are always address prefixes
|
|
# SC is only address when followed by digit (e.g., "SC 2", "SC. 5")
|
|
if re.match(r'^(STR|JUD|MUN|NR|BL|ET|AP)\.?\s', name_upper):
|
|
return None
|
|
|
|
# SC followed by digit = staircase (address), reject
|
|
# SC followed by letter/company name = "Societate Comercială", keep
|
|
if re.match(r'^S\.?\s*C\.?\s+\d', name_upper):
|
|
return None
|
|
|
|
# Skip if too short after cleaning
|
|
if len(name) < 3:
|
|
return None
|
|
|
|
return name
|
|
|
|
def _get_store_profile(self, cui: Optional[str]) -> Optional[dict]:
|
|
"""
|
|
Get store-specific profile by CUI.
|
|
|
|
DEPRECATED: Use ProfileRegistry.get_profile() directly for profile objects.
|
|
This method is kept for backward compatibility and returns validation hints dict.
|
|
|
|
Args:
|
|
cui: The CUI extracted from receipt (with or without RO prefix)
|
|
|
|
Returns:
|
|
Store profile validation hints dict or None if not found
|
|
"""
|
|
profile = ProfileRegistry.get_profile(cui)
|
|
if profile:
|
|
# Return validation hints for backward compatibility
|
|
hints = profile.get_validation_hints()
|
|
hints['name'] = profile.STORE_NAME
|
|
print(f"[Store Profile] Found profile for {cui}: {profile.STORE_NAME}", flush=True)
|
|
return hints
|
|
return None
|
|
|
|
def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract vendor CUI (fiscal identification code) from text.
|
|
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
|
|
"""
|
|
def get_cui_digit_count(cui: str) -> int:
|
|
"""Get the count of digits in CUI (excluding RO/R0 prefix)."""
|
|
cui_upper = cui.upper().strip()
|
|
if cui_upper.startswith('RO') or cui_upper.startswith('R0'):
|
|
return len(cui_upper) - 2
|
|
return len(cui_upper)
|
|
|
|
# Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
|
|
# This is common in some receipts: "RO11201891\nC. I. F."
|
|
for pattern, confidence in self.CUI_REVERSED_PATTERNS:
|
|
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
cui = match.group(1)
|
|
digit_count = get_cui_digit_count(cui)
|
|
if 6 <= digit_count <= 10:
|
|
# Verify this is not the CLIENT CUI by checking context
|
|
start = match.start()
|
|
# Check 50 chars before the match for CLIENT keyword
|
|
context_start = max(0, start - 50)
|
|
context = text_upper[context_start:start]
|
|
if 'CLIENT' not in context and 'LIENT' not in context:
|
|
return cui, confidence
|
|
|
|
# Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
|
|
lines = text_upper.split('\n')
|
|
for line in lines:
|
|
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
|
|
if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line:
|
|
continue
|
|
|
|
# Look for CIF in this line
|
|
for pattern, confidence in self.CUI_PATTERNS:
|
|
match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
cui = match.group(1)
|
|
digit_count = get_cui_digit_count(cui)
|
|
if 6 <= digit_count <= 10:
|
|
return cui, confidence
|
|
|
|
# Strategy 2: Fallback - search entire text but exclude CLIENT patterns
|
|
for pattern, confidence in self.CUI_PATTERNS:
|
|
# Find all matches
|
|
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
|
cui = match.group(1)
|
|
digit_count = get_cui_digit_count(cui)
|
|
if 6 <= digit_count <= 10:
|
|
# Check if this match is preceded by CLIENT in the same line
|
|
start = match.start()
|
|
line_start = text_upper.rfind('\n', 0, start) + 1
|
|
line_text = text_upper[line_start:start]
|
|
if 'CLIENT' not in line_text and 'LIENT' not in line_text:
|
|
return cui, confidence
|
|
|
|
return None, 0.0
|
|
|
|
def _detect_receipt_type(self, text: str) -> str:
|
|
"""Detect receipt type from text content.
|
|
|
|
BON FISCAL variants: "BON FISCAL", "BON FISCAL.", "BON FISCAL"
|
|
CHITANTA variants: "CHITANTA", "CHITANȚĂ"
|
|
"""
|
|
# Check for explicit BON FISCAL first (handles OCR spacing variations)
|
|
if re.search(r'BON\s+FISCAL', text):
|
|
return 'bon_fiscal'
|
|
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
|
|
return 'chitanta'
|
|
# Default to bon_fiscal if neither found
|
|
return 'bon_fiscal'
|
|
|
|
def _try_pattern_lidl(self, text: str) -> List[dict]:
|
|
"""
|
|
Try Lidl-style TVA pattern: "TVA A 21,00% 7.71" (no hyphen/colon separator).
|
|
|
|
Lidl receipts format:
|
|
TOTAL TVA 9,84
|
|
TVA A 21,00% 7,71
|
|
TVA B 11,00% 2,13
|
|
|
|
Returns list of TVA entries found.
|
|
"""
|
|
entries = []
|
|
seen = set()
|
|
|
|
# Pattern: TVA/TUA/IVA + code (A-D) + percent + amount (on same line)
|
|
# Handles: "TVA A 21,00% 7,71", "TVA B 11,00% 2,13", "TUA A 21% 7.71"
|
|
lidl_patterns = [
|
|
# Same line: "TVA A 21,00% 7.71" (with various spacing)
|
|
r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
|
# Same line with backslash (OCR artifact): "TVA A \21,00% 7.71"
|
|
r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
|
# IVA variant
|
|
r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
|
]
|
|
|
|
for pattern in lidl_patterns:
|
|
for match in re.finditer(pattern, text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount_str = self._normalize_number(match.group(3))
|
|
amount = Decimal(amount_str)
|
|
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen.add(entry_key)
|
|
print(f"[TVA Lidl] Found: TVA {code} {percent}% = {amount}", flush=True)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
return entries
|
|
|
|
def _select_best_tva_candidate(
|
|
self,
|
|
candidates: List[tuple],
|
|
tva_bon_total: Optional[Decimal]
|
|
) -> Tuple[List[dict], Optional[Decimal], float]:
|
|
"""
|
|
Select the best TVA candidate from collected candidates.
|
|
|
|
Selection criteria (priority order):
|
|
1. Sum matches TOTAL TVA BON (highest priority)
|
|
2. More entries = better (for multi-rate receipts)
|
|
3. Pattern confidence as tiebreaker
|
|
|
|
Args:
|
|
candidates: List of (pattern_name, confidence, entries, sum)
|
|
tva_bon_total: Authoritative TOTAL TVA BON value (if extracted)
|
|
|
|
Returns:
|
|
(best_entries, best_sum, confidence)
|
|
"""
|
|
if not candidates:
|
|
return [], None, 0.0
|
|
|
|
# Score each candidate
|
|
scored = []
|
|
for name, confidence, entries, sum_val in candidates:
|
|
score = 0.0
|
|
|
|
# Criterion 1: Sum matches TOTAL TVA BON (highest priority)
|
|
if tva_bon_total and sum_val:
|
|
tolerance = max(Decimal('0.02'), tva_bon_total * Decimal('0.02')) # 2% tolerance
|
|
if abs(sum_val - tva_bon_total) <= tolerance:
|
|
score += 100 # High bonus for matching authoritative total
|
|
print(f"[TVA Select] {name}: sum {sum_val} matches tva_bon_total {tva_bon_total}", flush=True)
|
|
|
|
# Criterion 2: More entries (for multi-rate receipts)
|
|
score += len(entries) * 10
|
|
|
|
# Criterion 3: Pattern confidence
|
|
score += confidence * 5
|
|
|
|
scored.append((score, name, confidence, entries, sum_val))
|
|
print(f"[TVA Select] Candidate {name}: score={score:.1f}, entries={len(entries)}, sum={sum_val}", flush=True)
|
|
|
|
# Sort by score descending
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|
best = scored[0]
|
|
print(f"[TVA Select] Winner: {best[1]} (score={best[0]:.1f})", flush=True)
|
|
|
|
return best[3], best[4], best[2] # entries, sum, confidence
|
|
|
|
def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal], float]:
|
|
"""
|
|
Extract multiple TVA (VAT) entries from text.
|
|
Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).
|
|
|
|
Uses CANDIDATE COLLECTION approach:
|
|
- Try ALL patterns and collect candidates
|
|
- Select best candidate based on matching TOTAL TVA BON
|
|
|
|
Returns (tva_entries, tva_total, confidence) where tva_entries is a list of:
|
|
{'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
|
|
"""
|
|
tva_entries = []
|
|
seen_entries = set() # To avoid duplicates
|
|
confidence = 0.0 # Track extraction confidence
|
|
|
|
# Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
|
|
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
|
|
# Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
|
|
non_vat_patterns = [
|
|
# Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
|
|
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
|
|
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
|
|
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
|
|
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
|
|
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
|
|
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
|
|
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
|
|
]
|
|
for pattern in non_vat_patterns:
|
|
if re.search(pattern, text, re.IGNORECASE):
|
|
# Non-VAT payer - return TVA = 0, high confidence
|
|
return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00'), 0.95
|
|
|
|
# Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
|
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
|
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
|
|
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
|
|
|
|
# Extract TOTAL TVA BON/TOTAL TVA first as the authoritative reference
|
|
tva_bon_total = self._extract_total_tva_bon(normalized_text)
|
|
print(f"[TVA Debug] TOTAL TVA BON: {tva_bon_total}", flush=True)
|
|
|
|
# CANDIDATE COLLECTION APPROACH: Try all patterns, collect candidates, select best
|
|
all_candidates = [] # List of (pattern_name, confidence, entries, sum)
|
|
|
|
# === LIDL-STYLE PATTERNS (NEW) ===
|
|
# Lidl format: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13" (no hyphen/colon)
|
|
# This pattern handles multi-rate TVA receipts
|
|
lidl_entries = self._try_pattern_lidl(normalized_text)
|
|
if lidl_entries:
|
|
lidl_sum = sum(e['amount'] for e in lidl_entries)
|
|
all_candidates.append(('lidl', 0.96, lidl_entries, lidl_sum))
|
|
print(f"[TVA Debug] Lidl pattern: {len(lidl_entries)} entries, sum={lidl_sum}", flush=True)
|
|
|
|
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
|
|
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
|
|
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
|
|
# OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
|
|
total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
|
|
taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
|
|
|
|
# Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
|
|
if not taxe_match:
|
|
reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
|
|
taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
|
|
|
|
if taxe_match:
|
|
# Also need to find the TVA rate from the table
|
|
# Pattern handles: "A-21%", "-21,00%", "21%" etc.
|
|
rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
|
|
rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
|
|
if rate_match:
|
|
try:
|
|
code = rate_match.group(1).upper() if rate_match.group(1) else 'A' # Default to A if missing
|
|
percent = int(rate_match.group(2))
|
|
amount_str = taxe_match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
pass
|
|
|
|
# Pattern 0b: Table format "A-21,00% 285,66 49,58" (code-percent base tva_amount)
|
|
# This format appears after a TVA header line like "TVA TOTAL VALDARE"
|
|
# The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
|
|
if not tva_entries:
|
|
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
|
|
for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount1_str = match.group(3).replace(' ', '')
|
|
amount2_str = match.group(4).replace(' ', '')
|
|
amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
|
|
amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
|
|
|
|
# Determine which is TVA: the smaller amount is usually TVA
|
|
# (TVA is a fraction of the total, so it's always smaller)
|
|
tva_amount = min(amount1, amount2)
|
|
|
|
if tva_amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': tva_amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 0c: REVERSED FORMAT "5.00% TUA*B" followed by amount on next line
|
|
# This handles receipts where percentage comes BEFORE TVA code (e.g., books with 5% rate)
|
|
# Matches: "5.00% TUA*B", "5% TVA B", "5.00% TVA", "9% TUA", "5% IVA"
|
|
if not tva_entries:
|
|
# Pattern: PERCENT% + TVA/IVA + optional code, then amount on next line
|
|
reversed_tva_pattern = r'(\d{1,2})[.,]?\d{0,2}\s*%\s*(?:T[VU][AR]|IVA)\s*\*?([A-D])?'
|
|
for match in re.finditer(reversed_tva_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
percent = int(match.group(1))
|
|
code = (match.group(2) or self._get_tva_code_from_percent(percent)).upper()
|
|
|
|
# Look for amount on the next line(s) after the match
|
|
after_match = normalized_text[match.end():]
|
|
# Find standalone number (amount) - skip empty lines
|
|
amount_match = re.search(r'^[\s\n]*([\d]+[.,]\d{2})\b', after_match)
|
|
if amount_match:
|
|
amount_str = self._normalize_number(amount_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 0d: "TOTAL TUA:", "TOTAL TVA:", "TOTAL IVA:" with amount (OCR variants)
|
|
if not tva_entries:
|
|
total_tva_simple = r'TOTAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d.,]+)'
|
|
match = re.search(total_tva_simple, normalized_text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
amount_str = self._normalize_number(match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
# Try to find the rate in nearby text
|
|
percent = self._detect_tva_percent(text)
|
|
if percent:
|
|
code = self._get_tva_code_from_percent(percent)
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
pass
|
|
|
|
# Pattern 0e: Multiline "TOTAL TUA\n198\n30.43" where:
|
|
# - "TOTAL TUA" on one line
|
|
# - "198" or similar (corrupted "19%") on next line (optional)
|
|
# - "30.43" (TVA amount) on following line
|
|
# OCR often splits this across multiple lines
|
|
if not tva_entries:
|
|
multiline_tva = r'TOTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s*\n\s*\d*\s*\n?\s*([\d]+[.,]\d{2})\b'
|
|
match = re.search(multiline_tva, normalized_text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
amount_str = self._normalize_number(match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
percent = self._detect_tva_percent(text)
|
|
if percent:
|
|
code = self._get_tva_code_from_percent(percent)
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
pass
|
|
|
|
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" or "IVA A - 19%" (with code)
|
|
# OCR tolerant: TUA, TVR, IVA, etc.
|
|
pattern_with_code = r'(?:T[VU][AR]|IVA)\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
|
for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount_str = match.group(3).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 2: "TVA - 21%: 32.31" or "IVA - 21%: 32.31" (without explicit code, assume 'A')
|
|
if not tva_entries:
|
|
pattern_no_code = r'(?:T[VU][AR]|IVA)\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
|
for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
|
|
try:
|
|
percent = int(match.group(1))
|
|
amount_str = match.group(2).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
# Determine code based on percent
|
|
code = self._get_tva_code_from_percent(percent)
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 3: "TOTAL TVA A - 21%" or "TOTAL IVA" with amount on same line or "TOTAL TVA BON" with amount
|
|
if not tva_entries:
|
|
# First try: "TOTAL TVA A - 21% 32.31" or "TOTAL IVA A - 21% 32.31" (amount on same line)
|
|
tva_with_amount = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
|
|
for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount_str = self._normalize_number(match.group(3))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 3b: "TOTAL TVA A - 21%" or "TOTAL IVA A - 21%" on one line, look for "TOTAL TVA BON" amount
|
|
if not tva_entries:
|
|
tva_total_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
|
|
for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
|
|
# Look for "TOTAL TVA BON" or "TOTAL IVA BON" followed by amount
|
|
tva_bon_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON[:\s]*([\d.,]+)'
|
|
tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
|
|
if tva_bon_match:
|
|
amount_str = self._normalize_number(tva_bon_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
continue
|
|
|
|
# Fallback: Amount after TOTAL TVA BON or TOTAL IVA BON on next line
|
|
tva_bon_pos = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON', normalized_text, re.IGNORECASE)
|
|
if tva_bon_pos:
|
|
after_bon = normalized_text[tva_bon_pos.end():]
|
|
# Find first standalone number (likely TVA amount)
|
|
amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
|
|
if amount_match:
|
|
amount_str = self._normalize_number(amount_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 3c: "TVAA - 21%" or "IVA A - 21%" on one line, amount on next line (simpler format)
|
|
if not tva_entries:
|
|
tva_line_pattern = r'(?:T[VU][AR]|IVA)\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
|
|
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = (match.group(1) or 'A').upper()
|
|
percent = int(match.group(2))
|
|
|
|
# Look for amount on the next line or immediately after
|
|
after_tva = normalized_text[match.end():]
|
|
amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva)
|
|
if amount_match:
|
|
amount_str = self._normalize_number(amount_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 4: Use TVA_PATTERNS for fallback
|
|
if not tva_entries:
|
|
for pattern, _ in self.TVA_PATTERNS:
|
|
match = re.search(pattern, normalized_text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
# Some patterns have 2 groups (percent, amount), others just amount
|
|
if match.lastindex >= 2:
|
|
percent = int(match.group(1))
|
|
amount_str = match.group(2)
|
|
else:
|
|
amount_str = match.group(1)
|
|
# Try to detect percent from text
|
|
percent = self._detect_tva_percent(text)
|
|
|
|
amount_str = amount_str.replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0 and percent:
|
|
code = self._get_tva_code_from_percent(percent)
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
break # Only use first match from fallback
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Add existing extraction results to candidates (if any)
|
|
if tva_entries:
|
|
entries_sum = sum(entry['amount'] for entry in tva_entries)
|
|
all_candidates.append(('standard', 0.90, tva_entries, entries_sum))
|
|
print(f"[TVA Debug] Standard patterns: {len(tva_entries)} entries, sum={entries_sum}", flush=True)
|
|
|
|
# === CANDIDATE SELECTION ===
|
|
# Select best candidate using TOTAL TVA BON as authoritative reference
|
|
if all_candidates:
|
|
best_entries, best_sum, confidence = self._select_best_tva_candidate(all_candidates, tva_bon_total)
|
|
if best_entries:
|
|
tva_entries = best_entries
|
|
entries_sum = best_sum
|
|
|
|
# Calculate sum from entries (if not set by candidate selection)
|
|
entries_sum = None
|
|
if tva_entries:
|
|
entries_sum = sum((entry['amount'] for entry in tva_entries), Decimal(0))
|
|
|
|
# Validate and correct TVA values
|
|
tva_entries, tva_total = self._validate_and_correct_tva(
|
|
tva_entries, entries_sum, tva_bon_total
|
|
)
|
|
|
|
# Sort by code (A, B, C, D)
|
|
tva_entries.sort(key=lambda x: x.get('code', 'Z'))
|
|
|
|
return tva_entries, tva_total, confidence if tva_entries else 0.0
|
|
|
|
def _get_tva_code_from_percent(self, percent: int) -> str:
|
|
"""Map TVA percentage to standard Romanian code.
|
|
|
|
Romanian TVA rates changed in August 2025:
|
|
- Standard rate: 19% → 21%
|
|
- Reduced rate: 9% → 11%
|
|
- Other rates (5%, 0%) remain unchanged
|
|
|
|
Old rates (before Aug 2025): New rates (from Aug 2025):
|
|
- A = 19% (standard) - A = 21% (standard)
|
|
- B = 9% (reduced) - B = 11% (reduced)
|
|
- C = 5% (reduced) - C = 5% (reduced)
|
|
- D = 0% (exempt) - D = 0% (exempt)
|
|
|
|
Both old and new rates are supported for historical receipts.
|
|
"""
|
|
if percent in (19, 21):
|
|
return 'A' # Standard rate (19% old, 21% new from Aug 2025)
|
|
elif percent in (9, 11):
|
|
return 'B' # Reduced rate (9% old, 11% new from Aug 2025)
|
|
elif percent == 5:
|
|
return 'C' # Reduced rate (unchanged)
|
|
elif percent == 0:
|
|
return 'D' # Exempt (unchanged)
|
|
else:
|
|
return 'A' # Default to standard rate
|
|
|
|
def _extract_total_tva_bon(self, text: str) -> Optional[Decimal]:
|
|
"""
|
|
Extract TOTAL TVA BON value separately as the reference.
|
|
This is the authoritative total TVA on the receipt.
|
|
|
|
Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, TOTAL IVA BON, etc.
|
|
"""
|
|
# Pattern for TOTAL TVA BON or TOTAL IVA BON with amount after
|
|
# OCR corruptions: TUAL (TVA+L merged), TVAL, TUAI, etc.
|
|
patterns = [
|
|
# Standard: TOTAL TVA BON: 14.92 or TOTAL IVA BON: 14.92
|
|
# Handles: TUAL (TVA+L), TVAL, TUAI, etc. with optional trailing letters
|
|
r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
|
|
# Amount before: 14.92 OTAL TUA BON (OCR line break)
|
|
r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON',
|
|
# Amount on next line after TOTAL TVA BON or TOTAL IVA BON
|
|
r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
amount_str = self._normalize_number(match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
return amount
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
return None
|
|
|
|
def _validate_and_correct_tva(
|
|
self,
|
|
tva_entries: List[dict],
|
|
entries_sum: Optional[Decimal],
|
|
tva_bon_total: Optional[Decimal]
|
|
) -> Tuple[List[dict], Optional[Decimal]]:
|
|
"""
|
|
Validate and correct TVA values.
|
|
|
|
Rules:
|
|
1. TVA cannot be greater than TOTAL amount (will be validated at higher level)
|
|
2. Sum of TVA A + TVA B + ... should equal TOTAL TVA BON
|
|
3. If single entry and sum != tva_bon_total, use tva_bon_total
|
|
4. Detect and fix OCR concatenation errors (e.g., 14.921492 from 14.92 + 14.92)
|
|
"""
|
|
if not tva_entries:
|
|
return tva_entries, tva_bon_total
|
|
|
|
# Check for OCR concatenation errors in individual entries
|
|
# Pattern: X.XX followed by another decimal (e.g., 14.921492 from 14.92 + 14.92)
|
|
corrected_entries = []
|
|
for entry in tva_entries:
|
|
amount = entry['amount']
|
|
amount_str = str(amount)
|
|
|
|
# Check if amount looks like concatenated decimals
|
|
# e.g., 14.921492 could be 14.92 + 14.92 incorrectly joined
|
|
# or 32.3132.31 from 32.31 + 32.31
|
|
if len(amount_str) > 6 and '.' in amount_str:
|
|
int_part, dec_part = amount_str.split('.')
|
|
|
|
# If decimal part > 2 digits, it's likely concatenation
|
|
if len(dec_part) > 2:
|
|
# Try to extract the first valid decimal amount
|
|
# e.g., from 14.921492, extract 14.92
|
|
try:
|
|
corrected_amount = Decimal(f"{int_part}.{dec_part[:2]}")
|
|
print(f"[TVA Validation] Corrected concatenation error: {amount} -> {corrected_amount}", flush=True)
|
|
entry['amount'] = corrected_amount
|
|
except InvalidOperation:
|
|
pass
|
|
|
|
corrected_entries.append(entry)
|
|
|
|
tva_entries = corrected_entries
|
|
|
|
# Recalculate sum after corrections
|
|
entries_sum = sum((entry['amount'] for entry in tva_entries), Decimal(0)) if tva_entries else None
|
|
|
|
# Validate sum against TOTAL TVA BON
|
|
if tva_bon_total and entries_sum:
|
|
# Allow small tolerance for rounding (0.02)
|
|
tolerance = Decimal('0.02')
|
|
difference = abs(entries_sum - tva_bon_total)
|
|
|
|
if difference > tolerance:
|
|
print(f"[TVA Validation] Sum mismatch: entries_sum={entries_sum}, tva_bon_total={tva_bon_total}", flush=True)
|
|
|
|
# If single entry and sum doesn't match, use TOTAL TVA BON as reference
|
|
if len(tva_entries) == 1:
|
|
print(f"[TVA Validation] Single entry - using TOTAL TVA BON as reference: {tva_bon_total}", flush=True)
|
|
tva_entries[0]['amount'] = tva_bon_total
|
|
entries_sum = tva_bon_total
|
|
# If multiple entries and sum > tva_bon_total, likely double counting
|
|
elif entries_sum > tva_bon_total:
|
|
# Check if one entry is the duplicate of another
|
|
amounts = [e['amount'] for e in tva_entries]
|
|
unique_amounts = set(amounts)
|
|
if len(unique_amounts) < len(amounts):
|
|
# Duplicate detected - likely TOTAL TVA BON counted as separate entry
|
|
print(f"[TVA Validation] Duplicate TVA detected, removing duplicates", flush=True)
|
|
# Keep only unique entries
|
|
seen = set()
|
|
unique_entries = []
|
|
for entry in tva_entries:
|
|
key = (entry.get('code'), entry['amount'])
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_entries.append(entry)
|
|
tva_entries = unique_entries
|
|
entries_sum = sum((e['amount'] for e in tva_entries), Decimal(0))
|
|
|
|
# Final total
|
|
tva_total = entries_sum if entries_sum else tva_bon_total
|
|
|
|
return tva_entries, tva_total
|
|
|
|
def _detect_tva_percent(self, text: str) -> Optional[int]:
|
|
"""Detect TVA percentage from text content.
|
|
|
|
IMPORTANT: Prioritize rates found near TVA markers over rates found elsewhere.
|
|
E.g., "REDUCERE 5%" should not override "TVA A 19%".
|
|
Also handle OCR corruptions like "194" for "19%" in "TOTAL TA F 194".
|
|
"""
|
|
import re as regex
|
|
|
|
# First, look for percent NEAR TVA markers (most reliable)
|
|
# This handles "TVA A 19%", "TVA 19,00%", "TOTAL TVA 19%"
|
|
tva_context_patterns = [
|
|
r'T[VU][AR]\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
|
|
r'IVA\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
|
|
# OCR corruption: "TOTAL TA F 194" where 194 = 19% (4 is artifact)
|
|
r'TOTAL\s+T[VA][AR]?\s*[F\s]?\s*(19|21)\d\b',
|
|
]
|
|
for pattern in tva_context_patterns:
|
|
match = regex.search(pattern, text, regex.IGNORECASE)
|
|
if match:
|
|
rate = int(match.group(1))
|
|
if rate in (19, 21, 11, 9, 5):
|
|
return rate
|
|
|
|
# Fallback: Look for common Romanian TVA percentages anywhere
|
|
# But EXCLUDE patterns near "REDUCERE", "DISCOUNT", "RED." (these are discounts, not TVA)
|
|
# Clean text by removing discount context
|
|
# Handle OCR corruptions: RED.CERE (C instead of U), RED CERE, REDUC, etc.
|
|
text_no_discount = regex.sub(r'(?:REDUC|DISCOUNT|RED)[.\sA-Z]*\d+[.,]?\d*\s*%', '', text, flags=regex.IGNORECASE)
|
|
|
|
# Now search in cleaned text (priority order: 19% > 21% > 11% > 9% > 5%)
|
|
if regex.search(r'\b19[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
|
return 19
|
|
elif regex.search(r'\b21[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
|
return 21
|
|
elif regex.search(r'\b11[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
|
return 11
|
|
elif regex.search(r'\b9[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
|
return 9
|
|
elif regex.search(r'\b5[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
|
return 5
|
|
|
|
# Default: If no percent found but we're in Romanian receipt context,
|
|
# assume 19% (standard rate)
|
|
if regex.search(r'T[VU][AR]|IVA', text, regex.IGNORECASE):
|
|
return 19
|
|
|
|
return None
|
|
|
|
def _validate_tva_reverse(
|
|
self,
|
|
tva_entries: List[dict],
|
|
total_amount: Optional[Decimal]
|
|
) -> Tuple[bool, Optional[Decimal], str]:
|
|
"""
|
|
Reverse TVA validation: from TVA amount and rate, calculate expected total.
|
|
|
|
Formula (CORRECT):
|
|
For TVA that is INCLUDED in total (standard Romanian receipts):
|
|
total = base + tva
|
|
tva = base * rate/100
|
|
Therefore: base = tva * 100 / rate
|
|
And: total = base + tva = tva * 100 / rate + tva = tva * (100 + rate) / rate
|
|
|
|
Returns (is_valid, expected_total, message)
|
|
"""
|
|
if not tva_entries or not total_amount:
|
|
return True, None, "Insufficient data for reverse validation"
|
|
|
|
expected_total = Decimal('0')
|
|
for entry in tva_entries:
|
|
tva_amount = entry['amount']
|
|
rate = Decimal(str(entry['percent']))
|
|
|
|
print(f"[TVA Debug] Entry: amount={tva_amount}, rate={rate}%", flush=True)
|
|
|
|
if rate > 0:
|
|
# CORRECT formula: total = tva * (100 + rate) / rate
|
|
# Example: tva=55.22, rate=21 → total = 55.22 * 121 / 21 = 318.16
|
|
gross_for_entry = tva_amount * (Decimal('100') + rate) / rate
|
|
expected_total += gross_for_entry
|
|
print(f"[TVA Debug] Calculated gross: {gross_for_entry}", flush=True)
|
|
else:
|
|
# 0% TVA - can't calculate base, skip
|
|
pass
|
|
|
|
if expected_total == 0:
|
|
return True, None, "Cannot calculate expected total (0% TVA only)"
|
|
|
|
# Tolerance: max(0.50 RON, 1% of total)
|
|
tolerance = max(Decimal('0.50'), total_amount * Decimal('0.01'))
|
|
difference = abs(expected_total - total_amount)
|
|
|
|
if difference <= tolerance:
|
|
return True, expected_total, f"TVA reverse validation passed (expected: {expected_total}, actual: {total_amount}, diff: {difference})"
|
|
else:
|
|
return False, expected_total, f"TVA reverse validation WARNING: expected {expected_total}, actual {total_amount}, diff {difference}"
|
|
|
|
def _extract_items_count(self, text: str) -> Optional[int]:
|
|
"""Extract number of items/articles from receipt."""
|
|
for pattern, _ in self.ITEMS_COUNT_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
count = int(match.group(1))
|
|
if 0 < count < 1000: # Reasonable range
|
|
return count
|
|
except ValueError:
|
|
continue
|
|
return None
|
|
|
|
def _extract_address(self, text: str) -> Optional[str]:
|
|
"""Extract vendor address from text."""
|
|
lines = text.split('\n')
|
|
address_parts = []
|
|
|
|
for line in lines[:15]: # Check first 15 lines
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Check for address patterns
|
|
line_upper = line.upper()
|
|
|
|
# JUD. (county) pattern
|
|
if re.search(r'\bJUD\.?\s+', line_upper):
|
|
address_parts.append(line)
|
|
continue
|
|
|
|
# STR. (street) pattern
|
|
if re.search(r'\bSTR\.?\s+', line_upper):
|
|
address_parts.append(line)
|
|
continue
|
|
|
|
# MUN./OR./COM. (city/town) pattern
|
|
if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper):
|
|
address_parts.append(line)
|
|
continue
|
|
|
|
if address_parts:
|
|
# Join and clean address parts
|
|
address = ', '.join(address_parts)
|
|
# Clean up
|
|
address = re.sub(r'\s+', ' ', address).strip()
|
|
address = re.sub(r',\s*,', ',', address)
|
|
return address if len(address) >= 5 else None
|
|
|
|
return None
|
|
|
|
def _extract_payment_methods(self, text: str) -> Tuple[List[dict], float]:
|
|
"""
|
|
Extract payment methods (CARD/NUMERAR) from receipt.
|
|
These appear after TOTAL LEI and before TOTAL TVA section.
|
|
|
|
Returns tuple of: (list of {'method': 'CARD'/'NUMERAR', 'amount': Decimal}, confidence)
|
|
"""
|
|
payment_methods = []
|
|
seen_methods = set()
|
|
max_confidence = 0.0
|
|
|
|
# Normalize spaces in numbers
|
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
|
|
|
# Find the region between TOTAL LEI and TOTAL TVA
|
|
total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE)
|
|
total_tva_match = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)', normalized_text, re.IGNORECASE)
|
|
|
|
# Define search region (after TOTAL LEI, before TOTAL TVA if exists)
|
|
if total_lei_match:
|
|
start_pos = total_lei_match.end()
|
|
end_pos = total_tva_match.start() if total_tva_match else len(normalized_text)
|
|
search_region = normalized_text[start_pos:end_pos]
|
|
else:
|
|
search_region = normalized_text # Fallback to full text
|
|
|
|
for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS:
|
|
for match in re.finditer(pattern, search_region, re.IGNORECASE | re.MULTILINE):
|
|
try:
|
|
amount_str = match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
# Validate: amount must be positive and reasonable (< MAX_REASONABLE_PAYMENT)
|
|
# This prevents OCR errors like CUI being parsed as payment
|
|
if amount > 0 and amount < self.MAX_REASONABLE_PAYMENT and method not in seen_methods:
|
|
payment_methods.append({
|
|
'method': method,
|
|
'amount': amount
|
|
})
|
|
seen_methods.add(method)
|
|
if confidence > max_confidence:
|
|
max_confidence = confidence
|
|
print(f"[Payment] Found {method}: {amount} (pattern matched, conf={confidence})", flush=True)
|
|
elif amount >= self.MAX_REASONABLE_PAYMENT:
|
|
print(f"[Payment] Rejected unreasonable amount {amount} for {method} (likely OCR error)", flush=True)
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
return payment_methods, max_confidence if payment_methods else 0.0
|
|
|
|
def _validate_payment_methods(
|
|
self, payment_methods: List[dict], total: Optional[Decimal]
|
|
) -> List[dict]:
|
|
"""
|
|
Validate payment methods against extracted total.
|
|
|
|
If payment sum is way larger than total (>10x), it's likely an OCR error
|
|
(e.g., CUI number parsed as payment amount). Clear invalid payments.
|
|
|
|
Args:
|
|
payment_methods: List of {'method': str, 'amount': Decimal}
|
|
total: Extracted total amount
|
|
|
|
Returns:
|
|
Validated payment methods (may be empty if all were invalid)
|
|
"""
|
|
if not total or not payment_methods:
|
|
return payment_methods
|
|
|
|
payment_sum = sum(pm.get('amount', Decimal('0')) for pm in payment_methods)
|
|
|
|
# If payment sum > 10x total, it's definitely an error
|
|
if payment_sum > total * 10:
|
|
print(f"[Payment Validation] Payment sum {payment_sum} >> Total {total} (>10x), clearing invalid payments", flush=True)
|
|
return []
|
|
|
|
# If payment sum > 2x total, it's suspicious but might be valid in some edge cases
|
|
# Just log a warning
|
|
if payment_sum > total * 2:
|
|
print(f"[Payment Validation] Warning: Payment sum {payment_sum} > 2x Total {total}, possible OCR error", flush=True)
|
|
|
|
return payment_methods
|
|
|
|
def _extract_client_data(
|
|
self, text_upper: str, original_text: str
|
|
) -> Tuple[Optional[str], Optional[str], Optional[str], float]:
|
|
"""
|
|
Extract client/buyer data from B2B receipts.
|
|
|
|
Returns (client_name, client_cui, client_address, confidence)
|
|
"""
|
|
client_name = None
|
|
client_cui = None
|
|
client_address = None
|
|
confidence = 0.0
|
|
|
|
# Step 1: Find CLIENT section marker
|
|
client_section_start = None
|
|
for marker in self.CLIENT_SECTION_MARKERS:
|
|
match = re.search(marker, text_upper, re.IGNORECASE)
|
|
if match:
|
|
client_section_start = match.start()
|
|
break
|
|
|
|
if client_section_start is None:
|
|
# No client section found
|
|
return None, None, None, 0.0
|
|
|
|
# Step 2: Extract client CUI
|
|
for pattern, conf in self.CLIENT_CUI_PATTERNS:
|
|
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
cui = match.group(1)
|
|
if 6 <= len(cui) <= 10:
|
|
client_cui = cui
|
|
confidence = max(confidence, conf)
|
|
break
|
|
|
|
# Step 3: Extract client name from CLIENT section
|
|
# Look for company name after CLIENT: marker
|
|
lines = original_text.split('\n')
|
|
for i, line in enumerate(lines):
|
|
line_upper = line.upper().strip()
|
|
|
|
# Check if this line contains CLIENT marker
|
|
if any(re.search(marker, line_upper) for marker in self.CLIENT_SECTION_MARKERS):
|
|
# Check if name is on same line after ":"
|
|
if ':' in line:
|
|
name_part = line.split(':', 1)[1].strip()
|
|
if name_part and len(name_part) >= 3:
|
|
# Skip if it looks like a CUI (R/RO followed by digits)
|
|
if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
|
|
# This is a CUI, not a name - extract it if not already found
|
|
if not client_cui:
|
|
cui_digits = re.sub(r'[^0-9]', '', name_part)
|
|
if 6 <= len(cui_digits) <= 10:
|
|
client_cui = cui_digits
|
|
confidence = max(confidence, 0.90)
|
|
continue
|
|
# Check for company indicators
|
|
if any(re.search(ind, name_part.upper()) for ind in self.VENDOR_INDICATORS):
|
|
client_name = self._clean_vendor_name(name_part)
|
|
confidence = max(confidence, 0.95)
|
|
break
|
|
elif len(name_part) >= 5 and not name_part.isdigit():
|
|
client_name = self._clean_vendor_name(name_part)
|
|
confidence = max(confidence, 0.80)
|
|
break
|
|
|
|
# Check next line for company name
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip()
|
|
next_upper = next_line.upper()
|
|
|
|
# Skip if it's a CUI/CIF line
|
|
if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
|
|
if any(re.search(ind, next_upper) for ind in self.VENDOR_INDICATORS):
|
|
client_name = self._clean_vendor_name(next_line)
|
|
confidence = max(confidence, 0.90)
|
|
break
|
|
elif len(next_line) >= 5 and not next_line.isdigit():
|
|
# Check if it looks like a company name
|
|
if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
|
|
client_name = self._clean_vendor_name(next_line)
|
|
confidence = max(confidence, 0.75)
|
|
break
|
|
|
|
# Step 4: Extract client address (if present after client section)
|
|
if client_section_start:
|
|
# Look for address patterns after client section
|
|
client_region = text_upper[client_section_start:client_section_start + 500]
|
|
for pattern, _ in self.ADDRESS_PATTERNS:
|
|
match = re.search(pattern, client_region)
|
|
if match:
|
|
client_address = match.group(1).strip()
|
|
break
|
|
|
|
# Log extraction result
|
|
if client_cui or client_name:
|
|
print(f"[Client Extraction] Found: name={client_name}, cui={client_cui}, conf={confidence}", flush=True)
|
|
|
|
return client_name, client_cui, client_address, confidence
|