Files
roa2web-service-auto/backend/modules/data_entry/services/ocr_extractor.py
Marius Mutu c5e051ad80 feat: Migrate to ultrathin monolith architecture
Consolidate 3 separate applications (reports-app, data-entry-app, telegram-bot) into a unified
architecture with single backend and frontend:

Backend Changes:
- Unified FastAPI backend at backend/ with modular structure
- Modules: reports, data_entry, telegram in backend/modules/
- Centralized config.py and main.py with all routers registered
- Single worker mode (--workers 1) for Telegram bot compatibility
- Shared Oracle connection pool and JWT authentication
- Unified requirements.txt and environment configuration

Frontend Changes:
- Single Vue.js SPA with module-based routing
- Unified frontend at src/ with modules in src/modules/{reports,data-entry}/
- Shared components and stores in src/shared/
- Error boundaries for module isolation
- Dual API proxy in Vite for module communication

Infrastructure:
- New unified startup scripts: start-prod.sh, start-test.sh, start-backend.sh
- Environment templates: .env.dev.example, .env.test.example, .env.prod.example
- Updated deployment scripts for Windows IIS
- Simplified SSH tunnel management

Documentation:
- Comprehensive CLAUDE.md with architecture overview
- Module-specific docs in docs/{data-entry,telegram}/
- Architecture decision records in docs/ARCHITECTURE-DECISIONS.md
- Deployment guides consolidated in deployment/windows/docs/

This migration reduces complexity, improves maintainability, and enables easier
deployment while maintaining all existing functionality.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-29 23:48:14 +02:00

1502 lines
68 KiB
Python

"""Extract structured fields from OCR text (Romanian receipts)."""
import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple, List
from dataclasses import dataclass, field
@dataclass
class ExtractionResult:
"""Structured extraction result from receipt."""
receipt_type: str = 'bon_fiscal'
receipt_number: Optional[str] = None
receipt_series: Optional[str] = None
receipt_date: Optional[date] = None
amount: Optional[Decimal] = None
partner_name: Optional[str] = None
cui: Optional[str] = None
description: Optional[str] = None
# Additional extracted fields - Multiple TVA entries support
tva_entries: List[dict] = field(default_factory=list) # [{code, percent, amount}]
tva_total: Optional[Decimal] = None
address: Optional[str] = None
items_count: Optional[int] = None
payment_methods: List[dict] = field(default_factory=list) # [{"method":"CARD","amount":Decimal}]
# Client data (for B2B receipts - buyer information)
client_name: Optional[str] = None
client_cui: Optional[str] = None
client_address: Optional[str] = None
confidence_amount: float = 0.0
confidence_date: float = 0.0
confidence_vendor: float = 0.0
confidence_client: float = 0.0
raw_text: str = ""
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
processing_time_ms: int = 0 # Processing time in milliseconds
@property
def overall_confidence(self) -> float:
"""Calculate weighted overall confidence score."""
weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
return round(
self.confidence_amount * weights['amount'] +
self.confidence_date * weights['date'] +
self.confidence_vendor * weights['vendor'],
2
)
class ReceiptExtractor:
"""Extract receipt fields using pattern matching for Romanian receipts."""
# Total amount patterns (most specific first)
# Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
# OCR often produces errors, so patterns must be tolerant
TOTAL_PATTERNS = [
# Most common: TOTAL LEI followed by amount (with OCR-tolerant variations)
# Handles: TOTAL LEI, TOTAL. LE!, T0TAL LEI, TOTAL LE1, etc.
(r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98), # OCR-tolerant: TOTAL. LE!, T0TAL LEI
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98), # Standard clean pattern
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), # OCR may miss first letter
# Standard patterns
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
# SUBTOTAL when TOTAL not found
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
(r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88), # OCR variations
# Payment methods
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
(r'REST\s*:?\s*([\d\s.,]+)', 0.70), # Sometimes total is near REST
]
# Fallback: Find the largest repeated amount (likely the total)
# This handles cases where OCR doesn't capture "TOTAL" keyword
# Date patterns - support dash, dot, and slash separators
# OCR may produce DRTA instead of DATA, DAIA, etc.
# OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
DATE_PATTERNS = [
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
# Date followed by ORA (time) - OCR may produce 0RA
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
# Date followed by time without ORA keyword
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
# Standalone date
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
# YYYY-MM-DD format (less common)
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
]
# OCR-corrupted date patterns with spaces/commas
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
DATE_PATTERNS_OCR_SPACES = [
# YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
# YYYY. MM. DD format with spaces (standalone)
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
# DD. MM, YYYY or DD, MM. YYYY (with time following)
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
# DD. MM, YYYY or DD, MM. YYYY (standalone)
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Receipt number patterns - Romanian fiscal receipt formats
# OCR may produce N instead of : or other errors
NUMBER_PATTERNS = [
# NDS format (common in Romanian POS)
(r'NDS\s*:?\s*(\d+)', 0.98),
# C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760)
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format
(r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number
(r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix
# BF (Bon Fiscal) number
(r'BF\s*:?\s*(\d+)', 0.93),
# NIVS format
(r'NIVS\s*:?\s*(\d+)', 0.95),
# Standard NR BON formats
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
# Document number
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
# ID BF format
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
# TD format (transaction ID)
(r'TD\s*:?\s*(\d+)', 0.85),
# 6-8 digit number (typical receipt number length)
(r'\b(\d{6,8})\b', 0.70),
# Generic long number at end (fallback)
(r'NR\.?\s*:?\s*(\d{4,})', 0.65),
]
# CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI
# CIF = Cod de Identificare Fiscală (vendor's tax ID)
# CLIENT C.U.I. = client's tax ID (should be ignored)
# OCR errors: R0 instead of RO, C1F instead of CIF
CUI_PATTERNS = [
# CIF at start of line (definitely vendor) - tolerant to OCR errors
(r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # C1F OCR error
# CIF not preceded by CLIENT (negative lookbehind)
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# Standalone CIF: format with OCR tolerance
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
# COD FISCAL (vendor)
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
(r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
# C.I.F. format (with dots, no spaces)
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
# CUI format (less specific, use with caution)
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
]
# Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
# Common in some receipts: "R011201891\nC. I. F." - number on line before label
CUI_REVERSED_PATTERNS = [
# RO + 8-10 digits on line immediately before C.I.F./CIF label
(r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
# Just digits before C.I.F. label
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
]
# Series patterns - be strict to avoid false matches
SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
# Z: format from Romanian fiscal receipts (must be at start of line or after space)
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
# BF series with explicit marker
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
]
# TVA (VAT) patterns - OCR may produce TUA, TVR, etc.
TVA_PATTERNS = [
# TOTAL TVA BON format (OCR tolerant: TUA, TVR)
(r'TOTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
(r'T[O0]TAL\s+T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.95),
# TVA with percentage (OCR tolerant)
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
# Simple TVA pattern
(r'T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.85),
# Standalone percentage line near TVA
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
]
# Payment method patterns - appears after TOTAL LEI, before TOTAL TVA
# Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00"
PAYMENT_METHOD_PATTERNS = [
# CARD with amount (high confidence)
(r'(?:PLATA\s+)?CARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
# NUMERAR (cash) with amount
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
# CASH alternative spelling
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
# Truncation recovery patterns (for OCR left-margin truncation issues)
# "RD" = truncated "CARD" (only 2 chars visible)
(r'\bRD\s*:?\s*([\d\s.,]+)', 'CARD', 0.70),
# "ARD" = truncated "CARD" (3 chars visible)
(r'\bARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.75),
# "MERAR" = truncated "NUMERAR"
(r'\bMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.70),
]
# Items count patterns - OCR may produce OZ instead of POZ, etc.
# Number may be on separate line before or after the label
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
ITEMS_COUNT_PATTERNS = [
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
# OCR tolerant: OZ instead of POZ, ARI instead of ART
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
# Number on line BEFORE "OZ. ART. IN BON:" - OCR sometimes reorders
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
# Number may be on next line after label
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
# Simpler patterns - but more specific
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
# POZ at start of line or after colon (not in product descriptions)
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
]
# Address patterns (Romanian format)
ADDRESS_PATTERNS = [
# Street patterns
(r'(STR\.?\s+[A-Z0-9\s.,]+(?:NR\.?\s*\d+)?)', 0.90),
# Full address with JUD (county)
(r'(JUD\.?\s+[A-Z]+,?\s*(?:MUN\.?|OR\.?|COM\.?)?\s*[A-Z]+)', 0.85),
]
# Client/Buyer patterns (for B2B receipts)
# CLIENT, CUMPARATOR, BENEFICIAR sections
CLIENT_SECTION_MARKERS = [
r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT: (reversed format)
r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:', # CUI CLIENT: (reversed format)
r'CLIENT\s*:',
r'CUMPARATOR\s*:',
r'BENEFICIAR\s*:',
r'CUMP[AĂ]R[AĂ]TOR\s*:',
r'DATE\s+CLIENT',
r'LIENT\s*:', # OCR truncation
]
# Client CUI patterns (explicitly after CLIENT marker)
CLIENT_CUI_PATTERNS = [
# CIF CLIENT: R01879856 (reversed format - CIF before CLIENT)
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# CLIENT C.U.I./ C.I.F. :R01879855 (slash variant with both labels)
(r'CLIENT\s+C\.\s*U\.\s*I\.?\s*/\s*C\.\s*[I1]\.\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?(?:\s*/\s*C\.?\s*[I1]\.?\s*F\.?)?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'CLIENT\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'CUMPARATOR\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUI/CIF on line immediately after CLIENT marker
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
]
# Vendor name indicators (lines containing these are likely vendor names)
# These should be company type suffixes, not generic words
# Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
VENDOR_INDICATORS = [
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
# S.C. alone is too short and generic - only match if followed by company name
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
r'HOLDING',
r'COMPANY',
r'GROUP',
# Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
]
def extract(self, text: str) -> ExtractionResult:
"""Extract all fields from OCR text."""
result = ExtractionResult()
result.raw_text = text
text_upper = text.upper()
# Extract core fields
result.amount, result.confidence_amount = self._extract_amount(text_upper)
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
result.receipt_number, _ = self._extract_number(text_upper)
result.receipt_series, _ = self._extract_series(text_upper)
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
result.cui, _ = self._extract_cui(text_upper, text)
# Extract additional fields - Multiple TVA entries
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
if not result.tva_entries:
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
# Debug: show what patterns see
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
# Log TVA vs TOTAL for debugging (validation happens in ocr_service._final_validation)
# NOTE: We NO LONGER clear TVA here - the service will recalculate TOTAL from TVA if needed
if result.tva_total and result.amount:
if result.tva_total > result.amount:
print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
elif result.tva_total > result.amount * Decimal('0.5'):
print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)
result.items_count = self._extract_items_count(text_upper)
result.address = self._extract_address(text_upper)
result.payment_methods = self._extract_payment_methods(text_upper)
# Extract client data (B2B receipts)
client_name, client_cui, client_address, confidence_client = self._extract_client_data(text_upper, text)
result.client_name = client_name
result.client_cui = client_cui
result.client_address = client_address
result.confidence_client = confidence_client
# Detect receipt type
result.receipt_type = self._detect_receipt_type(text_upper)
# Reverse TVA validation
if result.tva_entries and result.amount:
is_valid, expected_total, msg = self._validate_tva_reverse(result.tva_entries, result.amount)
if not is_valid:
print(f"[TVA Reverse Validation] {msg}", flush=True)
# Cross-validate amount using payment methods and TVA
validated_amount, validated_confidence, source = self._cross_validate_and_calculate_amount(
result.amount,
result.confidence_amount,
result.payment_methods,
result.tva_entries,
result.tva_total
)
if validated_amount != result.amount:
print(f"[Cross-Validation] Amount updated: {result.amount} -> {validated_amount} (source: {source})", flush=True)
result.amount = validated_amount
result.confidence_amount = validated_confidence
return result
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
"""Extract total amount from text."""
# First try standard patterns (TOTAL, SUBTOTAL, etc.)
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if match:
try:
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 0:
return amount, confidence
except (InvalidOperation, ValueError):
continue
# Strategy 2: Find amounts AFTER product lines end
# Products have pattern: "X BUC/ROLA X price = price"
# Total appears after all products
product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X'
product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE))
if product_matches:
# Get text after the last product line
last_product_pos = product_matches[-1].end()
after_products = text[last_product_pos:]
# Find standalone amounts on their own line after products
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
standalone_amounts = []
for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE):
try:
amount_str = match.group(1).replace(' ', '')
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 10: # Filter out small values
standalone_amounts.append(amount)
except (InvalidOperation, ValueError):
continue
if standalone_amounts:
# The largest standalone amount after products is likely the total
max_amount = max(standalone_amounts)
# Higher confidence if it appears multiple times
count = standalone_amounts.count(max_amount)
confidence = 0.85 if count >= 2 else 0.75
return max_amount, confidence
# Strategy 3: Find the most repeated large amount
# Normalize spaces in numbers (OCR may produce "186. 16")
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
amount_pattern = r'(\d{2,4}[.,]\d{2})\b'
amounts = re.findall(amount_pattern, normalized_text)
if amounts:
from collections import Counter
amount_counts = Counter(amounts)
# Filter amounts that appear 2+ times and are > 20
candidates = []
for amt_str, count in amount_counts.items():
try:
amt = Decimal(self._normalize_number(amt_str))
if count >= 2 and amt > 20:
candidates.append((amt, count))
except (InvalidOperation, ValueError):
continue
if candidates:
# Return the LARGEST amount that appears multiple times
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][0], 0.65
# Last resort: Find any standalone large amount
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
for match in re.finditer(line_amount_pattern, text, re.MULTILINE):
try:
amount_str = match.group(1).replace(' ', '')
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 50: # Higher threshold for fallback
return amount, 0.50
except (InvalidOperation, ValueError):
continue
return None, 0.0
def _normalize_number(self, num_str: str) -> str:
"""Normalize Romanian number format to standard decimal."""
# Remove spaces
num_str = num_str.replace(' ', '')
# Handle comma as decimal separator
if ',' in num_str and '.' in num_str:
# Romanian format: 1.234,56
num_str = num_str.replace('.', '').replace(',', '.')
elif ',' in num_str:
# Could be 1,50 or 1,234
parts = num_str.split(',')
if len(parts) == 2 and len(parts[1]) <= 2:
# Decimal comma: 1,50
num_str = num_str.replace(',', '.')
else:
# Thousands comma: 1,234
num_str = num_str.replace(',', '')
elif '.' in num_str:
parts = num_str.split('.')
if len(parts) > 2:
# Multiple dots: 1.234.567 -> 1234567
num_str = ''.join(parts[:-1]) + '.' + parts[-1]
return num_str
def _cross_validate_and_calculate_amount(
self,
amount: Optional[Decimal],
confidence_amount: float,
payment_methods: List[dict],
tva_entries: List[dict],
tva_total: Optional[Decimal]
) -> Tuple[Optional[Decimal], float, str]:
"""
Cross-validate and potentially calculate total from payment methods and TVA.
Returns: (amount, confidence, source_description)
Logic:
1. If amount is valid (>0) with high confidence (>=0.8), use it directly
2. Calculate payment_sum = CARD + NUMERAR + other methods
3. Calculate tva_implied_total = tva_total * (100 + rate) / rate
4. Cross-validate: if payment_sum matches extracted amount, boost confidence
5. If amount is 0/None, use payment_sum as total
6. If payment_sum is 0, try to calculate from TVA
"""
# Calculate payment methods sum
payment_sum = Decimal('0')
if payment_methods:
for pm in payment_methods:
try:
pm_amount = pm.get('amount')
if pm_amount:
payment_sum += Decimal(str(pm_amount))
except (InvalidOperation, ValueError, TypeError):
continue
# Calculate TVA-implied total: total = tva * (100 + rate) / rate
tva_implied_total = None
if tva_entries:
# Use the main TVA entry (typically the largest or first one)
main_entry = tva_entries[0]
rate = main_entry.get('percent', 19)
tva_amount = main_entry.get('amount')
if tva_amount and rate > 0:
try:
tva_dec = Decimal(str(tva_amount))
# total = tva * (100 + rate) / rate
tva_implied_total = (tva_dec * Decimal(100 + rate) / Decimal(rate)).quantize(Decimal('0.01'))
except (InvalidOperation, ValueError, TypeError):
pass
# Case 1: Amount is valid with high confidence - just validate
if amount and amount > 0 and confidence_amount >= 0.8:
# Cross-validate: check if it matches payment methods
if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
# Perfect match - boost confidence
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
return amount, confidence_amount, "extracted"
# Case 2: Amount exists but low confidence - try to validate/correct
if amount and amount > 0:
# Check if payment methods sum matches
if payment_sum > 0:
if abs(amount - payment_sum) <= Decimal('0.02'):
# Match - boost confidence
return amount, 0.90, "extracted (validated by payment methods)"
else:
# Mismatch - prefer payment_sum as it's more reliable
print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
return payment_sum, 0.85, "calculated from payment methods"
# Check TVA-implied total
if tva_implied_total:
if abs(amount - tva_implied_total) <= Decimal('0.50'):
# Close match - use extracted amount
return amount, 0.80, "extracted (validated by TVA)"
else:
print(f"[Cross-Validation] TVA mismatch: extracted={amount}, tva_implied={tva_implied_total}", flush=True)
# No validation possible - return as-is
return amount, confidence_amount, "extracted (unvalidated)"
# Case 3: Amount is 0 or None - calculate from payment methods
if payment_sum > 0:
print(f"[Cross-Validation] Amount not found, using payment sum: {payment_sum}", flush=True)
return payment_sum, 0.85, "calculated from payment methods"
# Case 4: Try TVA-implied total as last resort
if tva_implied_total and tva_implied_total > 0:
print(f"[Cross-Validation] Amount not found, using TVA-implied total: {tva_implied_total}", flush=True)
return tva_implied_total, 0.70, "calculated from TVA"
# Nothing worked - return original
return amount, confidence_amount, "not found"
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
"""Extract receipt date from text."""
# First try standard patterns (clean dates)
for pattern, confidence in self.DATE_PATTERNS:
match = re.search(pattern, text)
if match:
try:
# Normalize separators to dots
date_str = match.group(1).replace('/', '.').replace('-', '.')
# Try DD.MM.YYYY format first
try:
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
except ValueError:
# Try YYYY.MM.DD format
parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
# Validate date range
today = date.today()
if parsed <= today and parsed.year >= 2020:
return parsed, confidence
except ValueError:
continue
# Then try OCR-corrupted patterns (dates with spaces/commas)
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
match = re.search(pattern, text)
if match:
try:
if fmt == 'ymd':
# YYYY. MM. DD format (OMV/Petrom)
year = match.group(1)
month = match.group(2)
day = match.group(3)
else:
# DD. MM. YYYY format (default)
day = match.group(1)
month = match.group(2)
year = match.group(3)
date_str = f"{day}.{month}.{year}"
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
# Validate date range
today = date.today()
if parsed <= today and parsed.year >= 2020:
return parsed, confidence
except ValueError:
continue
return None, 0.0
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
"""Extract receipt number from text."""
for pattern, confidence in self.NUMBER_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1), confidence
return None, 0.0
def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
"""Extract receipt series from text."""
for pattern, confidence in self.SERIES_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).upper(), confidence
return None, 0.0
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract vendor/partner name from text.
Uses multiple strategies:
1. Look for lines with company type indicators (S.R.L., S.A., etc.)
2. Look for company name + SRL on separate lines
3. Look for lines near CIF
4. Use first valid line as fallback
"""
lines = text.split('\n')
skip_keywords = [
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT',
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
'NIVS', 'BENZINA', 'PETROM', 'OMV'
]
# Strategy 0: Look for company name followed by SRL/SA on next line
# Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
for i, line in enumerate(lines[:15]):
line = line.strip()
if not line or len(line) < 3:
continue
line_upper = line.upper()
# Skip lines with skip keywords
if any(kw in line_upper for kw in skip_keywords):
continue
# Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
if i + 1 < len(lines):
next_line = lines[i + 1].strip().upper()
# Match standalone company type suffix
if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
re.match(r'^S\.?\s*A\.?$', next_line) or \
re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
re.match(r'^I\.?\s*I\.?$', next_line):
# Combine: "COMPANY NAME" + " " + "SRL"
vendor = self._clean_vendor_name(f"{line} {next_line}")
if vendor and len(vendor) >= 5:
return vendor, 0.95
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
for i, line in enumerate(lines[:15]): # Check first 15 lines
line = line.strip()
if not line or len(line) < 3:
continue
line_upper = line.upper()
# Check for vendor indicators
for indicator in self.VENDOR_INDICATORS:
if re.search(indicator, line_upper):
# Found a company name indicator
vendor = self._clean_vendor_name(line)
if vendor and len(vendor) >= 3:
# High confidence for lines with company indicators
return vendor, 0.95
# Strategy 2: Look for lines right before or after CIF
for i, line in enumerate(lines[:15]):
line_upper = line.upper()
if 'CIF' in line_upper and 'CLIENT' not in line_upper:
# Check line before
if i > 0:
prev_line = lines[i-1].strip()
if prev_line and len(prev_line) >= 3:
if not any(kw in prev_line.upper() for kw in skip_keywords):
vendor = self._clean_vendor_name(prev_line)
if vendor:
return vendor, 0.85
# Strategy 3: First valid line as fallback
for i, line in enumerate(lines[:10]):
line = line.strip()
# Skip empty lines
if not line or len(line) < 3:
continue
# Skip lines that are just numbers or codes
if re.match(r'^[\d.,\s:]+$', line):
continue
# Skip lines with barcodes/product codes
if re.match(r'^[A-Z]*\d{6,}', line):
continue
# Skip lines with keywords
if any(kw in line.upper() for kw in skip_keywords):
continue
# Clean the line
vendor = self._clean_vendor_name(line)
if vendor and len(vendor) >= 3:
# Confidence decreases for lines further down
confidence = max(0.3, 0.7 - (i * 0.05))
return vendor, confidence
return None, 0.0
def _clean_vendor_name(self, name: str) -> Optional[str]:
"""Clean and normalize vendor name."""
if not name:
return None
# Remove common OCR artifacts
name = re.sub(r'[^\w\s.,&\-()]', ' ', name)
# Normalize whitespace
name = re.sub(r'\s+', ' ', name).strip()
# Skip if it looks like an address line only
if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
return None
# Skip if too short after cleaning
if len(name) < 3:
return None
return name
def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
"""
Extract vendor CUI (fiscal identification code) from text.
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
"""
# Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
# This is common in some receipts: "R011201891\nC. I. F."
for pattern, confidence in self.CUI_REVERSED_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
# Verify this is not the CLIENT CUI by checking context
start = match.start()
# Check 50 chars before the match for CLIENT keyword
context_start = max(0, start - 50)
context = text_upper[context_start:start]
if 'CLIENT' not in context and 'LIENT' not in context:
return cui, confidence
# Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
lines = text_upper.split('\n')
for line in lines:
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line:
continue
# Look for CIF in this line
for pattern, confidence in self.CUI_PATTERNS:
match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
return cui, confidence
# Strategy 2: Fallback - search entire text but exclude CLIENT patterns
for pattern, confidence in self.CUI_PATTERNS:
# Find all matches
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
cui = match.group(1)
if 6 <= len(cui) <= 10:
# Check if this match is preceded by CLIENT in the same line
start = match.start()
line_start = text_upper.rfind('\n', 0, start) + 1
line_text = text_upper[line_start:start]
if 'CLIENT' not in line_text and 'LIENT' not in line_text:
return cui, confidence
return None, 0.0
def _detect_receipt_type(self, text: str) -> str:
"""Detect receipt type from text content.
BON FISCAL variants: "BON FISCAL", "BON FISCAL.", "BON FISCAL"
CHITANTA variants: "CHITANTA", "CHITANȚĂ"
"""
# Check for explicit BON FISCAL first (handles OCR spacing variations)
if re.search(r'BON\s+FISCAL', text):
return 'bon_fiscal'
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
return 'chitanta'
# Default to bon_fiscal if neither found
return 'bon_fiscal'
def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
"""
Extract multiple TVA (VAT) entries from text.
Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).
Returns (tva_entries, tva_total) where tva_entries is a list of:
{'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
"""
tva_entries = []
seen_entries = set() # To avoid duplicates
# Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
# Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
non_vat_patterns = [
# Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
]
for pattern in non_vat_patterns:
if re.search(pattern, text, re.IGNORECASE):
# Non-VAT payer - return TVA = 0
return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')
# Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
# OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
# Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
if not taxe_match:
reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
if taxe_match:
# Also need to find the TVA rate from the table
# Pattern handles: "A-21%", "-21,00%", "21%" etc.
rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
if rate_match:
try:
code = rate_match.group(1).upper() if rate_match.group(1) else 'A' # Default to A if missing
percent = int(rate_match.group(2))
amount_str = taxe_match.group(1).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
pass
# Pattern 0b: Table format "A-21,00% 285,66 49,58" (code-percent base tva_amount)
# This format appears after a TVA header line like "TVA TOTAL VALDARE"
# The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
if not tva_entries:
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount1_str = match.group(3).replace(' ', '')
amount2_str = match.group(4).replace(' ', '')
amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
# Determine which is TVA: the smaller amount is usually TVA
# (TVA is a fraction of the total, so it's always smaller)
tva_amount = min(amount1, amount2)
if tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
# OCR tolerant: TUA, TVR, etc.
pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount_str = match.group(3).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A')
if not tva_entries:
pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount_str = match.group(2).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0:
# Determine code based on percent
code = self._get_tva_code_from_percent(percent)
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
if not tva_entries:
# First try: "TOTAL TVA A - 21% 32.31" (amount on same line)
tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount_str = self._normalize_number(match.group(3))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
if not tva_entries:
tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# Look for "TOTAL TVA BON" followed by amount
tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
if tva_bon_match:
amount_str = self._normalize_number(tva_bon_match.group(1))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
continue
# Fallback: Amount after TOTAL TVA BON on next line
tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
if tva_bon_pos:
after_bon = normalized_text[tva_bon_pos.end():]
# Find first standalone number (likely TVA amount)
amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
if amount_match:
amount_str = self._normalize_number(amount_match.group(1))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
if not tva_entries:
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
try:
code = (match.group(1) or 'A').upper()
percent = int(match.group(2))
# Look for amount on the next line or immediately after
after_tva = normalized_text[match.end():]
amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva)
if amount_match:
amount_str = self._normalize_number(amount_match.group(1))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 4: Use TVA_PATTERNS for fallback
if not tva_entries:
for pattern, _ in self.TVA_PATTERNS:
match = re.search(pattern, normalized_text, re.IGNORECASE)
if match:
try:
# Some patterns have 2 groups (percent, amount), others just amount
if match.lastindex >= 2:
percent = int(match.group(1))
amount_str = match.group(2)
else:
amount_str = match.group(1)
# Try to detect percent from text
percent = self._detect_tva_percent(text)
amount_str = amount_str.replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0 and percent:
code = self._get_tva_code_from_percent(percent)
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
break # Only use first match from fallback
except (ValueError, InvalidOperation):
continue
# Extract TOTAL TVA BON as reference (separate from individual entries)
tva_bon_total = self._extract_total_tva_bon(normalized_text)
# Calculate sum from entries
entries_sum = None
if tva_entries:
entries_sum = sum(entry['amount'] for entry in tva_entries)
# Validate and correct TVA values
tva_entries, tva_total = self._validate_and_correct_tva(
tva_entries, entries_sum, tva_bon_total
)
# Sort by code (A, B, C, D)
tva_entries.sort(key=lambda x: x.get('code', 'Z'))
return tva_entries, tva_total
def _get_tva_code_from_percent(self, percent: int) -> str:
"""Map TVA percentage to standard Romanian code.
Romanian TVA rates changed in August 2025:
- Standard rate: 19% → 21%
- Reduced rate: 9% → 11%
- Other rates (5%, 0%) remain unchanged
Old rates (before Aug 2025): New rates (from Aug 2025):
- A = 19% (standard) - A = 21% (standard)
- B = 9% (reduced) - B = 11% (reduced)
- C = 5% (reduced) - C = 5% (reduced)
- D = 0% (exempt) - D = 0% (exempt)
Both old and new rates are supported for historical receipts.
"""
if percent in (19, 21):
return 'A' # Standard rate (19% old, 21% new from Aug 2025)
elif percent in (9, 11):
return 'B' # Reduced rate (9% old, 11% new from Aug 2025)
elif percent == 5:
return 'C' # Reduced rate (unchanged)
elif percent == 0:
return 'D' # Exempt (unchanged)
else:
return 'A' # Default to standard rate
def _extract_total_tva_bon(self, text: str) -> Optional[Decimal]:
"""
Extract TOTAL TVA BON value separately as the reference.
This is the authoritative total TVA on the receipt.
Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, etc.
"""
# Pattern for TOTAL TVA BON with amount after
patterns = [
# Standard: TOTAL TVA BON: 14.92
r'T?OTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
# Amount before: 14.92 OTAL TUA BON (OCR line break)
r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+T[VU][AR]\s+BON',
# Amount on next line after TOTAL TVA BON
r'T?OTAL\s+T[VU][AR]\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
amount_str = self._normalize_number(match.group(1))
amount = Decimal(amount_str)
if amount > 0:
return amount
except (InvalidOperation, ValueError):
continue
return None
def _validate_and_correct_tva(
self,
tva_entries: List[dict],
entries_sum: Optional[Decimal],
tva_bon_total: Optional[Decimal]
) -> Tuple[List[dict], Optional[Decimal]]:
"""
Validate and correct TVA values.
Rules:
1. TVA cannot be greater than TOTAL amount (will be validated at higher level)
2. Sum of TVA A + TVA B + ... should equal TOTAL TVA BON
3. If single entry and sum != tva_bon_total, use tva_bon_total
4. Detect and fix OCR concatenation errors (e.g., 14.921492 from 14.92 + 14.92)
"""
if not tva_entries:
return tva_entries, tva_bon_total
# Check for OCR concatenation errors in individual entries
# Pattern: X.XX followed by another decimal (e.g., 14.921492 from 14.92 + 14.92)
corrected_entries = []
for entry in tva_entries:
amount = entry['amount']
amount_str = str(amount)
# Check if amount looks like concatenated decimals
# e.g., 14.921492 could be 14.92 + 14.92 incorrectly joined
# or 32.3132.31 from 32.31 + 32.31
if len(amount_str) > 6 and '.' in amount_str:
int_part, dec_part = amount_str.split('.')
# If decimal part > 2 digits, it's likely concatenation
if len(dec_part) > 2:
# Try to extract the first valid decimal amount
# e.g., from 14.921492, extract 14.92
try:
corrected_amount = Decimal(f"{int_part}.{dec_part[:2]}")
print(f"[TVA Validation] Corrected concatenation error: {amount}{corrected_amount}", flush=True)
entry['amount'] = corrected_amount
except InvalidOperation:
pass
corrected_entries.append(entry)
tva_entries = corrected_entries
# Recalculate sum after corrections
entries_sum = sum(entry['amount'] for entry in tva_entries) if tva_entries else None
# Validate sum against TOTAL TVA BON
if tva_bon_total and entries_sum:
# Allow small tolerance for rounding (0.02)
tolerance = Decimal('0.02')
difference = abs(entries_sum - tva_bon_total)
if difference > tolerance:
print(f"[TVA Validation] Sum mismatch: entries_sum={entries_sum}, tva_bon_total={tva_bon_total}", flush=True)
# If single entry and sum doesn't match, use TOTAL TVA BON as reference
if len(tva_entries) == 1:
print(f"[TVA Validation] Single entry - using TOTAL TVA BON as reference: {tva_bon_total}", flush=True)
tva_entries[0]['amount'] = tva_bon_total
entries_sum = tva_bon_total
# If multiple entries and sum > tva_bon_total, likely double counting
elif entries_sum > tva_bon_total:
# Check if one entry is the duplicate of another
amounts = [e['amount'] for e in tva_entries]
unique_amounts = set(amounts)
if len(unique_amounts) < len(amounts):
# Duplicate detected - likely TOTAL TVA BON counted as separate entry
print(f"[TVA Validation] Duplicate TVA detected, removing duplicates", flush=True)
# Keep only unique entries
seen = set()
unique_entries = []
for entry in tva_entries:
key = (entry.get('code'), entry['amount'])
if key not in seen:
seen.add(key)
unique_entries.append(entry)
tva_entries = unique_entries
entries_sum = sum(e['amount'] for e in tva_entries)
# Final total
tva_total = entries_sum if entries_sum else tva_bon_total
return tva_entries, tva_total
def _detect_tva_percent(self, text: str) -> Optional[int]:
"""Detect TVA percentage from text content."""
# Look for common Romanian TVA percentages
if '19%' in text or '19 %' in text:
return 19
elif '21%' in text or '21 %' in text:
return 21
elif '11%' in text or '11 %' in text:
return 11
elif '9%' in text or '9 %' in text:
return 9
elif '5%' in text or '5 %' in text:
return 5
return None
def _validate_tva_reverse(
self,
tva_entries: List[dict],
total_amount: Optional[Decimal]
) -> Tuple[bool, Optional[Decimal], str]:
"""
Reverse TVA validation: from TVA amount and rate, calculate expected total.
Formula:
base = tva_amount / (rate/100)
expected_total = sum(base + tva_amount) for all entries
Returns (is_valid, expected_total, message)
"""
if not tva_entries or not total_amount:
return True, None, "Insufficient data for reverse validation"
expected_total = Decimal('0')
for entry in tva_entries:
tva_amount = entry['amount']
rate = Decimal(str(entry['percent']))
if rate > 0:
# Calculate base from TVA: base = tva / (rate/100)
base = tva_amount / (rate / Decimal('100'))
expected_total += base + tva_amount
else:
# 0% TVA - can't calculate base, skip
pass
if expected_total == 0:
return True, None, "Cannot calculate expected total (0% TVA only)"
# Tolerance: max(0.50 RON, 1% of total)
tolerance = max(Decimal('0.50'), total_amount * Decimal('0.01'))
difference = abs(expected_total - total_amount)
if difference <= tolerance:
return True, expected_total, f"TVA reverse validation passed (expected: {expected_total}, actual: {total_amount}, diff: {difference})"
else:
return False, expected_total, f"TVA reverse validation WARNING: expected {expected_total}, actual {total_amount}, diff {difference}"
def _extract_items_count(self, text: str) -> Optional[int]:
"""Extract number of items/articles from receipt."""
for pattern, _ in self.ITEMS_COUNT_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
count = int(match.group(1))
if 0 < count < 1000: # Reasonable range
return count
except ValueError:
continue
return None
def _extract_address(self, text: str) -> Optional[str]:
"""Extract vendor address from text."""
lines = text.split('\n')
address_parts = []
for line in lines[:15]: # Check first 15 lines
line = line.strip()
if not line:
continue
# Check for address patterns
line_upper = line.upper()
# JUD. (county) pattern
if re.search(r'\bJUD\.?\s+', line_upper):
address_parts.append(line)
continue
# STR. (street) pattern
if re.search(r'\bSTR\.?\s+', line_upper):
address_parts.append(line)
continue
# MUN./OR./COM. (city/town) pattern
if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper):
address_parts.append(line)
continue
if address_parts:
# Join and clean address parts
address = ', '.join(address_parts)
# Clean up
address = re.sub(r'\s+', ' ', address).strip()
address = re.sub(r',\s*,', ',', address)
return address if len(address) >= 5 else None
return None
def _extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods (CARD/NUMERAR) from receipt.
These appear after TOTAL LEI and before TOTAL TVA section.
Returns list of: {'method': 'CARD'/'NUMERAR', 'amount': Decimal}
"""
payment_methods = []
seen_methods = set()
# Normalize spaces in numbers
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
# Find the region between TOTAL LEI and TOTAL TVA
total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE)
total_tva_match = re.search(r'TOTAL\s+T[VU][AR]', normalized_text, re.IGNORECASE)
# Define search region (after TOTAL LEI, before TOTAL TVA if exists)
if total_lei_match:
start_pos = total_lei_match.end()
end_pos = total_tva_match.start() if total_tva_match else len(normalized_text)
search_region = normalized_text[start_pos:end_pos]
else:
search_region = normalized_text # Fallback to full text
for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS:
for match in re.finditer(pattern, search_region, re.IGNORECASE):
try:
amount_str = match.group(1).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0 and method not in seen_methods:
payment_methods.append({
'method': method,
'amount': amount
})
seen_methods.add(method)
except (InvalidOperation, ValueError):
continue
return payment_methods
def _extract_client_data(
self, text_upper: str, original_text: str
) -> Tuple[Optional[str], Optional[str], Optional[str], float]:
"""
Extract client/buyer data from B2B receipts.
Returns (client_name, client_cui, client_address, confidence)
"""
client_name = None
client_cui = None
client_address = None
confidence = 0.0
# Step 1: Find CLIENT section marker
client_section_start = None
for marker in self.CLIENT_SECTION_MARKERS:
match = re.search(marker, text_upper, re.IGNORECASE)
if match:
client_section_start = match.start()
break
if client_section_start is None:
# No client section found
return None, None, None, 0.0
# Step 2: Extract client CUI
for pattern, conf in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
client_cui = cui
confidence = max(confidence, conf)
break
# Step 3: Extract client name from CLIENT section
# Look for company name after CLIENT: marker
lines = original_text.split('\n')
for i, line in enumerate(lines):
line_upper = line.upper().strip()
# Check if this line contains CLIENT marker
if any(re.search(marker, line_upper) for marker in self.CLIENT_SECTION_MARKERS):
# Check if name is on same line after ":"
if ':' in line:
name_part = line.split(':', 1)[1].strip()
if name_part and len(name_part) >= 3:
# Skip if it looks like a CUI (R/RO followed by digits)
if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
# This is a CUI, not a name - extract it if not already found
if not client_cui:
cui_digits = re.sub(r'[^0-9]', '', name_part)
if 6 <= len(cui_digits) <= 10:
client_cui = cui_digits
confidence = max(confidence, 0.90)
continue
# Check for company indicators
if any(re.search(ind, name_part.upper()) for ind in self.VENDOR_INDICATORS):
client_name = self._clean_vendor_name(name_part)
confidence = max(confidence, 0.95)
break
elif len(name_part) >= 5 and not name_part.isdigit():
client_name = self._clean_vendor_name(name_part)
confidence = max(confidence, 0.80)
break
# Check next line for company name
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
next_upper = next_line.upper()
# Skip if it's a CUI/CIF line
if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
if any(re.search(ind, next_upper) for ind in self.VENDOR_INDICATORS):
client_name = self._clean_vendor_name(next_line)
confidence = max(confidence, 0.90)
break
elif len(next_line) >= 5 and not next_line.isdigit():
# Check if it looks like a company name
if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
client_name = self._clean_vendor_name(next_line)
confidence = max(confidence, 0.75)
break
# Step 4: Extract client address (if present after client section)
if client_section_start:
# Look for address patterns after client section
client_region = text_upper[client_section_start:client_section_start + 500]
for pattern, _ in self.ADDRESS_PATTERNS:
match = re.search(pattern, client_region)
if match:
client_address = match.group(1).strip()
break
# Log extraction result
if client_cui or client_name:
print(f"[Client Extraction] Found: name={client_name}, cui={client_cui}, conf={confidence}", flush=True)
return client_name, client_cui, client_address, confidence