- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
997 lines
45 KiB
Python
997 lines
45 KiB
Python
"""Extract structured fields from OCR text (Romanian receipts)."""
|
|
|
|
import re
|
|
from datetime import date, datetime
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import Optional, Tuple, List
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Structured extraction result from receipt."""
|
|
receipt_type: str = 'bon_fiscal'
|
|
receipt_number: Optional[str] = None
|
|
receipt_series: Optional[str] = None
|
|
receipt_date: Optional[date] = None
|
|
amount: Optional[Decimal] = None
|
|
partner_name: Optional[str] = None
|
|
cui: Optional[str] = None
|
|
description: Optional[str] = None
|
|
# Additional extracted fields - Multiple TVA entries support
|
|
tva_entries: List[dict] = field(default_factory=list) # [{code, percent, amount}]
|
|
tva_total: Optional[Decimal] = None
|
|
address: Optional[str] = None
|
|
items_count: Optional[int] = None
|
|
|
|
confidence_amount: float = 0.0
|
|
confidence_date: float = 0.0
|
|
confidence_vendor: float = 0.0
|
|
raw_text: str = ""
|
|
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
|
|
processing_time_ms: int = 0 # Processing time in milliseconds
|
|
|
|
@property
|
|
def overall_confidence(self) -> float:
|
|
"""Calculate weighted overall confidence score."""
|
|
weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
|
|
return round(
|
|
self.confidence_amount * weights['amount'] +
|
|
self.confidence_date * weights['date'] +
|
|
self.confidence_vendor * weights['vendor'],
|
|
2
|
|
)
|
|
|
|
|
|
class ReceiptExtractor:
|
|
"""Extract receipt fields using pattern matching for Romanian receipts."""
|
|
|
|
# Total amount patterns (most specific first)
|
|
# Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
|
|
# OCR often produces errors, so patterns must be tolerant
|
|
TOTAL_PATTERNS = [
|
|
# Most common: TOTAL LEI followed by amount
|
|
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
|
|
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), # OCR may miss first letter
|
|
# Standard patterns
|
|
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
|
|
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
|
|
# SUBTOTAL when TOTAL not found
|
|
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
|
|
(r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88), # OCR variations
|
|
# Payment methods
|
|
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
|
|
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
|
|
(r'REST\s*:?\s*([\d\s.,]+)', 0.70), # Sometimes total is near REST
|
|
]
|
|
|
|
# Fallback: Find the largest repeated amount (likely the total)
|
|
# This handles cases where OCR doesn't capture "TOTAL" keyword
|
|
|
|
# Date patterns - support dash, dot, and slash separators
|
|
# OCR may produce DRTA instead of DATA, DAIA, etc.
|
|
# OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
|
|
DATE_PATTERNS = [
|
|
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
|
|
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
|
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
|
# Date followed by ORA (time) - OCR may produce 0RA
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
|
|
# Date followed by time without ORA keyword
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
|
|
# Standalone date
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
|
|
# YYYY-MM-DD format (less common)
|
|
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
|
|
]
|
|
|
|
# OCR-corrupted date patterns with spaces/commas
|
|
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
|
|
DATE_PATTERNS_OCR_SPACES = [
|
|
# YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
|
|
# YYYY. MM. DD format with spaces (standalone)
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
|
|
# DD. MM, YYYY or DD, MM. YYYY (with time following)
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
|
# DD. MM, YYYY or DD, MM. YYYY (standalone)
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
|
]
|
|
|
|
# Receipt number patterns - Romanian fiscal receipt formats
|
|
# OCR may produce N instead of : or other errors
|
|
NUMBER_PATTERNS = [
|
|
# NDS format (common in Romanian POS)
|
|
(r'NDS\s*:?\s*(\d+)', 0.98),
|
|
# C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760)
|
|
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format
|
|
(r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number
|
|
(r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix
|
|
# BF (Bon Fiscal) number
|
|
(r'BF\s*:?\s*(\d+)', 0.93),
|
|
# NIVS format
|
|
(r'NIVS\s*:?\s*(\d+)', 0.95),
|
|
# Standard NR BON formats
|
|
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
|
|
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
|
|
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
|
|
# Document number
|
|
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
|
|
# ID BF format
|
|
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
|
|
# TD format (transaction ID)
|
|
(r'TD\s*:?\s*(\d+)', 0.85),
|
|
# 6-8 digit number (typical receipt number length)
|
|
(r'\b(\d{6,8})\b', 0.70),
|
|
# Generic long number at end (fallback)
|
|
(r'NR\.?\s*:?\s*(\d{4,})', 0.65),
|
|
]
|
|
|
|
# CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI
|
|
# CIF = Cod de Identificare Fiscală (vendor's tax ID)
|
|
# CLIENT C.U.I. = client's tax ID (should be ignored)
|
|
# OCR errors: R0 instead of RO, C1F instead of CIF
|
|
CUI_PATTERNS = [
|
|
# CIF at start of line (definitely vendor) - tolerant to OCR errors
|
|
(r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
|
(r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # C1F OCR error
|
|
# CIF not preceded by CLIENT (negative lookbehind)
|
|
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
# Standalone CIF: format with OCR tolerance
|
|
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
|
|
# COD FISCAL (vendor)
|
|
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
|
# C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
|
|
(r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
|
# C.I.F. format (with dots, no spaces)
|
|
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
|
|
# CUI format (less specific, use with caution)
|
|
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
|
|
]
|
|
|
|
# Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
|
|
# Common in some receipts: "R011201891\nC. I. F." - number on line before label
|
|
CUI_REVERSED_PATTERNS = [
|
|
# RO + 8-10 digits on line immediately before C.I.F./CIF label
|
|
(r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
|
# Just digits before C.I.F. label
|
|
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
|
|
]
|
|
|
|
# Series patterns - be strict to avoid false matches
|
|
SERIES_PATTERNS = [
|
|
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
|
# Z: format from Romanian fiscal receipts (must be at start of line or after space)
|
|
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
|
|
# BF series with explicit marker
|
|
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
|
|
]
|
|
|
|
# TVA (VAT) patterns - OCR may produce TUA, TVR, etc.
|
|
TVA_PATTERNS = [
|
|
# TOTAL TVA BON format (OCR tolerant: TUA, TVR)
|
|
(r'TOTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
|
|
(r'T[O0]TAL\s+T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.95),
|
|
# TVA with percentage (OCR tolerant)
|
|
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
|
|
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
|
|
# Simple TVA pattern
|
|
(r'T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.85),
|
|
# Standalone percentage line near TVA
|
|
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
|
|
]
|
|
|
|
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
|
# Number may be on separate line before or after the label
|
|
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
|
|
ITEMS_COUNT_PATTERNS = [
|
|
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
|
|
# OCR tolerant: OZ instead of POZ, ARI instead of ART
|
|
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
|
|
# Number on line BEFORE "OZ. ART. IN BON:" - OCR sometimes reorders
|
|
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
|
|
# Number may be on next line after label
|
|
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
|
|
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
|
|
# Simpler patterns - but more specific
|
|
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
|
|
# POZ at start of line or after colon (not in product descriptions)
|
|
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
|
|
]
|
|
|
|
# Address patterns (Romanian format)
|
|
ADDRESS_PATTERNS = [
|
|
# Street patterns
|
|
(r'(STR\.?\s+[A-Z0-9\s.,]+(?:NR\.?\s*\d+)?)', 0.90),
|
|
# Full address with JUD (county)
|
|
(r'(JUD\.?\s+[A-Z]+,?\s*(?:MUN\.?|OR\.?|COM\.?)?\s*[A-Z]+)', 0.85),
|
|
]
|
|
|
|
# Vendor name indicators (lines containing these are likely vendor names)
|
|
# These should be company type suffixes, not generic words
|
|
# Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
|
|
VENDOR_INDICATORS = [
|
|
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
|
|
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
|
|
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
|
|
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
|
|
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
|
|
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
|
|
# S.C. alone is too short and generic - only match if followed by company name
|
|
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
|
|
r'HOLDING',
|
|
r'COMPANY',
|
|
r'GROUP',
|
|
# Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
|
|
]
|
|
|
|
def extract(self, text: str) -> ExtractionResult:
|
|
"""Extract all fields from OCR text."""
|
|
result = ExtractionResult()
|
|
result.raw_text = text
|
|
text_upper = text.upper()
|
|
|
|
# Extract core fields
|
|
result.amount, result.confidence_amount = self._extract_amount(text_upper)
|
|
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
|
|
result.receipt_number, _ = self._extract_number(text_upper)
|
|
result.receipt_series, _ = self._extract_series(text_upper)
|
|
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
|
|
result.cui, _ = self._extract_cui(text_upper, text)
|
|
|
|
# Extract additional fields - Multiple TVA entries
|
|
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
|
|
if not result.tva_entries:
|
|
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
|
|
# Debug: show what patterns see
|
|
import re
|
|
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
|
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
|
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
|
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
|
|
result.items_count = self._extract_items_count(text_upper)
|
|
result.address = self._extract_address(text_upper)
|
|
|
|
# Detect receipt type
|
|
result.receipt_type = self._detect_receipt_type(text_upper)
|
|
|
|
return result
|
|
|
|
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
|
|
"""Extract total amount from text."""
|
|
# First try standard patterns (TOTAL, SUBTOTAL, etc.)
|
|
for pattern, confidence in self.TOTAL_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
try:
|
|
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
|
|
amount_str = self._normalize_number(amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
return amount, confidence
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
# Strategy 2: Find amounts AFTER product lines end
|
|
# Products have pattern: "X BUC/ROLA X price = price"
|
|
# Total appears after all products
|
|
product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X'
|
|
product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE))
|
|
if product_matches:
|
|
# Get text after the last product line
|
|
last_product_pos = product_matches[-1].end()
|
|
after_products = text[last_product_pos:]
|
|
|
|
# Find standalone amounts on their own line after products
|
|
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
|
|
standalone_amounts = []
|
|
for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE):
|
|
try:
|
|
amount_str = match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 10: # Filter out small values
|
|
standalone_amounts.append(amount)
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
if standalone_amounts:
|
|
# The largest standalone amount after products is likely the total
|
|
max_amount = max(standalone_amounts)
|
|
# Higher confidence if it appears multiple times
|
|
count = standalone_amounts.count(max_amount)
|
|
confidence = 0.85 if count >= 2 else 0.75
|
|
return max_amount, confidence
|
|
|
|
# Strategy 3: Find the most repeated large amount
|
|
# Normalize spaces in numbers (OCR may produce "186. 16")
|
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
|
amount_pattern = r'(\d{2,4}[.,]\d{2})\b'
|
|
amounts = re.findall(amount_pattern, normalized_text)
|
|
if amounts:
|
|
from collections import Counter
|
|
amount_counts = Counter(amounts)
|
|
# Filter amounts that appear 2+ times and are > 20
|
|
candidates = []
|
|
for amt_str, count in amount_counts.items():
|
|
try:
|
|
amt = Decimal(self._normalize_number(amt_str))
|
|
if count >= 2 and amt > 20:
|
|
candidates.append((amt, count))
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
if candidates:
|
|
# Return the LARGEST amount that appears multiple times
|
|
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
return candidates[0][0], 0.65
|
|
|
|
# Last resort: Find any standalone large amount
|
|
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
|
|
for match in re.finditer(line_amount_pattern, text, re.MULTILINE):
|
|
try:
|
|
amount_str = match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 50: # Higher threshold for fallback
|
|
return amount, 0.50
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
|
|
return None, 0.0
|
|
|
|
def _normalize_number(self, num_str: str) -> str:
|
|
"""Normalize Romanian number format to standard decimal."""
|
|
# Remove spaces
|
|
num_str = num_str.replace(' ', '')
|
|
|
|
# Handle comma as decimal separator
|
|
if ',' in num_str and '.' in num_str:
|
|
# Romanian format: 1.234,56
|
|
num_str = num_str.replace('.', '').replace(',', '.')
|
|
elif ',' in num_str:
|
|
# Could be 1,50 or 1,234
|
|
parts = num_str.split(',')
|
|
if len(parts) == 2 and len(parts[1]) <= 2:
|
|
# Decimal comma: 1,50
|
|
num_str = num_str.replace(',', '.')
|
|
else:
|
|
# Thousands comma: 1,234
|
|
num_str = num_str.replace(',', '')
|
|
elif '.' in num_str:
|
|
parts = num_str.split('.')
|
|
if len(parts) > 2:
|
|
# Multiple dots: 1.234.567 -> 1234567
|
|
num_str = ''.join(parts[:-1]) + '.' + parts[-1]
|
|
|
|
return num_str
|
|
|
|
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
|
"""Extract receipt date from text."""
|
|
# First try standard patterns (clean dates)
|
|
for pattern, confidence in self.DATE_PATTERNS:
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
try:
|
|
# Normalize separators to dots
|
|
date_str = match.group(1).replace('/', '.').replace('-', '.')
|
|
|
|
# Try DD.MM.YYYY format first
|
|
try:
|
|
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
|
|
except ValueError:
|
|
# Try YYYY.MM.DD format
|
|
parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
|
|
|
|
# Validate date range
|
|
today = date.today()
|
|
if parsed <= today and parsed.year >= 2020:
|
|
return parsed, confidence
|
|
except ValueError:
|
|
continue
|
|
|
|
# Then try OCR-corrupted patterns (dates with spaces/commas)
|
|
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
|
|
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
try:
|
|
if fmt == 'ymd':
|
|
# YYYY. MM. DD format (OMV/Petrom)
|
|
year = match.group(1)
|
|
month = match.group(2)
|
|
day = match.group(3)
|
|
else:
|
|
# DD. MM. YYYY format (default)
|
|
day = match.group(1)
|
|
month = match.group(2)
|
|
year = match.group(3)
|
|
|
|
date_str = f"{day}.{month}.{year}"
|
|
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
|
|
|
|
# Validate date range
|
|
today = date.today()
|
|
if parsed <= today and parsed.year >= 2020:
|
|
return parsed, confidence
|
|
except ValueError:
|
|
continue
|
|
|
|
return None, 0.0
|
|
|
|
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract receipt number from text."""
|
|
for pattern, confidence in self.NUMBER_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1), confidence
|
|
return None, 0.0
|
|
|
|
def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract receipt series from text."""
|
|
for pattern, confidence in self.SERIES_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper(), confidence
|
|
return None, 0.0
|
|
|
|
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract vendor/partner name from text.
|
|
Uses multiple strategies:
|
|
1. Look for lines with company type indicators (S.R.L., S.A., etc.)
|
|
2. Look for company name + SRL on separate lines
|
|
3. Look for lines near CIF
|
|
4. Use first valid line as fallback
|
|
"""
|
|
lines = text.split('\n')
|
|
skip_keywords = [
|
|
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
|
|
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
|
|
'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT',
|
|
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
|
|
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
|
|
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
|
|
'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
|
|
'NIVS', 'BENZINA', 'PETROM', 'OMV'
|
|
]
|
|
|
|
# Strategy 0: Look for company name followed by SRL/SA on next line
|
|
# Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
|
|
for i, line in enumerate(lines[:15]):
|
|
line = line.strip()
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
line_upper = line.upper()
|
|
|
|
# Skip lines with skip keywords
|
|
if any(kw in line_upper for kw in skip_keywords):
|
|
continue
|
|
|
|
# Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip().upper()
|
|
# Match standalone company type suffix
|
|
if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
|
|
re.match(r'^S\.?\s*A\.?$', next_line) or \
|
|
re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
|
|
re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
|
|
re.match(r'^I\.?\s*I\.?$', next_line):
|
|
# Combine: "COMPANY NAME" + " " + "SRL"
|
|
vendor = self._clean_vendor_name(f"{line} {next_line}")
|
|
if vendor and len(vendor) >= 5:
|
|
return vendor, 0.95
|
|
|
|
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
|
|
for i, line in enumerate(lines[:15]): # Check first 15 lines
|
|
line = line.strip()
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
line_upper = line.upper()
|
|
|
|
# Check for vendor indicators
|
|
for indicator in self.VENDOR_INDICATORS:
|
|
if re.search(indicator, line_upper):
|
|
# Found a company name indicator
|
|
vendor = self._clean_vendor_name(line)
|
|
if vendor and len(vendor) >= 3:
|
|
# High confidence for lines with company indicators
|
|
return vendor, 0.95
|
|
|
|
# Strategy 2: Look for lines right before or after CIF
|
|
for i, line in enumerate(lines[:15]):
|
|
line_upper = line.upper()
|
|
if 'CIF' in line_upper and 'CLIENT' not in line_upper:
|
|
# Check line before
|
|
if i > 0:
|
|
prev_line = lines[i-1].strip()
|
|
if prev_line and len(prev_line) >= 3:
|
|
if not any(kw in prev_line.upper() for kw in skip_keywords):
|
|
vendor = self._clean_vendor_name(prev_line)
|
|
if vendor:
|
|
return vendor, 0.85
|
|
|
|
# Strategy 3: First valid line as fallback
|
|
for i, line in enumerate(lines[:10]):
|
|
line = line.strip()
|
|
|
|
# Skip empty lines
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
# Skip lines that are just numbers or codes
|
|
if re.match(r'^[\d.,\s:]+$', line):
|
|
continue
|
|
|
|
# Skip lines with barcodes/product codes
|
|
if re.match(r'^[A-Z]*\d{6,}', line):
|
|
continue
|
|
|
|
# Skip lines with keywords
|
|
if any(kw in line.upper() for kw in skip_keywords):
|
|
continue
|
|
|
|
# Clean the line
|
|
vendor = self._clean_vendor_name(line)
|
|
|
|
if vendor and len(vendor) >= 3:
|
|
# Confidence decreases for lines further down
|
|
confidence = max(0.3, 0.7 - (i * 0.05))
|
|
return vendor, confidence
|
|
|
|
return None, 0.0
|
|
|
|
def _clean_vendor_name(self, name: str) -> Optional[str]:
|
|
"""Clean and normalize vendor name."""
|
|
if not name:
|
|
return None
|
|
|
|
# Remove common OCR artifacts
|
|
name = re.sub(r'[^\w\s.,&\-()]', ' ', name)
|
|
# Normalize whitespace
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
|
|
# Skip if it looks like an address line only
|
|
if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
|
|
return None
|
|
|
|
# Skip if too short after cleaning
|
|
if len(name) < 3:
|
|
return None
|
|
|
|
return name
|
|
|
|
def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract vendor CUI (fiscal identification code) from text.
|
|
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
|
|
"""
|
|
# Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
|
|
# This is common in some receipts: "R011201891\nC. I. F."
|
|
for pattern, confidence in self.CUI_REVERSED_PATTERNS:
|
|
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
cui = match.group(1)
|
|
if 6 <= len(cui) <= 10:
|
|
# Verify this is not the CLIENT CUI by checking context
|
|
start = match.start()
|
|
# Check 50 chars before the match for CLIENT keyword
|
|
context_start = max(0, start - 50)
|
|
context = text_upper[context_start:start]
|
|
if 'CLIENT' not in context and 'LIENT' not in context:
|
|
return cui, confidence
|
|
|
|
# Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
|
|
lines = text_upper.split('\n')
|
|
for line in lines:
|
|
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
|
|
if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line:
|
|
continue
|
|
|
|
# Look for CIF in this line
|
|
for pattern, confidence in self.CUI_PATTERNS:
|
|
match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
cui = match.group(1)
|
|
if 6 <= len(cui) <= 10:
|
|
return cui, confidence
|
|
|
|
# Strategy 2: Fallback - search entire text but exclude CLIENT patterns
|
|
for pattern, confidence in self.CUI_PATTERNS:
|
|
# Find all matches
|
|
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
|
cui = match.group(1)
|
|
if 6 <= len(cui) <= 10:
|
|
# Check if this match is preceded by CLIENT in the same line
|
|
start = match.start()
|
|
line_start = text_upper.rfind('\n', 0, start) + 1
|
|
line_text = text_upper[line_start:start]
|
|
if 'CLIENT' not in line_text and 'LIENT' not in line_text:
|
|
return cui, confidence
|
|
|
|
return None, 0.0
|
|
|
|
def _detect_receipt_type(self, text: str) -> str:
|
|
"""Detect receipt type from text content."""
|
|
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
|
|
return 'chitanta'
|
|
return 'bon_fiscal'
|
|
|
|
def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
|
|
"""
|
|
Extract multiple TVA (VAT) entries from text.
|
|
Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).
|
|
|
|
Returns (tva_entries, tva_total) where tva_entries is a list of:
|
|
{'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
|
|
"""
|
|
tva_entries = []
|
|
seen_entries = set() # To avoid duplicates
|
|
|
|
# Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
|
|
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
|
|
# Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
|
|
non_vat_patterns = [
|
|
# Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
|
|
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
|
|
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
|
|
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
|
|
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
|
|
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
|
|
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
|
|
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
|
|
]
|
|
for pattern in non_vat_patterns:
|
|
if re.search(pattern, text, re.IGNORECASE):
|
|
# Non-VAT payer - return TVA = 0
|
|
return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')
|
|
|
|
# Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
|
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
|
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
|
|
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
|
|
|
|
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
|
|
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
|
|
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
|
|
# OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
|
|
total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
|
|
taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
|
|
|
|
# Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
|
|
if not taxe_match:
|
|
reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
|
|
taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
|
|
|
|
if taxe_match:
|
|
# Also need to find the TVA rate from the table
|
|
# Pattern handles: "A-21%", "-21,00%", "21%" etc.
|
|
rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
|
|
rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
|
|
if rate_match:
|
|
try:
|
|
code = rate_match.group(1).upper() if rate_match.group(1) else 'A' # Default to A if missing
|
|
percent = int(rate_match.group(2))
|
|
amount_str = taxe_match.group(1).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
pass
|
|
|
|
# Pattern 0b: Table format "A-21,00% 285,66 49,58" (code-percent base tva_amount)
|
|
# This format appears after a TVA header line like "TVA TOTAL VALDARE"
|
|
# The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
|
|
if not tva_entries:
|
|
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
|
|
for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount1_str = match.group(3).replace(' ', '')
|
|
amount2_str = match.group(4).replace(' ', '')
|
|
amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
|
|
amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
|
|
|
|
# Determine which is TVA: the smaller amount is usually TVA
|
|
# (TVA is a fraction of the total, so it's always smaller)
|
|
tva_amount = min(amount1, amount2)
|
|
|
|
if tva_amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': tva_amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
|
|
# OCR tolerant: TUA, TVR, etc.
|
|
pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
|
for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount_str = match.group(3).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A')
|
|
if not tva_entries:
|
|
pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
|
for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
|
|
try:
|
|
percent = int(match.group(1))
|
|
amount_str = match.group(2).replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
# Determine code based on percent
|
|
code = self._get_tva_code_from_percent(percent)
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
|
|
if not tva_entries:
|
|
# First try: "TOTAL TVA A - 21% 32.31" (amount on same line)
|
|
tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
|
|
for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
amount_str = self._normalize_number(match.group(3))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
|
|
if not tva_entries:
|
|
tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
|
|
for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
|
|
# Look for "TOTAL TVA BON" followed by amount
|
|
tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
|
|
tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
|
|
if tva_bon_match:
|
|
amount_str = self._normalize_number(tva_bon_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
continue
|
|
|
|
# Fallback: Amount after TOTAL TVA BON on next line
|
|
tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
|
|
if tva_bon_pos:
|
|
after_bon = normalized_text[tva_bon_pos.end():]
|
|
# Find first standalone number (likely TVA amount)
|
|
amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
|
|
if amount_match:
|
|
amount_str = self._normalize_number(amount_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
|
|
if not tva_entries:
|
|
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
|
|
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
|
|
try:
|
|
code = (match.group(1) or 'A').upper()
|
|
percent = int(match.group(2))
|
|
|
|
# Look for amount on the next line or immediately after
|
|
after_tva = normalized_text[match.end():]
|
|
amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva)
|
|
if amount_match:
|
|
amount_str = self._normalize_number(amount_match.group(1))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Pattern 4: Use TVA_PATTERNS for fallback
|
|
if not tva_entries:
|
|
for pattern, _ in self.TVA_PATTERNS:
|
|
match = re.search(pattern, normalized_text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
# Some patterns have 2 groups (percent, amount), others just amount
|
|
if match.lastindex >= 2:
|
|
percent = int(match.group(1))
|
|
amount_str = match.group(2)
|
|
else:
|
|
amount_str = match.group(1)
|
|
# Try to detect percent from text
|
|
percent = self._detect_tva_percent(text)
|
|
|
|
amount_str = amount_str.replace(' ', '')
|
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
|
amount = Decimal(amount_str)
|
|
if amount > 0 and percent:
|
|
code = self._get_tva_code_from_percent(percent)
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen_entries:
|
|
tva_entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
seen_entries.add(entry_key)
|
|
break # Only use first match from fallback
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Calculate total
|
|
tva_total = None
|
|
if tva_entries:
|
|
tva_total = sum(entry['amount'] for entry in tva_entries)
|
|
|
|
# Sort by code (A, B, C, D)
|
|
tva_entries.sort(key=lambda x: x.get('code', 'Z'))
|
|
|
|
return tva_entries, tva_total
|
|
|
|
def _get_tva_code_from_percent(self, percent: int) -> str:
|
|
"""Map TVA percentage to standard Romanian code.
|
|
|
|
Romanian TVA rates changed in August 2025:
|
|
- Standard rate: 19% → 21%
|
|
- Reduced rate: 9% → 11%
|
|
- Other rates (5%, 0%) remain unchanged
|
|
|
|
Old rates (before Aug 2025): New rates (from Aug 2025):
|
|
- A = 19% (standard) - A = 21% (standard)
|
|
- B = 9% (reduced) - B = 11% (reduced)
|
|
- C = 5% (reduced) - C = 5% (reduced)
|
|
- D = 0% (exempt) - D = 0% (exempt)
|
|
|
|
Both old and new rates are supported for historical receipts.
|
|
"""
|
|
if percent in (19, 21):
|
|
return 'A' # Standard rate (19% old, 21% new from Aug 2025)
|
|
elif percent in (9, 11):
|
|
return 'B' # Reduced rate (9% old, 11% new from Aug 2025)
|
|
elif percent == 5:
|
|
return 'C' # Reduced rate (unchanged)
|
|
elif percent == 0:
|
|
return 'D' # Exempt (unchanged)
|
|
else:
|
|
return 'A' # Default to standard rate
|
|
|
|
def _detect_tva_percent(self, text: str) -> Optional[int]:
|
|
"""Detect TVA percentage from text content."""
|
|
# Look for common Romanian TVA percentages
|
|
if '19%' in text or '19 %' in text:
|
|
return 19
|
|
elif '21%' in text or '21 %' in text:
|
|
return 21
|
|
elif '11%' in text or '11 %' in text:
|
|
return 11
|
|
elif '9%' in text or '9 %' in text:
|
|
return 9
|
|
elif '5%' in text or '5 %' in text:
|
|
return 5
|
|
return None
|
|
|
|
def _extract_items_count(self, text: str) -> Optional[int]:
|
|
"""Extract number of items/articles from receipt."""
|
|
for pattern, _ in self.ITEMS_COUNT_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
count = int(match.group(1))
|
|
if 0 < count < 1000: # Reasonable range
|
|
return count
|
|
except ValueError:
|
|
continue
|
|
return None
|
|
|
|
def _extract_address(self, text: str) -> Optional[str]:
|
|
"""Extract vendor address from text."""
|
|
lines = text.split('\n')
|
|
address_parts = []
|
|
|
|
for line in lines[:15]: # Check first 15 lines
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Check for address patterns
|
|
line_upper = line.upper()
|
|
|
|
# JUD. (county) pattern
|
|
if re.search(r'\bJUD\.?\s+', line_upper):
|
|
address_parts.append(line)
|
|
continue
|
|
|
|
# STR. (street) pattern
|
|
if re.search(r'\bSTR\.?\s+', line_upper):
|
|
address_parts.append(line)
|
|
continue
|
|
|
|
# MUN./OR./COM. (city/town) pattern
|
|
if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper):
|
|
address_parts.append(line)
|
|
continue
|
|
|
|
if address_parts:
|
|
# Join and clean address parts
|
|
address = ', '.join(address_parts)
|
|
# Clean up
|
|
address = re.sub(r'\s+', ' ', address).strip()
|
|
address = re.sub(r',\s*,', ',', address)
|
|
return address if len(address) >= 5 else None
|
|
|
|
return None
|