"""Extract structured fields from OCR text (Romanian receipts).""" import re from datetime import date, datetime from decimal import Decimal, InvalidOperation from typing import Optional, Tuple, List from dataclasses import dataclass, field @dataclass class ExtractionResult: """Structured extraction result from receipt.""" receipt_type: str = 'bon_fiscal' receipt_number: Optional[str] = None receipt_series: Optional[str] = None receipt_date: Optional[date] = None amount: Optional[Decimal] = None partner_name: Optional[str] = None cui: Optional[str] = None description: Optional[str] = None # Additional extracted fields - Multiple TVA entries support tva_entries: List[dict] = field(default_factory=list) # [{code, percent, amount}] tva_total: Optional[Decimal] = None address: Optional[str] = None items_count: Optional[int] = None confidence_amount: float = 0.0 confidence_date: float = 0.0 confidence_vendor: float = 0.0 raw_text: str = "" @property def overall_confidence(self) -> float: """Calculate weighted overall confidence score.""" weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3} return round( self.confidence_amount * weights['amount'] + self.confidence_date * weights['date'] + self.confidence_vendor * weights['vendor'], 2 ) class ReceiptExtractor: """Extract receipt fields using pattern matching for Romanian receipts.""" # Total amount patterns (most specific first) # Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc. # OCR often produces errors, so patterns must be tolerant TOTAL_PATTERNS = [ # Most common: TOTAL LEI followed by amount (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98), (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), # OCR may miss first letter # Standard patterns (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95), (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95), # SUBTOTAL when TOTAL not found (r'SUBTOTAL\s*([\d\s.,]+)', 0.90), (r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88), # OCR variations # Payment methods (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90), (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85), (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85), (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80), (r'REST\s*:?\s*([\d\s.,]+)', 0.70), # Sometimes total is near REST ] # Fallback: Find the largest repeated amount (likely the total) # This handles cases where OCR doesn't capture "TOTAL" keyword # Date patterns - support dash, dot, and slash separators # OCR may produce DRTA instead of DATA, DAIA, etc. DATE_PATTERNS = [ # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant) (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), # Date followed by ORA (time) - OCR may produce 0RA (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95), # Date followed by time without ORA keyword (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90), # Standalone date (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80), # YYYY-MM-DD format (less common) (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75), ] # Receipt number patterns - Romanian fiscal receipt formats # OCR may produce N instead of : or other errors NUMBER_PATTERNS = [ # NDS format (common in Romanian POS) (r'NDS\s*:?\s*(\d+)', 0.98), # C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760) (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format (r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number (r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix # BF (Bon Fiscal) number (r'BF\s*:?\s*(\d+)', 0.93), # NIVS format (r'NIVS\s*:?\s*(\d+)', 0.95), # Standard NR BON formats (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95), (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95), (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95), # Document number (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90), # ID BF format (r'ID\s*BF\s*:?\s*(\d+)', 0.90), # TD format (transaction ID) (r'TD\s*:?\s*(\d+)', 0.85), # 6-8 digit number (typical receipt number length) (r'\b(\d{6,8})\b', 0.70), # Generic long number at end (fallback) (r'NR\.?\s*:?\s*(\d{4,})', 0.65), ] # CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI # CIF = Cod de Identificare Fiscală (vendor's tax ID) # CLIENT C.U.I. = client's tax ID (should be ignored) # OCR errors: R0 instead of RO, C1F instead of CIF CUI_PATTERNS = [ # CIF at start of line (definitely vendor) - tolerant to OCR errors (r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98), (r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # C1F OCR error # CIF not preceded by CLIENT (negative lookbehind) (r'(? ExtractionResult: """Extract all fields from OCR text.""" result = ExtractionResult() result.raw_text = text text_upper = text.upper() # Extract core fields result.amount, result.confidence_amount = self._extract_amount(text_upper) result.receipt_date, result.confidence_date = self._extract_date(text_upper) result.receipt_number, _ = self._extract_number(text_upper) result.receipt_series, _ = self._extract_series(text_upper) result.partner_name, result.confidence_vendor = self._extract_vendor(text) result.cui, _ = self._extract_cui(text_upper, text) # Extract additional fields - Multiple TVA entries result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper) result.items_count = self._extract_items_count(text_upper) result.address = self._extract_address(text_upper) # Detect receipt type result.receipt_type = self._detect_receipt_type(text_upper) return result def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]: """Extract total amount from text.""" # First try standard patterns (TOTAL, SUBTOTAL, etc.) for pattern, confidence in self.TOTAL_PATTERNS: match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) if match: try: amount_str = re.sub(r'[^\d.,]', '', match.group(1)) amount_str = self._normalize_number(amount_str) amount = Decimal(amount_str) if amount > 0: return amount, confidence except (InvalidOperation, ValueError): continue # Strategy 2: Find amounts AFTER product lines end # Products have pattern: "X BUC/ROLA X price = price" # Total appears after all products product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X' product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE)) if product_matches: # Get text after the last product line last_product_pos = product_matches[-1].end() after_products = text[last_product_pos:] # Find standalone amounts on their own line after products line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$' standalone_amounts = [] for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE): try: amount_str = match.group(1).replace(' ', '') amount_str = self._normalize_number(amount_str) amount = Decimal(amount_str) if amount > 10: # Filter out small values standalone_amounts.append(amount) except (InvalidOperation, ValueError): continue if standalone_amounts: # The largest standalone amount after products is likely the total max_amount = max(standalone_amounts) # Higher confidence if it appears multiple times count = standalone_amounts.count(max_amount) confidence = 0.85 if count >= 2 else 0.75 return max_amount, confidence # Strategy 3: Find the most repeated large amount # Normalize spaces in numbers (OCR may produce "186. 16") normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) amount_pattern = r'(\d{2,4}[.,]\d{2})\b' amounts = re.findall(amount_pattern, normalized_text) if amounts: from collections import Counter amount_counts = Counter(amounts) # Filter amounts that appear 2+ times and are > 20 candidates = [] for amt_str, count in amount_counts.items(): try: amt = Decimal(self._normalize_number(amt_str)) if count >= 2 and amt > 20: candidates.append((amt, count)) except (InvalidOperation, ValueError): continue if candidates: # Return the LARGEST amount that appears multiple times candidates.sort(key=lambda x: x[0], reverse=True) return candidates[0][0], 0.65 # Last resort: Find any standalone large amount line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$' for match in re.finditer(line_amount_pattern, text, re.MULTILINE): try: amount_str = match.group(1).replace(' ', '') amount_str = self._normalize_number(amount_str) amount = Decimal(amount_str) if amount > 50: # Higher threshold for fallback return amount, 0.50 except (InvalidOperation, ValueError): continue return None, 0.0 def _normalize_number(self, num_str: str) -> str: """Normalize Romanian number format to standard decimal.""" # Remove spaces num_str = num_str.replace(' ', '') # Handle comma as decimal separator if ',' in num_str and '.' in num_str: # Romanian format: 1.234,56 num_str = num_str.replace('.', '').replace(',', '.') elif ',' in num_str: # Could be 1,50 or 1,234 parts = num_str.split(',') if len(parts) == 2 and len(parts[1]) <= 2: # Decimal comma: 1,50 num_str = num_str.replace(',', '.') else: # Thousands comma: 1,234 num_str = num_str.replace(',', '') elif '.' in num_str: parts = num_str.split('.') if len(parts) > 2: # Multiple dots: 1.234.567 -> 1234567 num_str = ''.join(parts[:-1]) + '.' + parts[-1] return num_str def _extract_date(self, text: str) -> Tuple[Optional[date], float]: """Extract receipt date from text.""" for pattern, confidence in self.DATE_PATTERNS: match = re.search(pattern, text) if match: try: # Normalize separators to dots date_str = match.group(1).replace('/', '.').replace('-', '.') # Try DD.MM.YYYY format first try: parsed = datetime.strptime(date_str, '%d.%m.%Y').date() except ValueError: # Try YYYY.MM.DD format parsed = datetime.strptime(date_str, '%Y.%m.%d').date() # Validate date range today = date.today() if parsed <= today and parsed.year >= 2020: return parsed, confidence except ValueError: continue return None, 0.0 def _extract_number(self, text: str) -> Tuple[Optional[str], float]: """Extract receipt number from text.""" for pattern, confidence in self.NUMBER_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1), confidence return None, 0.0 def _extract_series(self, text: str) -> Tuple[Optional[str], float]: """Extract receipt series from text.""" for pattern, confidence in self.SERIES_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1).upper(), confidence return None, 0.0 def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]: """ Extract vendor/partner name from text. Uses multiple strategies: 1. Look for lines with company type indicators (S.R.L., S.A., etc.) 2. Look for lines near CIF 3. Use first valid line as fallback """ lines = text.split('\n') skip_keywords = [ 'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA', 'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR', 'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT', 'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT', 'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT', 'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY', 'BUC', 'ROLA', 'CUMPARATOR' ] # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.) for i, line in enumerate(lines[:15]): # Check first 15 lines line = line.strip() if not line or len(line) < 3: continue line_upper = line.upper() # Check for vendor indicators for indicator in self.VENDOR_INDICATORS: if re.search(indicator, line_upper): # Found a company name indicator vendor = self._clean_vendor_name(line) if vendor and len(vendor) >= 3: # High confidence for lines with company indicators return vendor, 0.95 # Strategy 2: Look for lines right before or after CIF for i, line in enumerate(lines[:15]): line_upper = line.upper() if 'CIF' in line_upper and 'CLIENT' not in line_upper: # Check line before if i > 0: prev_line = lines[i-1].strip() if prev_line and len(prev_line) >= 3: if not any(kw in prev_line.upper() for kw in skip_keywords): vendor = self._clean_vendor_name(prev_line) if vendor: return vendor, 0.85 # Strategy 3: First valid line as fallback for i, line in enumerate(lines[:10]): line = line.strip() # Skip empty lines if not line or len(line) < 3: continue # Skip lines that are just numbers or codes if re.match(r'^[\d.,\s:]+$', line): continue # Skip lines with barcodes/product codes if re.match(r'^[A-Z]*\d{6,}', line): continue # Skip lines with keywords if any(kw in line.upper() for kw in skip_keywords): continue # Clean the line vendor = self._clean_vendor_name(line) if vendor and len(vendor) >= 3: # Confidence decreases for lines further down confidence = max(0.3, 0.7 - (i * 0.05)) return vendor, confidence return None, 0.0 def _clean_vendor_name(self, name: str) -> Optional[str]: """Clean and normalize vendor name.""" if not name: return None # Remove common OCR artifacts name = re.sub(r'[^\w\s.,&\-()]', ' ', name) # Normalize whitespace name = re.sub(r'\s+', ' ', name).strip() # Skip if it looks like an address line only if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()): return None # Skip if too short after cleaning if len(name) < 3: return None return name def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]: """ Extract vendor CUI (fiscal identification code) from text. Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...' """ # First, try to find CIF on a line that doesn't contain CLIENT lines = text_upper.split('\n') for line in lines: # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's) if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line: continue # Look for CIF in this line for pattern, confidence in self.CUI_PATTERNS: match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE) if match: cui = match.group(1) if 6 <= len(cui) <= 10: return cui, confidence # Fallback: search entire text but exclude CLIENT patterns for pattern, confidence in self.CUI_PATTERNS: # Find all matches for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE): cui = match.group(1) if 6 <= len(cui) <= 10: # Check if this match is preceded by CLIENT in the same line start = match.start() line_start = text_upper.rfind('\n', 0, start) + 1 line_text = text_upper[line_start:start] if 'CLIENT' not in line_text and 'LIENT' not in line_text: return cui, confidence return None, 0.0 def _detect_receipt_type(self, text: str) -> str: """Detect receipt type from text content.""" if 'CHITANTA' in text or 'CHITANȚĂ' in text: return 'chitanta' return 'bon_fiscal' def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]: """ Extract multiple TVA (VAT) entries from text. Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%). Returns (tva_entries, tva_total) where tva_entries is a list of: {'code': 'A', 'percent': 19, 'amount': Decimal('15.20')} """ tva_entries = [] seen_entries = set() # To avoid duplicates # Normalize spaces in numbers first (OCR may produce "32. 31") normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code) # OCR tolerant: TUA, TVR, etc. pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)' for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE): try: code = match.group(1).upper() percent = int(match.group(2)) amount_str = match.group(3).replace(' ', '') amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) amount = Decimal(amount_str) if amount > 0: entry_key = (code, percent) if entry_key not in seen_entries: tva_entries.append({ 'code': code, 'percent': percent, 'amount': amount }) seen_entries.add(entry_key) except (ValueError, InvalidOperation): continue # Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A') if not tva_entries: pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)' for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE): try: percent = int(match.group(1)) amount_str = match.group(2).replace(' ', '') amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) amount = Decimal(amount_str) if amount > 0: # Determine code based on percent code = self._get_tva_code_from_percent(percent) entry_key = (code, percent) if entry_key not in seen_entries: tva_entries.append({ 'code': code, 'percent': percent, 'amount': amount }) seen_entries.add(entry_key) except (ValueError, InvalidOperation): continue # Pattern 3: "TVAA - 21%" on one line, amount on next line if not tva_entries: tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%' for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE): try: code = (match.group(1) or 'A').upper() percent = int(match.group(2)) # Look for amount on the next line or immediately after after_tva = normalized_text[match.end():] amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva) if amount_match: amount_str = self._normalize_number(amount_match.group(1)) amount = Decimal(amount_str) if amount > 0: entry_key = (code, percent) if entry_key not in seen_entries: tva_entries.append({ 'code': code, 'percent': percent, 'amount': amount }) seen_entries.add(entry_key) except (ValueError, InvalidOperation): continue # Pattern 4: Use TVA_PATTERNS for fallback if not tva_entries: for pattern, _ in self.TVA_PATTERNS: match = re.search(pattern, normalized_text, re.IGNORECASE) if match: try: # Some patterns have 2 groups (percent, amount), others just amount if match.lastindex >= 2: percent = int(match.group(1)) amount_str = match.group(2) else: amount_str = match.group(1) # Try to detect percent from text percent = self._detect_tva_percent(text) amount_str = amount_str.replace(' ', '') amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) amount = Decimal(amount_str) if amount > 0 and percent: code = self._get_tva_code_from_percent(percent) entry_key = (code, percent) if entry_key not in seen_entries: tva_entries.append({ 'code': code, 'percent': percent, 'amount': amount }) seen_entries.add(entry_key) break # Only use first match from fallback except (ValueError, InvalidOperation): continue # Calculate total tva_total = None if tva_entries: tva_total = sum(entry['amount'] for entry in tva_entries) # Sort by code (A, B, C, D) tva_entries.sort(key=lambda x: x.get('code', 'Z')) return tva_entries, tva_total def _get_tva_code_from_percent(self, percent: int) -> str: """Map TVA percentage to standard Romanian code. Romanian TVA rates changed in August 2025: - Standard rate: 19% → 21% - Reduced rate: 9% → 11% - Other rates (5%, 0%) remain unchanged Old rates (before Aug 2025): New rates (from Aug 2025): - A = 19% (standard) - A = 21% (standard) - B = 9% (reduced) - B = 11% (reduced) - C = 5% (reduced) - C = 5% (reduced) - D = 0% (exempt) - D = 0% (exempt) Both old and new rates are supported for historical receipts. """ if percent in (19, 21): return 'A' # Standard rate (19% old, 21% new from Aug 2025) elif percent in (9, 11): return 'B' # Reduced rate (9% old, 11% new from Aug 2025) elif percent == 5: return 'C' # Reduced rate (unchanged) elif percent == 0: return 'D' # Exempt (unchanged) else: return 'A' # Default to standard rate def _detect_tva_percent(self, text: str) -> Optional[int]: """Detect TVA percentage from text content.""" # Look for common Romanian TVA percentages if '19%' in text or '19 %' in text: return 19 elif '21%' in text or '21 %' in text: return 21 elif '11%' in text or '11 %' in text: return 11 elif '9%' in text or '9 %' in text: return 9 elif '5%' in text or '5 %' in text: return 5 return None def _extract_items_count(self, text: str) -> Optional[int]: """Extract number of items/articles from receipt.""" for pattern, _ in self.ITEMS_COUNT_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: try: count = int(match.group(1)) if 0 < count < 1000: # Reasonable range return count except ValueError: continue return None def _extract_address(self, text: str) -> Optional[str]: """Extract vendor address from text.""" lines = text.split('\n') address_parts = [] for line in lines[:15]: # Check first 15 lines line = line.strip() if not line: continue # Check for address patterns line_upper = line.upper() # JUD. (county) pattern if re.search(r'\bJUD\.?\s+', line_upper): address_parts.append(line) continue # STR. (street) pattern if re.search(r'\bSTR\.?\s+', line_upper): address_parts.append(line) continue # MUN./OR./COM. (city/town) pattern if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper): address_parts.append(line) continue if address_parts: # Join and clean address parts address = ', '.join(address_parts) # Clean up address = re.sub(r'\s+', ' ', address).strip() address = re.sub(r',\s*,', ',', address) return address if len(address) >= 5 else None return None