"""Extract structured fields from OCR text (Romanian receipts).""" import re from datetime import date, datetime from decimal import Decimal, InvalidOperation from typing import Optional, Tuple from dataclasses import dataclass, field @dataclass class ExtractionResult: """Structured extraction result from receipt.""" receipt_type: str = 'bon_fiscal' receipt_number: Optional[str] = None receipt_series: Optional[str] = None receipt_date: Optional[date] = None amount: Optional[Decimal] = None partner_name: Optional[str] = None cui: Optional[str] = None description: Optional[str] = None confidence_amount: float = 0.0 confidence_date: float = 0.0 confidence_vendor: float = 0.0 raw_text: str = "" @property def overall_confidence(self) -> float: """Calculate weighted overall confidence score.""" weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3} return round( self.confidence_amount * weights['amount'] + self.confidence_date * weights['date'] + self.confidence_vendor * weights['vendor'], 2 ) class ReceiptExtractor: """Extract receipt fields using pattern matching for Romanian receipts.""" # Total amount patterns (most specific first) TOTAL_PATTERNS = [ (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95), (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95), (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90), (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85), (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85), (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80), ] # Date patterns DATE_PATTERNS = [ (r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95), (r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90), (r'(\d{2}[./]\d{2}[./]\d{4})', 0.80), (r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format ] # Receipt number patterns NUMBER_PATTERNS = [ (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95), (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95), (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95), (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90), (r'NR\.?\s*:?\s*(\d{4,})', 0.70), ] # CUI (fiscal code) patterns CUI_PATTERNS = [ (r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95), (r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95), (r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90), (r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80), ] # Series patterns SERIES_PATTERNS = [ (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90), (r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80), ] def extract(self, text: str) -> ExtractionResult: """Extract all fields from OCR text.""" result = ExtractionResult() result.raw_text = text text_upper = text.upper() # Extract fields result.amount, result.confidence_amount = self._extract_amount(text_upper) result.receipt_date, result.confidence_date = self._extract_date(text_upper) result.receipt_number, _ = self._extract_number(text_upper) result.receipt_series, _ = self._extract_series(text_upper) result.partner_name, result.confidence_vendor = self._extract_vendor(text) result.cui, _ = self._extract_cui(text_upper) # Detect receipt type result.receipt_type = self._detect_receipt_type(text_upper) return result def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]: """Extract total amount from text.""" for pattern, confidence in self.TOTAL_PATTERNS: match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) if match: try: amount_str = re.sub(r'[^\d.,]', '', match.group(1)) # Handle Romanian number format (1.234,56) amount_str = self._normalize_number(amount_str) amount = Decimal(amount_str) if amount > 0: return amount, confidence except (InvalidOperation, ValueError): continue return None, 0.0 def _normalize_number(self, num_str: str) -> str: """Normalize Romanian number format to standard decimal.""" # Remove spaces num_str = num_str.replace(' ', '') # Handle comma as decimal separator if ',' in num_str and '.' in num_str: # Romanian format: 1.234,56 num_str = num_str.replace('.', '').replace(',', '.') elif ',' in num_str: # Could be 1,50 or 1,234 parts = num_str.split(',') if len(parts) == 2 and len(parts[1]) <= 2: # Decimal comma: 1,50 num_str = num_str.replace(',', '.') else: # Thousands comma: 1,234 num_str = num_str.replace(',', '') elif '.' in num_str: parts = num_str.split('.') if len(parts) > 2: # Multiple dots: 1.234.567 -> 1234567 num_str = ''.join(parts[:-1]) + '.' + parts[-1] return num_str def _extract_date(self, text: str) -> Tuple[Optional[date], float]: """Extract receipt date from text.""" for pattern, confidence in self.DATE_PATTERNS: match = re.search(pattern, text) if match: try: date_str = match.group(1).replace('/', '.') # Try DD.MM.YYYY format first try: parsed = datetime.strptime(date_str, '%d.%m.%Y').date() except ValueError: # Try YYYY.MM.DD format parsed = datetime.strptime(date_str, '%Y.%m.%d').date() # Validate date range today = date.today() if parsed <= today and parsed.year >= 2020: return parsed, confidence except ValueError: continue return None, 0.0 def _extract_number(self, text: str) -> Tuple[Optional[str], float]: """Extract receipt number from text.""" for pattern, confidence in self.NUMBER_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1), confidence return None, 0.0 def _extract_series(self, text: str) -> Tuple[Optional[str], float]: """Extract receipt series from text.""" for pattern, confidence in self.SERIES_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: return match.group(1).upper(), confidence return None, 0.0 def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]: """Extract vendor/partner name from text.""" lines = text.split('\n') skip_keywords = [ 'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA', 'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR', 'RON', 'LEI', 'CHITANTA', 'REST' ] for i, line in enumerate(lines[:7]): # Check first 7 lines line = line.strip() # Skip empty lines if not line: continue # Skip lines that are just numbers if re.match(r'^[\d.,\s]+$', line): continue # Skip lines with keywords if any(kw in line.upper() for kw in skip_keywords): continue # Clean the line vendor = re.sub(r'[^\w\s.,&-]', '', line).strip() if len(vendor) >= 3: # Confidence decreases for lines further down confidence = max(0.3, 0.8 - (i * 0.1)) return vendor, confidence return None, 0.0 def _extract_cui(self, text: str) -> Tuple[Optional[str], float]: """Extract CUI (fiscal identification code) from text.""" for pattern, confidence in self.CUI_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: cui = match.group(1) if 6 <= len(cui) <= 10: return cui, confidence return None, 0.0 def _detect_receipt_type(self, text: str) -> str: """Detect receipt type from text content.""" if 'CHITANTA' in text or 'CHITANȚĂ' in text: return 'chitanta' return 'bon_fiscal'