Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
232 lines
8.3 KiB
Python
232 lines
8.3 KiB
Python
"""Extract structured fields from OCR text (Romanian receipts)."""
|
|
|
|
import re
|
|
from datetime import date, datetime
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import Optional, Tuple
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Structured extraction result from receipt."""
|
|
receipt_type: str = 'bon_fiscal'
|
|
receipt_number: Optional[str] = None
|
|
receipt_series: Optional[str] = None
|
|
receipt_date: Optional[date] = None
|
|
amount: Optional[Decimal] = None
|
|
partner_name: Optional[str] = None
|
|
cui: Optional[str] = None
|
|
description: Optional[str] = None
|
|
|
|
confidence_amount: float = 0.0
|
|
confidence_date: float = 0.0
|
|
confidence_vendor: float = 0.0
|
|
raw_text: str = ""
|
|
|
|
@property
|
|
def overall_confidence(self) -> float:
|
|
"""Calculate weighted overall confidence score."""
|
|
weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
|
|
return round(
|
|
self.confidence_amount * weights['amount'] +
|
|
self.confidence_date * weights['date'] +
|
|
self.confidence_vendor * weights['vendor'],
|
|
2
|
|
)
|
|
|
|
|
|
class ReceiptExtractor:
|
|
"""Extract receipt fields using pattern matching for Romanian receipts."""
|
|
|
|
# Total amount patterns (most specific first)
|
|
TOTAL_PATTERNS = [
|
|
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
|
|
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
|
|
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
|
|
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
|
|
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
|
|
]
|
|
|
|
# Date patterns
|
|
DATE_PATTERNS = [
|
|
(r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
|
|
(r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
|
|
(r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
|
|
(r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format
|
|
]
|
|
|
|
# Receipt number patterns
|
|
NUMBER_PATTERNS = [
|
|
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
|
|
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
|
|
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
|
|
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
|
|
(r'NR\.?\s*:?\s*(\d{4,})', 0.70),
|
|
]
|
|
|
|
# CUI (fiscal code) patterns
|
|
CUI_PATTERNS = [
|
|
(r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
|
|
(r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
|
|
(r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
|
|
(r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
|
|
]
|
|
|
|
# Series patterns
|
|
SERIES_PATTERNS = [
|
|
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
|
(r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
|
|
]
|
|
|
|
def extract(self, text: str) -> ExtractionResult:
|
|
"""Extract all fields from OCR text."""
|
|
result = ExtractionResult()
|
|
result.raw_text = text
|
|
text_upper = text.upper()
|
|
|
|
# Extract fields
|
|
result.amount, result.confidence_amount = self._extract_amount(text_upper)
|
|
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
|
|
result.receipt_number, _ = self._extract_number(text_upper)
|
|
result.receipt_series, _ = self._extract_series(text_upper)
|
|
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
|
|
result.cui, _ = self._extract_cui(text_upper)
|
|
|
|
# Detect receipt type
|
|
result.receipt_type = self._detect_receipt_type(text_upper)
|
|
|
|
return result
|
|
|
|
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
|
|
"""Extract total amount from text."""
|
|
for pattern, confidence in self.TOTAL_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
try:
|
|
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
|
|
# Handle Romanian number format (1.234,56)
|
|
amount_str = self._normalize_number(amount_str)
|
|
amount = Decimal(amount_str)
|
|
if amount > 0:
|
|
return amount, confidence
|
|
except (InvalidOperation, ValueError):
|
|
continue
|
|
return None, 0.0
|
|
|
|
def _normalize_number(self, num_str: str) -> str:
|
|
"""Normalize Romanian number format to standard decimal."""
|
|
# Remove spaces
|
|
num_str = num_str.replace(' ', '')
|
|
|
|
# Handle comma as decimal separator
|
|
if ',' in num_str and '.' in num_str:
|
|
# Romanian format: 1.234,56
|
|
num_str = num_str.replace('.', '').replace(',', '.')
|
|
elif ',' in num_str:
|
|
# Could be 1,50 or 1,234
|
|
parts = num_str.split(',')
|
|
if len(parts) == 2 and len(parts[1]) <= 2:
|
|
# Decimal comma: 1,50
|
|
num_str = num_str.replace(',', '.')
|
|
else:
|
|
# Thousands comma: 1,234
|
|
num_str = num_str.replace(',', '')
|
|
elif '.' in num_str:
|
|
parts = num_str.split('.')
|
|
if len(parts) > 2:
|
|
# Multiple dots: 1.234.567 -> 1234567
|
|
num_str = ''.join(parts[:-1]) + '.' + parts[-1]
|
|
|
|
return num_str
|
|
|
|
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
|
"""Extract receipt date from text."""
|
|
for pattern, confidence in self.DATE_PATTERNS:
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
try:
|
|
date_str = match.group(1).replace('/', '.')
|
|
|
|
# Try DD.MM.YYYY format first
|
|
try:
|
|
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
|
|
except ValueError:
|
|
# Try YYYY.MM.DD format
|
|
parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
|
|
|
|
# Validate date range
|
|
today = date.today()
|
|
if parsed <= today and parsed.year >= 2020:
|
|
return parsed, confidence
|
|
except ValueError:
|
|
continue
|
|
return None, 0.0
|
|
|
|
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract receipt number from text."""
|
|
for pattern, confidence in self.NUMBER_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1), confidence
|
|
return None, 0.0
|
|
|
|
def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract receipt series from text."""
|
|
for pattern, confidence in self.SERIES_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1).upper(), confidence
|
|
return None, 0.0
|
|
|
|
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract vendor/partner name from text."""
|
|
lines = text.split('\n')
|
|
skip_keywords = [
|
|
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
|
|
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
|
|
'RON', 'LEI', 'CHITANTA', 'REST'
|
|
]
|
|
|
|
for i, line in enumerate(lines[:7]): # Check first 7 lines
|
|
line = line.strip()
|
|
|
|
# Skip empty lines
|
|
if not line:
|
|
continue
|
|
|
|
# Skip lines that are just numbers
|
|
if re.match(r'^[\d.,\s]+$', line):
|
|
continue
|
|
|
|
# Skip lines with keywords
|
|
if any(kw in line.upper() for kw in skip_keywords):
|
|
continue
|
|
|
|
# Clean the line
|
|
vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()
|
|
|
|
if len(vendor) >= 3:
|
|
# Confidence decreases for lines further down
|
|
confidence = max(0.3, 0.8 - (i * 0.1))
|
|
return vendor, confidence
|
|
|
|
return None, 0.0
|
|
|
|
def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""Extract CUI (fiscal identification code) from text."""
|
|
for pattern, confidence in self.CUI_PATTERNS:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
cui = match.group(1)
|
|
if 6 <= len(cui) <= 10:
|
|
return cui, confidence
|
|
return None, 0.0
|
|
|
|
def _detect_receipt_type(self, text: str) -> str:
|
|
"""Detect receipt type from text content."""
|
|
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
|
|
return 'chitanta'
|
|
return 'bon_fiscal'
|