Files
roa2web-service-auto/data-entry-app/backend/app/services/ocr_extractor.py
Marius Mutu 41ae97180e feat: Add OCR integration for automatic receipt data extraction
Implement Tesseract-based OCR to automatically extract vendor name,
date, total amount, and VAT from uploaded receipt images/PDFs,
reducing manual data entry and improving accuracy.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:48:29 +02:00

232 lines
8.3 KiB
Python

"""Extract structured fields from OCR text (Romanian receipts)."""
import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple
from dataclasses import dataclass, field
@dataclass
class ExtractionResult:
"""Structured extraction result from receipt."""
receipt_type: str = 'bon_fiscal'
receipt_number: Optional[str] = None
receipt_series: Optional[str] = None
receipt_date: Optional[date] = None
amount: Optional[Decimal] = None
partner_name: Optional[str] = None
cui: Optional[str] = None
description: Optional[str] = None
confidence_amount: float = 0.0
confidence_date: float = 0.0
confidence_vendor: float = 0.0
raw_text: str = ""
@property
def overall_confidence(self) -> float:
"""Calculate weighted overall confidence score."""
weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
return round(
self.confidence_amount * weights['amount'] +
self.confidence_date * weights['date'] +
self.confidence_vendor * weights['vendor'],
2
)
class ReceiptExtractor:
"""Extract receipt fields using pattern matching for Romanian receipts."""
# Total amount patterns (most specific first)
TOTAL_PATTERNS = [
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
]
# Date patterns
DATE_PATTERNS = [
(r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
(r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
(r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
(r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format
]
# Receipt number patterns
NUMBER_PATTERNS = [
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
(r'NR\.?\s*:?\s*(\d{4,})', 0.70),
]
# CUI (fiscal code) patterns
CUI_PATTERNS = [
(r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
(r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
(r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
(r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
]
# Series patterns
SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
(r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
]
def extract(self, text: str) -> ExtractionResult:
"""Extract all fields from OCR text."""
result = ExtractionResult()
result.raw_text = text
text_upper = text.upper()
# Extract fields
result.amount, result.confidence_amount = self._extract_amount(text_upper)
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
result.receipt_number, _ = self._extract_number(text_upper)
result.receipt_series, _ = self._extract_series(text_upper)
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
result.cui, _ = self._extract_cui(text_upper)
# Detect receipt type
result.receipt_type = self._detect_receipt_type(text_upper)
return result
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
"""Extract total amount from text."""
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if match:
try:
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
# Handle Romanian number format (1.234,56)
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 0:
return amount, confidence
except (InvalidOperation, ValueError):
continue
return None, 0.0
def _normalize_number(self, num_str: str) -> str:
"""Normalize Romanian number format to standard decimal."""
# Remove spaces
num_str = num_str.replace(' ', '')
# Handle comma as decimal separator
if ',' in num_str and '.' in num_str:
# Romanian format: 1.234,56
num_str = num_str.replace('.', '').replace(',', '.')
elif ',' in num_str:
# Could be 1,50 or 1,234
parts = num_str.split(',')
if len(parts) == 2 and len(parts[1]) <= 2:
# Decimal comma: 1,50
num_str = num_str.replace(',', '.')
else:
# Thousands comma: 1,234
num_str = num_str.replace(',', '')
elif '.' in num_str:
parts = num_str.split('.')
if len(parts) > 2:
# Multiple dots: 1.234.567 -> 1234567
num_str = ''.join(parts[:-1]) + '.' + parts[-1]
return num_str
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
"""Extract receipt date from text."""
for pattern, confidence in self.DATE_PATTERNS:
match = re.search(pattern, text)
if match:
try:
date_str = match.group(1).replace('/', '.')
# Try DD.MM.YYYY format first
try:
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
except ValueError:
# Try YYYY.MM.DD format
parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
# Validate date range
today = date.today()
if parsed <= today and parsed.year >= 2020:
return parsed, confidence
except ValueError:
continue
return None, 0.0
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
"""Extract receipt number from text."""
for pattern, confidence in self.NUMBER_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1), confidence
return None, 0.0
def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
"""Extract receipt series from text."""
for pattern, confidence in self.SERIES_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).upper(), confidence
return None, 0.0
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
"""Extract vendor/partner name from text."""
lines = text.split('\n')
skip_keywords = [
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
'RON', 'LEI', 'CHITANTA', 'REST'
]
for i, line in enumerate(lines[:7]): # Check first 7 lines
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip lines that are just numbers
if re.match(r'^[\d.,\s]+$', line):
continue
# Skip lines with keywords
if any(kw in line.upper() for kw in skip_keywords):
continue
# Clean the line
vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()
if len(vendor) >= 3:
# Confidence decreases for lines further down
confidence = max(0.3, 0.8 - (i * 0.1))
return vendor, confidence
return None, 0.0
def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
"""Extract CUI (fiscal identification code) from text."""
for pattern, confidence in self.CUI_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
return cui, confidence
return None, 0.0
def _detect_receipt_type(self, text: str) -> str:
"""Detect receipt type from text content."""
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
return 'chitanta'
return 'bon_fiscal'