Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/base.py
Claude Agent 28f259cd05 fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00

525 lines
20 KiB
Python

"""
Base class for store-specific OCR extraction profiles.
Each store can have different receipt formats (TVA layout, total position, etc.).
Store profiles allow customizing extraction logic per-store for better accuracy.
Usage:
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class LidlProfile(BaseStoreProfile):
CUI_LIST = ["22891860"]
NAME_PATTERNS = ["LIDL", "LDL"]
def extract_tva_entries(self, text: str) -> List[dict]:
# Custom Lidl TVA extraction logic
...
"""
import re
from abc import ABC
from decimal import Decimal, InvalidOperation
from typing import List, Optional, Tuple, Dict, Any
from datetime import date
class BaseStoreProfile(ABC):
"""
Abstract base class for store-specific extraction profiles.
Each profile defines:
- CUI_LIST: CUI codes that identify this store (without RO prefix)
- NAME_PATTERNS: OCR-tolerant name patterns for fallback matching
- Custom extraction methods for TVA, total, date, etc.
The ProfileRegistry uses CUI_LIST to lookup profiles during extraction.
"""
# -------------------------------------------------------------------------
# Class attributes - override in subclasses
# -------------------------------------------------------------------------
# List of CUI codes (without RO prefix) that identify this store
CUI_LIST: List[str] = []
# OCR-tolerant name patterns for fallback matching
NAME_PATTERNS: List[str] = []
# Store display name
STORE_NAME: str = "Unknown Store"
# -------------------------------------------------------------------------
# Generic patterns - can be overridden in subclasses
# -------------------------------------------------------------------------
# Total amount patterns (confidence-weighted)
TOTAL_PATTERNS = [
(r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
]
# Date patterns (confidence-weighted)
DATE_PATTERNS = [
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
]
# Date patterns with OCR-introduced spaces (separate because format is different)
DATE_PATTERNS_OCR_SPACES = [
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Receipt number patterns (confidence-weighted)
NUMBER_PATTERNS = [
(r'NDS\s*:?\s*(\d+)', 0.98),
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),
(r'C3POS.*?(\d{6,7})\b', 0.95),
(r'BF\s*:\s*(\d{4,})', 0.96),
(r'BF\s+(\d{4,})', 0.93),
(r'NIVS\s*:?\s*(\d+)', 0.95),
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
]
# Payment method patterns (pattern, method_type, confidence)
PAYMENT_PATTERNS = [
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
]
# Client section markers (for B2B receipts) - More flexible patterns
CLIENT_MARKERS = [
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI"
r'CLIENT\s*:', # "CLIENT:"
r'CUMPARATOR\s*:', # "CUMPARATOR:"
r'BENEFICIAR\s*:', # "BENEFICIAR:"
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
]
# Client CUI patterns (pattern, confidence) - More flexible
CLIENT_CUI_PATTERNS = [
# "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
(r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
# "CLIENT CIF: XXXXXXX"
(r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "CUI CLIENT: XXXXXXX"
(r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "ROXXXXXXX" followed by CLIENT marker
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
# "C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
# "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
# "COD FISCAL CLIENT: XXXXXXX"
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
]
# Company type indicators (for identifying company names)
COMPANY_INDICATORS = [
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
r'HOLDING',
r'COMPANY',
r'GROUP',
]
# Maximum reasonable payment amount (to filter OCR errors)
MAX_PAYMENT = Decimal('100000')
# -------------------------------------------------------------------------
# Extraction methods - override in subclasses as needed
# -------------------------------------------------------------------------
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Override this method in subclasses to handle store-specific TVA formats.
Args:
text: Raw OCR text from receipt
Returns:
List of dicts with keys: code, percent, amount
"""
return []
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from receipt text.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper)
if match:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, confidence)
return (None, 0.0)
def extract_date(self, text: str) -> Tuple[Optional[date], float]:
"""
Extract receipt date from text.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (date, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Try standard patterns first
for pattern, confidence in self.DATE_PATTERNS:
match = re.search(pattern, text_upper)
if match:
parsed = self._parse_date(match.group(1))
if parsed:
return (parsed, confidence)
# Try OCR-corrupted patterns with spaces
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
match = re.search(pattern, text_upper)
if match:
try:
if fmt == 'ymd':
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
else: # dmy
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
return (date(year, month, day), confidence)
except (ValueError, TypeError):
continue
return (None, 0.0)
def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract receipt number from text.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (number, confidence) or (None, 0.0)
"""
text_upper = text.upper()
for pattern, confidence in self.NUMBER_PATTERNS:
match = re.search(pattern, text_upper)
if match:
number = match.group(1).strip()
if number and len(number) >= 3:
return (number, confidence)
return (None, 0.0)
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods (CARD/NUMERAR) from receipt.
Supports multiple payments of the same type (e.g., 2x CARD for split payments).
Each payment is returned as a separate entry with its amount.
Args:
text: Raw OCR text from receipt
Returns:
List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
Multiple entries of same method type are allowed for split payments.
"""
text_upper = text.upper()
methods = []
# Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
seen_entries = set()
for pattern, method, confidence in self.PAYMENT_PATTERNS:
for match in re.finditer(pattern, text_upper):
try:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
# Deduplicate by (method, amount) to avoid same entry from multiple patterns
# But allow different amounts for same method (split payments)
entry_key = (method, amount)
if entry_key not in seen_entries:
methods.append({
'method': method,
'amount': amount,
'confidence': confidence
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
return methods
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from B2B receipts.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# First check if there's a CLIENT section
has_client_section = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client_section:
return (None, 0.0)
# Try to extract CUI
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
# Normalize: remove RO prefix for storage
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def extract_client_name(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client/buyer company name from B2B receipts.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (client_name, confidence) or (None, 0.0)
"""
text_upper = text.upper()
lines = text.split('\n')
# First check if there's a CLIENT section
client_section_idx = None
for i, line in enumerate(lines):
line_upper = line.upper().strip()
if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS):
client_section_idx = i
break
if client_section_idx is None:
return (None, 0.0)
# Look for company name in CLIENT section
line = lines[client_section_idx].strip()
line_upper = line.upper()
# Strategy 1: Check if name is on same line after ":"
if ':' in line:
name_part = line.split(':', 1)[1].strip()
if name_part and len(name_part) >= 3:
# Skip if it looks like a CUI (RO followed by digits)
if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
pass # This is CUI, not name - continue to next strategy
else:
# Check for company indicators
name_upper = name_part.upper()
if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS):
return (self._clean_company_name(name_part), 0.95)
elif len(name_part) >= 5 and not name_part.isdigit():
return (self._clean_company_name(name_part), 0.80)
# Strategy 2: Check next line for company name
if client_section_idx + 1 < len(lines):
next_line = lines[client_section_idx + 1].strip()
next_upper = next_line.upper()
# Skip if it's a CUI/CIF line or looks like CUI
if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
if not re.match(r'^R[O0]?\d{6,10}$', next_upper):
if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS):
return (self._clean_company_name(next_line), 0.90)
elif len(next_line) >= 5 and not next_line.isdigit():
# Check it's not CUI/CIF/COD keywords
if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
return (self._clean_company_name(next_line), 0.75)
# Strategy 3: Look for any line with company indicators in CLIENT section region
search_end = min(client_section_idx + 5, len(lines))
for i in range(client_section_idx + 1, search_end):
line = lines[i].strip()
line_upper = line.upper()
# Skip CUI/CIF lines
if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper):
continue
if re.match(r'^R[O0]?\d{6,10}$', line_upper):
continue
if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS):
return (self._clean_company_name(line), 0.85)
return (None, 0.0)
@staticmethod
def _clean_company_name(name: str) -> str:
"""Clean company name for storage."""
if not name:
return ""
# Remove extra whitespace
name = re.sub(r'\s+', ' ', name).strip()
# Remove trailing punctuation except periods in S.R.L., S.A., etc.
name = re.sub(r'[,;:]+$', '', name).strip()
return name
# -------------------------------------------------------------------------
# Validation hints - override to customize validation behavior
# -------------------------------------------------------------------------
def get_validation_hints(self) -> Dict[str, Any]:
"""
Return validation hints for this store.
Returns:
Dict with validation hints. Common keys:
- has_multi_rate_tva: bool - Store uses multiple TVA rates
- card_equals_total: bool - CARD payment equals total
- has_client_cui: bool - Receipt includes client CUI
- has_efactura: bool - Store uses e-factura format
- is_non_vat_payer: bool - Store is not a VAT payer
"""
return {}
# -------------------------------------------------------------------------
# Helper methods - available to all subclasses
# -------------------------------------------------------------------------
@staticmethod
def _normalize_number(text: str) -> str:
"""
Normalize a number string for Decimal conversion.
Handles Romanian formats: "1.234,56" -> "1234.56"
"""
if not text:
return "0"
# Remove spaces
text = text.replace(" ", "")
# Determine decimal separator
last_comma = text.rfind(",")
last_dot = text.rfind(".")
if last_comma > last_dot:
text = text.replace(".", "").replace(",", ".")
elif last_dot > last_comma:
text = text.replace(",", "")
else:
text = text.replace(",", ".")
return text
@staticmethod
def _parse_decimal(text: str) -> Optional[Decimal]:
"""Parse a string to Decimal, handling various formats."""
try:
normalized = BaseStoreProfile._normalize_number(text)
return Decimal(normalized)
except (InvalidOperation, ValueError, TypeError):
return None
@staticmethod
def _parse_date(text: str) -> Optional[date]:
"""
Parse date string in various formats.
Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD
"""
if not text:
return None
# Normalize separators
text = text.replace('/', '-').replace('.', '-')
try:
parts = text.split('-')
if len(parts) != 3:
return None
# Determine format based on first part length
if len(parts[0]) == 4:
# YYYY-MM-DD
year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
else:
# DD-MM-YYYY
day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
# Validate ranges
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
return date(year, month, day)
except (ValueError, TypeError, IndexError):
pass
return None
@staticmethod
def _clean_text(text: str) -> str:
"""Clean OCR text for pattern matching."""
if not text:
return ""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text)
return text.strip()
# -------------------------------------------------------------------------
# Magic methods
# -------------------------------------------------------------------------
def __repr__(self) -> str:
return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>"
def __str__(self) -> str:
return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"