Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/unlimited_keys.py
Claude Agent 28f259cd05 fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00

270 lines
9.3 KiB
Python

"""
UNLIMITED KEYS S.R.L. store profile for OCR extraction.
Key duplication service. Notable for CASH (NUMERAR) payments.
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Optional, Tuple
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class UnlimitedKeysProfile(BaseStoreProfile):
"""
UNLIMITED KEYS S.R.L. - standard TVA profile with NUMERAR payment.
Key characteristics:
- Standard TVA format (single rate, any percentage)
- Key duplication service
- NUMERAR (cash) payment common - different from most stores!
- May also accept CARD
- OCR often reads "TVA" as "TUA" - need OCR error variants
"""
CUI_LIST = ["18993187"]
NAME_PATTERNS = ["UNLIMITED KEYS", "UNLIMITED", "UNL1MITED", "UNLIMITED KEYS SRL"]
STORE_NAME = "UNLIMITED KEYS S.R.L."
# Standard TVA patterns - including OCR error variants (TVA -> TUA)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY" (including TUA OCR error)
r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)',
# "TVA XX% YY,YY" (simple format, includes TUA)
r'T[UV]A\s+(\d{1,2})\s*%\s*([\d.,\s]+)',
# "XX.XX% TUA*A YY.YY" (OCR format with TUA*A or TUA)
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?[A-D]?\s*([\d.,\s]+)',
# "TOTAL TUA: YY.YY" (total TVA amount only)
r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)',
]
# TOTAL patterns for UNLIMITED KEYS (handles "80 .00" format)
TOTAL_PATTERNS = [
# "SUMA TOTALA: 80 .00" (with space before decimal)
(r'SUMA\s+TOTALA\s*:?\s*([\d\s.,]+)', 0.98),
# "TOTALA: 80,00"
(r'TOTALA\s*:?\s*([\d.,]+)', 0.95),
# Standard TOTAL patterns from base class
(r'TOTAL\s+(?:DE\s+PLATA|ACHITAT|LEI)\s*:?\s*([\d.,]+)', 0.95),
(r'TOTAL\s*:?\s*([\d.,]+)', 0.90),
]
# Payment patterns - NUMERAR is primary for this store
PAYMENT_PATTERNS = [
# "NUMERAR 80.00" or "NUMERAR: 80.00"
(r'NUMERAR\s*:?\s*([\d.,\s]+)', 'NUMERAR', 0.98),
# "CARD 80.00" or "CARD: 80.00"
(r'CARD\s*:?\s*([\d.,\s]+)', 'CARD', 0.95),
]
# Client CUI patterns - specific to this receipt format
CLIENT_CUI_PATTERNS = [
# "CIF CLIENT:1879855" (exact format from OCR)
(r'CIF\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
# "CLIENT CIF: ROXXXXXXX"
(r'CLIENT\s+CIF\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "C.I.F. CLIENT: XXXXXXX"
(r'C\.?I\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
]
# Override client markers to be less strict
CLIENT_MARKERS = [
r'CIF\s+CLIENT',
r'CLIENT\s+CIF',
r'C\.?I\.?F\.?\s+CLIENT',
r'CLIENT\s*:',
]
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from receipt text.
Handles UNLIMITED KEYS format with space before decimal (e.g., "80 .00").
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (total_amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
try:
# Clean up amount string (remove spaces, fix decimal)
amount_str = match.group(1)
# Remove spaces that might appear before decimal
amount_str = re.sub(r'\s+', '', amount_str)
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
return (amount, confidence)
except (ValueError, InvalidOperation):
continue
return (None, 0.0)
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Handles OCR errors where TVA is read as TUA.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
# Pattern 4: "XX.XX% TUA*A YY.YY" - common OCR format
pattern4 = self.TVA_PATTERNS[3]
match = re.search(pattern4, text_upper)
if match:
try:
percent = int(match.group(1))
amount_str = re.sub(r'\s+', '', match.group(2))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
pass
# Pattern 5: "TOTAL TUA: YY.YY" - fallback to total TVA
pattern5 = self.TVA_PATTERNS[4]
match = re.search(pattern5, text_upper)
if match:
try:
amount_str = re.sub(r'\s+', '', match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
# Infer percent from amount vs total ratio
entries.append({
'code': 'A',
'percent': 19, # Standard Romanian TVA rate
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
pass
# Try coded patterns
for pattern in self.TVA_PATTERNS[:3]:
for match in re.finditer(pattern, text_upper, re.IGNORECASE):
try:
groups = match.groups()
if len(groups) == 3:
code = groups[0].upper()
percent = int(groups[1])
amount_str = re.sub(r'\s+', '', groups[2])
else:
code = 'A'
percent = int(groups[0])
amount_str = re.sub(r'\s+', '', groups[1])
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods from receipt text.
Handles NUMERAR (cash) as primary payment for this store.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
for pattern, method, confidence in self.PAYMENT_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
try:
amount_str = re.sub(r'\s+', '', match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': method,
'amount': amount,
'confidence': confidence
})
except (ValueError, InvalidOperation):
continue
return payments
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from receipt text.
Handles "CIF CLIENT:1879855" format specific to this store.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try client CUI patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def get_validation_hints(self) -> Dict[str, Any]:
"""Return UNLIMITED KEYS-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False, # May be NUMERAR (cash)
"has_client_cui": True, # May have client CUI
"has_efactura": False,
"is_non_vat_payer": False,
"common_payment": "NUMERAR", # Cash payments common
}