fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@ Key duplication service. Notable for CASH (NUMERAR) payments.
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
@@ -22,26 +22,101 @@ class UnlimitedKeysProfile(BaseStoreProfile):
|
||||
- Key duplication service
|
||||
- NUMERAR (cash) payment common - different from most stores!
|
||||
- May also accept CARD
|
||||
- OCR often reads "TVA" as "TUA" - need OCR error variants
|
||||
"""
|
||||
|
||||
CUI_LIST = ["18993187"]
|
||||
NAME_PATTERNS = ["UNLIMITED KEYS", "UNLIMITED", "UNL1MITED", "UNLIMITED KEYS SRL"]
|
||||
STORE_NAME = "UNLIMITED KEYS S.R.L."
|
||||
|
||||
# Standard TVA patterns (flexible - accepts any rate)
|
||||
# Standard TVA patterns - including OCR error variants (TVA -> TUA)
|
||||
TVA_PATTERNS = [
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
|
||||
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY" (including TUA OCR error)
|
||||
r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)',
|
||||
# "A - XX,XX% = YY,YY"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "TVA XX% YY,YY" (simple format without code)
|
||||
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)',
|
||||
# "TVA XX% YY,YY" (simple format, includes TUA)
|
||||
r'T[UV]A\s+(\d{1,2})\s*%\s*([\d.,\s]+)',
|
||||
# "XX.XX% TUA*A YY.YY" (OCR format with TUA*A or TUA)
|
||||
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?[A-D]?\s*([\d.,\s]+)',
|
||||
# "TOTAL TUA: YY.YY" (total TVA amount only)
|
||||
r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)',
|
||||
]
|
||||
|
||||
# TOTAL patterns for UNLIMITED KEYS (handles "80 .00" format)
|
||||
TOTAL_PATTERNS = [
|
||||
# "SUMA TOTALA: 80 .00" (with space before decimal)
|
||||
(r'SUMA\s+TOTALA\s*:?\s*([\d\s.,]+)', 0.98),
|
||||
# "TOTALA: 80,00"
|
||||
(r'TOTALA\s*:?\s*([\d.,]+)', 0.95),
|
||||
# Standard TOTAL patterns from base class
|
||||
(r'TOTAL\s+(?:DE\s+PLATA|ACHITAT|LEI)\s*:?\s*([\d.,]+)', 0.95),
|
||||
(r'TOTAL\s*:?\s*([\d.,]+)', 0.90),
|
||||
]
|
||||
|
||||
# Payment patterns - NUMERAR is primary for this store
|
||||
PAYMENT_PATTERNS = [
|
||||
# "NUMERAR 80.00" or "NUMERAR: 80.00"
|
||||
(r'NUMERAR\s*:?\s*([\d.,\s]+)', 'NUMERAR', 0.98),
|
||||
# "CARD 80.00" or "CARD: 80.00"
|
||||
(r'CARD\s*:?\s*([\d.,\s]+)', 'CARD', 0.95),
|
||||
]
|
||||
|
||||
# Client CUI patterns - specific to this receipt format
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CIF CLIENT:1879855" (exact format from OCR)
|
||||
(r'CIF\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
|
||||
# "CLIENT CIF: ROXXXXXXX"
|
||||
(r'CLIENT\s+CIF\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
# "C.I.F. CLIENT: XXXXXXX"
|
||||
(r'C\.?I\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
]
|
||||
|
||||
# Override client markers to be less strict
|
||||
CLIENT_MARKERS = [
|
||||
r'CIF\s+CLIENT',
|
||||
r'CLIENT\s+CIF',
|
||||
r'C\.?I\.?F\.?\s+CLIENT',
|
||||
r'CLIENT\s*:',
|
||||
]
|
||||
|
||||
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||
"""
|
||||
Extract total amount from receipt text.
|
||||
|
||||
Handles UNLIMITED KEYS format with space before decimal (e.g., "80 .00").
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (total_amount, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, confidence in self.TOTAL_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
# Clean up amount string (remove spaces, fix decimal)
|
||||
amount_str = match.group(1)
|
||||
# Remove spaces that might appear before decimal
|
||||
amount_str = re.sub(r'\s+', '', amount_str)
|
||||
amount = self._parse_decimal(amount_str)
|
||||
|
||||
if amount and amount > 0:
|
||||
return (amount, confidence)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from receipt text.
|
||||
|
||||
Handles OCR errors where TVA is read as TUA.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
@@ -49,48 +124,139 @@ class UnlimitedKeysProfile(BaseStoreProfile):
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
text_upper = text.upper()
|
||||
|
||||
# Try coded patterns first
|
||||
for pattern in self.TVA_PATTERNS[:2]:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
# Pattern 4: "XX.XX% TUA*A YY.YY" - common OCR format
|
||||
pattern4 = self.TVA_PATTERNS[3]
|
||||
match = re.search(pattern4, text_upper)
|
||||
if match:
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount_str = re.sub(r'\s+', '', match.group(2))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
pass
|
||||
|
||||
# Pattern 5: "TOTAL TUA: YY.YY" - fallback to total TVA
|
||||
pattern5 = self.TVA_PATTERNS[4]
|
||||
match = re.search(pattern5, text_upper)
|
||||
if match:
|
||||
try:
|
||||
amount_str = re.sub(r'\s+', '', match.group(1))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
# Infer percent from amount vs total ratio
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Standard Romanian TVA rate
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
pass
|
||||
|
||||
# Try coded patterns
|
||||
for pattern in self.TVA_PATTERNS[:3]:
|
||||
for match in re.finditer(pattern, text_upper, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
|
||||
if amount and amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback to simple format
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[2]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
groups = match.groups()
|
||||
if len(groups) == 3:
|
||||
code = groups[0].upper()
|
||||
percent = int(groups[1])
|
||||
amount_str = re.sub(r'\s+', '', groups[2])
|
||||
else:
|
||||
code = 'A'
|
||||
percent = int(groups[0])
|
||||
amount_str = re.sub(r'\s+', '', groups[1])
|
||||
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break
|
||||
except (ValueError, InvalidOperation):
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract payment methods from receipt text.
|
||||
|
||||
Handles NUMERAR (cash) as primary payment for this store.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, method, confidence in self.PAYMENT_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
amount_str = re.sub(r'\s+', '', match.group(1))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
|
||||
if amount and amount > 0:
|
||||
payments.append({
|
||||
'method': method,
|
||||
'amount': amount,
|
||||
'confidence': confidence
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return payments
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from receipt text.
|
||||
|
||||
Handles "CIF CLIENT:1879855" format specific to this store.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for client markers
|
||||
has_client = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try client CUI patterns
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Clean up: remove RO prefix, spaces
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return UNLIMITED KEYS-specific validation hints."""
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user