fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions

View File

@@ -5,6 +5,7 @@ OMV receipts typically include client CUI and use standard TVA format.
Common at gas stations with fuel purchases.
Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14")
OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22")
"""
import re
@@ -24,17 +25,24 @@ class OMVProfile(BaseStoreProfile):
Key characteristics:
- Standard TVA format (usually single rate, any percentage)
- Includes client CUI on receipt (for business purchases)
- TVA table format: "A-XX,XX% base_amount tva_amount"
- TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces)
- Supports historical rates (19%) and current rates (21%)
- Date format: YYYY. MM. DD (with spaces)
- Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX"
"""
CUI_LIST = ["11201891"]
NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants
STORE_NAME = "OMV PETROM MARKETING S.R.L."
# OMV TVA table pattern: "A-19,00% 285,66 49,58" (code-percent base tva)
TVA_TABLE_PATTERN = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)'
# OMV TVA table patterns (handles OCR spaces in numbers)
# Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total)
TVA_TABLE_PATTERNS = [
# "A-21, 00% 55, 22 318, 16" - with spaces in numbers
r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)',
# "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE
r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)',
]
# Standard TVA pattern fallback
TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)'
@@ -49,12 +57,38 @@ class OMVProfile(BaseStoreProfile):
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Client CUI patterns for OMV (unique format)
CLIENT_CUI_PATTERNS = [
# "CLIENT C.U. I./C.I.F.: RO1879855"
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99),
# "C.U.I./C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
# Fallback to simpler pattern
(r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90),
]
# Client markers for OMV
CLIENT_MARKERS = [
r'CLIENT\s+C\.?\s*U\.?\s*I',
r'CLIENT\s+C\.?\s*I\.?\s*F',
r'NUME\s+CLIENT',
r'CLIENT\s*:',
]
def _clean_ocr_number(self, value: str) -> str:
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
# Remove spaces around commas and periods
value = re.sub(r'\s*([.,])\s*', r'\1', value)
# Remove any remaining spaces
value = value.replace(' ', '')
return value
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract OMV-specific TVA entries.
OMV receipts often show TVA in table format with base and TVA amounts.
Falls back to standard extraction if table format not found.
OMV receipts show TVA in table format with spaces in numbers.
Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base)
Args:
text: Raw OCR text from receipt
@@ -63,35 +97,138 @@ class OMVProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
# Try table format first (more accurate)
for match in re.finditer(self.TVA_TABLE_PATTERN, text, re.IGNORECASE):
# Try table format first: "A-21, 00% 55, 22 318, 16"
table_pattern = self.TVA_TABLE_PATTERNS[0]
for match in re.finditer(table_pattern, text_upper):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# TVA amount is the second number (smaller one)
tva_amount = self._parse_decimal(match.group(4))
# Clean OCR spaces from amounts
tva_amount_str = self._clean_ocr_number(match.group(3))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation):
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
return entries # OMV usually has single TVA rate
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TAXE: 55, 22"
fallback_pattern = self.TVA_TABLE_PATTERNS[1]
match = re.search(fallback_pattern, text_upper)
if match:
try:
tva_amount_str = self._clean_ocr_number(match.group(1))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Standard rate, will be corrected by validation
'amount': tva_amount
})
except (ValueError, InvalidOperation):
pass
return entries
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from OMV receipt.
OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855"
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for OMV client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try OMV-specific patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract OMV-specific payment methods.
OMV receipts use "CARTE CREDIT" instead of "CARD".
Payment amount equals TOTAL for gas station receipts.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# OMV payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments # OMV usually has single payment method
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return OMV-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,