Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/omv.py
Claude Agent 28f259cd05 fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00

237 lines
8.1 KiB
Python

"""
OMV Petrom store profile for OCR extraction.
OMV receipts typically include client CUI and use standard TVA format.
Common at gas stations with fuel purchases.
Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14")
OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22")
"""
import re
from datetime import date
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class OMVProfile(BaseStoreProfile):
"""
OMV PETROM MARKETING S.R.L. - standard TVA with client CUI.
Key characteristics:
- Standard TVA format (usually single rate, any percentage)
- Includes client CUI on receipt (for business purchases)
- TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces)
- Supports historical rates (19%) and current rates (21%)
- Date format: YYYY. MM. DD (with spaces)
- Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX"
"""
CUI_LIST = ["11201891"]
NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants
STORE_NAME = "OMV PETROM MARKETING S.R.L."
# OMV TVA table patterns (handles OCR spaces in numbers)
# Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total)
TVA_TABLE_PATTERNS = [
# "A-21, 00% 55, 22 318, 16" - with spaces in numbers
r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)',
# "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE
r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)',
]
# Standard TVA pattern fallback
TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)'
# OMV specific: prioritize YYYY. MM. DD format with spaces
DATE_PATTERNS_OCR_SPACES = [
# YYYY. MM. DD with time (OMV format)
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
# Fallback to DD. MM. YYYY
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Client CUI patterns for OMV (unique format)
CLIENT_CUI_PATTERNS = [
# "CLIENT C.U. I./C.I.F.: RO1879855"
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99),
# "C.U.I./C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
# Fallback to simpler pattern
(r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90),
]
# Client markers for OMV
CLIENT_MARKERS = [
r'CLIENT\s+C\.?\s*U\.?\s*I',
r'CLIENT\s+C\.?\s*I\.?\s*F',
r'NUME\s+CLIENT',
r'CLIENT\s*:',
]
def _clean_ocr_number(self, value: str) -> str:
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
# Remove spaces around commas and periods
value = re.sub(r'\s*([.,])\s*', r'\1', value)
# Remove any remaining spaces
value = value.replace(' ', '')
return value
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract OMV-specific TVA entries.
OMV receipts show TVA in table format with spaces in numbers.
Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base)
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
# Try table format first: "A-21, 00% 55, 22 318, 16"
table_pattern = self.TVA_TABLE_PATTERNS[0]
for match in re.finditer(table_pattern, text_upper):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# Clean OCR spaces from amounts
tva_amount_str = self._clean_ocr_number(match.group(3))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
return entries # OMV usually has single TVA rate
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TAXE: 55, 22"
fallback_pattern = self.TVA_TABLE_PATTERNS[1]
match = re.search(fallback_pattern, text_upper)
if match:
try:
tva_amount_str = self._clean_ocr_number(match.group(1))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Standard rate, will be corrected by validation
'amount': tva_amount
})
except (ValueError, InvalidOperation):
pass
return entries
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from OMV receipt.
OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855"
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for OMV client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try OMV-specific patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract OMV-specific payment methods.
OMV receipts use "CARTE CREDIT" instead of "CARD".
Payment amount equals TOTAL for gas station receipts.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# OMV payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments # OMV usually has single payment method
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return OMV-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
"tva_table_format": True,
}