fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ OMV receipts typically include client CUI and use standard TVA format.
|
||||
Common at gas stations with fuel purchases.
|
||||
|
||||
Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14")
|
||||
OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22")
|
||||
"""
|
||||
|
||||
import re
|
||||
@@ -24,17 +25,24 @@ class OMVProfile(BaseStoreProfile):
|
||||
Key characteristics:
|
||||
- Standard TVA format (usually single rate, any percentage)
|
||||
- Includes client CUI on receipt (for business purchases)
|
||||
- TVA table format: "A-XX,XX% base_amount tva_amount"
|
||||
- TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces)
|
||||
- Supports historical rates (19%) and current rates (21%)
|
||||
- Date format: YYYY. MM. DD (with spaces)
|
||||
- Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX"
|
||||
"""
|
||||
|
||||
CUI_LIST = ["11201891"]
|
||||
NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants
|
||||
STORE_NAME = "OMV PETROM MARKETING S.R.L."
|
||||
|
||||
# OMV TVA table pattern: "A-19,00% 285,66 49,58" (code-percent base tva)
|
||||
TVA_TABLE_PATTERN = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)'
|
||||
# OMV TVA table patterns (handles OCR spaces in numbers)
|
||||
# Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total)
|
||||
TVA_TABLE_PATTERNS = [
|
||||
# "A-21, 00% 55, 22 318, 16" - with spaces in numbers
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)',
|
||||
# "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE
|
||||
r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)',
|
||||
]
|
||||
|
||||
# Standard TVA pattern fallback
|
||||
TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)'
|
||||
@@ -49,12 +57,38 @@ class OMVProfile(BaseStoreProfile):
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
||||
]
|
||||
|
||||
# Client CUI patterns for OMV (unique format)
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CLIENT C.U. I./C.I.F.: RO1879855"
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99),
|
||||
# "C.U.I./C.I.F. CLIENT: XXXXXXX"
|
||||
(r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
|
||||
# Fallback to simpler pattern
|
||||
(r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90),
|
||||
]
|
||||
|
||||
# Client markers for OMV
|
||||
CLIENT_MARKERS = [
|
||||
r'CLIENT\s+C\.?\s*U\.?\s*I',
|
||||
r'CLIENT\s+C\.?\s*I\.?\s*F',
|
||||
r'NUME\s+CLIENT',
|
||||
r'CLIENT\s*:',
|
||||
]
|
||||
|
||||
def _clean_ocr_number(self, value: str) -> str:
|
||||
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
|
||||
# Remove spaces around commas and periods
|
||||
value = re.sub(r'\s*([.,])\s*', r'\1', value)
|
||||
# Remove any remaining spaces
|
||||
value = value.replace(' ', '')
|
||||
return value
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract OMV-specific TVA entries.
|
||||
|
||||
OMV receipts often show TVA in table format with base and TVA amounts.
|
||||
Falls back to standard extraction if table format not found.
|
||||
OMV receipts show TVA in table format with spaces in numbers.
|
||||
Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base)
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
@@ -63,35 +97,138 @@ class OMVProfile(BaseStoreProfile):
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
text_upper = text.upper()
|
||||
|
||||
# Try table format first (more accurate)
|
||||
for match in re.finditer(self.TVA_TABLE_PATTERN, text, re.IGNORECASE):
|
||||
# Try table format first: "A-21, 00% 55, 22 318, 16"
|
||||
table_pattern = self.TVA_TABLE_PATTERNS[0]
|
||||
for match in re.finditer(table_pattern, text_upper):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
# TVA amount is the second number (smaller one)
|
||||
tva_amount = self._parse_decimal(match.group(4))
|
||||
# Clean OCR spaces from amounts
|
||||
tva_amount_str = self._clean_ocr_number(match.group(3))
|
||||
tva_amount = self._parse_decimal(tva_amount_str)
|
||||
|
||||
if tva_amount and tva_amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': tva_amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': tva_amount
|
||||
})
|
||||
return entries # OMV usually has single TVA rate
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback: "TOTAL TAXE: 55, 22"
|
||||
fallback_pattern = self.TVA_TABLE_PATTERNS[1]
|
||||
match = re.search(fallback_pattern, text_upper)
|
||||
if match:
|
||||
try:
|
||||
tva_amount_str = self._clean_ocr_number(match.group(1))
|
||||
tva_amount = self._parse_decimal(tva_amount_str)
|
||||
if tva_amount and tva_amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Standard rate, will be corrected by validation
|
||||
'amount': tva_amount
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
return entries
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from OMV receipt.
|
||||
|
||||
OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855"
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for OMV client markers
|
||||
has_client = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try OMV-specific patterns
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Clean up: remove RO prefix, spaces
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract OMV-specific payment methods.
|
||||
|
||||
OMV receipts use "CARTE CREDIT" instead of "CARD".
|
||||
Payment amount equals TOTAL for gas station receipts.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Get total amount first
|
||||
total_amount, _ = self.extract_total(text)
|
||||
if not total_amount:
|
||||
return []
|
||||
|
||||
# OMV payment patterns
|
||||
payment_indicators = [
|
||||
('CARTE CREDIT', 'CARD', 0.98),
|
||||
('CARTE DE CREDIT', 'CARD', 0.98),
|
||||
('CARD', 'CARD', 0.95),
|
||||
('VISA', 'CARD', 0.95),
|
||||
('MASTERCARD', 'CARD', 0.95),
|
||||
('CONTACTLESS', 'CARD', 0.90),
|
||||
('NUMERAR', 'NUMERAR', 0.95),
|
||||
('CASH', 'NUMERAR', 0.90),
|
||||
]
|
||||
|
||||
for indicator, method, confidence in payment_indicators:
|
||||
if indicator in text_upper:
|
||||
payments.append({
|
||||
'method': method,
|
||||
'amount': total_amount,
|
||||
'confidence': confidence
|
||||
})
|
||||
return payments # OMV usually has single payment method
|
||||
|
||||
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
|
||||
if 'BON FISCAL' in text_upper:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.70
|
||||
})
|
||||
|
||||
return payments
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return OMV-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": False,
|
||||
"card_equals_total": True, # Gas station: card equals total
|
||||
"has_client_cui": True,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
|
||||
Reference in New Issue
Block a user