Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
163 lines
5.3 KiB
Python
163 lines
5.3 KiB
Python
"""
|
|
SOCAR Petroleum store profile for OCR extraction.
|
|
|
|
SOCAR receipts are similar to OMV - gas station with client CUI support.
|
|
Date format may use YYYY. MM. DD with spaces.
|
|
"""
|
|
|
|
import re
|
|
from datetime import date
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import List, Dict, Any, Tuple, Optional
|
|
|
|
from .base import BaseStoreProfile
|
|
from . import ProfileRegistry
|
|
|
|
|
|
@ProfileRegistry.register
|
|
class SocarProfile(BaseStoreProfile):
|
|
"""
|
|
SOCAR PETROLEUM S.A. - standard TVA with client CUI.
|
|
|
|
Key characteristics:
|
|
- Standard TVA format (usually single rate)
|
|
- Includes client CUI on receipt (for business purchases)
|
|
- Similar format to OMV/Petrom
|
|
- Date format may use YYYY. MM. DD (with spaces)
|
|
"""
|
|
|
|
CUI_LIST = ["12546600"]
|
|
NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants
|
|
STORE_NAME = "SOCAR PETROLEUM S.A."
|
|
|
|
# Standard TVA patterns for gas stations
|
|
TVA_PATTERNS = [
|
|
# Table format: "A-19,00% 285,66 49,58"
|
|
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)',
|
|
# Simple format: "TVA 19% 49,58"
|
|
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
|
]
|
|
|
|
# Gas stations may use YYYY. MM. DD format
|
|
DATE_PATTERNS_OCR_SPACES = [
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
|
]
|
|
|
|
def extract_tva_entries(self, text: str) -> List[dict]:
|
|
"""
|
|
Extract SOCAR-specific TVA entries.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
List of TVA entries with code, percent, and amount
|
|
"""
|
|
entries = []
|
|
seen = set()
|
|
|
|
# Try table format first
|
|
table_pattern = self.TVA_PATTERNS[0]
|
|
for match in re.finditer(table_pattern, text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
tva_amount = self._parse_decimal(match.group(4))
|
|
|
|
if tva_amount and tva_amount > 0:
|
|
entry_key = (code, percent)
|
|
if entry_key not in seen:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': tva_amount
|
|
})
|
|
seen.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# Fallback to simple format if no table entries found
|
|
if not entries:
|
|
simple_pattern = self.TVA_PATTERNS[1]
|
|
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
|
try:
|
|
percent = int(match.group(1))
|
|
amount = self._parse_decimal(match.group(2))
|
|
|
|
if amount and amount > 0:
|
|
# Default to code 'A' for simple format
|
|
entries.append({
|
|
'code': 'A',
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
break # Only take first match for simple format
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
return entries
|
|
|
|
def extract_payment_methods(self, text: str) -> List[dict]:
|
|
"""
|
|
Extract SOCAR-specific payment methods.
|
|
|
|
Gas stations use "CARTE CREDIT" or "CARD" for card payments.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
List of payment methods with method, amount, and confidence
|
|
"""
|
|
payments = []
|
|
text_upper = text.upper()
|
|
|
|
# Get total amount first
|
|
total_amount, _ = self.extract_total(text)
|
|
if not total_amount:
|
|
return []
|
|
|
|
# Gas station payment patterns
|
|
payment_indicators = [
|
|
('CARTE CREDIT', 'CARD', 0.98),
|
|
('CARTE DE CREDIT', 'CARD', 0.98),
|
|
('CARD', 'CARD', 0.95),
|
|
('VISA', 'CARD', 0.95),
|
|
('MASTERCARD', 'CARD', 0.95),
|
|
('CONTACTLESS', 'CARD', 0.90),
|
|
('NUMERAR', 'NUMERAR', 0.95),
|
|
('CASH', 'NUMERAR', 0.90),
|
|
]
|
|
|
|
for indicator, method, confidence in payment_indicators:
|
|
if indicator in text_upper:
|
|
payments.append({
|
|
'method': method,
|
|
'amount': total_amount,
|
|
'confidence': confidence
|
|
})
|
|
return payments
|
|
|
|
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
|
|
if 'BON FISCAL' in text_upper:
|
|
payments.append({
|
|
'method': 'CARD',
|
|
'amount': total_amount,
|
|
'confidence': 0.70
|
|
})
|
|
|
|
return payments
|
|
|
|
def get_validation_hints(self) -> Dict[str, Any]:
|
|
"""Return SOCAR-specific validation hints."""
|
|
return {
|
|
"has_multi_rate_tva": False,
|
|
"card_equals_total": True, # Gas station: card equals total
|
|
"has_client_cui": True,
|
|
"has_efactura": False,
|
|
"is_non_vat_payer": False,
|
|
}
|