Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/socar.py
Claude Agent 28f259cd05 fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:40:58 +00:00

163 lines
5.3 KiB
Python

"""
SOCAR Petroleum store profile for OCR extraction.
SOCAR receipts are similar to OMV - gas station with client CUI support.
Date format may use YYYY. MM. DD with spaces.
"""
import re
from datetime import date
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class SocarProfile(BaseStoreProfile):
"""
SOCAR PETROLEUM S.A. - standard TVA with client CUI.
Key characteristics:
- Standard TVA format (usually single rate)
- Includes client CUI on receipt (for business purchases)
- Similar format to OMV/Petrom
- Date format may use YYYY. MM. DD (with spaces)
"""
CUI_LIST = ["12546600"]
NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants
STORE_NAME = "SOCAR PETROLEUM S.A."
# Standard TVA patterns for gas stations
TVA_PATTERNS = [
# Table format: "A-19,00% 285,66 49,58"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)',
# Simple format: "TVA 19% 49,58"
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
]
# Gas stations may use YYYY. MM. DD format
DATE_PATTERNS_OCR_SPACES = [
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract SOCAR-specific TVA entries.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
# Try table format first
table_pattern = self.TVA_PATTERNS[0]
for match in re.finditer(table_pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
tva_amount = self._parse_decimal(match.group(4))
if tva_amount and tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Fallback to simple format if no table entries found
if not entries:
simple_pattern = self.TVA_PATTERNS[1]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
# Default to code 'A' for simple format
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break # Only take first match for simple format
except (ValueError, InvalidOperation):
continue
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract SOCAR-specific payment methods.
Gas stations use "CARTE CREDIT" or "CARD" for card payments.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Gas station payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return SOCAR-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
}