Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/socar.py
Claude Agent 099556213d feat(ocr): Add modular store profiles with hot-reload support
## Store Profiles System
- Add ProfileRegistry for CUI-based profile lookup
- Add BaseStoreProfile with generic extraction patterns
- Implement hot-reload via POST /api/data-entry/ocr/profiles/reload

## 12 Store Profiles
- LIDL: Multi-rate TVA (A, B, C, D codes)
- OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates
- BRICK, DEDEMAN: Standard TVA, e-factura support
- KINETERRA, BEST PRINT: Non-VAT payers (returns [])
- STEPOUT MARKET: TVA 5% (books/reduced rate)
- UNLIMITED KEYS: NUMERAR payment detection
- GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA

## Flexible TVA Patterns
- All patterns use (\d{1,2})% to accept any rate
- Supports historical (19%, 9%, 5%) and current (21%, 11%)

## Payment Methods Fix
- Fixed base.py to support multiple payments of same type
- Changed deduplication from method-only to (method, amount) tuple
- Returns separate entries for split payments

## Tools
- Add generate_store_profile.py for automatic profile generation
- Analyzes PDFs via OCR API and detects patterns

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-06 23:07:07 +00:00

112 lines
3.7 KiB
Python

"""
SOCAR Petroleum store profile for OCR extraction.
SOCAR receipts are similar to OMV - gas station with client CUI support.
Date format may use YYYY. MM. DD with spaces.
"""
import re
from datetime import date
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class SocarProfile(BaseStoreProfile):
"""
SOCAR PETROLEUM S.A. - standard TVA with client CUI.
Key characteristics:
- Standard TVA format (usually single rate)
- Includes client CUI on receipt (for business purchases)
- Similar format to OMV/Petrom
- Date format may use YYYY. MM. DD (with spaces)
"""
CUI_LIST = ["12546600"]
NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants
STORE_NAME = "SOCAR PETROLEUM S.A."
# Standard TVA patterns for gas stations
TVA_PATTERNS = [
# Table format: "A-19,00% 285,66 49,58"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)',
# Simple format: "TVA 19% 49,58"
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
]
# Gas stations may use YYYY. MM. DD format
DATE_PATTERNS_OCR_SPACES = [
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract SOCAR-specific TVA entries.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
# Try table format first
table_pattern = self.TVA_PATTERNS[0]
for match in re.finditer(table_pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
tva_amount = self._parse_decimal(match.group(4))
if tva_amount and tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Fallback to simple format if no table entries found
if not entries:
simple_pattern = self.TVA_PATTERNS[1]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
# Default to code 'A' for simple format
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break # Only take first match for simple format
except (ValueError, InvalidOperation):
continue
return entries
def get_validation_hints(self) -> Dict[str, Any]:
"""Return SOCAR-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
}