feat(ocr): Add modular store profiles with hot-reload support
## Store Profiles System
- Add ProfileRegistry for CUI-based profile lookup
- Add BaseStoreProfile with generic extraction patterns
- Implement hot-reload via POST /api/data-entry/ocr/profiles/reload
## 12 Store Profiles
- LIDL: Multi-rate TVA (A, B, C, D codes)
- OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates
- BRICK, DEDEMAN: Standard TVA, e-factura support
- KINETERRA, BEST PRINT: Non-VAT payers (returns [])
- STEPOUT MARKET: TVA 5% (books/reduced rate)
- UNLIMITED KEYS: NUMERAR payment detection
- GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA
## Flexible TVA Patterns
- All patterns use (\d{1,2})% to accept any rate
- Supports historical (19%, 9%, 5%) and current (21%, 11%)
## Payment Methods Fix
- Fixed base.py to support multiple payments of same type
- Changed deduplication from method-only to (method, amount) tuple
- Returns separate entries for split payments
## Tools
- Add generate_store_profile.py for automatic profile generation
- Analyzes PDFs via OCR API and detects patterns
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
111
backend/modules/data_entry/services/ocr/profiles/socar.py
Normal file
111
backend/modules/data_entry/services/ocr/profiles/socar.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
SOCAR Petroleum store profile for OCR extraction.
|
||||
|
||||
SOCAR receipts are similar to OMV - gas station with client CUI support.
|
||||
Date format may use YYYY. MM. DD with spaces.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class SocarProfile(BaseStoreProfile):
|
||||
"""
|
||||
SOCAR PETROLEUM S.A. - standard TVA with client CUI.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format (usually single rate)
|
||||
- Includes client CUI on receipt (for business purchases)
|
||||
- Similar format to OMV/Petrom
|
||||
- Date format may use YYYY. MM. DD (with spaces)
|
||||
"""
|
||||
|
||||
CUI_LIST = ["12546600"]
|
||||
NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants
|
||||
STORE_NAME = "SOCAR PETROLEUM S.A."
|
||||
|
||||
# Standard TVA patterns for gas stations
|
||||
TVA_PATTERNS = [
|
||||
# Table format: "A-19,00% 285,66 49,58"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)',
|
||||
# Simple format: "TVA 19% 49,58"
|
||||
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# Gas stations may use YYYY. MM. DD format
|
||||
DATE_PATTERNS_OCR_SPACES = [
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
||||
]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract SOCAR-specific TVA entries.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
|
||||
# Try table format first
|
||||
table_pattern = self.TVA_PATTERNS[0]
|
||||
for match in re.finditer(table_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
tva_amount = self._parse_decimal(match.group(4))
|
||||
|
||||
if tva_amount and tva_amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': tva_amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Fallback to simple format if no table entries found
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[1]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
|
||||
if amount and amount > 0:
|
||||
# Default to code 'A' for simple format
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break # Only take first match for simple format
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return SOCAR-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": False,
|
||||
"has_client_cui": True,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
}
|
||||
Reference in New Issue
Block a user