feat(ocr): Add modular store profiles with hot-reload support
## Store Profiles System
- Add ProfileRegistry for CUI-based profile lookup
- Add BaseStoreProfile with generic extraction patterns
- Implement hot-reload via POST /api/data-entry/ocr/profiles/reload
## 12 Store Profiles
- LIDL: Multi-rate TVA (A, B, C, D codes)
- OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates
- BRICK, DEDEMAN: Standard TVA, e-factura support
- KINETERRA, BEST PRINT: Non-VAT payers (returns [])
- STEPOUT MARKET: TVA 5% (books/reduced rate)
- UNLIMITED KEYS: NUMERAR payment detection
- GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA
## Flexible TVA Patterns
- All patterns use (\d{1,2})% to accept any rate
- Supports historical (19%, 9%, 5%) and current (21%, 11%)
## Payment Methods Fix
- Fixed base.py to support multiple payments of same type
- Changed deduplication from method-only to (method, amount) tuple
- Returns separate entries for split payments
## Tools
- Add generate_store_profile.py for automatic profile generation
- Analyzes PDFs via OCR API and detects patterns
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
515
backend/modules/data_entry/services/ocr/profiles/base.py
Normal file
515
backend/modules/data_entry/services/ocr/profiles/base.py
Normal file
@@ -0,0 +1,515 @@
|
||||
"""
|
||||
Base class for store-specific OCR extraction profiles.
|
||||
|
||||
Each store can have different receipt formats (TVA layout, total position, etc.).
|
||||
Store profiles allow customizing extraction logic per-store for better accuracy.
|
||||
|
||||
Usage:
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
@ProfileRegistry.register
|
||||
class LidlProfile(BaseStoreProfile):
|
||||
CUI_LIST = ["22891860"]
|
||||
NAME_PATTERNS = ["LIDL", "LDL"]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
# Custom Lidl TVA extraction logic
|
||||
...
|
||||
"""
|
||||
|
||||
import re
|
||||
from abc import ABC
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Optional, Tuple, Dict, Any
|
||||
from datetime import date
|
||||
|
||||
|
||||
class BaseStoreProfile(ABC):
|
||||
"""
|
||||
Abstract base class for store-specific extraction profiles.
|
||||
|
||||
Each profile defines:
|
||||
- CUI_LIST: CUI codes that identify this store (without RO prefix)
|
||||
- NAME_PATTERNS: OCR-tolerant name patterns for fallback matching
|
||||
- Custom extraction methods for TVA, total, date, etc.
|
||||
|
||||
The ProfileRegistry uses CUI_LIST to lookup profiles during extraction.
|
||||
"""
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Class attributes - override in subclasses
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
# List of CUI codes (without RO prefix) that identify this store
|
||||
CUI_LIST: List[str] = []
|
||||
|
||||
# OCR-tolerant name patterns for fallback matching
|
||||
NAME_PATTERNS: List[str] = []
|
||||
|
||||
# Store display name
|
||||
STORE_NAME: str = "Unknown Store"
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Generic patterns - can be overridden in subclasses
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
# Total amount patterns (confidence-weighted)
|
||||
TOTAL_PATTERNS = [
|
||||
(r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),
|
||||
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
|
||||
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),
|
||||
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
|
||||
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
|
||||
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
|
||||
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
|
||||
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
|
||||
]
|
||||
|
||||
# Date patterns (confidence-weighted)
|
||||
DATE_PATTERNS = [
|
||||
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
||||
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
||||
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
|
||||
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
|
||||
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
|
||||
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
|
||||
]
|
||||
|
||||
# Date patterns with OCR-introduced spaces (separate because format is different)
|
||||
DATE_PATTERNS_OCR_SPACES = [
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
||||
]
|
||||
|
||||
# Receipt number patterns (confidence-weighted)
|
||||
NUMBER_PATTERNS = [
|
||||
(r'NDS\s*:?\s*(\d+)', 0.98),
|
||||
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),
|
||||
(r'C3POS.*?(\d{6,7})\b', 0.95),
|
||||
(r'BF\s*:\s*(\d{4,})', 0.96),
|
||||
(r'BF\s+(\d{4,})', 0.93),
|
||||
(r'NIVS\s*:?\s*(\d+)', 0.95),
|
||||
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
|
||||
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
|
||||
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
|
||||
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
|
||||
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
|
||||
]
|
||||
|
||||
# Payment method patterns (pattern, method_type, confidence)
|
||||
PAYMENT_PATTERNS = [
|
||||
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
||||
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
||||
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
|
||||
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
|
||||
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
|
||||
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
|
||||
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
|
||||
]
|
||||
|
||||
# Client section markers (for B2B receipts)
|
||||
CLIENT_MARKERS = [
|
||||
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',
|
||||
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',
|
||||
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',
|
||||
r'CLIENT\s*:',
|
||||
r'CUMPARATOR\s*:',
|
||||
r'BENEFICIAR\s*:',
|
||||
]
|
||||
|
||||
# Client CUI patterns (pattern, confidence)
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
|
||||
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
|
||||
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.90),
|
||||
]
|
||||
|
||||
# Company type indicators (for identifying company names)
|
||||
COMPANY_INDICATORS = [
|
||||
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
|
||||
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
|
||||
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
|
||||
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
|
||||
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
|
||||
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
|
||||
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
|
||||
r'HOLDING',
|
||||
r'COMPANY',
|
||||
r'GROUP',
|
||||
]
|
||||
|
||||
# Maximum reasonable payment amount (to filter OCR errors)
|
||||
MAX_PAYMENT = Decimal('100000')
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Extraction methods - override in subclasses as needed
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from receipt text.
|
||||
|
||||
Override this method in subclasses to handle store-specific TVA formats.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: code, percent, amount
|
||||
"""
|
||||
return []
|
||||
|
||||
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||
"""
|
||||
Extract total amount from receipt text.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (amount, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, confidence in self.TOTAL_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
amount = self._parse_decimal(match.group(1))
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
return (amount, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
||||
"""
|
||||
Extract receipt date from text.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (date, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Try standard patterns first
|
||||
for pattern, confidence in self.DATE_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
parsed = self._parse_date(match.group(1))
|
||||
if parsed:
|
||||
return (parsed, confidence)
|
||||
|
||||
# Try OCR-corrupted patterns with spaces
|
||||
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
try:
|
||||
if fmt == 'ymd':
|
||||
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||
else: # dmy
|
||||
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||
|
||||
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
|
||||
return (date(year, month, day), confidence)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract receipt number from text.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (number, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, confidence in self.NUMBER_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
number = match.group(1).strip()
|
||||
if number and len(number) >= 3:
|
||||
return (number, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract payment methods (CARD/NUMERAR) from receipt.
|
||||
|
||||
Supports multiple payments of the same type (e.g., 2x CARD for split payments).
|
||||
Each payment is returned as a separate entry with its amount.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
|
||||
Multiple entries of same method type are allowed for split payments.
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
methods = []
|
||||
# Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
|
||||
seen_entries = set()
|
||||
|
||||
for pattern, method, confidence in self.PAYMENT_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper):
|
||||
try:
|
||||
amount = self._parse_decimal(match.group(1))
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
# Deduplicate by (method, amount) to avoid same entry from multiple patterns
|
||||
# But allow different amounts for same method (split payments)
|
||||
entry_key = (method, amount)
|
||||
if entry_key not in seen_entries:
|
||||
methods.append({
|
||||
'method': method,
|
||||
'amount': amount,
|
||||
'confidence': confidence
|
||||
})
|
||||
seen_entries.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return methods
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from B2B receipts.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# First check if there's a CLIENT section
|
||||
has_client_section = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client_section:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try to extract CUI
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Normalize: remove RO prefix for storage
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_client_name(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client/buyer company name from B2B receipts.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (client_name, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
lines = text.split('\n')
|
||||
|
||||
# First check if there's a CLIENT section
|
||||
client_section_idx = None
|
||||
for i, line in enumerate(lines):
|
||||
line_upper = line.upper().strip()
|
||||
if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS):
|
||||
client_section_idx = i
|
||||
break
|
||||
|
||||
if client_section_idx is None:
|
||||
return (None, 0.0)
|
||||
|
||||
# Look for company name in CLIENT section
|
||||
line = lines[client_section_idx].strip()
|
||||
line_upper = line.upper()
|
||||
|
||||
# Strategy 1: Check if name is on same line after ":"
|
||||
if ':' in line:
|
||||
name_part = line.split(':', 1)[1].strip()
|
||||
if name_part and len(name_part) >= 3:
|
||||
# Skip if it looks like a CUI (RO followed by digits)
|
||||
if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
|
||||
pass # This is CUI, not name - continue to next strategy
|
||||
else:
|
||||
# Check for company indicators
|
||||
name_upper = name_part.upper()
|
||||
if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS):
|
||||
return (self._clean_company_name(name_part), 0.95)
|
||||
elif len(name_part) >= 5 and not name_part.isdigit():
|
||||
return (self._clean_company_name(name_part), 0.80)
|
||||
|
||||
# Strategy 2: Check next line for company name
|
||||
if client_section_idx + 1 < len(lines):
|
||||
next_line = lines[client_section_idx + 1].strip()
|
||||
next_upper = next_line.upper()
|
||||
|
||||
# Skip if it's a CUI/CIF line or looks like CUI
|
||||
if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
|
||||
if not re.match(r'^R[O0]?\d{6,10}$', next_upper):
|
||||
if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS):
|
||||
return (self._clean_company_name(next_line), 0.90)
|
||||
elif len(next_line) >= 5 and not next_line.isdigit():
|
||||
# Check it's not CUI/CIF/COD keywords
|
||||
if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
|
||||
return (self._clean_company_name(next_line), 0.75)
|
||||
|
||||
# Strategy 3: Look for any line with company indicators in CLIENT section region
|
||||
search_end = min(client_section_idx + 5, len(lines))
|
||||
for i in range(client_section_idx + 1, search_end):
|
||||
line = lines[i].strip()
|
||||
line_upper = line.upper()
|
||||
|
||||
# Skip CUI/CIF lines
|
||||
if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper):
|
||||
continue
|
||||
if re.match(r'^R[O0]?\d{6,10}$', line_upper):
|
||||
continue
|
||||
|
||||
if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS):
|
||||
return (self._clean_company_name(line), 0.85)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
@staticmethod
|
||||
def _clean_company_name(name: str) -> str:
|
||||
"""Clean company name for storage."""
|
||||
if not name:
|
||||
return ""
|
||||
# Remove extra whitespace
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
# Remove trailing punctuation except periods in S.R.L., S.A., etc.
|
||||
name = re.sub(r'[,;:]+$', '', name).strip()
|
||||
return name
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Validation hints - override to customize validation behavior
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Return validation hints for this store.
|
||||
|
||||
Returns:
|
||||
Dict with validation hints. Common keys:
|
||||
- has_multi_rate_tva: bool - Store uses multiple TVA rates
|
||||
- card_equals_total: bool - CARD payment equals total
|
||||
- has_client_cui: bool - Receipt includes client CUI
|
||||
- has_efactura: bool - Store uses e-factura format
|
||||
- is_non_vat_payer: bool - Store is not a VAT payer
|
||||
"""
|
||||
return {}
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Helper methods - available to all subclasses
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _normalize_number(text: str) -> str:
|
||||
"""
|
||||
Normalize a number string for Decimal conversion.
|
||||
|
||||
Handles Romanian formats: "1.234,56" -> "1234.56"
|
||||
"""
|
||||
if not text:
|
||||
return "0"
|
||||
|
||||
# Remove spaces
|
||||
text = text.replace(" ", "")
|
||||
|
||||
# Determine decimal separator
|
||||
last_comma = text.rfind(",")
|
||||
last_dot = text.rfind(".")
|
||||
|
||||
if last_comma > last_dot:
|
||||
text = text.replace(".", "").replace(",", ".")
|
||||
elif last_dot > last_comma:
|
||||
text = text.replace(",", "")
|
||||
else:
|
||||
text = text.replace(",", ".")
|
||||
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def _parse_decimal(text: str) -> Optional[Decimal]:
|
||||
"""Parse a string to Decimal, handling various formats."""
|
||||
try:
|
||||
normalized = BaseStoreProfile._normalize_number(text)
|
||||
return Decimal(normalized)
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_date(text: str) -> Optional[date]:
|
||||
"""
|
||||
Parse date string in various formats.
|
||||
|
||||
Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Normalize separators
|
||||
text = text.replace('/', '-').replace('.', '-')
|
||||
|
||||
try:
|
||||
parts = text.split('-')
|
||||
if len(parts) != 3:
|
||||
return None
|
||||
|
||||
# Determine format based on first part length
|
||||
if len(parts[0]) == 4:
|
||||
# YYYY-MM-DD
|
||||
year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
|
||||
else:
|
||||
# DD-MM-YYYY
|
||||
day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
|
||||
|
||||
# Validate ranges
|
||||
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
|
||||
return date(year, month, day)
|
||||
except (ValueError, TypeError, IndexError):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _clean_text(text: str) -> str:
|
||||
"""Clean OCR text for pattern matching."""
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
return text.strip()
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Magic methods
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"
|
||||
Reference in New Issue
Block a user