ocr extract

This commit is contained in:
Claude Agent
2026-01-07 14:34:07 +00:00
parent 22eca953ce
commit cc98d6f21f
24 changed files with 774 additions and 2346 deletions

View File

@@ -50,6 +50,9 @@ class BaseStoreProfile(ABC):
# Store display name
STORE_NAME: str = "Unknown Store"
# Flag for known non-VAT payer stores (skips TVA extraction)
IS_NON_VAT_PAYER: bool = False
# -------------------------------------------------------------------------
# Generic patterns - can be overridden in subclasses
# -------------------------------------------------------------------------
@@ -100,18 +103,33 @@ class BaseStoreProfile(ABC):
]
# Payment method patterns (pattern, method_type, confidence)
# Handles ALL payment types: CARD, NUMERAR, and card brand names
PAYMENT_PATTERNS = [
# CARTE CREDIT variants (OMV/Petrom/Socar receipts)
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
(r'CARTE\s+DE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
(r'CARTE\s+DE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
# CARD standard
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
# Card brand names
(r'VISA\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'MASTERCARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'MAESTR[O0]\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'CONTACTLESS\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
(r'DEBIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
(r'CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.88),
# Cash variants
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
# Truncation recovery patterns (for OCR left-margin issues)
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
]
# Client section markers (for B2B receipts) - More flexible patterns
# Includes OCR corruption variants (LIENT, C IENT, L IENT)
CLIENT_MARKERS = [
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
@@ -121,24 +139,62 @@ class BaseStoreProfile(ABC):
r'BENEFICIAR\s*:', # "BENEFICIAR:"
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
# OCR corruption patterns
r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF L IENT:"
r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C)
r'LIENT\s*:', # "LIENT:" (missing C and I/L)
# Brick-specific (I→L OCR error)
r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/', # "CLIENT C.U.L./" (I read as L)
]
# Client CUI patterns (pattern, confidence) - More flexible
# Client CUI patterns (pattern, confidence) - Comprehensive
# Handles: docTR reordering, doubled letters, corruption, CUMPARATOR, Brick L/I swap
CLIENT_CUI_PATTERNS = [
# "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
(r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
# "CLIENT CIF: XXXXXXX"
(r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "CUI CLIENT: XXXXXXX"
(r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "ROXXXXXXX" followed by CLIENT marker
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
# "C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
# "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
# === CUI on line BEFORE CLIENT marker (docTR/OCR reordering) ===
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
# === "CIF I CLIENT:" format (OCR extra chars) ===
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
# === CIF CLIENT: (reversed - CIF before CLIENT) ===
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# === CLIENT C.U.I/C.I.F. (slash variants) ===
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
# === Doubled letters (docTR artifact: "C.U U.I") ===
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
# === CLIENT C.U.I. or CLIENT CUI (without slash) ===
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
# === Corrupted CLIENT after CIF (OCR errors) ===
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
(r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),
(r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
# === CUMPARATOR variants ===
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUMPARATOR with CUI/CIF on next line
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
# CUMPARATOR with CUI/CIF two lines down
(r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# === CLIENT on next line ===
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# === Standard fallback patterns ===
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
# "COD FISCAL CLIENT: XXXXXXX"
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
# === Brick-specific (I→L OCR error) ===
# Matches: "CLIENT C.U.L./C.IF. :R01879855"
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
]
# Company type indicators (for identifying company names)
@@ -158,15 +214,133 @@ class BaseStoreProfile(ABC):
# Maximum reasonable payment amount (to filter OCR errors)
MAX_PAYMENT = Decimal('100000')
# -------------------------------------------------------------------------
# TVA (VAT) patterns - ALL FORMATS unified
# OCR tolerant: T[VU][AR] matches TVA, TUA, TVR
# -------------------------------------------------------------------------
TVA_PATTERNS = [
# === FORMAT 1: INLINE cu cod și procent (Lidl-style) ===
# "TVA A 21,00% 7.71" sau "TVA B 11,00% 2.13"
(r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.98, 'inline'),
(r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.97, 'inline'),
(r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.95, 'inline'),
# === FORMAT 2: REVERSED (Stepout-style) ===
# "5.00% TUA*B" - procent ÎNAINTE de TVA
(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', 0.97, 'reversed'),
# === FORMAT 3: TABLE (OMV-style) ===
# "A-21,00% 285,66 49,58" (cod-procent bază tva)
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', 0.96, 'table'),
(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', 0.95, 'taxe'),
# === FORMAT 4: MULTILINE (Brick/Electrobering) ===
# "TOTAL TVA A - 19%" pe o linie, amount pe următoarea
(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', 0.96, 'multiline'),
(r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', 0.95, 'multiline'),
(r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', 0.94, 'multiline'),
# === FORMAT 5: STANDARD (din extractor) ===
(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98, 'bon'),
(r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
(r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95, 'percent'),
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93, 'percent'),
(r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93, 'books'),
(r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92, 'books'),
# === FORMAT 6: CODED inline (cu code A-D) ===
(r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)', 0.95, 'coded'),
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)', 0.93, 'coded'),
# === FALLBACK patterns ===
(r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88, 'fallback'),
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
(r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)', 0.90, 'standard'),
]
# Non-VAT payer patterns - NEPLATITOR DE TVA
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, ANEPLATHTOR, MEPLATITOR
NON_VAT_PATTERNS = [
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
]
# CUI (fiscal code) patterns - VENDOR CUI (exclude CLIENT)
# OCR errors: R0 instead of RO, C1F instead of CIF
CUI_PATTERNS = [
# CIF at start of line (definitely vendor)
(r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'^CIF\s*:?\s*(\d{6,10})', 0.97),
(r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),
(r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),
# CIF not preceded by CLIENT (negative lookbehind)
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
# Standalone CIF with word boundary
(r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
(r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
# COD FISCAL (vendor)
(r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
(r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
# C. I. F. format with SPACES (OCR artifact)
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
# C.I.F. format (with dots, no spaces)
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
# CUI format
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
# Lidl format: "Cod Identificare fiscala" (OCR corrupted)
(r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
# Generic: anything with "fiscal" followed by RO + digits
(r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
]
# CUI REVERSED format (number BEFORE label)
CUI_REVERSED_PATTERNS = [
(r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
]
# Items count patterns - NR POZ ART IN BON
ITEMS_COUNT_PATTERNS = [
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
]
# Series patterns - Romanian fiscal receipt series
SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
]
# -------------------------------------------------------------------------
# Extraction methods - override in subclasses as needed
# -------------------------------------------------------------------------
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Extract TVA entries from receipt text - GENERIC implementation.
Override this method in subclasses to handle store-specific TVA formats.
Handles ALL formats:
- Multi-rate inline (Lidl): "TVA A 21% 7.71"
- Reversed (Stepout): "5.00% TUA*B"
- Table (OMV): "A-21,00% 285,66 49,58"
- Multiline: "TOTAL TVA A - 19%" + amount on next line
- Non-VAT payers: Returns empty list
Args:
text: Raw OCR text from receipt
@@ -174,12 +348,252 @@ class BaseStoreProfile(ABC):
Returns:
List of dicts with keys: code, percent, amount
"""
return []
entries = []
text_upper = text.upper()
# Step 1: Check for known non-VAT payer (by class flag or text detection)
if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper):
return [] # No TVA entries for non-VAT payers
# Step 2: Normalize OCR spaces in numbers
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
lines = normalized.split('\n')
# Step 3: Try all formats, collect candidates
candidates = []
# Try inline multi-rate (Lidl-style)
candidates.extend(self._try_tva_inline(normalized))
# Try reversed format (Stepout-style)
candidates.extend(self._try_tva_reversed(normalized, lines))
# Try multiline format (Brick/Electrobering)
candidates.extend(self._try_tva_multiline(normalized, lines))
# Try table format (OMV-style)
candidates.extend(self._try_tva_table(normalized))
# Try standard/fallback patterns
if not candidates:
candidates.extend(self._try_tva_standard(normalized))
# Step 4: Deduplicate and return
seen = set()
for entry in candidates:
key = (entry.get('code', 'A'), entry.get('percent', 19))
if key not in seen and entry.get('amount') and entry['amount'] > 0:
entries.append(entry)
seen.add(key)
return entries
def _is_non_vat_payer(self, text: str) -> bool:
"""Check if receipt is from non-VAT payer."""
for pattern in self.NON_VAT_PATTERNS:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def _try_tva_inline(self, text: str) -> List[dict]:
"""Try Lidl-style inline format: 'TVA A 21,00% 7.71'"""
entries = []
# Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13"
for pattern, confidence, fmt in self.TVA_PATTERNS:
if fmt != 'inline':
continue
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
groups = match.groups()
if len(groups) >= 3:
code = groups[0].upper() if groups[0] else 'A'
percent = int(groups[1])
amount = self._parse_decimal(self._clean_ocr_number(groups[2]))
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _try_tva_reversed(self, text: str, lines: List[str]) -> List[dict]:
"""Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)"""
entries = []
# Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next
for i, line in enumerate(lines):
# Try pattern with amount on SAME line: "5.00% TUA*B 2.00"
match = re.search(
r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?\s+([\d\s.,]+)',
line, re.IGNORECASE
)
if match:
try:
percent = int(match.group(1))
code = match.group(2).upper() if match.group(2) else 'A'
amount_str = match.group(3).strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
continue # Check for more entries
except (ValueError, InvalidOperation, IndexError):
pass
# Fallback: amount on NEXT line
match = re.search(r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?$', line, re.IGNORECASE)
if match:
try:
percent = int(match.group(1))
code = match.group(2).upper() if match.group(2) else 'A'
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _try_tva_multiline(self, text: str, lines: List[str]) -> List[dict]:
"""Try multiline format: 'TOTAL TVA A - 19%' + amount on next line"""
entries = []
# Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next
multiline_patterns = [
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%',
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%',
]
for i, line in enumerate(lines):
for pattern in multiline_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
try:
code = match.group(1).upper() if match.group(1) else 'A'
percent = int(match.group(2))
# Amount is on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _try_tva_table(self, text: str) -> List[dict]:
"""Try OMV-style table format: 'A-21,00% 285,66 49,58'"""
entries = []
# Pattern: "A-21,00% 285,66 49,58" (code-percent base_amount tva_amount)
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)'
for match in re.finditer(table_pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# Group 4 is the TVA amount (last column in table)
tva_amount_str = self._clean_ocr_number(match.group(4))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TAXE: 55,22"
if not entries:
taxe_match = re.search(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', text, re.IGNORECASE)
if taxe_match:
try:
amount_str = self._clean_ocr_number(taxe_match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default rate
'amount': amount
})
except (ValueError, InvalidOperation):
pass
return entries
def _try_tva_standard(self, text: str) -> List[dict]:
"""Try standard TVA patterns as fallback"""
entries = []
standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books']
for pattern, confidence, fmt in self.TVA_PATTERNS:
if fmt not in standard_fmts:
continue
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
groups = match.groups()
if len(groups) >= 2:
# Could be (percent, amount) or (code, percent, amount)
if groups[0] and groups[0].isalpha():
code = groups[0].upper()
percent = int(groups[1]) if len(groups) > 1 else 19
amount_str = groups[2] if len(groups) > 2 else None
else:
code = 'A'
percent = int(groups[0]) if groups[0] and groups[0].isdigit() else 19
amount_str = groups[1] if len(groups) > 1 else groups[0]
if amount_str:
amount = self._parse_decimal(self._clean_ocr_number(amount_str))
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
elif len(groups) == 1:
# Just amount
amount = self._parse_decimal(self._clean_ocr_number(groups[0]))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _clean_ocr_number(self, value: str) -> str:
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
if not value:
return ""
value = re.sub(r'\s*([.,])\s*', r'\1', value)
value = value.replace(' ', '')
return value
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from receipt text.
Supports both single-line and multiline formats:
- Single line: "TOTAL: 78.00", "SUMA TOTALA: 78.00"
- Multiline: "SUMA\nTOTALA:\n78.00" (common in thermal receipts)
Args:
text: Raw OCR text from receipt
@@ -187,7 +601,54 @@ class BaseStoreProfile(ABC):
Tuple of (amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
lines = text_upper.split('\n')
# =====================================================================
# STRATEGY 1: Multiline "SUMA TOTALA" pattern (thermal receipts)
# Format: SUMA on one line, TOTALA: on next, amount on third
# =====================================================================
for i, line in enumerate(lines):
line_clean = line.strip()
# Check for "SUMA" keyword (with OCR variants: SUNA, SUHA, SUM A)
if re.search(r'S[UU]M[AĂ\s]', line_clean):
# Look at next 3 lines for "TOTALA" and amount
for j in range(i, min(i + 4, len(lines))):
check_line = lines[j].strip()
# Check for "TOTALA:" or "TOTALA -" followed by amount
match = re.search(r'T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', check_line)
if match:
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.98)
# Check for "TOTALA" without amount, amount on next line
if re.search(r'T[O0]TALA\s*[:\-]?\s*$', check_line):
if j + 1 < len(lines):
amount_line = lines[j + 1].strip()
amount = self._parse_decimal(amount_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.97)
# Check for "SUMA TOTALA" on single line with amount
match = re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', line_clean)
if match:
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.98)
# Check for "SUMA TOTALA" without amount, amount on next line
if re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]?\s*$', line_clean):
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
amount = self._parse_decimal(next_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.96)
# =====================================================================
# STRATEGY 2: Standard single-line patterns
# =====================================================================
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper)
if match:
@@ -259,28 +720,76 @@ class BaseStoreProfile(ABC):
"""
Extract payment methods (CARD/NUMERAR) from receipt.
Supports multiple payments of the same type (e.g., 2x CARD for split payments).
Each payment is returned as a separate entry with its amount.
Supports:
- Multiline patterns: "CARD\n78.00" (common in thermal receipts)
- Multiple payments (split CARD + NUMERAR)
- REST (change) detection to calculate actual CARD amount
- Keyword-only CARD/NUMERAR that infers from total
- Fallback for fiscal receipts without explicit payment
Args:
text: Raw OCR text from receipt
Returns:
List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
Multiple entries of same method type are allowed for split payments.
"""
text_upper = text.upper()
lines = text_upper.split('\n')
methods = []
# Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
seen_entries = set()
# =====================================================================
# STEP 0: Try MULTILINE patterns first (thermal receipts)
# Format: "CARD" on one line, amount on next line
# =====================================================================
for i, line in enumerate(lines):
line_clean = line.strip()
# Standalone CARD keyword (not part of MASTERCARD, etc.)
if re.match(r'^CARD\s*$', line_clean):
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
# Must be a valid amount (not another keyword)
if re.match(r'^[\d\s.,]+$', next_line):
amount = self._parse_decimal(next_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
entry_key = ('CARD', amount)
if entry_key not in seen_entries:
methods.append({
'method': 'CARD',
'amount': amount,
'confidence': 0.95
})
seen_entries.add(entry_key)
# Standalone NUMERAR keyword
if re.match(r'^NUMERAR\s*$', line_clean):
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
if re.match(r'^[\d\s.,]+$', next_line):
amount = self._parse_decimal(next_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
entry_key = ('NUMERAR', amount)
if entry_key not in seen_entries:
methods.append({
'method': 'NUMERAR',
'amount': amount,
'confidence': 0.95
})
seen_entries.add(entry_key)
# If multiline extraction found methods, return them
if methods:
return methods
# =====================================================================
# STEP 1: Try pattern-based extraction with explicit amounts
# =====================================================================
for pattern, method, confidence in self.PAYMENT_PATTERNS:
for match in re.finditer(pattern, text_upper):
try:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
# Deduplicate by (method, amount) to avoid same entry from multiple patterns
# But allow different amounts for same method (split payments)
entry_key = (method, amount)
if entry_key not in seen_entries:
methods.append({
@@ -292,6 +801,70 @@ class BaseStoreProfile(ABC):
except (ValueError, InvalidOperation):
continue
# If we found explicit amounts, we're done
if methods:
return methods
# Step 2: Try keyword-only detection with REST logic
# Get total amount for inference
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Check for payment keywords
has_card = any(kw in text_upper for kw in ['CARD', 'CARTE CREDIT', 'VISA', 'MASTERCARD', 'DEBIT', 'CREDIT', 'CONTACTLESS'])
has_numerar = any(kw in text_upper for kw in ['NUMERAR', 'CASH'])
# Find REST (change) amount
rest_amount = Decimal('0')
for i, line in enumerate(lines):
if 'REST' in line:
# REST on same line: "REST 0.00" or "REST: 0.00"
match = re.search(r'REST\s*:?\s*([\d.,]+)', line)
if match:
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
elif i + 1 < len(lines):
# REST on separate line
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
break
# Calculate payment amounts
if has_card:
card_amount = total_amount - rest_amount
if card_amount > 0:
methods.append({
'method': 'CARD',
'amount': card_amount,
'confidence': 0.90
})
if has_numerar:
if has_card and rest_amount > 0:
# Mixed payment: NUMERAR is the change given back
methods.append({
'method': 'NUMERAR',
'amount': rest_amount,
'confidence': 0.85
})
elif not has_card:
# Cash only
methods.append({
'method': 'NUMERAR',
'amount': total_amount,
'confidence': 0.90
})
# Step 3: Fallback for fiscal receipts without explicit payment
if not methods and total_amount and total_amount > 0:
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
if is_fiscal:
# Default to CARD for business purchases (most common)
methods.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return methods
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:

View File

@@ -1,54 +0,0 @@
"""
BEST PRINT TRADE ACTIV SRL store profile for OCR extraction.
Stamp manufacturing service. Non-VAT payer (neplătitor de TVA).
"""
from typing import List, Dict, Any
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class BestPrintProfile(BaseStoreProfile):
"""
BEST PRINT TRADE ACTIV SRL - non-VAT payer profile.
Key characteristics:
- Non-VAT payer (neplătitor de TVA) - NO TVA on receipts
- Stamp manufacturing and printing services
- Total amount has no TVA component
- CARD payment typical
"""
CUI_LIST = ["45417955"]
NAME_PATTERNS = ["BEST PRINT", "BESTPRINT", "BEST PRINT TRADE", "BEST PR1NT"]
STORE_NAME = "BEST PRINT TRADE ACTIV SRL"
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries - returns empty for non-VAT payer.
BEST PRINT is a non-VAT payer (neplătitor de TVA),
so no TVA entries are expected on receipts.
Args:
text: Raw OCR text from receipt (unused)
Returns:
Empty list (non-VAT payer has no TVA)
"""
# Non-VAT payer - no TVA entries
return []
def get_validation_hints(self) -> Dict[str, Any]:
"""Return BEST PRINT-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": True, # May have client CUI
"has_efactura": False,
"is_non_vat_payer": True, # CRITICAL: Non-VAT payer
"tva_pattern": "none",
}

View File

@@ -1,282 +0,0 @@
"""
BRICK (Five-Holding) store profile for OCR extraction.
Five-Holding S.A. operates BRICK stores with standard receipt format.
Receipt structure:
- TVA format: "TOTAL TVA A - 21%" with amount on next line
- Payment: "CARD" on separate line (amount from TOTAL LEI)
- Client CUI: "CLIENT C.U.L./C.IF. :ROXXXXXXX" (OCR reads I as L)
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class BrickProfile(BaseStoreProfile):
"""
FIVE-HOLDING S.A. (BRICK) - standard TVA format with client CUI.
Key characteristics:
- Standard TVA format with rate code (A, B, etc.)
- TVA amount on separate line after percentage
- CARD payment indicated by keyword (amount derived from total)
- Client CUI in format: CLIENT C.U.L./C.IF.
- OCR often reads "I" as "L" in CUI markers
"""
CUI_LIST = ["10562600"]
NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK", "F1VE"]
STORE_NAME = "FIVE-HOLDING S.A."
# BRICK TVA patterns (amount often on separate line)
TVA_PATTERNS = [
# "TOTAL TVA A - 21%" with amount on next line (captured as multiline)
r'TOTAL\s+TVA\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
# "OTAL IVAA 21%" - OCR error variant
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
# "TOTAL TVA A 21%" without separator
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
# "TVA A: XX% = YY,YY" - inline format
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s*BON\s*\n?\s*([\d.,]+)'
# Client CUI patterns - specific to Brick (handles OCR L/I confusion)
CLIENT_CUI_PATTERNS = [
# "CLIENT C.U.L./C.IF. :R01879855" - exact OCR format (I->L)
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/?\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
# "CLIENT C.U.I./C.I.F.: RO1879855" - standard format
(r'CLIENT\s+C\.?U\.?I\.?\s*/?\s*C\.?I\.?F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
# "CIF CLIENT: XXXXXXX" - alternative format
(r'CIF\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.95),
]
# Client markers for Brick
CLIENT_MARKERS = [
r'CLIENT\s+C\.?U\.?[LI1]',
r'CLIENT\s+C\.?I\.?F',
r'CIF\s+CLIENT',
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract BRICK-specific TVA entries.
BRICK receipts show TVA in multi-line format:
"TOTAL TVA A - 21%"
"32.31"
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
seen = set()
# Try coded patterns first (with multiline support)
for pattern in self.TVA_PATTERNS:
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
return entries # Brick usually has single TVA rate
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TVA BON" with amount on next line
match = re.search(self.TOTAL_TVA_BON_PATTERN, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
try:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default rate
'amount': amount
})
except (ValueError, InvalidOperation):
pass
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract BRICK-specific payment methods.
BRICK receipts show payment method on separate line:
"TOTAL LEI"
"21.18"
"CARD"
"0.00" <- REST (change)
When CARD appears with REST=0, full amount was paid by card.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Find TOTAL LEI amount
total_amount = None
for i, line in enumerate(lines):
if 'TOTAL' in line and 'LEI' in line:
# Amount is likely on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
total_amount = self._parse_decimal(amount_str)
break
# Also try inline: "TOTAL LEI 21.18"
match = re.search(r'TOTAL\s+LEI\s*([\d.,]+)', line)
if match:
total_amount = self._parse_decimal(match.group(1))
break
if not total_amount:
# Fallback to generic total extraction
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Check for CARD or NUMERAR keywords
has_card = any('CARD' in line for line in lines)
has_numerar = any('NUMERAR' in line for line in lines)
# Find REST amount to determine actual card amount
rest_amount = Decimal('0')
for i, line in enumerate(lines):
if 'REST' in line:
# REST amount is on next line or same line
match = re.search(r'REST\s*([\d.,]+)', line)
if match:
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
elif i + 1 < len(lines):
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
break
if has_card:
# Card payment = total - rest
card_amount = total_amount - rest_amount
if card_amount > 0:
payments.append({
'method': 'CARD',
'amount': card_amount,
'confidence': 0.95
})
if has_numerar:
# If both card and cash, need more complex logic
# For now, assume numerar is the rest if card is present
if not has_card:
payments.append({
'method': 'NUMERAR',
'amount': total_amount,
'confidence': 0.95
})
elif rest_amount > 0:
payments.append({
'method': 'NUMERAR',
'amount': rest_amount,
'confidence': 0.90
})
# If no explicit payment keyword but REST=0, assume card
if not payments and rest_amount == 0:
# Check for any payment indicators
for line in lines:
if 'CARD' in line or 'DEBIT' in line or 'CREDIT' in line:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.90
})
break
# FALLBACK: If still no payment found but we have total amount,
# assume CARD for business receipts (Brick stores usually accept card)
# This handles cases where OCR fails to capture payment method
if not payments and total_amount and total_amount > 0:
# Check if this is a fiscal receipt (BON FISCAL)
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
if is_fiscal:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70 # Lower confidence for inferred payment
})
return payments
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from BRICK receipt.
BRICK uses format: "CLIENT C.U.L./C.IF. :R01879855"
Note: OCR often reads "I" as "L" in these markers.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for Brick client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try Brick-specific patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def get_validation_hints(self) -> Dict[str, Any]:
"""Return BRICK-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True, # Card amount equals total when REST=0
"has_client_cui": True, # Brick receipts CAN have client CUI
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True, # TVA amount on next line
}

View File

@@ -1,118 +0,0 @@
"""
DEDEMAN store profile for OCR extraction.
Dedeman receipts may include e-factura information and use standard TVA format.
Large DIY retailer in Romania.
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class DedemanProfile(BaseStoreProfile):
"""
DEDEMAN SRL - standard TVA with e-factura support.
Key characteristics:
- Standard TVA format
- May include e-factura reference number
- Professional receipts for construction materials
"""
CUI_LIST = ["2816464"]
NAME_PATTERNS = ["DEDEMAN", "DEDEMAN SRL", "OEDEMAN", "D3DEMAN"] # OCR variants
STORE_NAME = "DEDEMAN SRL"
# Standard TVA patterns (flexible - accepts any rate)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA (XX%) YY,YY"
r'TVA\s*\(?\s*(\d{1,2})\s*%\s*\)?\s*:?\s*([\d.,]+)',
]
# E-factura pattern for reference extraction
EFACTURA_PATTERN = r'e-?factura\s*:?\s*([A-Z0-9]+)'
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract Dedeman-specific TVA entries.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
# Try coded patterns first
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break
except (ValueError, InvalidOperation):
continue
return entries
def extract_efactura_reference(self, text: str) -> str | None:
"""
Extract e-factura reference number if present.
Args:
text: Raw OCR text from receipt
Returns:
E-factura reference string or None
"""
match = re.search(self.EFACTURA_PATTERN, text, re.IGNORECASE)
return match.group(1) if match else None
def get_validation_hints(self) -> Dict[str, Any]:
"""Return Dedeman-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"has_client_cui": False,
"has_efactura": True,
"is_non_vat_payer": False,
}

View File

@@ -1,133 +0,0 @@
"""
ELECTROBERING S.R.L. store profile for OCR extraction.
Electronics and home supplies store.
Receipt structure:
- TVA format: "TOTAL TVA A - - 19%" with amount on next line
- "TOTAL TVA BON" with total TVA amount
- Client CUI: "CIF CLIENT: XXXXXXX"
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class ElectroberingProfile(BaseStoreProfile):
"""
ELECTROBERING S.R.L. - standard TVA profile with multiline support.
Key characteristics:
- TVA format with rate on one line, amount on next
- Double-dash separators common (OCR artifact)
- May have client CUI for B2B purchases
- CARD payment typical
"""
CUI_LIST = ["2744937"]
NAME_PATTERNS = ["ELECTROBERING", "ELECTR0BERING", "ELECTROBERING SRL"]
STORE_NAME = "ELECTROBERING S.R.L."
# ELECTROBERING TVA patterns (handles double-dash and multiline)
TVA_PATTERNS = [
# "TOTAL TVA A - - 19%" with amount on next line
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
# "TOTAL TVA A 19%" without separator
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
# Standard: "TVA A: XX% = YY,YY"
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract ELECTROBERING-specific TVA entries.
ELECTROBERING receipts show TVA in multi-line format:
"TOTAL TVA A - - 19%"
"5.59"
"TOTAL TVA BON"
"5.59"
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Find TVA rate line and get amount from next line
for i, line in enumerate(lines):
# Match "TOTAL TVA A - - 19%" or "TOTAL TVA A 19%"
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', line)
if match:
code = match.group(1)
percent = int(match.group(2))
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
# Fallback: Find TOTAL TVA BON and get amount
for i, line in enumerate(lines):
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default Romanian TVA rate
'amount': amount
})
return entries
# Last fallback: inline format "TVA A: XX% = YY,YY"
for pattern in [self.TVA_PATTERNS[2]]:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match and len(match.groups()) >= 3:
try:
code = match.group(1)
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation):
pass
return entries
def get_validation_hints(self) -> Dict[str, Any]:
"""Return ELECTROBERING-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": True, # May have client CUI for B2B
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True,
}

View File

@@ -1,108 +0,0 @@
"""
GAMA INK SERVICE SRL store profile for OCR extraction.
Toner refill and printer supplies store.
Receipt structure:
- TVA format: "TOTAL TVA A 4 19%" with amount on next line (4 is OCR for -)
- "TOTAL TVA BON" with total TVA amount
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class GamaInkProfile(BaseStoreProfile):
"""
GAMA INK SERVICE SRL - standard TVA profile with multiline support.
Key characteristics:
- TVA format with rate on one line, amount on next
- OCR often reads "-" as "4" (e.g., "A 4 19%" instead of "A - 19%")
- CARD payment typical
"""
CUI_LIST = ["17741882"]
NAME_PATTERNS = ["GAMA INK", "GAMA", "GAMAINK", "GAMA INK SERVICE"]
STORE_NAME = "GAMA INK SERVICE SRL"
# GAMA INK TVA patterns (handles OCR errors)
TVA_PATTERNS = [
# "TOTAL TVA A 4 19%" (4 is OCR for -)
r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%',
# "TOTAL TVA A - 19%"
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract GAMA INK-specific TVA entries.
Format: "TOTAL TVA A 4 19%" on one line, amount on next line.
Note: OCR reads "-" as "4" sometimes.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Find TVA rate line and get amount from next line
for i, line in enumerate(lines):
# Match "TOTAL TVA A 4 19%" or "TOTAL TVA A - 19%"
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%', line)
if match:
code = match.group(1)
percent = int(match.group(2))
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
# Fallback: Find TOTAL TVA BON and get amount
for i, line in enumerate(lines):
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default Romanian TVA rate
'amount': amount
})
return entries
return entries
def get_validation_hints(self) -> Dict[str, Any]:
"""Return GAMA INK-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": True, # May have client CUI for business
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True,
}

View File

@@ -1,53 +0,0 @@
"""
KINETERRA store profile for OCR extraction.
Kineterra is a non-VAT payer (neplătitor de TVA).
Receipts don't include TVA breakdown.
"""
from typing import List, Dict, Any
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class KineterraProfile(BaseStoreProfile):
"""
KINETERRA CONCEPT SRL - non-VAT payer profile.
Key characteristics:
- Non-VAT payer (neplătitor de TVA)
- No TVA breakdown on receipts
- Total amount has no TVA component
"""
CUI_LIST = ["31180432"]
NAME_PATTERNS = ["KINETERRA", "KINETERRA CONCEPT", "K1NETERRA"] # OCR variants
STORE_NAME = "KINETERRA CONCEPT SRL"
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries - returns empty for non-VAT payer.
Kineterra is a non-VAT payer, so no TVA entries are expected.
Args:
text: Raw OCR text from receipt (unused)
Returns:
Empty list (non-VAT payer has no TVA)
"""
# Non-VAT payer - no TVA entries
return []
def get_validation_hints(self) -> Dict[str, Any]:
"""Return Kineterra-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"has_client_cui": False,
"has_efactura": False,
"is_non_vat_payer": True,
"tva_pattern": "none",
}

View File

@@ -1,93 +0,0 @@
"""
LIDL store profile for OCR extraction.
Lidl receipts have a specific TVA format without hyphen/colon separators:
TOTAL TVA 9,84
TVA A 21,00% 7,71
TVA B 11,00% 2,13
This profile handles multi-rate TVA extraction for Lidl receipts.
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class LidlProfile(BaseStoreProfile):
"""
LIDL DISCOUNT S.R.L. - multi-rate TVA profile.
Key characteristics:
- Multi-rate TVA (codes A, B, C, D with any percentage - patterns are flexible)
- TVA format: "TVA A XX,XX% YY,YY" (code + percent + amount on same line)
- Supports historical rates (19%, 9%, 5%) and current rates (21%, 11%)
- CARD payment usually equals total
- No client CUI on receipts
"""
CUI_LIST = ["22891860"]
NAME_PATTERNS = ["LIDL", "LDL", "L1DL", "LIDL DISCOUNT"] # OCR variants
STORE_NAME = "LIDL DISCOUNT S.R.L."
# Lidl-specific TVA patterns
# Format: "TVA A 21,00% 7,71" (code + percent + amount on same line)
TVA_PATTERNS = [
# Primary: "TVA A 21,00% 7.71" with various spacing
r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
# With backslash OCR artifact: "TVA A \21,00% 7.71"
r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
# IVA variant (rare OCR misread)
r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract Lidl-specific TVA entries.
Handles multiple TVA rates (A, B, C, D) commonly found on Lidl receipts.
Uses deduplication to avoid counting the same entry twice from different patterns.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set() # Deduplication key: (code, percent)
for pattern in self.TVA_PATTERNS:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation):
continue
return entries
def get_validation_hints(self) -> Dict[str, Any]:
"""Return Lidl-specific validation hints."""
return {
"has_multi_rate_tva": True,
"card_equals_total": True,
"has_client_cui": False,
"has_efactura": False,
"is_non_vat_payer": False,
}

View File

@@ -1,236 +0,0 @@
"""
OMV Petrom store profile for OCR extraction.
OMV receipts typically include client CUI and use standard TVA format.
Common at gas stations with fuel purchases.
Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14")
OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22")
"""
import re
from datetime import date
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class OMVProfile(BaseStoreProfile):
"""
OMV PETROM MARKETING S.R.L. - standard TVA with client CUI.
Key characteristics:
- Standard TVA format (usually single rate, any percentage)
- Includes client CUI on receipt (for business purchases)
- TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces)
- Supports historical rates (19%) and current rates (21%)
- Date format: YYYY. MM. DD (with spaces)
- Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX"
"""
CUI_LIST = ["11201891"]
NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants
STORE_NAME = "OMV PETROM MARKETING S.R.L."
# OMV TVA table patterns (handles OCR spaces in numbers)
# Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total)
TVA_TABLE_PATTERNS = [
# "A-21, 00% 55, 22 318, 16" - with spaces in numbers
r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)',
# "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE
r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)',
]
# Standard TVA pattern fallback
TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)'
# OMV specific: prioritize YYYY. MM. DD format with spaces
DATE_PATTERNS_OCR_SPACES = [
# YYYY. MM. DD with time (OMV format)
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
# Fallback to DD. MM. YYYY
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Client CUI patterns for OMV (unique format)
CLIENT_CUI_PATTERNS = [
# "CLIENT C.U. I./C.I.F.: RO1879855"
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99),
# "C.U.I./C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
# Fallback to simpler pattern
(r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90),
]
# Client markers for OMV
CLIENT_MARKERS = [
r'CLIENT\s+C\.?\s*U\.?\s*I',
r'CLIENT\s+C\.?\s*I\.?\s*F',
r'NUME\s+CLIENT',
r'CLIENT\s*:',
]
def _clean_ocr_number(self, value: str) -> str:
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
# Remove spaces around commas and periods
value = re.sub(r'\s*([.,])\s*', r'\1', value)
# Remove any remaining spaces
value = value.replace(' ', '')
return value
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract OMV-specific TVA entries.
OMV receipts show TVA in table format with spaces in numbers.
Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base)
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
# Try table format first: "A-21, 00% 55, 22 318, 16"
table_pattern = self.TVA_TABLE_PATTERNS[0]
for match in re.finditer(table_pattern, text_upper):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# Clean OCR spaces from amounts
tva_amount_str = self._clean_ocr_number(match.group(3))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
return entries # OMV usually has single TVA rate
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TAXE: 55, 22"
fallback_pattern = self.TVA_TABLE_PATTERNS[1]
match = re.search(fallback_pattern, text_upper)
if match:
try:
tva_amount_str = self._clean_ocr_number(match.group(1))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Standard rate, will be corrected by validation
'amount': tva_amount
})
except (ValueError, InvalidOperation):
pass
return entries
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from OMV receipt.
OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855"
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for OMV client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try OMV-specific patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract OMV-specific payment methods.
OMV receipts use "CARTE CREDIT" instead of "CARD".
Payment amount equals TOTAL for gas station receipts.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# OMV payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments # OMV usually has single payment method
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return OMV-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
"tva_table_format": True,
}

View File

@@ -1,101 +0,0 @@
"""
PICTUS VELUM SRL store profile for OCR extraction.
Office supplies and stationery store.
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class PictusVelumProfile(BaseStoreProfile):
"""
PICTUS VELUM SRL - standard TVA profile.
Key characteristics:
- Standard TVA format (single rate, any percentage)
- Office supplies and stationery (rechizite)
- CARD payment typical
"""
CUI_LIST = ["39634534"]
NAME_PATTERNS = ["PICTUS", "PICTUS VELUM", "P1CTUS", "PICTUS VELUM SRL"]
STORE_NAME = "PICTUS VELUM SRL"
# Standard TVA patterns (flexible - accepts any rate)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA XX% YY,YY" (simple format without code)
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
# Try coded patterns first
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break
except (ValueError, InvalidOperation):
continue
return entries
def get_validation_hints(self) -> Dict[str, Any]:
"""Return PICTUS VELUM-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": False,
"has_efactura": False,
"is_non_vat_payer": False,
}

View File

@@ -1,162 +0,0 @@
"""
SOCAR Petroleum store profile for OCR extraction.
SOCAR receipts are similar to OMV - gas station with client CUI support.
Date format may use YYYY. MM. DD with spaces.
"""
import re
from datetime import date
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class SocarProfile(BaseStoreProfile):
"""
SOCAR PETROLEUM S.A. - standard TVA with client CUI.
Key characteristics:
- Standard TVA format (usually single rate)
- Includes client CUI on receipt (for business purchases)
- Similar format to OMV/Petrom
- Date format may use YYYY. MM. DD (with spaces)
"""
CUI_LIST = ["12546600"]
NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants
STORE_NAME = "SOCAR PETROLEUM S.A."
# Standard TVA patterns for gas stations
TVA_PATTERNS = [
# Table format: "A-19,00% 285,66 49,58"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)',
# Simple format: "TVA 19% 49,58"
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
]
# Gas stations may use YYYY. MM. DD format
DATE_PATTERNS_OCR_SPACES = [
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract SOCAR-specific TVA entries.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
# Try table format first
table_pattern = self.TVA_PATTERNS[0]
for match in re.finditer(table_pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
tva_amount = self._parse_decimal(match.group(4))
if tva_amount and tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Fallback to simple format if no table entries found
if not entries:
simple_pattern = self.TVA_PATTERNS[1]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
# Default to code 'A' for simple format
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break # Only take first match for simple format
except (ValueError, InvalidOperation):
continue
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract SOCAR-specific payment methods.
Gas stations use "CARTE CREDIT" or "CARD" for card payments.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Gas station payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return SOCAR-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
}

View File

@@ -1,204 +0,0 @@
"""
STEPOUT MARKET SRL store profile for OCR extraction.
Bookstore with reduced TVA rate (5% for books in Romania).
Receipt structure:
- TVA format: "5.00% TUA*B" with amount on next line
- Total format: "SUMA TOTALA:" with amount on next line
- Payment: "CARD" with amount on next line
- Client CUI: "CIF CLIENT:XXXXXXX"
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class StepoutMarketProfile(BaseStoreProfile):
"""
STEPOUT MARKET SRL - reduced TVA rate profile (books).
Key characteristics:
- Reduced TVA rate: 5% for books (cărți qualification in Romania)
- TVA format: "X.XX% TUA*B" (OCR reads TVA as TUA)
- Multiline format for amounts
- CARD payment typical
"""
CUI_LIST = ["35532655"]
NAME_PATTERNS = ["STEPOUT", "STEPOUT MARKET", "STEP0UT", "STEPUUT", "STEPOUT MARKET SRL"]
STORE_NAME = "STEPOUT MARKET SRL"
# TVA patterns for Stepout (handles TUA OCR error and multiline)
TVA_PATTERNS = [
# "5.00% TUA*B" - OCR format with TUA
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])',
# "TVA A: 5% = YY,YY" or "TVA-A 5% YY,YY" (inline format)
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "TOTAL TUA:" with amount on next line
r'TOTAL\s+T[UV]A\s*:',
]
# Total patterns for Stepout
TOTAL_PATTERNS = [
# "SUMA TOTALA:" with amount on next line
(r'SUMA\s+TOTALA\s*:', 0.98),
# "TOTAL:" fallback
(r'TOTAL\s*:', 0.90),
]
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from Stepout Market receipt.
Format: "SUMA TOTALA:" on one line, amount on next line.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (total_amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
lines = text_upper.split('\n')
for pattern, confidence in self.TOTAL_PATTERNS:
for i, line in enumerate(lines):
if re.search(pattern, line, re.IGNORECASE):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
return (amount, confidence)
# Fallback to base class
return super().extract_total(text)
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from Stepout Market receipt.
Format: "5.00% TUA*B" on one line, amount on next line.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Try "X.XX% TUA*B" format first
for i, line in enumerate(lines):
match = re.search(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', line)
if match:
percent = int(match.group(1))
code = match.group(2)
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries # Single rate store
# Try "TOTAL TUA:" format
for i, line in enumerate(lines):
if re.search(r'TOTAL\s+T[UV]A\s*:', line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'B', # Books are usually code B (5%)
'percent': 5,
'amount': amount
})
return entries
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods from Stepout Market receipt.
Format: "CARD" on one line, amount on next line.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Find CARD or NUMERAR keyword
for i, line in enumerate(lines):
line_stripped = line.strip()
if line_stripped == 'CARD':
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': 'CARD',
'amount': amount,
'confidence': 0.95
})
return payments
elif line_stripped == 'NUMERAR' or 'CASH' in line_stripped:
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': 'NUMERAR',
'amount': amount,
'confidence': 0.95
})
return payments
# Fallback: check for inline CARD amount
for line in lines:
match = re.search(r'CARD\s*:?\s*([\d.,]+)', line)
if match:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0:
payments.append({
'method': 'CARD',
'amount': amount,
'confidence': 0.90
})
return payments
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return STEPOUT MARKET-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
"typical_tva_rate": 5, # Books have 5% TVA in Romania
"product_category": "books",
"tva_on_separate_line": True,
}

View File

@@ -1,269 +0,0 @@
"""
UNLIMITED KEYS S.R.L. store profile for OCR extraction.
Key duplication service. Notable for CASH (NUMERAR) payments.
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any, Optional, Tuple
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class UnlimitedKeysProfile(BaseStoreProfile):
"""
UNLIMITED KEYS S.R.L. - standard TVA profile with NUMERAR payment.
Key characteristics:
- Standard TVA format (single rate, any percentage)
- Key duplication service
- NUMERAR (cash) payment common - different from most stores!
- May also accept CARD
- OCR often reads "TVA" as "TUA" - need OCR error variants
"""
CUI_LIST = ["18993187"]
NAME_PATTERNS = ["UNLIMITED KEYS", "UNLIMITED", "UNL1MITED", "UNLIMITED KEYS SRL"]
STORE_NAME = "UNLIMITED KEYS S.R.L."
# Standard TVA patterns - including OCR error variants (TVA -> TUA)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY" (including TUA OCR error)
r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)',
# "TVA XX% YY,YY" (simple format, includes TUA)
r'T[UV]A\s+(\d{1,2})\s*%\s*([\d.,\s]+)',
# "XX.XX% TUA*A YY.YY" (OCR format with TUA*A or TUA)
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?[A-D]?\s*([\d.,\s]+)',
# "TOTAL TUA: YY.YY" (total TVA amount only)
r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)',
]
# TOTAL patterns for UNLIMITED KEYS (handles "80 .00" format)
TOTAL_PATTERNS = [
# "SUMA TOTALA: 80 .00" (with space before decimal)
(r'SUMA\s+TOTALA\s*:?\s*([\d\s.,]+)', 0.98),
# "TOTALA: 80,00"
(r'TOTALA\s*:?\s*([\d.,]+)', 0.95),
# Standard TOTAL patterns from base class
(r'TOTAL\s+(?:DE\s+PLATA|ACHITAT|LEI)\s*:?\s*([\d.,]+)', 0.95),
(r'TOTAL\s*:?\s*([\d.,]+)', 0.90),
]
# Payment patterns - NUMERAR is primary for this store
PAYMENT_PATTERNS = [
# "NUMERAR 80.00" or "NUMERAR: 80.00"
(r'NUMERAR\s*:?\s*([\d.,\s]+)', 'NUMERAR', 0.98),
# "CARD 80.00" or "CARD: 80.00"
(r'CARD\s*:?\s*([\d.,\s]+)', 'CARD', 0.95),
]
# Client CUI patterns - specific to this receipt format
CLIENT_CUI_PATTERNS = [
# "CIF CLIENT:1879855" (exact format from OCR)
(r'CIF\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
# "CLIENT CIF: ROXXXXXXX"
(r'CLIENT\s+CIF\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "C.I.F. CLIENT: XXXXXXX"
(r'C\.?I\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
]
# Override client markers to be less strict
CLIENT_MARKERS = [
r'CIF\s+CLIENT',
r'CLIENT\s+CIF',
r'C\.?I\.?F\.?\s+CLIENT',
r'CLIENT\s*:',
]
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from receipt text.
Handles UNLIMITED KEYS format with space before decimal (e.g., "80 .00").
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (total_amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
try:
# Clean up amount string (remove spaces, fix decimal)
amount_str = match.group(1)
# Remove spaces that might appear before decimal
amount_str = re.sub(r'\s+', '', amount_str)
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
return (amount, confidence)
except (ValueError, InvalidOperation):
continue
return (None, 0.0)
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Handles OCR errors where TVA is read as TUA.
Args:
text: Raw OCR text from receipt
Returns:
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
# Pattern 4: "XX.XX% TUA*A YY.YY" - common OCR format
pattern4 = self.TVA_PATTERNS[3]
match = re.search(pattern4, text_upper)
if match:
try:
percent = int(match.group(1))
amount_str = re.sub(r'\s+', '', match.group(2))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
pass
# Pattern 5: "TOTAL TUA: YY.YY" - fallback to total TVA
pattern5 = self.TVA_PATTERNS[4]
match = re.search(pattern5, text_upper)
if match:
try:
amount_str = re.sub(r'\s+', '', match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
# Infer percent from amount vs total ratio
entries.append({
'code': 'A',
'percent': 19, # Standard Romanian TVA rate
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
pass
# Try coded patterns
for pattern in self.TVA_PATTERNS[:3]:
for match in re.finditer(pattern, text_upper, re.IGNORECASE):
try:
groups = match.groups()
if len(groups) == 3:
code = groups[0].upper()
percent = int(groups[1])
amount_str = re.sub(r'\s+', '', groups[2])
else:
code = 'A'
percent = int(groups[0])
amount_str = re.sub(r'\s+', '', groups[1])
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods from receipt text.
Handles NUMERAR (cash) as primary payment for this store.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
for pattern, method, confidence in self.PAYMENT_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
try:
amount_str = re.sub(r'\s+', '', match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': method,
'amount': amount,
'confidence': confidence
})
except (ValueError, InvalidOperation):
continue
return payments
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from receipt text.
Handles "CIF CLIENT:1879855" format specific to this store.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try client CUI patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def get_validation_hints(self) -> Dict[str, Any]:
"""Return UNLIMITED KEYS-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False, # May be NUMERAR (cash)
"has_client_cui": True, # May have client CUI
"has_efactura": False,
"is_non_vat_payer": False,
"common_payment": "NUMERAR", # Cash payments common
}

View File

@@ -576,6 +576,7 @@ class ReceiptExtractor:
print(f"[TVA Reverse Validation] {msg}", flush=True)
# Cross-validate amount using payment methods and TVA
original_amount = result.amount
validated_amount, validated_confidence, source = self._cross_validate_and_calculate_amount(
result.amount,
result.confidence_amount,
@@ -583,8 +584,38 @@ class ReceiptExtractor:
result.tva_entries,
result.tva_total
)
if validated_amount != result.amount:
print(f"[Cross-Validation] Amount updated: {result.amount} -> {validated_amount} (source: {source})", flush=True)
# Add validation warnings when TOTAL is calculated (not directly extracted)
if 'calculated from TVA' in source:
warning_msg = f"TOTAL ({validated_amount}) calculat din TVA (nu a fost extras direct din bon)"
result.validation_warnings.append(warning_msg)
print(f"[Cross-Validation] ⚠️ {warning_msg}", flush=True)
# Add comparison if original was different
if original_amount and original_amount != validated_amount:
diff = abs(float(validated_amount) - float(original_amount))
result.validation_warnings.append(
f"TOTAL extras ({original_amount}) diferă de cel calculat ({validated_amount}) cu {diff:.2f} RON"
)
elif 'calculated from payment methods' in source:
warning_msg = f"TOTAL ({validated_amount}) calculat din suma metodelor de plată (nu a fost extras direct)"
result.validation_warnings.append(warning_msg)
print(f"[Cross-Validation] ⚠️ {warning_msg}", flush=True)
if original_amount and original_amount != validated_amount:
diff = abs(float(validated_amount) - float(original_amount))
result.validation_warnings.append(
f"TOTAL extras ({original_amount}) diferă de suma plăților ({validated_amount}) cu {diff:.2f} RON"
)
elif source == 'not found':
result.validation_warnings.append("TOTAL nu a fost detectat și nu a putut fi calculat")
print("[Cross-Validation] ⚠️ TOTAL nu a fost detectat", flush=True)
elif validated_amount != original_amount:
print(f"[Cross-Validation] Amount updated: {original_amount} -> {validated_amount} (source: {source})", flush=True)
result.amount = validated_amount
result.confidence_amount = validated_confidence