Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/profiles/base.py
Claude Agent cc98d6f21f ocr extract
2026-01-07 14:34:07 +00:00

1098 lines
47 KiB
Python

"""
Base class for store-specific OCR extraction profiles.
Each store can have different receipt formats (TVA layout, total position, etc.).
Store profiles allow customizing extraction logic per-store for better accuracy.
Usage:
from .base import BaseStoreProfile
from . import ProfileRegistry
@ProfileRegistry.register
class LidlProfile(BaseStoreProfile):
CUI_LIST = ["22891860"]
NAME_PATTERNS = ["LIDL", "LDL"]
def extract_tva_entries(self, text: str) -> List[dict]:
# Custom Lidl TVA extraction logic
...
"""
import re
from abc import ABC
from decimal import Decimal, InvalidOperation
from typing import List, Optional, Tuple, Dict, Any
from datetime import date
class BaseStoreProfile(ABC):
"""
Abstract base class for store-specific extraction profiles.
Each profile defines:
- CUI_LIST: CUI codes that identify this store (without RO prefix)
- NAME_PATTERNS: OCR-tolerant name patterns for fallback matching
- Custom extraction methods for TVA, total, date, etc.
The ProfileRegistry uses CUI_LIST to lookup profiles during extraction.
"""
# -------------------------------------------------------------------------
# Class attributes - override in subclasses
# -------------------------------------------------------------------------
# List of CUI codes (without RO prefix) that identify this store
CUI_LIST: List[str] = []
# OCR-tolerant name patterns for fallback matching
NAME_PATTERNS: List[str] = []
# Store display name
STORE_NAME: str = "Unknown Store"
# Flag for known non-VAT payer stores (skips TVA extraction)
IS_NON_VAT_PAYER: bool = False
# -------------------------------------------------------------------------
# Generic patterns - can be overridden in subclasses
# -------------------------------------------------------------------------
# Total amount patterns (confidence-weighted)
TOTAL_PATTERNS = [
(r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
]
# Date patterns (confidence-weighted)
DATE_PATTERNS = [
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
]
# Date patterns with OCR-introduced spaces (separate because format is different)
DATE_PATTERNS_OCR_SPACES = [
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Receipt number patterns (confidence-weighted)
NUMBER_PATTERNS = [
(r'NDS\s*:?\s*(\d+)', 0.98),
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),
(r'C3POS.*?(\d{6,7})\b', 0.95),
(r'BF\s*:\s*(\d{4,})', 0.96),
(r'BF\s+(\d{4,})', 0.93),
(r'NIVS\s*:?\s*(\d+)', 0.95),
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
]
# Payment method patterns (pattern, method_type, confidence)
# Handles ALL payment types: CARD, NUMERAR, and card brand names
PAYMENT_PATTERNS = [
# CARTE CREDIT variants (OMV/Petrom/Socar receipts)
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
(r'CARTE\s+DE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
(r'CARTE\s+DE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
# CARD standard
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
# Card brand names
(r'VISA\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'MASTERCARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'MAESTR[O0]\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
(r'CONTACTLESS\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
(r'DEBIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
(r'CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.88),
# Cash variants
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
# Truncation recovery patterns (for OCR left-margin issues)
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
]
# Client section markers (for B2B receipts) - More flexible patterns
# Includes OCR corruption variants (LIENT, C IENT, L IENT)
CLIENT_MARKERS = [
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI"
r'CLIENT\s*:', # "CLIENT:"
r'CUMPARATOR\s*:', # "CUMPARATOR:"
r'BENEFICIAR\s*:', # "BENEFICIAR:"
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
# OCR corruption patterns
r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF L IENT:"
r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C)
r'LIENT\s*:', # "LIENT:" (missing C and I/L)
# Brick-specific (I→L OCR error)
r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/', # "CLIENT C.U.L./" (I read as L)
]
# Client CUI patterns (pattern, confidence) - Comprehensive
# Handles: docTR reordering, doubled letters, corruption, CUMPARATOR, Brick L/I swap
CLIENT_CUI_PATTERNS = [
# === CUI on line BEFORE CLIENT marker (docTR/OCR reordering) ===
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
# === "CIF I CLIENT:" format (OCR extra chars) ===
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
# === CIF CLIENT: (reversed - CIF before CLIENT) ===
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# === CLIENT C.U.I/C.I.F. (slash variants) ===
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
# === Doubled letters (docTR artifact: "C.U U.I") ===
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
# === CLIENT C.U.I. or CLIENT CUI (without slash) ===
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
# === Corrupted CLIENT after CIF (OCR errors) ===
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
(r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),
(r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
# === CUMPARATOR variants ===
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUMPARATOR with CUI/CIF on next line
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
# CUMPARATOR with CUI/CIF two lines down
(r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# === CLIENT on next line ===
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# === Standard fallback patterns ===
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
# === Brick-specific (I→L OCR error) ===
# Matches: "CLIENT C.U.L./C.IF. :R01879855"
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
]
# Company type indicators (for identifying company names)
COMPANY_INDICATORS = [
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
r'HOLDING',
r'COMPANY',
r'GROUP',
]
# Maximum reasonable payment amount (to filter OCR errors)
MAX_PAYMENT = Decimal('100000')
# -------------------------------------------------------------------------
# TVA (VAT) patterns - ALL FORMATS unified
# OCR tolerant: T[VU][AR] matches TVA, TUA, TVR
# -------------------------------------------------------------------------
TVA_PATTERNS = [
# === FORMAT 1: INLINE cu cod și procent (Lidl-style) ===
# "TVA A 21,00% 7.71" sau "TVA B 11,00% 2.13"
(r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.98, 'inline'),
(r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.97, 'inline'),
(r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.95, 'inline'),
# === FORMAT 2: REVERSED (Stepout-style) ===
# "5.00% TUA*B" - procent ÎNAINTE de TVA
(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', 0.97, 'reversed'),
# === FORMAT 3: TABLE (OMV-style) ===
# "A-21,00% 285,66 49,58" (cod-procent bază tva)
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', 0.96, 'table'),
(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', 0.95, 'taxe'),
# === FORMAT 4: MULTILINE (Brick/Electrobering) ===
# "TOTAL TVA A - 19%" pe o linie, amount pe următoarea
(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', 0.96, 'multiline'),
(r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', 0.95, 'multiline'),
(r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', 0.94, 'multiline'),
# === FORMAT 5: STANDARD (din extractor) ===
(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98, 'bon'),
(r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
(r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95, 'percent'),
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93, 'percent'),
(r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93, 'books'),
(r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92, 'books'),
# === FORMAT 6: CODED inline (cu code A-D) ===
(r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)', 0.95, 'coded'),
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)', 0.93, 'coded'),
# === FALLBACK patterns ===
(r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88, 'fallback'),
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
(r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)', 0.90, 'standard'),
]
# Non-VAT payer patterns - NEPLATITOR DE TVA
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, ANEPLATHTOR, MEPLATITOR
NON_VAT_PATTERNS = [
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
]
# CUI (fiscal code) patterns - VENDOR CUI (exclude CLIENT)
# OCR errors: R0 instead of RO, C1F instead of CIF
CUI_PATTERNS = [
# CIF at start of line (definitely vendor)
(r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'^CIF\s*:?\s*(\d{6,10})', 0.97),
(r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),
(r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),
# CIF not preceded by CLIENT (negative lookbehind)
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
# Standalone CIF with word boundary
(r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
(r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
# COD FISCAL (vendor)
(r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
(r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
# C. I. F. format with SPACES (OCR artifact)
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
# C.I.F. format (with dots, no spaces)
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
# CUI format
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
# Lidl format: "Cod Identificare fiscala" (OCR corrupted)
(r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
# Generic: anything with "fiscal" followed by RO + digits
(r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
]
# CUI REVERSED format (number BEFORE label)
CUI_REVERSED_PATTERNS = [
(r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
]
# Items count patterns - NR POZ ART IN BON
ITEMS_COUNT_PATTERNS = [
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
]
# Series patterns - Romanian fiscal receipt series
SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
]
# -------------------------------------------------------------------------
# Extraction methods - override in subclasses as needed
# -------------------------------------------------------------------------
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text - GENERIC implementation.
Handles ALL formats:
- Multi-rate inline (Lidl): "TVA A 21% 7.71"
- Reversed (Stepout): "5.00% TUA*B"
- Table (OMV): "A-21,00% 285,66 49,58"
- Multiline: "TOTAL TVA A - 19%" + amount on next line
- Non-VAT payers: Returns empty list
Args:
text: Raw OCR text from receipt
Returns:
List of dicts with keys: code, percent, amount
"""
entries = []
text_upper = text.upper()
# Step 1: Check for known non-VAT payer (by class flag or text detection)
if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper):
return [] # No TVA entries for non-VAT payers
# Step 2: Normalize OCR spaces in numbers
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
lines = normalized.split('\n')
# Step 3: Try all formats, collect candidates
candidates = []
# Try inline multi-rate (Lidl-style)
candidates.extend(self._try_tva_inline(normalized))
# Try reversed format (Stepout-style)
candidates.extend(self._try_tva_reversed(normalized, lines))
# Try multiline format (Brick/Electrobering)
candidates.extend(self._try_tva_multiline(normalized, lines))
# Try table format (OMV-style)
candidates.extend(self._try_tva_table(normalized))
# Try standard/fallback patterns
if not candidates:
candidates.extend(self._try_tva_standard(normalized))
# Step 4: Deduplicate and return
seen = set()
for entry in candidates:
key = (entry.get('code', 'A'), entry.get('percent', 19))
if key not in seen and entry.get('amount') and entry['amount'] > 0:
entries.append(entry)
seen.add(key)
return entries
def _is_non_vat_payer(self, text: str) -> bool:
"""Check if receipt is from non-VAT payer."""
for pattern in self.NON_VAT_PATTERNS:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def _try_tva_inline(self, text: str) -> List[dict]:
"""Try Lidl-style inline format: 'TVA A 21,00% 7.71'"""
entries = []
# Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13"
for pattern, confidence, fmt in self.TVA_PATTERNS:
if fmt != 'inline':
continue
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
groups = match.groups()
if len(groups) >= 3:
code = groups[0].upper() if groups[0] else 'A'
percent = int(groups[1])
amount = self._parse_decimal(self._clean_ocr_number(groups[2]))
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _try_tva_reversed(self, text: str, lines: List[str]) -> List[dict]:
"""Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)"""
entries = []
# Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next
for i, line in enumerate(lines):
# Try pattern with amount on SAME line: "5.00% TUA*B 2.00"
match = re.search(
r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?\s+([\d\s.,]+)',
line, re.IGNORECASE
)
if match:
try:
percent = int(match.group(1))
code = match.group(2).upper() if match.group(2) else 'A'
amount_str = match.group(3).strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
continue # Check for more entries
except (ValueError, InvalidOperation, IndexError):
pass
# Fallback: amount on NEXT line
match = re.search(r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?$', line, re.IGNORECASE)
if match:
try:
percent = int(match.group(1))
code = match.group(2).upper() if match.group(2) else 'A'
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _try_tva_multiline(self, text: str, lines: List[str]) -> List[dict]:
"""Try multiline format: 'TOTAL TVA A - 19%' + amount on next line"""
entries = []
# Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next
multiline_patterns = [
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%',
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%',
]
for i, line in enumerate(lines):
for pattern in multiline_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
try:
code = match.group(1).upper() if match.group(1) else 'A'
percent = int(match.group(2))
# Amount is on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _try_tva_table(self, text: str) -> List[dict]:
"""Try OMV-style table format: 'A-21,00% 285,66 49,58'"""
entries = []
# Pattern: "A-21,00% 285,66 49,58" (code-percent base_amount tva_amount)
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)'
for match in re.finditer(table_pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# Group 4 is the TVA amount (last column in table)
tva_amount_str = self._clean_ocr_number(match.group(4))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TAXE: 55,22"
if not entries:
taxe_match = re.search(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', text, re.IGNORECASE)
if taxe_match:
try:
amount_str = self._clean_ocr_number(taxe_match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default rate
'amount': amount
})
except (ValueError, InvalidOperation):
pass
return entries
def _try_tva_standard(self, text: str) -> List[dict]:
"""Try standard TVA patterns as fallback"""
entries = []
standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books']
for pattern, confidence, fmt in self.TVA_PATTERNS:
if fmt not in standard_fmts:
continue
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
groups = match.groups()
if len(groups) >= 2:
# Could be (percent, amount) or (code, percent, amount)
if groups[0] and groups[0].isalpha():
code = groups[0].upper()
percent = int(groups[1]) if len(groups) > 1 else 19
amount_str = groups[2] if len(groups) > 2 else None
else:
code = 'A'
percent = int(groups[0]) if groups[0] and groups[0].isdigit() else 19
amount_str = groups[1] if len(groups) > 1 else groups[0]
if amount_str:
amount = self._parse_decimal(self._clean_ocr_number(amount_str))
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
elif len(groups) == 1:
# Just amount
amount = self._parse_decimal(self._clean_ocr_number(groups[0]))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def _clean_ocr_number(self, value: str) -> str:
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
if not value:
return ""
value = re.sub(r'\s*([.,])\s*', r'\1', value)
value = value.replace(' ', '')
return value
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from receipt text.
Supports both single-line and multiline formats:
- Single line: "TOTAL: 78.00", "SUMA TOTALA: 78.00"
- Multiline: "SUMA\nTOTALA:\n78.00" (common in thermal receipts)
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
lines = text_upper.split('\n')
# =====================================================================
# STRATEGY 1: Multiline "SUMA TOTALA" pattern (thermal receipts)
# Format: SUMA on one line, TOTALA: on next, amount on third
# =====================================================================
for i, line in enumerate(lines):
line_clean = line.strip()
# Check for "SUMA" keyword (with OCR variants: SUNA, SUHA, SUM A)
if re.search(r'S[UU]M[AĂ\s]', line_clean):
# Look at next 3 lines for "TOTALA" and amount
for j in range(i, min(i + 4, len(lines))):
check_line = lines[j].strip()
# Check for "TOTALA:" or "TOTALA -" followed by amount
match = re.search(r'T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', check_line)
if match:
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.98)
# Check for "TOTALA" without amount, amount on next line
if re.search(r'T[O0]TALA\s*[:\-]?\s*$', check_line):
if j + 1 < len(lines):
amount_line = lines[j + 1].strip()
amount = self._parse_decimal(amount_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.97)
# Check for "SUMA TOTALA" on single line with amount
match = re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', line_clean)
if match:
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.98)
# Check for "SUMA TOTALA" without amount, amount on next line
if re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]?\s*$', line_clean):
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
amount = self._parse_decimal(next_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, 0.96)
# =====================================================================
# STRATEGY 2: Standard single-line patterns
# =====================================================================
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper)
if match:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
return (amount, confidence)
return (None, 0.0)
def extract_date(self, text: str) -> Tuple[Optional[date], float]:
"""
Extract receipt date from text.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (date, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Try standard patterns first
for pattern, confidence in self.DATE_PATTERNS:
match = re.search(pattern, text_upper)
if match:
parsed = self._parse_date(match.group(1))
if parsed:
return (parsed, confidence)
# Try OCR-corrupted patterns with spaces
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
match = re.search(pattern, text_upper)
if match:
try:
if fmt == 'ymd':
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
else: # dmy
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
return (date(year, month, day), confidence)
except (ValueError, TypeError):
continue
return (None, 0.0)
def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract receipt number from text.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (number, confidence) or (None, 0.0)
"""
text_upper = text.upper()
for pattern, confidence in self.NUMBER_PATTERNS:
match = re.search(pattern, text_upper)
if match:
number = match.group(1).strip()
if number and len(number) >= 3:
return (number, confidence)
return (None, 0.0)
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods (CARD/NUMERAR) from receipt.
Supports:
- Multiline patterns: "CARD\n78.00" (common in thermal receipts)
- Multiple payments (split CARD + NUMERAR)
- REST (change) detection to calculate actual CARD amount
- Keyword-only CARD/NUMERAR that infers from total
- Fallback for fiscal receipts without explicit payment
Args:
text: Raw OCR text from receipt
Returns:
List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
"""
text_upper = text.upper()
lines = text_upper.split('\n')
methods = []
seen_entries = set()
# =====================================================================
# STEP 0: Try MULTILINE patterns first (thermal receipts)
# Format: "CARD" on one line, amount on next line
# =====================================================================
for i, line in enumerate(lines):
line_clean = line.strip()
# Standalone CARD keyword (not part of MASTERCARD, etc.)
if re.match(r'^CARD\s*$', line_clean):
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
# Must be a valid amount (not another keyword)
if re.match(r'^[\d\s.,]+$', next_line):
amount = self._parse_decimal(next_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
entry_key = ('CARD', amount)
if entry_key not in seen_entries:
methods.append({
'method': 'CARD',
'amount': amount,
'confidence': 0.95
})
seen_entries.add(entry_key)
# Standalone NUMERAR keyword
if re.match(r'^NUMERAR\s*$', line_clean):
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
if re.match(r'^[\d\s.,]+$', next_line):
amount = self._parse_decimal(next_line)
if amount and amount > 0 and amount < self.MAX_PAYMENT:
entry_key = ('NUMERAR', amount)
if entry_key not in seen_entries:
methods.append({
'method': 'NUMERAR',
'amount': amount,
'confidence': 0.95
})
seen_entries.add(entry_key)
# If multiline extraction found methods, return them
if methods:
return methods
# =====================================================================
# STEP 1: Try pattern-based extraction with explicit amounts
# =====================================================================
for pattern, method, confidence in self.PAYMENT_PATTERNS:
for match in re.finditer(pattern, text_upper):
try:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0 and amount < self.MAX_PAYMENT:
entry_key = (method, amount)
if entry_key not in seen_entries:
methods.append({
'method': method,
'amount': amount,
'confidence': confidence
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# If we found explicit amounts, we're done
if methods:
return methods
# Step 2: Try keyword-only detection with REST logic
# Get total amount for inference
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Check for payment keywords
has_card = any(kw in text_upper for kw in ['CARD', 'CARTE CREDIT', 'VISA', 'MASTERCARD', 'DEBIT', 'CREDIT', 'CONTACTLESS'])
has_numerar = any(kw in text_upper for kw in ['NUMERAR', 'CASH'])
# Find REST (change) amount
rest_amount = Decimal('0')
for i, line in enumerate(lines):
if 'REST' in line:
# REST on same line: "REST 0.00" or "REST: 0.00"
match = re.search(r'REST\s*:?\s*([\d.,]+)', line)
if match:
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
elif i + 1 < len(lines):
# REST on separate line
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
break
# Calculate payment amounts
if has_card:
card_amount = total_amount - rest_amount
if card_amount > 0:
methods.append({
'method': 'CARD',
'amount': card_amount,
'confidence': 0.90
})
if has_numerar:
if has_card and rest_amount > 0:
# Mixed payment: NUMERAR is the change given back
methods.append({
'method': 'NUMERAR',
'amount': rest_amount,
'confidence': 0.85
})
elif not has_card:
# Cash only
methods.append({
'method': 'NUMERAR',
'amount': total_amount,
'confidence': 0.90
})
# Step 3: Fallback for fiscal receipts without explicit payment
if not methods and total_amount and total_amount > 0:
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
if is_fiscal:
# Default to CARD for business purchases (most common)
methods.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return methods
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from B2B receipts.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# First check if there's a CLIENT section
has_client_section = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client_section:
return (None, 0.0)
# Try to extract CUI
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
# Normalize: remove RO prefix for storage
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def extract_client_name(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client/buyer company name from B2B receipts.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (client_name, confidence) or (None, 0.0)
"""
text_upper = text.upper()
lines = text.split('\n')
# First check if there's a CLIENT section
client_section_idx = None
for i, line in enumerate(lines):
line_upper = line.upper().strip()
if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS):
client_section_idx = i
break
if client_section_idx is None:
return (None, 0.0)
# Look for company name in CLIENT section
line = lines[client_section_idx].strip()
line_upper = line.upper()
# Strategy 1: Check if name is on same line after ":"
if ':' in line:
name_part = line.split(':', 1)[1].strip()
if name_part and len(name_part) >= 3:
# Skip if it looks like a CUI (RO followed by digits)
if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
pass # This is CUI, not name - continue to next strategy
else:
# Check for company indicators
name_upper = name_part.upper()
if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS):
return (self._clean_company_name(name_part), 0.95)
elif len(name_part) >= 5 and not name_part.isdigit():
return (self._clean_company_name(name_part), 0.80)
# Strategy 2: Check next line for company name
if client_section_idx + 1 < len(lines):
next_line = lines[client_section_idx + 1].strip()
next_upper = next_line.upper()
# Skip if it's a CUI/CIF line or looks like CUI
if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
if not re.match(r'^R[O0]?\d{6,10}$', next_upper):
if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS):
return (self._clean_company_name(next_line), 0.90)
elif len(next_line) >= 5 and not next_line.isdigit():
# Check it's not CUI/CIF/COD keywords
if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
return (self._clean_company_name(next_line), 0.75)
# Strategy 3: Look for any line with company indicators in CLIENT section region
search_end = min(client_section_idx + 5, len(lines))
for i in range(client_section_idx + 1, search_end):
line = lines[i].strip()
line_upper = line.upper()
# Skip CUI/CIF lines
if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper):
continue
if re.match(r'^R[O0]?\d{6,10}$', line_upper):
continue
if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS):
return (self._clean_company_name(line), 0.85)
return (None, 0.0)
@staticmethod
def _clean_company_name(name: str) -> str:
"""Clean company name for storage."""
if not name:
return ""
# Remove extra whitespace
name = re.sub(r'\s+', ' ', name).strip()
# Remove trailing punctuation except periods in S.R.L., S.A., etc.
name = re.sub(r'[,;:]+$', '', name).strip()
return name
# -------------------------------------------------------------------------
# Validation hints - override to customize validation behavior
# -------------------------------------------------------------------------
def get_validation_hints(self) -> Dict[str, Any]:
"""
Return validation hints for this store.
Returns:
Dict with validation hints. Common keys:
- has_multi_rate_tva: bool - Store uses multiple TVA rates
- card_equals_total: bool - CARD payment equals total
- has_client_cui: bool - Receipt includes client CUI
- has_efactura: bool - Store uses e-factura format
- is_non_vat_payer: bool - Store is not a VAT payer
"""
return {}
# -------------------------------------------------------------------------
# Helper methods - available to all subclasses
# -------------------------------------------------------------------------
@staticmethod
def _normalize_number(text: str) -> str:
"""
Normalize a number string for Decimal conversion.
Handles Romanian formats: "1.234,56" -> "1234.56"
"""
if not text:
return "0"
# Remove spaces
text = text.replace(" ", "")
# Determine decimal separator
last_comma = text.rfind(",")
last_dot = text.rfind(".")
if last_comma > last_dot:
text = text.replace(".", "").replace(",", ".")
elif last_dot > last_comma:
text = text.replace(",", "")
else:
text = text.replace(",", ".")
return text
@staticmethod
def _parse_decimal(text: str) -> Optional[Decimal]:
"""Parse a string to Decimal, handling various formats."""
try:
normalized = BaseStoreProfile._normalize_number(text)
return Decimal(normalized)
except (InvalidOperation, ValueError, TypeError):
return None
@staticmethod
def _parse_date(text: str) -> Optional[date]:
"""
Parse date string in various formats.
Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD
"""
if not text:
return None
# Normalize separators
text = text.replace('/', '-').replace('.', '-')
try:
parts = text.split('-')
if len(parts) != 3:
return None
# Determine format based on first part length
if len(parts[0]) == 4:
# YYYY-MM-DD
year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
else:
# DD-MM-YYYY
day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
# Validate ranges
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
return date(year, month, day)
except (ValueError, TypeError, IndexError):
pass
return None
@staticmethod
def _clean_text(text: str) -> str:
"""Clean OCR text for pattern matching."""
if not text:
return ""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text)
return text.strip()
# -------------------------------------------------------------------------
# Magic methods
# -------------------------------------------------------------------------
def __repr__(self) -> str:
return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>"
def __str__(self) -> str:
return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"