ocr extract
This commit is contained in:
@@ -50,6 +50,9 @@ class BaseStoreProfile(ABC):
|
||||
# Store display name
|
||||
STORE_NAME: str = "Unknown Store"
|
||||
|
||||
# Flag for known non-VAT payer stores (skips TVA extraction)
|
||||
IS_NON_VAT_PAYER: bool = False
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Generic patterns - can be overridden in subclasses
|
||||
# -------------------------------------------------------------------------
|
||||
@@ -100,18 +103,33 @@ class BaseStoreProfile(ABC):
|
||||
]
|
||||
|
||||
# Payment method patterns (pattern, method_type, confidence)
|
||||
# Handles ALL payment types: CARD, NUMERAR, and card brand names
|
||||
PAYMENT_PATTERNS = [
|
||||
# CARTE CREDIT variants (OMV/Petrom/Socar receipts)
|
||||
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
||||
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
||||
(r'CARTE\s+DE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
||||
(r'CARTE\s+DE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
||||
# CARD standard
|
||||
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
# Card brand names
|
||||
(r'VISA\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
(r'MASTERCARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
(r'MAESTR[O0]\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
(r'CONTACTLESS\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
|
||||
(r'DEBIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
|
||||
(r'CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.88),
|
||||
# Cash variants
|
||||
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
|
||||
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
|
||||
# Truncation recovery patterns (for OCR left-margin issues)
|
||||
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
|
||||
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
|
||||
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
|
||||
]
|
||||
|
||||
# Client section markers (for B2B receipts) - More flexible patterns
|
||||
# Includes OCR corruption variants (LIENT, C IENT, L IENT)
|
||||
CLIENT_MARKERS = [
|
||||
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
|
||||
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
|
||||
@@ -121,24 +139,62 @@ class BaseStoreProfile(ABC):
|
||||
r'BENEFICIAR\s*:', # "BENEFICIAR:"
|
||||
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
|
||||
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
|
||||
# OCR corruption patterns
|
||||
r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF L IENT:"
|
||||
r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C)
|
||||
r'LIENT\s*:', # "LIENT:" (missing C and I/L)
|
||||
# Brick-specific (I→L OCR error)
|
||||
r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/', # "CLIENT C.U.L./" (I read as L)
|
||||
]
|
||||
|
||||
# Client CUI patterns (pattern, confidence) - More flexible
|
||||
# Client CUI patterns (pattern, confidence) - Comprehensive
|
||||
# Handles: docTR reordering, doubled letters, corruption, CUMPARATOR, Brick L/I swap
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
|
||||
(r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
|
||||
# "CLIENT CIF: XXXXXXX"
|
||||
(r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
# "CUI CLIENT: XXXXXXX"
|
||||
(r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
# "ROXXXXXXX" followed by CLIENT marker
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
|
||||
# "C.I.F. CLIENT: XXXXXXX"
|
||||
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
|
||||
# "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
|
||||
# === CUI on line BEFORE CLIENT marker (docTR/OCR reordering) ===
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
|
||||
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
|
||||
# === "CIF I CLIENT:" format (OCR extra chars) ===
|
||||
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
|
||||
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
||||
# === CIF CLIENT: (reversed - CIF before CLIENT) ===
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
# === CLIENT C.U.I/C.I.F. (slash variants) ===
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
|
||||
# === Doubled letters (docTR artifact: "C.U U.I") ===
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
# === CLIENT C.U.I. or CLIENT CUI (without slash) ===
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
# === Corrupted CLIENT after CIF (OCR errors) ===
|
||||
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),
|
||||
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
||||
(r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),
|
||||
(r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
||||
# === CUMPARATOR variants ===
|
||||
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
# CUMPARATOR with CUI/CIF on next line
|
||||
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
||||
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
||||
# CUMPARATOR with CUI/CIF two lines down
|
||||
(r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
# === CLIENT on next line ===
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
# === Standard fallback patterns ===
|
||||
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
|
||||
# "COD FISCAL CLIENT: XXXXXXX"
|
||||
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
|
||||
# === Brick-specific (I→L OCR error) ===
|
||||
# Matches: "CLIENT C.U.L./C.IF. :R01879855"
|
||||
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
|
||||
]
|
||||
|
||||
# Company type indicators (for identifying company names)
|
||||
@@ -158,15 +214,133 @@ class BaseStoreProfile(ABC):
|
||||
# Maximum reasonable payment amount (to filter OCR errors)
|
||||
MAX_PAYMENT = Decimal('100000')
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# TVA (VAT) patterns - ALL FORMATS unified
|
||||
# OCR tolerant: T[VU][AR] matches TVA, TUA, TVR
|
||||
# -------------------------------------------------------------------------
|
||||
TVA_PATTERNS = [
|
||||
# === FORMAT 1: INLINE cu cod și procent (Lidl-style) ===
|
||||
# "TVA A 21,00% 7.71" sau "TVA B 11,00% 2.13"
|
||||
(r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.98, 'inline'),
|
||||
(r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.97, 'inline'),
|
||||
(r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.95, 'inline'),
|
||||
|
||||
# === FORMAT 2: REVERSED (Stepout-style) ===
|
||||
# "5.00% TUA*B" - procent ÎNAINTE de TVA
|
||||
(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', 0.97, 'reversed'),
|
||||
|
||||
# === FORMAT 3: TABLE (OMV-style) ===
|
||||
# "A-21,00% 285,66 49,58" (cod-procent bază tva)
|
||||
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', 0.96, 'table'),
|
||||
(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', 0.95, 'taxe'),
|
||||
|
||||
# === FORMAT 4: MULTILINE (Brick/Electrobering) ===
|
||||
# "TOTAL TVA A - 19%" pe o linie, amount pe următoarea
|
||||
(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', 0.96, 'multiline'),
|
||||
(r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', 0.95, 'multiline'),
|
||||
(r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', 0.94, 'multiline'),
|
||||
|
||||
# === FORMAT 5: STANDARD (din extractor) ===
|
||||
(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98, 'bon'),
|
||||
(r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
|
||||
(r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
|
||||
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95, 'percent'),
|
||||
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93, 'percent'),
|
||||
(r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93, 'books'),
|
||||
(r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92, 'books'),
|
||||
|
||||
# === FORMAT 6: CODED inline (cu code A-D) ===
|
||||
(r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)', 0.95, 'coded'),
|
||||
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)', 0.93, 'coded'),
|
||||
|
||||
# === FALLBACK patterns ===
|
||||
(r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88, 'fallback'),
|
||||
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
|
||||
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
|
||||
(r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)', 0.90, 'standard'),
|
||||
]
|
||||
|
||||
# Non-VAT payer patterns - NEPLATITOR DE TVA
|
||||
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, ANEPLATHTOR, MEPLATITOR
|
||||
NON_VAT_PATTERNS = [
|
||||
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
|
||||
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
|
||||
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
|
||||
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
|
||||
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
|
||||
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
|
||||
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
|
||||
]
|
||||
|
||||
# CUI (fiscal code) patterns - VENDOR CUI (exclude CLIENT)
|
||||
# OCR errors: R0 instead of RO, C1F instead of CIF
|
||||
CUI_PATTERNS = [
|
||||
# CIF at start of line (definitely vendor)
|
||||
(r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'^CIF\s*:?\s*(\d{6,10})', 0.97),
|
||||
(r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
||||
(r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),
|
||||
# CIF not preceded by CLIENT (negative lookbehind)
|
||||
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
||||
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
|
||||
# Standalone CIF with word boundary
|
||||
(r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
|
||||
(r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
|
||||
# COD FISCAL (vendor)
|
||||
(r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
|
||||
(r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
|
||||
# C. I. F. format with SPACES (OCR artifact)
|
||||
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
|
||||
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
|
||||
# C.I.F. format (with dots, no spaces)
|
||||
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
|
||||
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
|
||||
# CUI format
|
||||
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
|
||||
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
|
||||
# Lidl format: "Cod Identificare fiscala" (OCR corrupted)
|
||||
(r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
|
||||
# Generic: anything with "fiscal" followed by RO + digits
|
||||
(r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
|
||||
]
|
||||
|
||||
# CUI REVERSED format (number BEFORE label)
|
||||
CUI_REVERSED_PATTERNS = [
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
||||
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
|
||||
]
|
||||
|
||||
# Items count patterns - NR POZ ART IN BON
|
||||
ITEMS_COUNT_PATTERNS = [
|
||||
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
|
||||
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
|
||||
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
|
||||
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
|
||||
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
|
||||
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
|
||||
]
|
||||
|
||||
# Series patterns - Romanian fiscal receipt series
|
||||
SERIES_PATTERNS = [
|
||||
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
||||
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
|
||||
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
|
||||
]
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Extraction methods - override in subclasses as needed
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from receipt text.
|
||||
Extract TVA entries from receipt text - GENERIC implementation.
|
||||
|
||||
Override this method in subclasses to handle store-specific TVA formats.
|
||||
Handles ALL formats:
|
||||
- Multi-rate inline (Lidl): "TVA A 21% 7.71"
|
||||
- Reversed (Stepout): "5.00% TUA*B"
|
||||
- Table (OMV): "A-21,00% 285,66 49,58"
|
||||
- Multiline: "TOTAL TVA A - 19%" + amount on next line
|
||||
- Non-VAT payers: Returns empty list
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
@@ -174,12 +348,252 @@ class BaseStoreProfile(ABC):
|
||||
Returns:
|
||||
List of dicts with keys: code, percent, amount
|
||||
"""
|
||||
return []
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Step 1: Check for known non-VAT payer (by class flag or text detection)
|
||||
if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper):
|
||||
return [] # No TVA entries for non-VAT payers
|
||||
|
||||
# Step 2: Normalize OCR spaces in numbers
|
||||
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
||||
lines = normalized.split('\n')
|
||||
|
||||
# Step 3: Try all formats, collect candidates
|
||||
candidates = []
|
||||
|
||||
# Try inline multi-rate (Lidl-style)
|
||||
candidates.extend(self._try_tva_inline(normalized))
|
||||
|
||||
# Try reversed format (Stepout-style)
|
||||
candidates.extend(self._try_tva_reversed(normalized, lines))
|
||||
|
||||
# Try multiline format (Brick/Electrobering)
|
||||
candidates.extend(self._try_tva_multiline(normalized, lines))
|
||||
|
||||
# Try table format (OMV-style)
|
||||
candidates.extend(self._try_tva_table(normalized))
|
||||
|
||||
# Try standard/fallback patterns
|
||||
if not candidates:
|
||||
candidates.extend(self._try_tva_standard(normalized))
|
||||
|
||||
# Step 4: Deduplicate and return
|
||||
seen = set()
|
||||
for entry in candidates:
|
||||
key = (entry.get('code', 'A'), entry.get('percent', 19))
|
||||
if key not in seen and entry.get('amount') and entry['amount'] > 0:
|
||||
entries.append(entry)
|
||||
seen.add(key)
|
||||
|
||||
return entries
|
||||
|
||||
def _is_non_vat_payer(self, text: str) -> bool:
|
||||
"""Check if receipt is from non-VAT payer."""
|
||||
for pattern in self.NON_VAT_PATTERNS:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _try_tva_inline(self, text: str) -> List[dict]:
|
||||
"""Try Lidl-style inline format: 'TVA A 21,00% 7.71'"""
|
||||
entries = []
|
||||
# Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13"
|
||||
for pattern, confidence, fmt in self.TVA_PATTERNS:
|
||||
if fmt != 'inline':
|
||||
continue
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
groups = match.groups()
|
||||
if len(groups) >= 3:
|
||||
code = groups[0].upper() if groups[0] else 'A'
|
||||
percent = int(groups[1])
|
||||
amount = self._parse_decimal(self._clean_ocr_number(groups[2]))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
|
||||
def _try_tva_reversed(self, text: str, lines: List[str]) -> List[dict]:
|
||||
"""Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)"""
|
||||
entries = []
|
||||
# Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next
|
||||
for i, line in enumerate(lines):
|
||||
# Try pattern with amount on SAME line: "5.00% TUA*B 2.00"
|
||||
match = re.search(
|
||||
r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?\s+([\d\s.,]+)',
|
||||
line, re.IGNORECASE
|
||||
)
|
||||
if match:
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
code = match.group(2).upper() if match.group(2) else 'A'
|
||||
amount_str = match.group(3).strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
continue # Check for more entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
pass
|
||||
|
||||
# Fallback: amount on NEXT line
|
||||
match = re.search(r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?$', line, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
code = match.group(2).upper() if match.group(2) else 'A'
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
|
||||
def _try_tva_multiline(self, text: str, lines: List[str]) -> List[dict]:
|
||||
"""Try multiline format: 'TOTAL TVA A - 19%' + amount on next line"""
|
||||
entries = []
|
||||
# Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next
|
||||
multiline_patterns = [
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
|
||||
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
|
||||
r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%',
|
||||
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%',
|
||||
]
|
||||
for i, line in enumerate(lines):
|
||||
for pattern in multiline_patterns:
|
||||
match = re.search(pattern, line, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
code = match.group(1).upper() if match.group(1) else 'A'
|
||||
percent = int(match.group(2))
|
||||
# Amount is on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
|
||||
def _try_tva_table(self, text: str) -> List[dict]:
|
||||
"""Try OMV-style table format: 'A-21,00% 285,66 49,58'"""
|
||||
entries = []
|
||||
# Pattern: "A-21,00% 285,66 49,58" (code-percent base_amount tva_amount)
|
||||
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)'
|
||||
for match in re.finditer(table_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
# Group 4 is the TVA amount (last column in table)
|
||||
tva_amount_str = self._clean_ocr_number(match.group(4))
|
||||
tva_amount = self._parse_decimal(tva_amount_str)
|
||||
if tva_amount and tva_amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': tva_amount
|
||||
})
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback: "TOTAL TAXE: 55,22"
|
||||
if not entries:
|
||||
taxe_match = re.search(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', text, re.IGNORECASE)
|
||||
if taxe_match:
|
||||
try:
|
||||
amount_str = self._clean_ocr_number(taxe_match.group(1))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Default rate
|
||||
'amount': amount
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
return entries
|
||||
|
||||
def _try_tva_standard(self, text: str) -> List[dict]:
|
||||
"""Try standard TVA patterns as fallback"""
|
||||
entries = []
|
||||
standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books']
|
||||
for pattern, confidence, fmt in self.TVA_PATTERNS:
|
||||
if fmt not in standard_fmts:
|
||||
continue
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
groups = match.groups()
|
||||
if len(groups) >= 2:
|
||||
# Could be (percent, amount) or (code, percent, amount)
|
||||
if groups[0] and groups[0].isalpha():
|
||||
code = groups[0].upper()
|
||||
percent = int(groups[1]) if len(groups) > 1 else 19
|
||||
amount_str = groups[2] if len(groups) > 2 else None
|
||||
else:
|
||||
code = 'A'
|
||||
percent = int(groups[0]) if groups[0] and groups[0].isdigit() else 19
|
||||
amount_str = groups[1] if len(groups) > 1 else groups[0]
|
||||
if amount_str:
|
||||
amount = self._parse_decimal(self._clean_ocr_number(amount_str))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
elif len(groups) == 1:
|
||||
# Just amount
|
||||
amount = self._parse_decimal(self._clean_ocr_number(groups[0]))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
|
||||
def _clean_ocr_number(self, value: str) -> str:
|
||||
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
|
||||
if not value:
|
||||
return ""
|
||||
value = re.sub(r'\s*([.,])\s*', r'\1', value)
|
||||
value = value.replace(' ', '')
|
||||
return value
|
||||
|
||||
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||
"""
|
||||
Extract total amount from receipt text.
|
||||
|
||||
Supports both single-line and multiline formats:
|
||||
- Single line: "TOTAL: 78.00", "SUMA TOTALA: 78.00"
|
||||
- Multiline: "SUMA\nTOTALA:\n78.00" (common in thermal receipts)
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
@@ -187,7 +601,54 @@ class BaseStoreProfile(ABC):
|
||||
Tuple of (amount, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# =====================================================================
|
||||
# STRATEGY 1: Multiline "SUMA TOTALA" pattern (thermal receipts)
|
||||
# Format: SUMA on one line, TOTALA: on next, amount on third
|
||||
# =====================================================================
|
||||
for i, line in enumerate(lines):
|
||||
line_clean = line.strip()
|
||||
|
||||
# Check for "SUMA" keyword (with OCR variants: SUNA, SUHA, SUM A)
|
||||
if re.search(r'S[UU]M[AĂ\s]', line_clean):
|
||||
# Look at next 3 lines for "TOTALA" and amount
|
||||
for j in range(i, min(i + 4, len(lines))):
|
||||
check_line = lines[j].strip()
|
||||
|
||||
# Check for "TOTALA:" or "TOTALA -" followed by amount
|
||||
match = re.search(r'T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', check_line)
|
||||
if match:
|
||||
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
return (amount, 0.98)
|
||||
|
||||
# Check for "TOTALA" without amount, amount on next line
|
||||
if re.search(r'T[O0]TALA\s*[:\-]?\s*$', check_line):
|
||||
if j + 1 < len(lines):
|
||||
amount_line = lines[j + 1].strip()
|
||||
amount = self._parse_decimal(amount_line)
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
return (amount, 0.97)
|
||||
|
||||
# Check for "SUMA TOTALA" on single line with amount
|
||||
match = re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', line_clean)
|
||||
if match:
|
||||
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
return (amount, 0.98)
|
||||
|
||||
# Check for "SUMA TOTALA" without amount, amount on next line
|
||||
if re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]?\s*$', line_clean):
|
||||
if i + 1 < len(lines):
|
||||
next_line = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(next_line)
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
return (amount, 0.96)
|
||||
|
||||
# =====================================================================
|
||||
# STRATEGY 2: Standard single-line patterns
|
||||
# =====================================================================
|
||||
for pattern, confidence in self.TOTAL_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
@@ -259,28 +720,76 @@ class BaseStoreProfile(ABC):
|
||||
"""
|
||||
Extract payment methods (CARD/NUMERAR) from receipt.
|
||||
|
||||
Supports multiple payments of the same type (e.g., 2x CARD for split payments).
|
||||
Each payment is returned as a separate entry with its amount.
|
||||
Supports:
|
||||
- Multiline patterns: "CARD\n78.00" (common in thermal receipts)
|
||||
- Multiple payments (split CARD + NUMERAR)
|
||||
- REST (change) detection to calculate actual CARD amount
|
||||
- Keyword-only CARD/NUMERAR that infers from total
|
||||
- Fallback for fiscal receipts without explicit payment
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
|
||||
Multiple entries of same method type are allowed for split payments.
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
methods = []
|
||||
# Track (method, amount) pairs to avoid exact duplicates from overlapping patterns
|
||||
seen_entries = set()
|
||||
|
||||
# =====================================================================
|
||||
# STEP 0: Try MULTILINE patterns first (thermal receipts)
|
||||
# Format: "CARD" on one line, amount on next line
|
||||
# =====================================================================
|
||||
for i, line in enumerate(lines):
|
||||
line_clean = line.strip()
|
||||
|
||||
# Standalone CARD keyword (not part of MASTERCARD, etc.)
|
||||
if re.match(r'^CARD\s*$', line_clean):
|
||||
if i + 1 < len(lines):
|
||||
next_line = lines[i + 1].strip()
|
||||
# Must be a valid amount (not another keyword)
|
||||
if re.match(r'^[\d\s.,]+$', next_line):
|
||||
amount = self._parse_decimal(next_line)
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
entry_key = ('CARD', amount)
|
||||
if entry_key not in seen_entries:
|
||||
methods.append({
|
||||
'method': 'CARD',
|
||||
'amount': amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
seen_entries.add(entry_key)
|
||||
|
||||
# Standalone NUMERAR keyword
|
||||
if re.match(r'^NUMERAR\s*$', line_clean):
|
||||
if i + 1 < len(lines):
|
||||
next_line = lines[i + 1].strip()
|
||||
if re.match(r'^[\d\s.,]+$', next_line):
|
||||
amount = self._parse_decimal(next_line)
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
entry_key = ('NUMERAR', amount)
|
||||
if entry_key not in seen_entries:
|
||||
methods.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
seen_entries.add(entry_key)
|
||||
|
||||
# If multiline extraction found methods, return them
|
||||
if methods:
|
||||
return methods
|
||||
|
||||
# =====================================================================
|
||||
# STEP 1: Try pattern-based extraction with explicit amounts
|
||||
# =====================================================================
|
||||
for pattern, method, confidence in self.PAYMENT_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper):
|
||||
try:
|
||||
amount = self._parse_decimal(match.group(1))
|
||||
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
||||
# Deduplicate by (method, amount) to avoid same entry from multiple patterns
|
||||
# But allow different amounts for same method (split payments)
|
||||
entry_key = (method, amount)
|
||||
if entry_key not in seen_entries:
|
||||
methods.append({
|
||||
@@ -292,6 +801,70 @@ class BaseStoreProfile(ABC):
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# If we found explicit amounts, we're done
|
||||
if methods:
|
||||
return methods
|
||||
|
||||
# Step 2: Try keyword-only detection with REST logic
|
||||
# Get total amount for inference
|
||||
total_amount, _ = self.extract_total(text)
|
||||
if not total_amount:
|
||||
return []
|
||||
|
||||
# Check for payment keywords
|
||||
has_card = any(kw in text_upper for kw in ['CARD', 'CARTE CREDIT', 'VISA', 'MASTERCARD', 'DEBIT', 'CREDIT', 'CONTACTLESS'])
|
||||
has_numerar = any(kw in text_upper for kw in ['NUMERAR', 'CASH'])
|
||||
|
||||
# Find REST (change) amount
|
||||
rest_amount = Decimal('0')
|
||||
for i, line in enumerate(lines):
|
||||
if 'REST' in line:
|
||||
# REST on same line: "REST 0.00" or "REST: 0.00"
|
||||
match = re.search(r'REST\s*:?\s*([\d.,]+)', line)
|
||||
if match:
|
||||
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
|
||||
elif i + 1 < len(lines):
|
||||
# REST on separate line
|
||||
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
|
||||
break
|
||||
|
||||
# Calculate payment amounts
|
||||
if has_card:
|
||||
card_amount = total_amount - rest_amount
|
||||
if card_amount > 0:
|
||||
methods.append({
|
||||
'method': 'CARD',
|
||||
'amount': card_amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
|
||||
if has_numerar:
|
||||
if has_card and rest_amount > 0:
|
||||
# Mixed payment: NUMERAR is the change given back
|
||||
methods.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': rest_amount,
|
||||
'confidence': 0.85
|
||||
})
|
||||
elif not has_card:
|
||||
# Cash only
|
||||
methods.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
|
||||
# Step 3: Fallback for fiscal receipts without explicit payment
|
||||
if not methods and total_amount and total_amount > 0:
|
||||
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
|
||||
if is_fiscal:
|
||||
# Default to CARD for business purchases (most common)
|
||||
methods.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.70
|
||||
})
|
||||
|
||||
return methods
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
"""
|
||||
BEST PRINT TRADE ACTIV SRL store profile for OCR extraction.
|
||||
|
||||
Stamp manufacturing service. Non-VAT payer (neplătitor de TVA).
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class BestPrintProfile(BaseStoreProfile):
|
||||
"""
|
||||
BEST PRINT TRADE ACTIV SRL - non-VAT payer profile.
|
||||
|
||||
Key characteristics:
|
||||
- Non-VAT payer (neplătitor de TVA) - NO TVA on receipts
|
||||
- Stamp manufacturing and printing services
|
||||
- Total amount has no TVA component
|
||||
- CARD payment typical
|
||||
"""
|
||||
|
||||
CUI_LIST = ["45417955"]
|
||||
NAME_PATTERNS = ["BEST PRINT", "BESTPRINT", "BEST PRINT TRADE", "BEST PR1NT"]
|
||||
STORE_NAME = "BEST PRINT TRADE ACTIV SRL"
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries - returns empty for non-VAT payer.
|
||||
|
||||
BEST PRINT is a non-VAT payer (neplătitor de TVA),
|
||||
so no TVA entries are expected on receipts.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt (unused)
|
||||
|
||||
Returns:
|
||||
Empty list (non-VAT payer has no TVA)
|
||||
"""
|
||||
# Non-VAT payer - no TVA entries
|
||||
return []
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return BEST PRINT-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True,
|
||||
"has_client_cui": True, # May have client CUI
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": True, # CRITICAL: Non-VAT payer
|
||||
"tva_pattern": "none",
|
||||
}
|
||||
@@ -1,282 +0,0 @@
|
||||
"""
|
||||
BRICK (Five-Holding) store profile for OCR extraction.
|
||||
|
||||
Five-Holding S.A. operates BRICK stores with standard receipt format.
|
||||
|
||||
Receipt structure:
|
||||
- TVA format: "TOTAL TVA A - 21%" with amount on next line
|
||||
- Payment: "CARD" on separate line (amount from TOTAL LEI)
|
||||
- Client CUI: "CLIENT C.U.L./C.IF. :ROXXXXXXX" (OCR reads I as L)
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class BrickProfile(BaseStoreProfile):
|
||||
"""
|
||||
FIVE-HOLDING S.A. (BRICK) - standard TVA format with client CUI.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format with rate code (A, B, etc.)
|
||||
- TVA amount on separate line after percentage
|
||||
- CARD payment indicated by keyword (amount derived from total)
|
||||
- Client CUI in format: CLIENT C.U.L./C.IF.
|
||||
- OCR often reads "I" as "L" in CUI markers
|
||||
"""
|
||||
|
||||
CUI_LIST = ["10562600"]
|
||||
NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK", "F1VE"]
|
||||
STORE_NAME = "FIVE-HOLDING S.A."
|
||||
|
||||
# BRICK TVA patterns (amount often on separate line)
|
||||
TVA_PATTERNS = [
|
||||
# "TOTAL TVA A - 21%" with amount on next line (captured as multiline)
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
|
||||
# "OTAL IVAA 21%" - OCR error variant
|
||||
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
|
||||
# "TOTAL TVA A 21%" without separator
|
||||
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
|
||||
# "TVA A: XX% = YY,YY" - inline format
|
||||
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# TOTAL TVA BON pattern (fallback)
|
||||
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s*BON\s*\n?\s*([\d.,]+)'
|
||||
|
||||
# Client CUI patterns - specific to Brick (handles OCR L/I confusion)
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CLIENT C.U.L./C.IF. :R01879855" - exact OCR format (I->L)
|
||||
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/?\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
|
||||
# "CLIENT C.U.I./C.I.F.: RO1879855" - standard format
|
||||
(r'CLIENT\s+C\.?U\.?I\.?\s*/?\s*C\.?I\.?F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
|
||||
# "CIF CLIENT: XXXXXXX" - alternative format
|
||||
(r'CIF\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.95),
|
||||
]
|
||||
|
||||
# Client markers for Brick
|
||||
CLIENT_MARKERS = [
|
||||
r'CLIENT\s+C\.?U\.?[LI1]',
|
||||
r'CLIENT\s+C\.?I\.?F',
|
||||
r'CIF\s+CLIENT',
|
||||
]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract BRICK-specific TVA entries.
|
||||
|
||||
BRICK receipts show TVA in multi-line format:
|
||||
"TOTAL TVA A - 21%"
|
||||
"32.31"
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
seen = set()
|
||||
|
||||
# Try coded patterns first (with multiline support)
|
||||
for pattern in self.TVA_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
|
||||
if amount and amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
return entries # Brick usually has single TVA rate
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback: "TOTAL TVA BON" with amount on next line
|
||||
match = re.search(self.TOTAL_TVA_BON_PATTERN, text_upper, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
try:
|
||||
amount = self._parse_decimal(match.group(1))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Default rate
|
||||
'amount': amount
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
return entries
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract BRICK-specific payment methods.
|
||||
|
||||
BRICK receipts show payment method on separate line:
|
||||
"TOTAL LEI"
|
||||
"21.18"
|
||||
"CARD"
|
||||
"0.00" <- REST (change)
|
||||
|
||||
When CARD appears with REST=0, full amount was paid by card.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Find TOTAL LEI amount
|
||||
total_amount = None
|
||||
for i, line in enumerate(lines):
|
||||
if 'TOTAL' in line and 'LEI' in line:
|
||||
# Amount is likely on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
total_amount = self._parse_decimal(amount_str)
|
||||
break
|
||||
# Also try inline: "TOTAL LEI 21.18"
|
||||
match = re.search(r'TOTAL\s+LEI\s*([\d.,]+)', line)
|
||||
if match:
|
||||
total_amount = self._parse_decimal(match.group(1))
|
||||
break
|
||||
|
||||
if not total_amount:
|
||||
# Fallback to generic total extraction
|
||||
total_amount, _ = self.extract_total(text)
|
||||
|
||||
if not total_amount:
|
||||
return []
|
||||
|
||||
# Check for CARD or NUMERAR keywords
|
||||
has_card = any('CARD' in line for line in lines)
|
||||
has_numerar = any('NUMERAR' in line for line in lines)
|
||||
|
||||
# Find REST amount to determine actual card amount
|
||||
rest_amount = Decimal('0')
|
||||
for i, line in enumerate(lines):
|
||||
if 'REST' in line:
|
||||
# REST amount is on next line or same line
|
||||
match = re.search(r'REST\s*([\d.,]+)', line)
|
||||
if match:
|
||||
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
|
||||
elif i + 1 < len(lines):
|
||||
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
|
||||
break
|
||||
|
||||
if has_card:
|
||||
# Card payment = total - rest
|
||||
card_amount = total_amount - rest_amount
|
||||
if card_amount > 0:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': card_amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
|
||||
if has_numerar:
|
||||
# If both card and cash, need more complex logic
|
||||
# For now, assume numerar is the rest if card is present
|
||||
if not has_card:
|
||||
payments.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
elif rest_amount > 0:
|
||||
payments.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': rest_amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
|
||||
# If no explicit payment keyword but REST=0, assume card
|
||||
if not payments and rest_amount == 0:
|
||||
# Check for any payment indicators
|
||||
for line in lines:
|
||||
if 'CARD' in line or 'DEBIT' in line or 'CREDIT' in line:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
break
|
||||
|
||||
# FALLBACK: If still no payment found but we have total amount,
|
||||
# assume CARD for business receipts (Brick stores usually accept card)
|
||||
# This handles cases where OCR fails to capture payment method
|
||||
if not payments and total_amount and total_amount > 0:
|
||||
# Check if this is a fiscal receipt (BON FISCAL)
|
||||
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
|
||||
if is_fiscal:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.70 # Lower confidence for inferred payment
|
||||
})
|
||||
|
||||
return payments
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from BRICK receipt.
|
||||
|
||||
BRICK uses format: "CLIENT C.U.L./C.IF. :R01879855"
|
||||
Note: OCR often reads "I" as "L" in these markers.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for Brick client markers
|
||||
has_client = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try Brick-specific patterns
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Clean up: remove RO prefix, spaces
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return BRICK-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True, # Card amount equals total when REST=0
|
||||
"has_client_cui": True, # Brick receipts CAN have client CUI
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"tva_on_separate_line": True, # TVA amount on next line
|
||||
}
|
||||
@@ -1,118 +0,0 @@
|
||||
"""
|
||||
DEDEMAN store profile for OCR extraction.
|
||||
|
||||
Dedeman receipts may include e-factura information and use standard TVA format.
|
||||
Large DIY retailer in Romania.
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class DedemanProfile(BaseStoreProfile):
|
||||
"""
|
||||
DEDEMAN SRL - standard TVA with e-factura support.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format
|
||||
- May include e-factura reference number
|
||||
- Professional receipts for construction materials
|
||||
"""
|
||||
|
||||
CUI_LIST = ["2816464"]
|
||||
NAME_PATTERNS = ["DEDEMAN", "DEDEMAN SRL", "OEDEMAN", "D3DEMAN"] # OCR variants
|
||||
STORE_NAME = "DEDEMAN SRL"
|
||||
|
||||
# Standard TVA patterns (flexible - accepts any rate)
|
||||
TVA_PATTERNS = [
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
|
||||
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "A - XX,XX% = YY,YY"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "TVA (XX%) YY,YY"
|
||||
r'TVA\s*\(?\s*(\d{1,2})\s*%\s*\)?\s*:?\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# E-factura pattern for reference extraction
|
||||
EFACTURA_PATTERN = r'e-?factura\s*:?\s*([A-Z0-9]+)'
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract Dedeman-specific TVA entries.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
|
||||
# Try coded patterns first
|
||||
for pattern in self.TVA_PATTERNS[:2]:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
|
||||
if amount and amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback to simple format
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[2]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def extract_efactura_reference(self, text: str) -> str | None:
|
||||
"""
|
||||
Extract e-factura reference number if present.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
E-factura reference string or None
|
||||
"""
|
||||
match = re.search(self.EFACTURA_PATTERN, text, re.IGNORECASE)
|
||||
return match.group(1) if match else None
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return Dedeman-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": False,
|
||||
"has_client_cui": False,
|
||||
"has_efactura": True,
|
||||
"is_non_vat_payer": False,
|
||||
}
|
||||
@@ -1,133 +0,0 @@
|
||||
"""
|
||||
ELECTROBERING S.R.L. store profile for OCR extraction.
|
||||
|
||||
Electronics and home supplies store.
|
||||
|
||||
Receipt structure:
|
||||
- TVA format: "TOTAL TVA A - - 19%" with amount on next line
|
||||
- "TOTAL TVA BON" with total TVA amount
|
||||
- Client CUI: "CIF CLIENT: XXXXXXX"
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class ElectroberingProfile(BaseStoreProfile):
|
||||
"""
|
||||
ELECTROBERING S.R.L. - standard TVA profile with multiline support.
|
||||
|
||||
Key characteristics:
|
||||
- TVA format with rate on one line, amount on next
|
||||
- Double-dash separators common (OCR artifact)
|
||||
- May have client CUI for B2B purchases
|
||||
- CARD payment typical
|
||||
"""
|
||||
|
||||
CUI_LIST = ["2744937"]
|
||||
NAME_PATTERNS = ["ELECTROBERING", "ELECTR0BERING", "ELECTROBERING SRL"]
|
||||
STORE_NAME = "ELECTROBERING S.R.L."
|
||||
|
||||
# ELECTROBERING TVA patterns (handles double-dash and multiline)
|
||||
TVA_PATTERNS = [
|
||||
# "TOTAL TVA A - - 19%" with amount on next line
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
|
||||
# "TOTAL TVA A 19%" without separator
|
||||
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
|
||||
# Standard: "TVA A: XX% = YY,YY"
|
||||
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# TOTAL TVA BON pattern (fallback)
|
||||
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract ELECTROBERING-specific TVA entries.
|
||||
|
||||
ELECTROBERING receipts show TVA in multi-line format:
|
||||
"TOTAL TVA A - - 19%"
|
||||
"5.59"
|
||||
"TOTAL TVA BON"
|
||||
"5.59"
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Find TVA rate line and get amount from next line
|
||||
for i, line in enumerate(lines):
|
||||
# Match "TOTAL TVA A - - 19%" or "TOTAL TVA A 19%"
|
||||
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', line)
|
||||
if match:
|
||||
code = match.group(1)
|
||||
percent = int(match.group(2))
|
||||
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
|
||||
# Fallback: Find TOTAL TVA BON and get amount
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Default Romanian TVA rate
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
|
||||
# Last fallback: inline format "TVA A: XX% = YY,YY"
|
||||
for pattern in [self.TVA_PATTERNS[2]]:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match and len(match.groups()) >= 3:
|
||||
try:
|
||||
code = match.group(1)
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
return entries
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return ELECTROBERING-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True,
|
||||
"has_client_cui": True, # May have client CUI for B2B
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"tva_on_separate_line": True,
|
||||
}
|
||||
@@ -1,108 +0,0 @@
|
||||
"""
|
||||
GAMA INK SERVICE SRL store profile for OCR extraction.
|
||||
|
||||
Toner refill and printer supplies store.
|
||||
|
||||
Receipt structure:
|
||||
- TVA format: "TOTAL TVA A 4 19%" with amount on next line (4 is OCR for -)
|
||||
- "TOTAL TVA BON" with total TVA amount
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class GamaInkProfile(BaseStoreProfile):
|
||||
"""
|
||||
GAMA INK SERVICE SRL - standard TVA profile with multiline support.
|
||||
|
||||
Key characteristics:
|
||||
- TVA format with rate on one line, amount on next
|
||||
- OCR often reads "-" as "4" (e.g., "A 4 19%" instead of "A - 19%")
|
||||
- CARD payment typical
|
||||
"""
|
||||
|
||||
CUI_LIST = ["17741882"]
|
||||
NAME_PATTERNS = ["GAMA INK", "GAMA", "GAMAINK", "GAMA INK SERVICE"]
|
||||
STORE_NAME = "GAMA INK SERVICE SRL"
|
||||
|
||||
# GAMA INK TVA patterns (handles OCR errors)
|
||||
TVA_PATTERNS = [
|
||||
# "TOTAL TVA A 4 19%" (4 is OCR for -)
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%',
|
||||
# "TOTAL TVA A - 19%"
|
||||
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
|
||||
]
|
||||
|
||||
# TOTAL TVA BON pattern (fallback)
|
||||
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract GAMA INK-specific TVA entries.
|
||||
|
||||
Format: "TOTAL TVA A 4 19%" on one line, amount on next line.
|
||||
Note: OCR reads "-" as "4" sometimes.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Find TVA rate line and get amount from next line
|
||||
for i, line in enumerate(lines):
|
||||
# Match "TOTAL TVA A 4 19%" or "TOTAL TVA A - 19%"
|
||||
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%', line)
|
||||
if match:
|
||||
code = match.group(1)
|
||||
percent = int(match.group(2))
|
||||
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
|
||||
# Fallback: Find TOTAL TVA BON and get amount
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Default Romanian TVA rate
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
|
||||
return entries
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return GAMA INK-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True,
|
||||
"has_client_cui": True, # May have client CUI for business
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"tva_on_separate_line": True,
|
||||
}
|
||||
@@ -1,53 +0,0 @@
|
||||
"""
|
||||
KINETERRA store profile for OCR extraction.
|
||||
|
||||
Kineterra is a non-VAT payer (neplătitor de TVA).
|
||||
Receipts don't include TVA breakdown.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class KineterraProfile(BaseStoreProfile):
|
||||
"""
|
||||
KINETERRA CONCEPT SRL - non-VAT payer profile.
|
||||
|
||||
Key characteristics:
|
||||
- Non-VAT payer (neplătitor de TVA)
|
||||
- No TVA breakdown on receipts
|
||||
- Total amount has no TVA component
|
||||
"""
|
||||
|
||||
CUI_LIST = ["31180432"]
|
||||
NAME_PATTERNS = ["KINETERRA", "KINETERRA CONCEPT", "K1NETERRA"] # OCR variants
|
||||
STORE_NAME = "KINETERRA CONCEPT SRL"
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries - returns empty for non-VAT payer.
|
||||
|
||||
Kineterra is a non-VAT payer, so no TVA entries are expected.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt (unused)
|
||||
|
||||
Returns:
|
||||
Empty list (non-VAT payer has no TVA)
|
||||
"""
|
||||
# Non-VAT payer - no TVA entries
|
||||
return []
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return Kineterra-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": False,
|
||||
"has_client_cui": False,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": True,
|
||||
"tva_pattern": "none",
|
||||
}
|
||||
@@ -1,93 +0,0 @@
|
||||
"""
|
||||
LIDL store profile for OCR extraction.
|
||||
|
||||
Lidl receipts have a specific TVA format without hyphen/colon separators:
|
||||
TOTAL TVA 9,84
|
||||
TVA A 21,00% 7,71
|
||||
TVA B 11,00% 2,13
|
||||
|
||||
This profile handles multi-rate TVA extraction for Lidl receipts.
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class LidlProfile(BaseStoreProfile):
|
||||
"""
|
||||
LIDL DISCOUNT S.R.L. - multi-rate TVA profile.
|
||||
|
||||
Key characteristics:
|
||||
- Multi-rate TVA (codes A, B, C, D with any percentage - patterns are flexible)
|
||||
- TVA format: "TVA A XX,XX% YY,YY" (code + percent + amount on same line)
|
||||
- Supports historical rates (19%, 9%, 5%) and current rates (21%, 11%)
|
||||
- CARD payment usually equals total
|
||||
- No client CUI on receipts
|
||||
"""
|
||||
|
||||
CUI_LIST = ["22891860"]
|
||||
NAME_PATTERNS = ["LIDL", "LDL", "L1DL", "LIDL DISCOUNT"] # OCR variants
|
||||
STORE_NAME = "LIDL DISCOUNT S.R.L."
|
||||
|
||||
# Lidl-specific TVA patterns
|
||||
# Format: "TVA A 21,00% 7,71" (code + percent + amount on same line)
|
||||
TVA_PATTERNS = [
|
||||
# Primary: "TVA A 21,00% 7.71" with various spacing
|
||||
r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
||||
# With backslash OCR artifact: "TVA A \21,00% 7.71"
|
||||
r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
||||
# IVA variant (rare OCR misread)
|
||||
r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)',
|
||||
]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract Lidl-specific TVA entries.
|
||||
|
||||
Handles multiple TVA rates (A, B, C, D) commonly found on Lidl receipts.
|
||||
Uses deduplication to avoid counting the same entry twice from different patterns.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set() # Deduplication key: (code, percent)
|
||||
|
||||
for pattern in self.TVA_PATTERNS:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
|
||||
if amount and amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return Lidl-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": True,
|
||||
"card_equals_total": True,
|
||||
"has_client_cui": False,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
}
|
||||
@@ -1,236 +0,0 @@
|
||||
"""
|
||||
OMV Petrom store profile for OCR extraction.
|
||||
|
||||
OMV receipts typically include client CUI and use standard TVA format.
|
||||
Common at gas stations with fuel purchases.
|
||||
|
||||
Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14")
|
||||
OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22")
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class OMVProfile(BaseStoreProfile):
|
||||
"""
|
||||
OMV PETROM MARKETING S.R.L. - standard TVA with client CUI.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format (usually single rate, any percentage)
|
||||
- Includes client CUI on receipt (for business purchases)
|
||||
- TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces)
|
||||
- Supports historical rates (19%) and current rates (21%)
|
||||
- Date format: YYYY. MM. DD (with spaces)
|
||||
- Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX"
|
||||
"""
|
||||
|
||||
CUI_LIST = ["11201891"]
|
||||
NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants
|
||||
STORE_NAME = "OMV PETROM MARKETING S.R.L."
|
||||
|
||||
# OMV TVA table patterns (handles OCR spaces in numbers)
|
||||
# Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total)
|
||||
TVA_TABLE_PATTERNS = [
|
||||
# "A-21, 00% 55, 22 318, 16" - with spaces in numbers
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)',
|
||||
# "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE
|
||||
r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)',
|
||||
]
|
||||
|
||||
# Standard TVA pattern fallback
|
||||
TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)'
|
||||
|
||||
# OMV specific: prioritize YYYY. MM. DD format with spaces
|
||||
DATE_PATTERNS_OCR_SPACES = [
|
||||
# YYYY. MM. DD with time (OMV format)
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
|
||||
# Fallback to DD. MM. YYYY
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
||||
]
|
||||
|
||||
# Client CUI patterns for OMV (unique format)
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CLIENT C.U. I./C.I.F.: RO1879855"
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99),
|
||||
# "C.U.I./C.I.F. CLIENT: XXXXXXX"
|
||||
(r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
|
||||
# Fallback to simpler pattern
|
||||
(r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90),
|
||||
]
|
||||
|
||||
# Client markers for OMV
|
||||
CLIENT_MARKERS = [
|
||||
r'CLIENT\s+C\.?\s*U\.?\s*I',
|
||||
r'CLIENT\s+C\.?\s*I\.?\s*F',
|
||||
r'NUME\s+CLIENT',
|
||||
r'CLIENT\s*:',
|
||||
]
|
||||
|
||||
def _clean_ocr_number(self, value: str) -> str:
|
||||
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
|
||||
# Remove spaces around commas and periods
|
||||
value = re.sub(r'\s*([.,])\s*', r'\1', value)
|
||||
# Remove any remaining spaces
|
||||
value = value.replace(' ', '')
|
||||
return value
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract OMV-specific TVA entries.
|
||||
|
||||
OMV receipts show TVA in table format with spaces in numbers.
|
||||
Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base)
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Try table format first: "A-21, 00% 55, 22 318, 16"
|
||||
table_pattern = self.TVA_TABLE_PATTERNS[0]
|
||||
for match in re.finditer(table_pattern, text_upper):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
# Clean OCR spaces from amounts
|
||||
tva_amount_str = self._clean_ocr_number(match.group(3))
|
||||
tva_amount = self._parse_decimal(tva_amount_str)
|
||||
|
||||
if tva_amount and tva_amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': tva_amount
|
||||
})
|
||||
return entries # OMV usually has single TVA rate
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback: "TOTAL TAXE: 55, 22"
|
||||
fallback_pattern = self.TVA_TABLE_PATTERNS[1]
|
||||
match = re.search(fallback_pattern, text_upper)
|
||||
if match:
|
||||
try:
|
||||
tva_amount_str = self._clean_ocr_number(match.group(1))
|
||||
tva_amount = self._parse_decimal(tva_amount_str)
|
||||
if tva_amount and tva_amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Standard rate, will be corrected by validation
|
||||
'amount': tva_amount
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
return entries
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from OMV receipt.
|
||||
|
||||
OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855"
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for OMV client markers
|
||||
has_client = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try OMV-specific patterns
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Clean up: remove RO prefix, spaces
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract OMV-specific payment methods.
|
||||
|
||||
OMV receipts use "CARTE CREDIT" instead of "CARD".
|
||||
Payment amount equals TOTAL for gas station receipts.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Get total amount first
|
||||
total_amount, _ = self.extract_total(text)
|
||||
if not total_amount:
|
||||
return []
|
||||
|
||||
# OMV payment patterns
|
||||
payment_indicators = [
|
||||
('CARTE CREDIT', 'CARD', 0.98),
|
||||
('CARTE DE CREDIT', 'CARD', 0.98),
|
||||
('CARD', 'CARD', 0.95),
|
||||
('VISA', 'CARD', 0.95),
|
||||
('MASTERCARD', 'CARD', 0.95),
|
||||
('CONTACTLESS', 'CARD', 0.90),
|
||||
('NUMERAR', 'NUMERAR', 0.95),
|
||||
('CASH', 'NUMERAR', 0.90),
|
||||
]
|
||||
|
||||
for indicator, method, confidence in payment_indicators:
|
||||
if indicator in text_upper:
|
||||
payments.append({
|
||||
'method': method,
|
||||
'amount': total_amount,
|
||||
'confidence': confidence
|
||||
})
|
||||
return payments # OMV usually has single payment method
|
||||
|
||||
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
|
||||
if 'BON FISCAL' in text_upper:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.70
|
||||
})
|
||||
|
||||
return payments
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return OMV-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True, # Gas station: card equals total
|
||||
"has_client_cui": True,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"tva_table_format": True,
|
||||
}
|
||||
@@ -1,101 +0,0 @@
|
||||
"""
|
||||
PICTUS VELUM SRL store profile for OCR extraction.
|
||||
|
||||
Office supplies and stationery store.
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class PictusVelumProfile(BaseStoreProfile):
|
||||
"""
|
||||
PICTUS VELUM SRL - standard TVA profile.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format (single rate, any percentage)
|
||||
- Office supplies and stationery (rechizite)
|
||||
- CARD payment typical
|
||||
"""
|
||||
|
||||
CUI_LIST = ["39634534"]
|
||||
NAME_PATTERNS = ["PICTUS", "PICTUS VELUM", "P1CTUS", "PICTUS VELUM SRL"]
|
||||
STORE_NAME = "PICTUS VELUM SRL"
|
||||
|
||||
# Standard TVA patterns (flexible - accepts any rate)
|
||||
TVA_PATTERNS = [
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
|
||||
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "A - XX,XX% = YY,YY"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "TVA XX% YY,YY" (simple format without code)
|
||||
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from receipt text.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
|
||||
# Try coded patterns first
|
||||
for pattern in self.TVA_PATTERNS[:2]:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
|
||||
if amount and amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback to simple format
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[2]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return PICTUS VELUM-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True,
|
||||
"has_client_cui": False,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
}
|
||||
@@ -1,162 +0,0 @@
|
||||
"""
|
||||
SOCAR Petroleum store profile for OCR extraction.
|
||||
|
||||
SOCAR receipts are similar to OMV - gas station with client CUI support.
|
||||
Date format may use YYYY. MM. DD with spaces.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class SocarProfile(BaseStoreProfile):
|
||||
"""
|
||||
SOCAR PETROLEUM S.A. - standard TVA with client CUI.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format (usually single rate)
|
||||
- Includes client CUI on receipt (for business purchases)
|
||||
- Similar format to OMV/Petrom
|
||||
- Date format may use YYYY. MM. DD (with spaces)
|
||||
"""
|
||||
|
||||
CUI_LIST = ["12546600"]
|
||||
NAME_PATTERNS = ["SOCAR", "S0CAR", "SOCAR PETROLEUM"] # OCR variants
|
||||
STORE_NAME = "SOCAR PETROLEUM S.A."
|
||||
|
||||
# Standard TVA patterns for gas stations
|
||||
TVA_PATTERNS = [
|
||||
# Table format: "A-19,00% 285,66 49,58"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)',
|
||||
# Simple format: "TVA 19% 49,58"
|
||||
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# Gas stations may use YYYY. MM. DD format
|
||||
DATE_PATTERNS_OCR_SPACES = [
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.98, 'ymd'),
|
||||
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.95, 'ymd'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
||||
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
||||
]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract SOCAR-specific TVA entries.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
|
||||
# Try table format first
|
||||
table_pattern = self.TVA_PATTERNS[0]
|
||||
for match in re.finditer(table_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
tva_amount = self._parse_decimal(match.group(4))
|
||||
|
||||
if tva_amount and tva_amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': tva_amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Fallback to simple format if no table entries found
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[1]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
|
||||
if amount and amount > 0:
|
||||
# Default to code 'A' for simple format
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break # Only take first match for simple format
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract SOCAR-specific payment methods.
|
||||
|
||||
Gas stations use "CARTE CREDIT" or "CARD" for card payments.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Get total amount first
|
||||
total_amount, _ = self.extract_total(text)
|
||||
if not total_amount:
|
||||
return []
|
||||
|
||||
# Gas station payment patterns
|
||||
payment_indicators = [
|
||||
('CARTE CREDIT', 'CARD', 0.98),
|
||||
('CARTE DE CREDIT', 'CARD', 0.98),
|
||||
('CARD', 'CARD', 0.95),
|
||||
('VISA', 'CARD', 0.95),
|
||||
('MASTERCARD', 'CARD', 0.95),
|
||||
('CONTACTLESS', 'CARD', 0.90),
|
||||
('NUMERAR', 'NUMERAR', 0.95),
|
||||
('CASH', 'NUMERAR', 0.90),
|
||||
]
|
||||
|
||||
for indicator, method, confidence in payment_indicators:
|
||||
if indicator in text_upper:
|
||||
payments.append({
|
||||
'method': method,
|
||||
'amount': total_amount,
|
||||
'confidence': confidence
|
||||
})
|
||||
return payments
|
||||
|
||||
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
|
||||
if 'BON FISCAL' in text_upper:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.70
|
||||
})
|
||||
|
||||
return payments
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return SOCAR-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True, # Gas station: card equals total
|
||||
"has_client_cui": True,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
}
|
||||
@@ -1,204 +0,0 @@
|
||||
"""
|
||||
STEPOUT MARKET SRL store profile for OCR extraction.
|
||||
|
||||
Bookstore with reduced TVA rate (5% for books in Romania).
|
||||
|
||||
Receipt structure:
|
||||
- TVA format: "5.00% TUA*B" with amount on next line
|
||||
- Total format: "SUMA TOTALA:" with amount on next line
|
||||
- Payment: "CARD" with amount on next line
|
||||
- Client CUI: "CIF CLIENT:XXXXXXX"
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class StepoutMarketProfile(BaseStoreProfile):
|
||||
"""
|
||||
STEPOUT MARKET SRL - reduced TVA rate profile (books).
|
||||
|
||||
Key characteristics:
|
||||
- Reduced TVA rate: 5% for books (cărți qualification in Romania)
|
||||
- TVA format: "X.XX% TUA*B" (OCR reads TVA as TUA)
|
||||
- Multiline format for amounts
|
||||
- CARD payment typical
|
||||
"""
|
||||
|
||||
CUI_LIST = ["35532655"]
|
||||
NAME_PATTERNS = ["STEPOUT", "STEPOUT MARKET", "STEP0UT", "STEPUUT", "STEPOUT MARKET SRL"]
|
||||
STORE_NAME = "STEPOUT MARKET SRL"
|
||||
|
||||
# TVA patterns for Stepout (handles TUA OCR error and multiline)
|
||||
TVA_PATTERNS = [
|
||||
# "5.00% TUA*B" - OCR format with TUA
|
||||
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])',
|
||||
# "TVA A: 5% = YY,YY" or "TVA-A 5% YY,YY" (inline format)
|
||||
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "TOTAL TUA:" with amount on next line
|
||||
r'TOTAL\s+T[UV]A\s*:',
|
||||
]
|
||||
|
||||
# Total patterns for Stepout
|
||||
TOTAL_PATTERNS = [
|
||||
# "SUMA TOTALA:" with amount on next line
|
||||
(r'SUMA\s+TOTALA\s*:', 0.98),
|
||||
# "TOTAL:" fallback
|
||||
(r'TOTAL\s*:', 0.90),
|
||||
]
|
||||
|
||||
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||
"""
|
||||
Extract total amount from Stepout Market receipt.
|
||||
|
||||
Format: "SUMA TOTALA:" on one line, amount on next line.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (total_amount, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
for pattern, confidence in self.TOTAL_PATTERNS:
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(pattern, line, re.IGNORECASE):
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
return (amount, confidence)
|
||||
|
||||
# Fallback to base class
|
||||
return super().extract_total(text)
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from Stepout Market receipt.
|
||||
|
||||
Format: "5.00% TUA*B" on one line, amount on next line.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Try "X.XX% TUA*B" format first
|
||||
for i, line in enumerate(lines):
|
||||
match = re.search(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', line)
|
||||
if match:
|
||||
percent = int(match.group(1))
|
||||
code = match.group(2)
|
||||
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries # Single rate store
|
||||
|
||||
# Try "TOTAL TUA:" format
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(r'TOTAL\s+T[UV]A\s*:', line):
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'B', # Books are usually code B (5%)
|
||||
'percent': 5,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
|
||||
return entries
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract payment methods from Stepout Market receipt.
|
||||
|
||||
Format: "CARD" on one line, amount on next line.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Find CARD or NUMERAR keyword
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip()
|
||||
if line_stripped == 'CARD':
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
return payments
|
||||
elif line_stripped == 'NUMERAR' or 'CASH' in line_stripped:
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
payments.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
return payments
|
||||
|
||||
# Fallback: check for inline CARD amount
|
||||
for line in lines:
|
||||
match = re.search(r'CARD\s*:?\s*([\d.,]+)', line)
|
||||
if match:
|
||||
amount = self._parse_decimal(match.group(1))
|
||||
if amount and amount > 0:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
return payments
|
||||
|
||||
return payments
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return STEPOUT MARKET-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": True,
|
||||
"has_client_cui": True,
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"typical_tva_rate": 5, # Books have 5% TVA in Romania
|
||||
"product_category": "books",
|
||||
"tva_on_separate_line": True,
|
||||
}
|
||||
@@ -1,269 +0,0 @@
|
||||
"""
|
||||
UNLIMITED KEYS S.R.L. store profile for OCR extraction.
|
||||
|
||||
Key duplication service. Notable for CASH (NUMERAR) payments.
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
|
||||
|
||||
@ProfileRegistry.register
|
||||
class UnlimitedKeysProfile(BaseStoreProfile):
|
||||
"""
|
||||
UNLIMITED KEYS S.R.L. - standard TVA profile with NUMERAR payment.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format (single rate, any percentage)
|
||||
- Key duplication service
|
||||
- NUMERAR (cash) payment common - different from most stores!
|
||||
- May also accept CARD
|
||||
- OCR often reads "TVA" as "TUA" - need OCR error variants
|
||||
"""
|
||||
|
||||
CUI_LIST = ["18993187"]
|
||||
NAME_PATTERNS = ["UNLIMITED KEYS", "UNLIMITED", "UNL1MITED", "UNLIMITED KEYS SRL"]
|
||||
STORE_NAME = "UNLIMITED KEYS S.R.L."
|
||||
|
||||
# Standard TVA patterns - including OCR error variants (TVA -> TUA)
|
||||
TVA_PATTERNS = [
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY" (including TUA OCR error)
|
||||
r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)',
|
||||
# "A - XX,XX% = YY,YY"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)',
|
||||
# "TVA XX% YY,YY" (simple format, includes TUA)
|
||||
r'T[UV]A\s+(\d{1,2})\s*%\s*([\d.,\s]+)',
|
||||
# "XX.XX% TUA*A YY.YY" (OCR format with TUA*A or TUA)
|
||||
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?[A-D]?\s*([\d.,\s]+)',
|
||||
# "TOTAL TUA: YY.YY" (total TVA amount only)
|
||||
r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)',
|
||||
]
|
||||
|
||||
# TOTAL patterns for UNLIMITED KEYS (handles "80 .00" format)
|
||||
TOTAL_PATTERNS = [
|
||||
# "SUMA TOTALA: 80 .00" (with space before decimal)
|
||||
(r'SUMA\s+TOTALA\s*:?\s*([\d\s.,]+)', 0.98),
|
||||
# "TOTALA: 80,00"
|
||||
(r'TOTALA\s*:?\s*([\d.,]+)', 0.95),
|
||||
# Standard TOTAL patterns from base class
|
||||
(r'TOTAL\s+(?:DE\s+PLATA|ACHITAT|LEI)\s*:?\s*([\d.,]+)', 0.95),
|
||||
(r'TOTAL\s*:?\s*([\d.,]+)', 0.90),
|
||||
]
|
||||
|
||||
# Payment patterns - NUMERAR is primary for this store
|
||||
PAYMENT_PATTERNS = [
|
||||
# "NUMERAR 80.00" or "NUMERAR: 80.00"
|
||||
(r'NUMERAR\s*:?\s*([\d.,\s]+)', 'NUMERAR', 0.98),
|
||||
# "CARD 80.00" or "CARD: 80.00"
|
||||
(r'CARD\s*:?\s*([\d.,\s]+)', 'CARD', 0.95),
|
||||
]
|
||||
|
||||
# Client CUI patterns - specific to this receipt format
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CIF CLIENT:1879855" (exact format from OCR)
|
||||
(r'CIF\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
|
||||
# "CLIENT CIF: ROXXXXXXX"
|
||||
(r'CLIENT\s+CIF\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
# "C.I.F. CLIENT: XXXXXXX"
|
||||
(r'C\.?I\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
|
||||
]
|
||||
|
||||
# Override client markers to be less strict
|
||||
CLIENT_MARKERS = [
|
||||
r'CIF\s+CLIENT',
|
||||
r'CLIENT\s+CIF',
|
||||
r'C\.?I\.?F\.?\s+CLIENT',
|
||||
r'CLIENT\s*:',
|
||||
]
|
||||
|
||||
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||
"""
|
||||
Extract total amount from receipt text.
|
||||
|
||||
Handles UNLIMITED KEYS format with space before decimal (e.g., "80 .00").
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (total_amount, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, confidence in self.TOTAL_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
# Clean up amount string (remove spaces, fix decimal)
|
||||
amount_str = match.group(1)
|
||||
# Remove spaces that might appear before decimal
|
||||
amount_str = re.sub(r'\s+', '', amount_str)
|
||||
amount = self._parse_decimal(amount_str)
|
||||
|
||||
if amount and amount > 0:
|
||||
return (amount, confidence)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from receipt text.
|
||||
|
||||
Handles OCR errors where TVA is read as TUA.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Pattern 4: "XX.XX% TUA*A YY.YY" - common OCR format
|
||||
pattern4 = self.TVA_PATTERNS[3]
|
||||
match = re.search(pattern4, text_upper)
|
||||
if match:
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount_str = re.sub(r'\s+', '', match.group(2))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
pass
|
||||
|
||||
# Pattern 5: "TOTAL TUA: YY.YY" - fallback to total TVA
|
||||
pattern5 = self.TVA_PATTERNS[4]
|
||||
match = re.search(pattern5, text_upper)
|
||||
if match:
|
||||
try:
|
||||
amount_str = re.sub(r'\s+', '', match.group(1))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
# Infer percent from amount vs total ratio
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Standard Romanian TVA rate
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
pass
|
||||
|
||||
# Try coded patterns
|
||||
for pattern in self.TVA_PATTERNS[:3]:
|
||||
for match in re.finditer(pattern, text_upper, re.IGNORECASE):
|
||||
try:
|
||||
groups = match.groups()
|
||||
if len(groups) == 3:
|
||||
code = groups[0].upper()
|
||||
percent = int(groups[1])
|
||||
amount_str = re.sub(r'\s+', '', groups[2])
|
||||
else:
|
||||
code = 'A'
|
||||
percent = int(groups[0])
|
||||
amount_str = re.sub(r'\s+', '', groups[1])
|
||||
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract payment methods from receipt text.
|
||||
|
||||
Handles NUMERAR (cash) as primary payment for this store.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, method, confidence in self.PAYMENT_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
amount_str = re.sub(r'\s+', '', match.group(1))
|
||||
amount = self._parse_decimal(amount_str)
|
||||
|
||||
if amount and amount > 0:
|
||||
payments.append({
|
||||
'method': method,
|
||||
'amount': amount,
|
||||
'confidence': confidence
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
return payments
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from receipt text.
|
||||
|
||||
Handles "CIF CLIENT:1879855" format specific to this store.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for client markers
|
||||
has_client = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try client CUI patterns
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Clean up: remove RO prefix, spaces
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return UNLIMITED KEYS-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": False, # May be NUMERAR (cash)
|
||||
"has_client_cui": True, # May have client CUI
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"common_payment": "NUMERAR", # Cash payments common
|
||||
}
|
||||
@@ -576,6 +576,7 @@ class ReceiptExtractor:
|
||||
print(f"[TVA Reverse Validation] {msg}", flush=True)
|
||||
|
||||
# Cross-validate amount using payment methods and TVA
|
||||
original_amount = result.amount
|
||||
validated_amount, validated_confidence, source = self._cross_validate_and_calculate_amount(
|
||||
result.amount,
|
||||
result.confidence_amount,
|
||||
@@ -583,8 +584,38 @@ class ReceiptExtractor:
|
||||
result.tva_entries,
|
||||
result.tva_total
|
||||
)
|
||||
if validated_amount != result.amount:
|
||||
print(f"[Cross-Validation] Amount updated: {result.amount} -> {validated_amount} (source: {source})", flush=True)
|
||||
|
||||
# Add validation warnings when TOTAL is calculated (not directly extracted)
|
||||
if 'calculated from TVA' in source:
|
||||
warning_msg = f"TOTAL ({validated_amount}) calculat din TVA (nu a fost extras direct din bon)"
|
||||
result.validation_warnings.append(warning_msg)
|
||||
print(f"[Cross-Validation] ⚠️ {warning_msg}", flush=True)
|
||||
|
||||
# Add comparison if original was different
|
||||
if original_amount and original_amount != validated_amount:
|
||||
diff = abs(float(validated_amount) - float(original_amount))
|
||||
result.validation_warnings.append(
|
||||
f"TOTAL extras ({original_amount}) diferă de cel calculat ({validated_amount}) cu {diff:.2f} RON"
|
||||
)
|
||||
|
||||
elif 'calculated from payment methods' in source:
|
||||
warning_msg = f"TOTAL ({validated_amount}) calculat din suma metodelor de plată (nu a fost extras direct)"
|
||||
result.validation_warnings.append(warning_msg)
|
||||
print(f"[Cross-Validation] ⚠️ {warning_msg}", flush=True)
|
||||
|
||||
if original_amount and original_amount != validated_amount:
|
||||
diff = abs(float(validated_amount) - float(original_amount))
|
||||
result.validation_warnings.append(
|
||||
f"TOTAL extras ({original_amount}) diferă de suma plăților ({validated_amount}) cu {diff:.2f} RON"
|
||||
)
|
||||
|
||||
elif source == 'not found':
|
||||
result.validation_warnings.append("TOTAL nu a fost detectat și nu a putut fi calculat")
|
||||
print("[Cross-Validation] ⚠️ TOTAL nu a fost detectat", flush=True)
|
||||
|
||||
elif validated_amount != original_amount:
|
||||
print(f"[Cross-Validation] Amount updated: {original_amount} -> {validated_amount} (source: {source})", flush=True)
|
||||
|
||||
result.amount = validated_amount
|
||||
result.confidence_amount = validated_confidence
|
||||
|
||||
|
||||
Reference in New Issue
Block a user