New unified receipt creation system with: - UnifiedReceiptForm component with inline OCR preview and confidence indicators - Compact upload zone with drag-drop and camera support - TVA and Payment fields with dynamic add/remove - Supplier dual-field with autocomplete and OCR hint - Receipt form sections with collapsible auxiliary data Backend OCR improvements: - Add confidence_tva and confidence_payment to extraction results - Update TVA extraction to return confidence scores - Include TVA (15%) and payment (10%) in overall_confidence calculation Also includes: - CSS design system rules documentation - Port check helper function for service scripts - Expanded design tokens documentation in CLAUDE.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1123 lines
49 KiB
Python
1123 lines
49 KiB
Python
"""
|
|
Base class for store-specific OCR extraction profiles.
|
|
|
|
Each store can have different receipt formats (TVA layout, total position, etc.).
|
|
Store profiles allow customizing extraction logic per-store for better accuracy.
|
|
|
|
Usage:
|
|
from .base import BaseStoreProfile
|
|
from . import ProfileRegistry
|
|
|
|
@ProfileRegistry.register
|
|
class LidlProfile(BaseStoreProfile):
|
|
CUI_LIST = ["22891860"]
|
|
NAME_PATTERNS = ["LIDL", "LDL"]
|
|
|
|
def extract_tva_entries(self, text: str) -> Tuple[List[dict], float]:
|
|
# Custom Lidl TVA extraction logic
|
|
# Returns (entries_list, confidence_score)
|
|
...
|
|
"""
|
|
|
|
import re
|
|
from abc import ABC
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import List, Optional, Tuple, Dict, Any
|
|
from datetime import date
|
|
|
|
|
|
class BaseStoreProfile(ABC):
|
|
"""
|
|
Abstract base class for store-specific extraction profiles.
|
|
|
|
Each profile defines:
|
|
- CUI_LIST: CUI codes that identify this store (without RO prefix)
|
|
- NAME_PATTERNS: OCR-tolerant name patterns for fallback matching
|
|
- Custom extraction methods for TVA, total, date, etc.
|
|
|
|
The ProfileRegistry uses CUI_LIST to lookup profiles during extraction.
|
|
"""
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Class attributes - override in subclasses
|
|
# -------------------------------------------------------------------------
|
|
|
|
# List of CUI codes (without RO prefix) that identify this store
|
|
CUI_LIST: List[str] = []
|
|
|
|
# OCR-tolerant name patterns for fallback matching
|
|
NAME_PATTERNS: List[str] = []
|
|
|
|
# Store display name
|
|
STORE_NAME: str = "Unknown Store"
|
|
|
|
# Flag for known non-VAT payer stores (skips TVA extraction)
|
|
IS_NON_VAT_PAYER: bool = False
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Generic patterns - can be overridden in subclasses
|
|
# -------------------------------------------------------------------------
|
|
|
|
# Total amount patterns (confidence-weighted)
|
|
TOTAL_PATTERNS = [
|
|
(r'T[O0]TAL[.\s]+L[E3][I1!]\s*:?\s*([\d\s.,]+)', 0.98),
|
|
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
|
|
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95),
|
|
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
|
|
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
|
|
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
|
|
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
|
|
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
|
|
]
|
|
|
|
# Date patterns (confidence-weighted)
|
|
DATE_PATTERNS = [
|
|
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
|
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
|
|
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
|
|
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
|
|
]
|
|
|
|
# Date patterns with OCR-introduced spaces (separate because format is different)
|
|
DATE_PATTERNS_OCR_SPACES = [
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
|
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
|
]
|
|
|
|
# Receipt number patterns (confidence-weighted)
|
|
NUMBER_PATTERNS = [
|
|
(r'NDS\s*:?\s*(\d+)', 0.98),
|
|
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98),
|
|
(r'C3POS.*?(\d{6,7})\b', 0.95),
|
|
(r'BF\s*:\s*(\d{4,})', 0.96),
|
|
(r'BF\s+(\d{4,})', 0.93),
|
|
(r'NIVS\s*:?\s*(\d+)', 0.95),
|
|
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
|
|
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
|
|
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
|
|
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
|
|
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
|
|
]
|
|
|
|
# Payment method patterns (pattern, method_type, confidence)
|
|
# Handles ALL payment types: CARD, NUMERAR, and card brand names
|
|
PAYMENT_PATTERNS = [
|
|
# CARTE CREDIT variants (OMV/Petrom/Socar receipts)
|
|
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
|
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
|
(r'CARTE\s+DE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
|
(r'CARTE\s+DE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
|
# CARD standard
|
|
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
|
|
# Card brand names
|
|
(r'VISA\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
|
(r'MASTERCARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
|
(r'MAESTR[O0]\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
|
(r'CONTACTLESS\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
|
|
(r'DEBIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.90),
|
|
(r'CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.88),
|
|
# Cash variants
|
|
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
|
|
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
|
|
# Truncation recovery patterns (for OCR left-margin issues)
|
|
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
|
|
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
|
|
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
|
|
]
|
|
|
|
# Client section markers (for B2B receipts) - More flexible patterns
|
|
# Includes OCR corruption variants (LIENT, C IENT, L IENT)
|
|
CLIENT_MARKERS = [
|
|
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
|
|
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
|
|
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI"
|
|
r'CLIENT\s*:', # "CLIENT:"
|
|
r'CUMPARATOR\s*:', # "CUMPARATOR:"
|
|
r'BENEFICIAR\s*:', # "BENEFICIAR:"
|
|
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
|
|
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
|
|
# OCR corruption patterns
|
|
r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF L IENT:"
|
|
r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C)
|
|
r'LIENT\s*:', # "LIENT:" (missing C and I/L)
|
|
# Brick-specific (I→L OCR error)
|
|
r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/', # "CLIENT C.U.L./" (I read as L)
|
|
]
|
|
|
|
# Client CUI patterns (pattern, confidence) - Comprehensive
|
|
# Handles: docTR reordering, doubled letters, corruption, CUMPARATOR, Brick L/I swap
|
|
CLIENT_CUI_PATTERNS = [
|
|
# === CUI on line BEFORE CLIENT marker (docTR/OCR reordering) ===
|
|
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
|
|
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
|
|
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
|
|
# === "CIF I CLIENT:" format (OCR extra chars) ===
|
|
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
|
|
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
|
# === CIF CLIENT: (reversed - CIF before CLIENT) ===
|
|
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
|
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
|
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
|
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
|
# === CLIENT C.U.I/C.I.F. (slash variants) ===
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
|
|
# === Doubled letters (docTR artifact: "C.U U.I") ===
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
|
# === CLIENT C.U.I. or CLIENT CUI (without slash) ===
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
|
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
|
# === Corrupted CLIENT after CIF (OCR errors) ===
|
|
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93),
|
|
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
|
(r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92),
|
|
(r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
|
# === CUMPARATOR variants ===
|
|
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
# CUMPARATOR with CUI/CIF on next line
|
|
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
|
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
|
# CUMPARATOR with CUI/CIF two lines down
|
|
(r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
|
# === CLIENT on next line ===
|
|
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
|
(r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
|
# === Standard fallback patterns ===
|
|
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
|
|
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
|
|
# === Brick-specific (I→L OCR error) ===
|
|
# Matches: "CLIENT C.U.L./C.IF. :R01879855"
|
|
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
|
|
]
|
|
|
|
# Company type indicators (for identifying company names)
|
|
COMPANY_INDICATORS = [
|
|
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
|
|
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
|
|
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
|
|
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
|
|
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
|
|
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
|
|
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
|
|
r'HOLDING',
|
|
r'COMPANY',
|
|
r'GROUP',
|
|
]
|
|
|
|
# Maximum reasonable payment amount (to filter OCR errors)
|
|
MAX_PAYMENT = Decimal('100000')
|
|
|
|
# -------------------------------------------------------------------------
|
|
# TVA (VAT) patterns - ALL FORMATS unified
|
|
# OCR tolerant: T[VU][AR] matches TVA, TUA, TVR
|
|
# -------------------------------------------------------------------------
|
|
TVA_PATTERNS = [
|
|
# === FORMAT 1: INLINE cu cod și procent (Lidl-style) ===
|
|
# "TVA A 21,00% 7.71" sau "TVA B 11,00% 2.13"
|
|
(r'T[VU][AR]\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.98, 'inline'),
|
|
(r'T[VU][AR]\s+([A-D])\s+\\?(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.97, 'inline'),
|
|
(r'IVA\s+([A-D])\s+(\d{1,2})[.,]?\d{0,2}\s*%\s+([\d.,]+)', 0.95, 'inline'),
|
|
|
|
# === FORMAT 2: REVERSED (Stepout-style) ===
|
|
# "5.00% TUA*B" - procent ÎNAINTE de TVA
|
|
(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', 0.97, 'reversed'),
|
|
|
|
# === FORMAT 3: TABLE (OMV-style) ===
|
|
# "A-21,00% 285,66 49,58" (cod-procent bază tva)
|
|
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)', 0.96, 'table'),
|
|
(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', 0.95, 'taxe'),
|
|
|
|
# === FORMAT 4: MULTILINE (Brick/Electrobering) ===
|
|
# "TOTAL TVA A - 19%" pe o linie, amount pe următoarea
|
|
(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', 0.96, 'multiline'),
|
|
(r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%', 0.95, 'multiline'),
|
|
(r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%', 0.94, 'multiline'),
|
|
|
|
# === FORMAT 5: STANDARD (din extractor) ===
|
|
(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98, 'bon'),
|
|
(r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
|
|
(r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95, 'standard'),
|
|
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95, 'percent'),
|
|
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93, 'percent'),
|
|
(r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93, 'books'),
|
|
(r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92, 'books'),
|
|
|
|
# === FORMAT 6: CODED inline (cu code A-D) ===
|
|
(r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)', 0.95, 'coded'),
|
|
(r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)', 0.93, 'coded'),
|
|
|
|
# === FALLBACK patterns ===
|
|
(r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88, 'fallback'),
|
|
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
|
|
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85, 'fallback'),
|
|
(r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)', 0.90, 'standard'),
|
|
]
|
|
|
|
# Non-VAT payer patterns - NEPLATITOR DE TVA
|
|
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, ANEPLATHTOR, MEPLATITOR
|
|
NON_VAT_PATTERNS = [
|
|
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
|
|
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
|
|
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
|
|
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
|
|
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
|
|
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
|
|
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
|
|
]
|
|
|
|
# CUI (fiscal code) patterns - VENDOR CUI (exclude CLIENT)
|
|
# OCR errors: R0 instead of RO, C1F instead of CIF
|
|
CUI_PATTERNS = [
|
|
# CIF at start of line (definitely vendor)
|
|
(r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
|
(r'^CIF\s*:?\s*(\d{6,10})', 0.97),
|
|
(r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
|
(r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94),
|
|
# CIF not preceded by CLIENT (negative lookbehind)
|
|
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
|
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
|
|
# Standalone CIF with word boundary
|
|
(r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
|
|
(r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
|
|
# COD FISCAL (vendor)
|
|
(r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
|
|
(r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
|
|
# C. I. F. format with SPACES (OCR artifact)
|
|
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
|
|
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
|
|
# C.I.F. format (with dots, no spaces)
|
|
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
|
|
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
|
|
# CUI format
|
|
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
|
|
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
|
|
# Lidl format: "Cod Identificare fiscala" (OCR corrupted)
|
|
(r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
|
|
# Generic: anything with "fiscal" followed by RO + digits
|
|
(r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
|
|
]
|
|
|
|
# CUI REVERSED format (number BEFORE label)
|
|
CUI_REVERSED_PATTERNS = [
|
|
(r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
|
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
|
|
]
|
|
|
|
# Items count patterns - NR POZ ART IN BON
|
|
ITEMS_COUNT_PATTERNS = [
|
|
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
|
|
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
|
|
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
|
|
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
|
|
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
|
|
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
|
|
]
|
|
|
|
# Series patterns - Romanian fiscal receipt series
|
|
SERIES_PATTERNS = [
|
|
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
|
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
|
|
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
|
|
]
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Extraction methods - override in subclasses as needed
|
|
# -------------------------------------------------------------------------
|
|
|
|
def extract_tva_entries(self, text: str) -> Tuple[List[dict], float]:
|
|
"""
|
|
Extract TVA entries from receipt text - GENERIC implementation.
|
|
|
|
Handles ALL formats:
|
|
- Multi-rate inline (Lidl): "TVA A 21% 7.71"
|
|
- Reversed (Stepout): "5.00% TUA*B"
|
|
- Table (OMV): "A-21,00% 285,66 49,58"
|
|
- Multiline: "TOTAL TVA A - 19%" + amount on next line
|
|
- Non-VAT payers: Returns empty list
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
Tuple of (List of dicts with keys: code, percent, amount, confidence float)
|
|
"""
|
|
entries = []
|
|
max_confidence = 0.0
|
|
text_upper = text.upper()
|
|
|
|
# Step 1: Check for known non-VAT payer (by class flag or text detection)
|
|
if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper):
|
|
return ([], 0.0) # No TVA entries for non-VAT payers
|
|
|
|
# Step 2: Normalize OCR spaces in numbers
|
|
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
|
lines = normalized.split('\n')
|
|
|
|
# Step 3: Try all formats, collect candidates with confidence
|
|
candidates = []
|
|
|
|
# Try inline multi-rate (Lidl-style)
|
|
inline_entries, inline_conf = self._try_tva_inline(normalized)
|
|
candidates.extend(inline_entries)
|
|
if inline_conf > max_confidence:
|
|
max_confidence = inline_conf
|
|
|
|
# Try reversed format (Stepout-style)
|
|
reversed_entries, reversed_conf = self._try_tva_reversed(normalized, lines)
|
|
candidates.extend(reversed_entries)
|
|
if reversed_conf > max_confidence:
|
|
max_confidence = reversed_conf
|
|
|
|
# Try multiline format (Brick/Electrobering)
|
|
multiline_entries, multiline_conf = self._try_tva_multiline(normalized, lines)
|
|
candidates.extend(multiline_entries)
|
|
if multiline_conf > max_confidence:
|
|
max_confidence = multiline_conf
|
|
|
|
# Try table format (OMV-style)
|
|
table_entries, table_conf = self._try_tva_table(normalized)
|
|
candidates.extend(table_entries)
|
|
if table_conf > max_confidence:
|
|
max_confidence = table_conf
|
|
|
|
# Try standard/fallback patterns
|
|
if not candidates:
|
|
standard_entries, standard_conf = self._try_tva_standard(normalized)
|
|
candidates.extend(standard_entries)
|
|
if standard_conf > max_confidence:
|
|
max_confidence = standard_conf
|
|
|
|
# Step 4: Deduplicate and return
|
|
seen = set()
|
|
for entry in candidates:
|
|
key = (entry.get('code', 'A'), entry.get('percent', 19))
|
|
if key not in seen and entry.get('amount') and entry['amount'] > 0:
|
|
entries.append(entry)
|
|
seen.add(key)
|
|
|
|
return (entries, max_confidence if entries else 0.0)
|
|
|
|
def _is_non_vat_payer(self, text: str) -> bool:
|
|
"""Check if receipt is from non-VAT payer."""
|
|
for pattern in self.NON_VAT_PATTERNS:
|
|
if re.search(pattern, text, re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
def _try_tva_inline(self, text: str) -> Tuple[List[dict], float]:
|
|
"""Try Lidl-style inline format: 'TVA A 21,00% 7.71'"""
|
|
entries = []
|
|
max_confidence = 0.0
|
|
# Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13"
|
|
for pattern, confidence, fmt in self.TVA_PATTERNS:
|
|
if fmt != 'inline':
|
|
continue
|
|
for match in re.finditer(pattern, text, re.IGNORECASE):
|
|
try:
|
|
groups = match.groups()
|
|
if len(groups) >= 3:
|
|
code = groups[0].upper() if groups[0] else 'A'
|
|
percent = int(groups[1])
|
|
amount = self._parse_decimal(self._clean_ocr_number(groups[2]))
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
if confidence > max_confidence:
|
|
max_confidence = confidence
|
|
except (ValueError, InvalidOperation, IndexError):
|
|
continue
|
|
return (entries, max_confidence)
|
|
|
|
def _try_tva_reversed(self, text: str, lines: List[str]) -> Tuple[List[dict], float]:
|
|
"""Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)"""
|
|
entries = []
|
|
confidence = 0.97 # Default confidence for reversed format
|
|
# Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next
|
|
for i, line in enumerate(lines):
|
|
# Try pattern with amount on SAME line: "5.00% TUA*B 2.00"
|
|
match = re.search(
|
|
r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?\s+([\d\s.,]+)',
|
|
line, re.IGNORECASE
|
|
)
|
|
if match:
|
|
try:
|
|
percent = int(match.group(1))
|
|
code = match.group(2).upper() if match.group(2) else 'A'
|
|
amount_str = match.group(3).strip()
|
|
amount = self._parse_decimal(amount_str)
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
continue # Check for more entries
|
|
except (ValueError, InvalidOperation, IndexError):
|
|
pass
|
|
|
|
# Fallback: amount on NEXT line
|
|
match = re.search(r'(\d{1,2})[.,]?\d{0,2}\s*%\s*T[UV][AR]\s*\*?\s*([A-D])?$', line, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
percent = int(match.group(1))
|
|
code = match.group(2).upper() if match.group(2) else 'A'
|
|
if i + 1 < len(lines):
|
|
amount_str = lines[i + 1].strip()
|
|
amount = self._parse_decimal(amount_str)
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
except (ValueError, InvalidOperation, IndexError):
|
|
continue
|
|
return (entries, confidence if entries else 0.0)
|
|
|
|
def _try_tva_multiline(self, text: str, lines: List[str]) -> Tuple[List[dict], float]:
|
|
"""Try multiline format: 'TOTAL TVA A - 19%' + amount on next line"""
|
|
entries = []
|
|
confidence = 0.95 # Default confidence for multiline format
|
|
# Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next
|
|
multiline_patterns = [
|
|
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
|
|
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
|
|
r'TOTAL\s+T[VU][AR]\s*([A-D])?\s*[-:]?\s*(\d{1,2})\s*%',
|
|
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%',
|
|
]
|
|
for i, line in enumerate(lines):
|
|
for pattern in multiline_patterns:
|
|
match = re.search(pattern, line, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
code = match.group(1).upper() if match.group(1) else 'A'
|
|
percent = int(match.group(2))
|
|
# Amount is on next line
|
|
if i + 1 < len(lines):
|
|
amount_str = lines[i + 1].strip()
|
|
amount = self._parse_decimal(amount_str)
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
return (entries, confidence)
|
|
except (ValueError, InvalidOperation, IndexError):
|
|
continue
|
|
return (entries, 0.0)
|
|
|
|
def _try_tva_table(self, text: str) -> Tuple[List[dict], float]:
|
|
"""Try OMV-style table format: 'A-21,00% 285,66 49,58'"""
|
|
entries = []
|
|
confidence = 0.96 # Default confidence for table format
|
|
# Pattern: "A-21,00% 285,66 49,58" (code-percent base_amount tva_amount)
|
|
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)'
|
|
for match in re.finditer(table_pattern, text, re.IGNORECASE):
|
|
try:
|
|
code = match.group(1).upper()
|
|
percent = int(match.group(2))
|
|
# Group 4 is the TVA amount (last column in table)
|
|
tva_amount_str = self._clean_ocr_number(match.group(4))
|
|
tva_amount = self._parse_decimal(tva_amount_str)
|
|
if tva_amount and tva_amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': tva_amount
|
|
})
|
|
except (ValueError, InvalidOperation, IndexError):
|
|
continue
|
|
|
|
# Fallback: "TOTAL TAXE: 55,22"
|
|
if not entries:
|
|
taxe_match = re.search(r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)', text, re.IGNORECASE)
|
|
if taxe_match:
|
|
try:
|
|
amount_str = self._clean_ocr_number(taxe_match.group(1))
|
|
amount = self._parse_decimal(amount_str)
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': 'A',
|
|
'percent': 19, # Default rate
|
|
'amount': amount
|
|
})
|
|
confidence = 0.90 # Lower confidence for fallback
|
|
except (ValueError, InvalidOperation):
|
|
pass
|
|
return (entries, confidence if entries else 0.0)
|
|
|
|
def _try_tva_standard(self, text: str) -> Tuple[List[dict], float]:
|
|
"""Try standard TVA patterns as fallback"""
|
|
entries = []
|
|
matched_confidence = 0.0
|
|
standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books']
|
|
for pattern, confidence, fmt in self.TVA_PATTERNS:
|
|
if fmt not in standard_fmts:
|
|
continue
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
groups = match.groups()
|
|
if len(groups) >= 2:
|
|
# Could be (percent, amount) or (code, percent, amount)
|
|
if groups[0] and groups[0].isalpha():
|
|
code = groups[0].upper()
|
|
percent = int(groups[1]) if len(groups) > 1 else 19
|
|
amount_str = groups[2] if len(groups) > 2 else None
|
|
else:
|
|
code = 'A'
|
|
percent = int(groups[0]) if groups[0] and groups[0].isdigit() else 19
|
|
amount_str = groups[1] if len(groups) > 1 else groups[0]
|
|
if amount_str:
|
|
amount = self._parse_decimal(self._clean_ocr_number(amount_str))
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
return (entries, confidence)
|
|
elif len(groups) == 1:
|
|
# Just amount
|
|
amount = self._parse_decimal(self._clean_ocr_number(groups[0]))
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': 'A',
|
|
'percent': 19,
|
|
'amount': amount
|
|
})
|
|
return (entries, confidence)
|
|
except (ValueError, InvalidOperation, IndexError):
|
|
continue
|
|
return (entries, matched_confidence)
|
|
|
|
def _clean_ocr_number(self, value: str) -> str:
|
|
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
|
|
if not value:
|
|
return ""
|
|
value = re.sub(r'\s*([.,])\s*', r'\1', value)
|
|
value = value.replace(' ', '')
|
|
return value
|
|
|
|
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
|
|
"""
|
|
Extract total amount from receipt text.
|
|
|
|
Supports both single-line and multiline formats:
|
|
- Single line: "TOTAL: 78.00", "SUMA TOTALA: 78.00"
|
|
- Multiline: "SUMA\nTOTALA:\n78.00" (common in thermal receipts)
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
Tuple of (amount, confidence) or (None, 0.0)
|
|
"""
|
|
text_upper = text.upper()
|
|
lines = text_upper.split('\n')
|
|
|
|
# =====================================================================
|
|
# STRATEGY 1: Multiline "SUMA TOTALA" pattern (thermal receipts)
|
|
# Format: SUMA on one line, TOTALA: on next, amount on third
|
|
# =====================================================================
|
|
for i, line in enumerate(lines):
|
|
line_clean = line.strip()
|
|
|
|
# Check for "SUMA" keyword (with OCR variants: SUNA, SUHA, SUM A)
|
|
if re.search(r'S[UU]M[AĂ\s]', line_clean):
|
|
# Look at next 3 lines for "TOTALA" and amount
|
|
for j in range(i, min(i + 4, len(lines))):
|
|
check_line = lines[j].strip()
|
|
|
|
# Check for "TOTALA:" or "TOTALA -" followed by amount
|
|
match = re.search(r'T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', check_line)
|
|
if match:
|
|
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
return (amount, 0.98)
|
|
|
|
# Check for "TOTALA" without amount, amount on next line
|
|
if re.search(r'T[O0]TALA\s*[:\-]?\s*$', check_line):
|
|
if j + 1 < len(lines):
|
|
amount_line = lines[j + 1].strip()
|
|
amount = self._parse_decimal(amount_line)
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
return (amount, 0.97)
|
|
|
|
# Check for "SUMA TOTALA" on single line with amount
|
|
match = re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]\s*([\d\s.,]+)', line_clean)
|
|
if match:
|
|
amount = self._parse_decimal(self._clean_ocr_number(match.group(1)))
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
return (amount, 0.98)
|
|
|
|
# Check for "SUMA TOTALA" without amount, amount on next line
|
|
if re.search(r'S[UU]M[AĂ]\s+T[O0]TALA\s*[:\-]?\s*$', line_clean):
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip()
|
|
amount = self._parse_decimal(next_line)
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
return (amount, 0.96)
|
|
|
|
# =====================================================================
|
|
# STRATEGY 2: Standard single-line patterns
|
|
# =====================================================================
|
|
for pattern, confidence in self.TOTAL_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
amount = self._parse_decimal(match.group(1))
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
return (amount, confidence)
|
|
|
|
return (None, 0.0)
|
|
|
|
def extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
|
"""
|
|
Extract receipt date from text.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
Tuple of (date, confidence) or (None, 0.0)
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
# Try standard patterns first
|
|
for pattern, confidence in self.DATE_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
parsed = self._parse_date(match.group(1))
|
|
if parsed:
|
|
return (parsed, confidence)
|
|
|
|
# Try OCR-corrupted patterns with spaces
|
|
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
try:
|
|
if fmt == 'ymd':
|
|
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
|
else: # dmy
|
|
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
|
|
|
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
|
|
return (date(year, month, day), confidence)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
return (None, 0.0)
|
|
|
|
def extract_receipt_number(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract receipt number from text.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
Tuple of (number, confidence) or (None, 0.0)
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
for pattern, confidence in self.NUMBER_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
number = match.group(1).strip()
|
|
if number and len(number) >= 3:
|
|
return (number, confidence)
|
|
|
|
return (None, 0.0)
|
|
|
|
def extract_payment_methods(self, text: str) -> List[dict]:
|
|
"""
|
|
Extract payment methods (CARD/NUMERAR) from receipt.
|
|
|
|
Supports:
|
|
- Multiline patterns: "CARD\n78.00" (common in thermal receipts)
|
|
- Multiple payments (split CARD + NUMERAR)
|
|
- REST (change) detection to calculate actual CARD amount
|
|
- Keyword-only CARD/NUMERAR that infers from total
|
|
- Fallback for fiscal receipts without explicit payment
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
List of dicts: [{'method': 'CARD'/'NUMERAR', 'amount': Decimal, 'confidence': float}]
|
|
"""
|
|
text_upper = text.upper()
|
|
lines = text_upper.split('\n')
|
|
methods = []
|
|
seen_entries = set()
|
|
|
|
# =====================================================================
|
|
# STEP 0: Try MULTILINE patterns first (thermal receipts)
|
|
# Format: "CARD" on one line, amount on next line
|
|
# =====================================================================
|
|
for i, line in enumerate(lines):
|
|
line_clean = line.strip()
|
|
|
|
# Standalone CARD keyword (not part of MASTERCARD, etc.)
|
|
if re.match(r'^CARD\s*$', line_clean):
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip()
|
|
# Must be a valid amount (not another keyword)
|
|
if re.match(r'^[\d\s.,]+$', next_line):
|
|
amount = self._parse_decimal(next_line)
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
entry_key = ('CARD', amount)
|
|
if entry_key not in seen_entries:
|
|
methods.append({
|
|
'method': 'CARD',
|
|
'amount': amount,
|
|
'confidence': 0.95
|
|
})
|
|
seen_entries.add(entry_key)
|
|
|
|
# Standalone NUMERAR keyword
|
|
if re.match(r'^NUMERAR\s*$', line_clean):
|
|
if i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip()
|
|
if re.match(r'^[\d\s.,]+$', next_line):
|
|
amount = self._parse_decimal(next_line)
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
entry_key = ('NUMERAR', amount)
|
|
if entry_key not in seen_entries:
|
|
methods.append({
|
|
'method': 'NUMERAR',
|
|
'amount': amount,
|
|
'confidence': 0.95
|
|
})
|
|
seen_entries.add(entry_key)
|
|
|
|
# If multiline extraction found methods, return them
|
|
if methods:
|
|
return methods
|
|
|
|
# =====================================================================
|
|
# STEP 1: Try pattern-based extraction with explicit amounts
|
|
# =====================================================================
|
|
for pattern, method, confidence in self.PAYMENT_PATTERNS:
|
|
for match in re.finditer(pattern, text_upper):
|
|
try:
|
|
amount = self._parse_decimal(match.group(1))
|
|
if amount and amount > 0 and amount < self.MAX_PAYMENT:
|
|
entry_key = (method, amount)
|
|
if entry_key not in seen_entries:
|
|
methods.append({
|
|
'method': method,
|
|
'amount': amount,
|
|
'confidence': confidence
|
|
})
|
|
seen_entries.add(entry_key)
|
|
except (ValueError, InvalidOperation):
|
|
continue
|
|
|
|
# If we found explicit amounts, we're done
|
|
if methods:
|
|
return methods
|
|
|
|
# Step 2: Try keyword-only detection with REST logic
|
|
# Get total amount for inference
|
|
total_amount, _ = self.extract_total(text)
|
|
if not total_amount:
|
|
return []
|
|
|
|
# Check for payment keywords
|
|
has_card = any(kw in text_upper for kw in ['CARD', 'CARTE CREDIT', 'VISA', 'MASTERCARD', 'DEBIT', 'CREDIT', 'CONTACTLESS'])
|
|
has_numerar = any(kw in text_upper for kw in ['NUMERAR', 'CASH'])
|
|
|
|
# Find REST (change) amount
|
|
rest_amount = Decimal('0')
|
|
for i, line in enumerate(lines):
|
|
if 'REST' in line:
|
|
# REST on same line: "REST 0.00" or "REST: 0.00"
|
|
match = re.search(r'REST\s*:?\s*([\d.,]+)', line)
|
|
if match:
|
|
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
|
|
elif i + 1 < len(lines):
|
|
# REST on separate line
|
|
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
|
|
break
|
|
|
|
# Calculate payment amounts
|
|
if has_card:
|
|
card_amount = total_amount - rest_amount
|
|
if card_amount > 0:
|
|
methods.append({
|
|
'method': 'CARD',
|
|
'amount': card_amount,
|
|
'confidence': 0.90
|
|
})
|
|
|
|
if has_numerar:
|
|
if has_card and rest_amount > 0:
|
|
# Mixed payment: NUMERAR is the change given back
|
|
methods.append({
|
|
'method': 'NUMERAR',
|
|
'amount': rest_amount,
|
|
'confidence': 0.85
|
|
})
|
|
elif not has_card:
|
|
# Cash only
|
|
methods.append({
|
|
'method': 'NUMERAR',
|
|
'amount': total_amount,
|
|
'confidence': 0.90
|
|
})
|
|
|
|
# Step 3: Fallback for fiscal receipts without explicit payment
|
|
if not methods and total_amount and total_amount > 0:
|
|
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
|
|
if is_fiscal:
|
|
# Default to CARD for business purchases (most common)
|
|
methods.append({
|
|
'method': 'CARD',
|
|
'amount': total_amount,
|
|
'confidence': 0.70
|
|
})
|
|
|
|
return methods
|
|
|
|
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract client CUI from B2B receipts.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
Tuple of (cui, confidence) or (None, 0.0)
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
# First check if there's a CLIENT section
|
|
has_client_section = any(
|
|
re.search(marker, text_upper, re.IGNORECASE)
|
|
for marker in self.CLIENT_MARKERS
|
|
)
|
|
|
|
if not has_client_section:
|
|
return (None, 0.0)
|
|
|
|
# Try to extract CUI
|
|
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
|
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
cui = match.group(1)
|
|
# Normalize: remove RO prefix for storage
|
|
cui_digits = re.sub(r'[^0-9]', '', cui)
|
|
if 6 <= len(cui_digits) <= 10:
|
|
return (cui_digits, confidence)
|
|
|
|
return (None, 0.0)
|
|
|
|
def extract_client_name(self, text: str) -> Tuple[Optional[str], float]:
|
|
"""
|
|
Extract client/buyer company name from B2B receipts.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
Tuple of (client_name, confidence) or (None, 0.0)
|
|
"""
|
|
text_upper = text.upper()
|
|
lines = text.split('\n')
|
|
|
|
# First check if there's a CLIENT section
|
|
client_section_idx = None
|
|
for i, line in enumerate(lines):
|
|
line_upper = line.upper().strip()
|
|
if any(re.search(marker, line_upper, re.IGNORECASE) for marker in self.CLIENT_MARKERS):
|
|
client_section_idx = i
|
|
break
|
|
|
|
if client_section_idx is None:
|
|
return (None, 0.0)
|
|
|
|
# Look for company name in CLIENT section
|
|
line = lines[client_section_idx].strip()
|
|
line_upper = line.upper()
|
|
|
|
# Strategy 1: Check if name is on same line after ":"
|
|
if ':' in line:
|
|
name_part = line.split(':', 1)[1].strip()
|
|
if name_part and len(name_part) >= 3:
|
|
# Skip if it looks like a CUI (RO followed by digits)
|
|
if re.match(r'^R[O0]?\d{6,10}$', name_part.upper()):
|
|
pass # This is CUI, not name - continue to next strategy
|
|
else:
|
|
# Check for company indicators
|
|
name_upper = name_part.upper()
|
|
if any(re.search(ind, name_upper) for ind in self.COMPANY_INDICATORS):
|
|
return (self._clean_company_name(name_part), 0.95)
|
|
elif len(name_part) >= 5 and not name_part.isdigit():
|
|
return (self._clean_company_name(name_part), 0.80)
|
|
|
|
# Strategy 2: Check next line for company name
|
|
if client_section_idx + 1 < len(lines):
|
|
next_line = lines[client_section_idx + 1].strip()
|
|
next_upper = next_line.upper()
|
|
|
|
# Skip if it's a CUI/CIF line or looks like CUI
|
|
if not re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', next_upper):
|
|
if not re.match(r'^R[O0]?\d{6,10}$', next_upper):
|
|
if any(re.search(ind, next_upper) for ind in self.COMPANY_INDICATORS):
|
|
return (self._clean_company_name(next_line), 0.90)
|
|
elif len(next_line) >= 5 and not next_line.isdigit():
|
|
# Check it's not CUI/CIF/COD keywords
|
|
if not any(kw in next_upper for kw in ['CUI', 'CIF', 'COD', 'FISCAL']):
|
|
return (self._clean_company_name(next_line), 0.75)
|
|
|
|
# Strategy 3: Look for any line with company indicators in CLIENT section region
|
|
search_end = min(client_section_idx + 5, len(lines))
|
|
for i in range(client_section_idx + 1, search_end):
|
|
line = lines[i].strip()
|
|
line_upper = line.upper()
|
|
|
|
# Skip CUI/CIF lines
|
|
if re.search(r'C\.?\s*[UI]\.?\s*[IF]\.?', line_upper):
|
|
continue
|
|
if re.match(r'^R[O0]?\d{6,10}$', line_upper):
|
|
continue
|
|
|
|
if any(re.search(ind, line_upper) for ind in self.COMPANY_INDICATORS):
|
|
return (self._clean_company_name(line), 0.85)
|
|
|
|
return (None, 0.0)
|
|
|
|
@staticmethod
|
|
def _clean_company_name(name: str) -> str:
|
|
"""Clean company name for storage."""
|
|
if not name:
|
|
return ""
|
|
# Remove extra whitespace
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
# Remove trailing punctuation except periods in S.R.L., S.A., etc.
|
|
name = re.sub(r'[,;:]+$', '', name).strip()
|
|
return name
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Validation hints - override to customize validation behavior
|
|
# -------------------------------------------------------------------------
|
|
|
|
def get_validation_hints(self) -> Dict[str, Any]:
|
|
"""
|
|
Return validation hints for this store.
|
|
|
|
Returns:
|
|
Dict with validation hints. Common keys:
|
|
- has_multi_rate_tva: bool - Store uses multiple TVA rates
|
|
- card_equals_total: bool - CARD payment equals total
|
|
- has_client_cui: bool - Receipt includes client CUI
|
|
- has_efactura: bool - Store uses e-factura format
|
|
- is_non_vat_payer: bool - Store is not a VAT payer
|
|
"""
|
|
return {}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Helper methods - available to all subclasses
|
|
# -------------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _normalize_number(text: str) -> str:
|
|
"""
|
|
Normalize a number string for Decimal conversion.
|
|
|
|
Handles Romanian formats: "1.234,56" -> "1234.56"
|
|
"""
|
|
if not text:
|
|
return "0"
|
|
|
|
# Remove spaces
|
|
text = text.replace(" ", "")
|
|
|
|
# Determine decimal separator
|
|
last_comma = text.rfind(",")
|
|
last_dot = text.rfind(".")
|
|
|
|
if last_comma > last_dot:
|
|
text = text.replace(".", "").replace(",", ".")
|
|
elif last_dot > last_comma:
|
|
text = text.replace(",", "")
|
|
else:
|
|
text = text.replace(",", ".")
|
|
|
|
return text
|
|
|
|
@staticmethod
|
|
def _parse_decimal(text: str) -> Optional[Decimal]:
|
|
"""Parse a string to Decimal, handling various formats."""
|
|
try:
|
|
normalized = BaseStoreProfile._normalize_number(text)
|
|
return Decimal(normalized)
|
|
except (InvalidOperation, ValueError, TypeError):
|
|
return None
|
|
|
|
@staticmethod
|
|
def _parse_date(text: str) -> Optional[date]:
|
|
"""
|
|
Parse date string in various formats.
|
|
|
|
Supports: DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY, YYYY-MM-DD
|
|
"""
|
|
if not text:
|
|
return None
|
|
|
|
# Normalize separators
|
|
text = text.replace('/', '-').replace('.', '-')
|
|
|
|
try:
|
|
parts = text.split('-')
|
|
if len(parts) != 3:
|
|
return None
|
|
|
|
# Determine format based on first part length
|
|
if len(parts[0]) == 4:
|
|
# YYYY-MM-DD
|
|
year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
|
|
else:
|
|
# DD-MM-YYYY
|
|
day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
|
|
|
|
# Validate ranges
|
|
if 1 <= day <= 31 and 1 <= month <= 12 and 2000 <= year <= 2100:
|
|
return date(year, month, day)
|
|
except (ValueError, TypeError, IndexError):
|
|
pass
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def _clean_text(text: str) -> str:
|
|
"""Clean OCR text for pattern matching."""
|
|
if not text:
|
|
return ""
|
|
text = re.sub(r'\s+', ' ', text)
|
|
text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
|
return text.strip()
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Magic methods
|
|
# -------------------------------------------------------------------------
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<{self.__class__.__name__} CUI={self.CUI_LIST}>"
|
|
|
|
def __str__(self) -> str:
|
|
return f"{self.STORE_NAME} ({', '.join(self.CUI_LIST)})"
|