feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,8 @@ from decimal import Decimal, InvalidOperation
|
||||
from typing import Optional, Tuple, List
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
@@ -24,6 +26,7 @@ class ExtractionResult:
|
||||
address: Optional[str] = None
|
||||
items_count: Optional[int] = None
|
||||
payment_methods: List[dict] = field(default_factory=list) # [{"method":"CARD","amount":Decimal}]
|
||||
suggested_payment_mode: Optional[str] = None # 'banca' if CARD detected, 'numerar' if cash only
|
||||
|
||||
# Client data (for B2B receipts - buyer information)
|
||||
client_name: Optional[str] = None
|
||||
@@ -125,8 +128,10 @@ class ReceiptExtractor:
|
||||
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format
|
||||
(r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number
|
||||
(r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix
|
||||
# BF (Bon Fiscal) number
|
||||
(r'BF\s*:?\s*(\d+)', 0.93),
|
||||
# BF (Bon Fiscal) number - high priority
|
||||
# Format: "Z:0864 BF:0018" - extract only the number after BF:
|
||||
(r'BF\s*:\s*(\d{4,})', 0.96), # BF: with colon (most specific)
|
||||
(r'BF\s+(\d{4,})', 0.93), # BF followed by space and number
|
||||
# NIVS format
|
||||
(r'NIVS\s*:?\s*(\d+)', 0.95),
|
||||
# Standard NR BON formats
|
||||
@@ -151,28 +156,45 @@ class ReceiptExtractor:
|
||||
# OCR errors: R0 instead of RO, C1F instead of CIF
|
||||
CUI_PATTERNS = [
|
||||
# CIF at start of line (definitely vendor) - tolerant to OCR errors
|
||||
(r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
(r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # C1F OCR error
|
||||
# NOTE: Capture full CUI including RO prefix: (R[O0]?\d{6,10}) or ((?:R[O0])?\d{6,10})
|
||||
(r'^CIF\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'^CIF\s*:?\s*(\d{6,10})', 0.97), # Without RO prefix
|
||||
(r'^C[I1]F\s*:?\s*(R[O0]?\d{6,10})', 0.95), # C1F OCR error
|
||||
(r'^C[I1]F\s*:?\s*(\d{6,10})', 0.94), # C1F without RO
|
||||
# CIF not preceded by CLIENT (negative lookbehind)
|
||||
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(R[O0]?\d{6,10})', 0.95),
|
||||
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(\d{6,10})', 0.94),
|
||||
# Standalone CIF: format with OCR tolerance
|
||||
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
|
||||
(r'\bC[I1]F\s*:?\s*(R[O0]?\d{6,10})\b', 0.90),
|
||||
(r'\bC[I1]F\s*:?\s*(\d{6,10})\b', 0.89),
|
||||
# COD FISCAL (vendor)
|
||||
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
(r'COD\s+FISCAL\s*:?\s*(R[O0]?\d{6,10})', 0.90),
|
||||
(r'COD\s+FISCAL\s*:?\s*(\d{6,10})', 0.89),
|
||||
# C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
|
||||
(r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
||||
# Also handles double colon from OMV/Petrom: "C. I.F.: : RO11201891"
|
||||
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(R[O0]?\d{6,10})', 0.92),
|
||||
(r'C\.\s*I\.\s*F\.?\s*[:\s]+(\d{6,10})', 0.91),
|
||||
# C.I.F. format (with dots, no spaces)
|
||||
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
|
||||
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.88),
|
||||
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(\d{6,10})', 0.87),
|
||||
# CUI format (less specific, use with caution)
|
||||
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
|
||||
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.85),
|
||||
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(\d{6,10})', 0.84),
|
||||
# Lidl format: "Cod Identificare fiscala: RO..." (OCR corrupts to "Ced Identificanfliscalar")
|
||||
# Matches: "Identificare fiscala", "Identificanfliscalar", "Identificoan/Fljscales"
|
||||
(r'[IC](?:od|ed)\s*Identific[a-z/]*\s*(R[O0]\d{6,10})', 0.90),
|
||||
# Generic: anything with "fiscal" followed by RO + digits
|
||||
(r'fiscal[a-z]*\s*:?\s*(R[O0]\d{6,10})', 0.85),
|
||||
]
|
||||
|
||||
# Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
|
||||
# Common in some receipts: "R011201891\nC. I. F." - number on line before label
|
||||
# Common in some receipts: "RO11201891\nC. I. F." - number on line before label
|
||||
# IMPORTANT: Capture the full CUI including RO prefix
|
||||
CUI_REVERSED_PATTERNS = [
|
||||
# RO + 8-10 digits on line immediately before C.I.F./CIF label
|
||||
(r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
||||
# Just digits before C.I.F. label
|
||||
# RO/R0 + 6-10 digits on line immediately before C.I.F./CIF label
|
||||
# Capture the FULL CUI including RO prefix
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
||||
# Just digits before C.I.F. label (neplatitor TVA - no RO prefix)
|
||||
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
|
||||
]
|
||||
|
||||
@@ -185,38 +207,67 @@ class ReceiptExtractor:
|
||||
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
|
||||
]
|
||||
|
||||
# TVA (VAT) patterns - OCR may produce TUA, TVR, etc.
|
||||
# TVA (VAT) patterns - OCR may produce TUA, TVR, IVA, etc.
|
||||
# All patterns are case-insensitive (re.IGNORECASE applied in extraction)
|
||||
TVA_PATTERNS = [
|
||||
# TOTAL TVA BON format (OCR tolerant: TUA, TVR)
|
||||
(r'TOTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
|
||||
(r'T[O0]TAL\s+T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.95),
|
||||
# TOTAL TVA BON format (OCR tolerant: TUA, TVR, IVA)
|
||||
(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
|
||||
(r'T[O0]TAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.95),
|
||||
# IVA variant (Spanish/Portuguese influence, some receipts)
|
||||
(r'TOTAL\s+IVA\s*:?\s*([\d\s.,]+)', 0.95),
|
||||
(r'IVA\s+[A-D]?\s*[-:]?\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.93),
|
||||
# TVA with percentage (OCR tolerant)
|
||||
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
|
||||
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
|
||||
# Simple TVA pattern
|
||||
(r'T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.85),
|
||||
# 5% TVA rate (books, newspapers - TVA C)
|
||||
(r'T[VU][AR]\s*[C5]\s*[-:]\s*5\s*%\s*:?\s*([\d\s.,]+)', 0.93),
|
||||
(r'(?:T[VU][AR]|IVA)\s+5\s*%\s*:?\s*([\d\s.,]+)', 0.92),
|
||||
# Garbled OCR: T0TAL, TVAI, TUAI, etc.
|
||||
(r'T[O0]T[AE]L\s+(?:T[VUAI]+[AR]?|IVA)\s*:?\s*([\d\s.,]+)', 0.88),
|
||||
# OCR corruption: "TA F 194" (TVA with V→F or space), "T A 19%"
|
||||
# Handles: "TOTAL TA F 194" where TVA became "TA F"
|
||||
(r'TOTAL\s+TA\s*[F\s]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.85),
|
||||
(r'TA\s+[FA-Z]?\s*\d{1,2}\s*%?\s*:?\s*([\d\s.,]+)', 0.82),
|
||||
# "TUA" with random letter after (OCR noise): "TUA F", "TUA I"
|
||||
(r'T[VU]A\s+[A-Z]?\s*\d*\s*:?\s*([\d\s.,]+)', 0.83),
|
||||
# Simple TVA/IVA pattern
|
||||
(r'(?:T[VU][AR]|IVA)\s*:?\s*([\d\s.,]+)', 0.85),
|
||||
# Standalone percentage line near TVA
|
||||
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
|
||||
]
|
||||
|
||||
# Payment method patterns - appears after TOTAL LEI, before TOTAL TVA
|
||||
# Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00"
|
||||
# OMV/Petrom uses "CARTE CREDIT" or "CARTE CREDIT 318, 16"
|
||||
PAYMENT_METHOD_PATTERNS = [
|
||||
# CARTE CREDIT with amount on same line (OMV/Petrom receipts)
|
||||
# Handles: "CARTE CREDIT 318, 16" with OCR spaces in number
|
||||
(r'CARTE\s+CREDIT\s*:?\s*([\d\s.,]+)', 'CARD', 0.98),
|
||||
# CARTE CREDIT with amount on next line (OCR may split lines)
|
||||
# Handles: "CARTE CREDIT\n318, 16"
|
||||
(r'CARTE\s+CREDIT\s*:?\s*\n\s*([\d\s.,]+)', 'CARD', 0.97),
|
||||
# CARD with amount (high confidence)
|
||||
(r'(?:PLATA\s+)?CARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
# Also handles OCR artifacts like "CARD F 100.00" where F is noise
|
||||
(r'(?:PLATA\s+)?CARD\s*[:\sA-Z]?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||
# NUMERAR (cash) with amount
|
||||
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
|
||||
# CASH alternative spelling
|
||||
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
|
||||
# Truncation recovery patterns (for OCR left-margin truncation issues)
|
||||
# IMPROVED: More restrictive - require max 6 digits before decimals
|
||||
# to avoid matching CUI numbers like RO10562600 → RD10562600
|
||||
# "RD" = truncated "CARD" (only 2 chars visible)
|
||||
(r'\bRD\s*:?\s*([\d\s.,]+)', 'CARD', 0.70),
|
||||
(r'(?:^|\n|\s)RD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.70),
|
||||
# "ARD" = truncated "CARD" (3 chars visible)
|
||||
(r'\bARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.75),
|
||||
(r'(?:^|\n|\s)ARD\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'CARD', 0.75),
|
||||
# "MERAR" = truncated "NUMERAR"
|
||||
(r'\bMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.70),
|
||||
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
|
||||
]
|
||||
|
||||
# Maximum reasonable payment amount for a receipt (100,000 LEI)
|
||||
# Amounts larger than this are likely OCR errors (e.g., CUI parsed as amount)
|
||||
MAX_REASONABLE_PAYMENT = Decimal('100000')
|
||||
|
||||
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
||||
# Number may be on separate line before or after the label
|
||||
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
|
||||
@@ -250,6 +301,9 @@ class ReceiptExtractor:
|
||||
# Reversed format: CIF/CUI before CLIENT
|
||||
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT:
|
||||
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:', # CUI CLIENT:
|
||||
# Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF CL IENT:"
|
||||
r'C[I1]F\s+[A-Z\s]{0,6}IENT\s*:', # "CIF a IENT:", "CIF CL IENT:", "CIF LIENT:"
|
||||
r'C[I1]F\s+LIENT\s*:', # "CIF LIENT:" (missing C from CLIENT)
|
||||
# CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
|
||||
# Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
|
||||
r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
|
||||
@@ -267,6 +321,16 @@ class ReceiptExtractor:
|
||||
# Client CUI patterns (explicitly after CLIENT marker)
|
||||
# OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# NEW: CUI on line BEFORE CLIENT marker (docTR/OCR may output value before label)
|
||||
# Pattern: "RO1879855\nCLIENT C.U.I./C.I.F.:" - CUI on line before CLIENT label
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
|
||||
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*[I1]\.?\s*F\.?', 0.99),
|
||||
# Same but with optional colon after RO number
|
||||
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
|
||||
# "CIF I CLIENT:" or "CIF IDENTIFICARE CLIENT:" format (OCR may insert extra chars)
|
||||
# Common OCR artifact: "CIF I CLIENT: R01879855"
|
||||
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
|
||||
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
||||
# CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
@@ -276,19 +340,34 @@ class ReceiptExtractor:
|
||||
# Most flexible pattern for slash variants
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
|
||||
# OCR artifact: doubled letters like "C.U U. I." or "C.I I.F." (docTR sometimes duplicates)
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*U?\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
# Corrupted CLIENT after CIF: "CIF a IENT:", "CIF LIENT:", "CIF L IENT:", "CIF C IENT:"
|
||||
# OCR often corrupts "CLIENT" when it appears after "CIF"
|
||||
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(R[O0]?\d{6,10})', 0.93), # "CIF a IENT:", "CIF CL IENT:"
|
||||
(r'CIF\s+[a-zA-Z\s]{2,8}IENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
||||
(r'CIF\s+LIENT\s*:?\s*(R[O0]?\d{6,10})', 0.92), # "CIF LIENT:" (missing C)
|
||||
(r'CIF\s+LIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
||||
# CUMPARATOR variants
|
||||
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
# CUMPARATOR with CUI/CIF on next line: "CUMPARATOR: NAME\nCIF: 12345678"
|
||||
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93),
|
||||
(r'CUMPARATOR\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.93), # F or T (OCR error)
|
||||
# CUMPARATOR with CUI/CIF two lines down: "CUMPARATOR: NAME\nADDRESS\nCIF: 12345678"
|
||||
(r'CUMPARATOR\s*:.*\n.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
# CUI/CIF on line immediately after CLIENT marker
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
# CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
|
||||
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # F or T (OCR error)
|
||||
# CUI/CIF after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
|
||||
(r'CLIENT\s*:.*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
(r'CLIENT\s*:.*\n\s*C\.?\s*[I1]\.?\s*[FT]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90), # CIF/CIT after name
|
||||
]
|
||||
|
||||
# Vendor name indicators (lines containing these are likely vendor names)
|
||||
@@ -322,6 +401,8 @@ class ReceiptExtractor:
|
||||
result.receipt_series, _ = self._extract_series(text_upper)
|
||||
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
|
||||
result.cui, _ = self._extract_cui(text_upper, text)
|
||||
# Normalize CUI: fix R0 → RO OCR error and validate format
|
||||
result.cui = OCRValidationEngine.normalize_cui(result.cui)
|
||||
|
||||
# Extract additional fields - Multiple TVA entries
|
||||
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
|
||||
@@ -345,10 +426,35 @@ class ReceiptExtractor:
|
||||
result.address = self._extract_address(text_upper)
|
||||
result.payment_methods = self._extract_payment_methods(text_upper)
|
||||
|
||||
# Validate payment methods against extracted amount
|
||||
# If payment sum >> amount, clear invalid payments (likely OCR error)
|
||||
# Save original payment methods before validation (for payment mode detection)
|
||||
original_payment_methods = result.payment_methods.copy() if result.payment_methods else []
|
||||
|
||||
result.payment_methods = self._validate_payment_methods(result.payment_methods, result.amount)
|
||||
|
||||
# Auto-suggest payment_mode based on detected payment methods
|
||||
# Use ORIGINAL payment_methods to detect CARD even if validation cleared them
|
||||
# (e.g., CARD 318.16 is valid even if total validation failed)
|
||||
payment_methods_for_mode = result.payment_methods if result.payment_methods else original_payment_methods
|
||||
if payment_methods_for_mode:
|
||||
card_amount = sum(
|
||||
pm.get('amount', Decimal('0'))
|
||||
for pm in payment_methods_for_mode
|
||||
if pm.get('method') == 'CARD'
|
||||
)
|
||||
if card_amount > 0:
|
||||
result.suggested_payment_mode = 'banca'
|
||||
print(f"[Payment Mode] CARD detected ({card_amount}), suggesting 'banca'", flush=True)
|
||||
else:
|
||||
# Only cash payments detected
|
||||
result.suggested_payment_mode = 'numerar'
|
||||
print(f"[Payment Mode] Cash only detected, suggesting 'numerar'", flush=True)
|
||||
|
||||
# Extract client data (B2B receipts)
|
||||
client_name, client_cui, client_address, confidence_client = self._extract_client_data(text_upper, text)
|
||||
result.client_name = client_name
|
||||
result.client_cui = client_cui
|
||||
result.client_cui = OCRValidationEngine.normalize_cui(client_cui) # Fix R0 → RO OCR error
|
||||
result.client_address = client_address
|
||||
result.confidence_client = confidence_client
|
||||
|
||||
@@ -378,13 +484,28 @@ class ReceiptExtractor:
|
||||
|
||||
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||
"""Extract total amount from text."""
|
||||
# PRE-FILTER: Remove lines containing REST (rest = change, not total)
|
||||
# When paid by card, there's no change - exact amount is paid
|
||||
lines = text.split('\n')
|
||||
filtered_lines = []
|
||||
for line in lines:
|
||||
# Skip lines with REST pattern (change amount, not total)
|
||||
if re.search(r'\bREST\b', line, re.IGNORECASE):
|
||||
continue
|
||||
filtered_lines.append(line)
|
||||
text = '\n'.join(filtered_lines)
|
||||
|
||||
# First try standard patterns (TOTAL, SUBTOTAL, etc.)
|
||||
for pattern, confidence in self.TOTAL_PATTERNS:
|
||||
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
try:
|
||||
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
|
||||
# IMPORTANT: Call _normalize_number FIRST to handle "190 60" → "190.60"
|
||||
# before stripping other characters
|
||||
amount_str = match.group(1).strip()
|
||||
amount_str = self._normalize_number(amount_str)
|
||||
# Now remove any remaining non-numeric chars (except decimal point)
|
||||
amount_str = re.sub(r'[^\d.]', '', amount_str)
|
||||
amount = Decimal(amount_str)
|
||||
if amount > 0:
|
||||
return amount, confidence
|
||||
@@ -461,8 +582,22 @@ class ReceiptExtractor:
|
||||
|
||||
def _normalize_number(self, num_str: str) -> str:
|
||||
"""Normalize Romanian number format to standard decimal."""
|
||||
# Remove spaces
|
||||
num_str = num_str.replace(' ', '')
|
||||
# OCR often reads "." as " " (space) - handle "190 60" as "190.60"
|
||||
# Pattern: digits + space + exactly 2 digits at end
|
||||
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', num_str.strip())
|
||||
if space_decimal_match:
|
||||
num_str = f"{space_decimal_match.group(1)}.{space_decimal_match.group(2)}"
|
||||
else:
|
||||
# Handle "1 234 56" pattern (thousands + decimal with spaces)
|
||||
# Match: digits + space(s) + digits + space + 2 digits
|
||||
multi_space_match = re.match(r'^([\d\s]+?)\s+(\d{2})$', num_str.strip())
|
||||
if multi_space_match:
|
||||
integer_part = multi_space_match.group(1).replace(' ', '')
|
||||
decimal_part = multi_space_match.group(2)
|
||||
num_str = f"{integer_part}.{decimal_part}"
|
||||
else:
|
||||
# Remove remaining spaces (thousands separators)
|
||||
num_str = num_str.replace(' ', '')
|
||||
|
||||
# Handle comma as decimal separator
|
||||
if ',' in num_str and '.' in num_str:
|
||||
@@ -532,34 +667,57 @@ class ReceiptExtractor:
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Case 1: Amount is valid with high confidence - just validate
|
||||
# Case 1: Amount is valid with high confidence - validate against TVA and payments
|
||||
if amount and amount > 0 and confidence_amount >= 0.8:
|
||||
# Cross-validate: check if it matches payment methods
|
||||
# First check TVA-implied total (most reliable when TVA is extracted correctly)
|
||||
if tva_implied_total and tva_implied_total > 0:
|
||||
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
|
||||
if tva_diff_percent <= 1:
|
||||
# Near-perfect TVA match - highest confidence
|
||||
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by TVA)"
|
||||
elif tva_diff_percent > 10:
|
||||
# Significant mismatch - TVA-implied total is more reliable
|
||||
# This catches cases where wrong TOTAL line was extracted (e.g., REST, SUBTOTAL)
|
||||
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
|
||||
return tva_implied_total, 0.90, "calculated from TVA (extracted amount mismatch)"
|
||||
|
||||
# Cross-validate with payment methods
|
||||
if payment_sum > 0 and abs(amount - payment_sum) <= Decimal('0.02'):
|
||||
# Perfect match - boost confidence
|
||||
return amount, min(0.98, confidence_amount + 0.05), "extracted (validated by payment methods)"
|
||||
elif payment_sum > 0:
|
||||
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
|
||||
if payment_diff_percent > 10:
|
||||
# Significant mismatch - payment sum is more reliable
|
||||
print(f"[Cross-Validation] Amount mismatch with payments: extracted={amount}, payments={payment_sum} (diff={payment_diff_percent:.1f}%)", flush=True)
|
||||
return payment_sum, 0.88, "calculated from payment methods (extracted amount mismatch)"
|
||||
|
||||
return amount, confidence_amount, "extracted"
|
||||
|
||||
# Case 2: Amount exists but low confidence - try to validate/correct
|
||||
if amount and amount > 0:
|
||||
# First check TVA-implied total (most reliable)
|
||||
if tva_implied_total and tva_implied_total > 0:
|
||||
tva_diff_percent = abs(float(amount) - float(tva_implied_total)) / float(tva_implied_total) * 100
|
||||
if tva_diff_percent <= 2:
|
||||
# Close match - boost confidence
|
||||
return amount, 0.88, "extracted (validated by TVA)"
|
||||
elif tva_diff_percent > 10:
|
||||
# Significant mismatch - use TVA-implied total
|
||||
print(f"[Cross-Validation] Amount mismatch with TVA: extracted={amount}, tva_implied={tva_implied_total} (diff={tva_diff_percent:.1f}%)", flush=True)
|
||||
return tva_implied_total, 0.85, "calculated from TVA"
|
||||
|
||||
# Check if payment methods sum matches
|
||||
if payment_sum > 0:
|
||||
if abs(amount - payment_sum) <= Decimal('0.02'):
|
||||
# Match - boost confidence
|
||||
payment_diff_percent = abs(float(amount) - float(payment_sum)) / float(payment_sum) * 100
|
||||
if payment_diff_percent <= 0.5:
|
||||
# Close match - boost confidence
|
||||
return amount, 0.90, "extracted (validated by payment methods)"
|
||||
else:
|
||||
elif payment_diff_percent > 10:
|
||||
# Mismatch - prefer payment_sum as it's more reliable
|
||||
print(f"[Cross-Validation] Amount mismatch: extracted={amount}, payments={payment_sum}", flush=True)
|
||||
return payment_sum, 0.85, "calculated from payment methods"
|
||||
|
||||
# Check TVA-implied total
|
||||
if tva_implied_total:
|
||||
if abs(amount - tva_implied_total) <= Decimal('0.50'):
|
||||
# Close match - use extracted amount
|
||||
return amount, 0.80, "extracted (validated by TVA)"
|
||||
else:
|
||||
print(f"[Cross-Validation] TVA mismatch: extracted={amount}, tva_implied={tva_implied_total}", flush=True)
|
||||
|
||||
# No validation possible - return as-is
|
||||
return amount, confidence_amount, "extracted (unvalidated)"
|
||||
|
||||
@@ -701,6 +859,10 @@ class ReceiptExtractor:
|
||||
|
||||
line_upper = line.upper()
|
||||
|
||||
# Skip lines with skip keywords (CUMPARATOR, CLIENT, etc.)
|
||||
if any(kw in line_upper for kw in skip_keywords):
|
||||
continue
|
||||
|
||||
# Check for vendor indicators
|
||||
for indicator in self.VENDOR_INDICATORS:
|
||||
if re.search(indicator, line_upper):
|
||||
@@ -778,13 +940,21 @@ class ReceiptExtractor:
|
||||
Extract vendor CUI (fiscal identification code) from text.
|
||||
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
|
||||
"""
|
||||
def get_cui_digit_count(cui: str) -> int:
|
||||
"""Get the count of digits in CUI (excluding RO/R0 prefix)."""
|
||||
cui_upper = cui.upper().strip()
|
||||
if cui_upper.startswith('RO') or cui_upper.startswith('R0'):
|
||||
return len(cui_upper) - 2
|
||||
return len(cui_upper)
|
||||
|
||||
# Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
|
||||
# This is common in some receipts: "R011201891\nC. I. F."
|
||||
# This is common in some receipts: "RO11201891\nC. I. F."
|
||||
for pattern, confidence in self.CUI_REVERSED_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
if 6 <= len(cui) <= 10:
|
||||
digit_count = get_cui_digit_count(cui)
|
||||
if 6 <= digit_count <= 10:
|
||||
# Verify this is not the CLIENT CUI by checking context
|
||||
start = match.start()
|
||||
# Check 50 chars before the match for CLIENT keyword
|
||||
@@ -805,7 +975,8 @@ class ReceiptExtractor:
|
||||
match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
if 6 <= len(cui) <= 10:
|
||||
digit_count = get_cui_digit_count(cui)
|
||||
if 6 <= digit_count <= 10:
|
||||
return cui, confidence
|
||||
|
||||
# Strategy 2: Fallback - search entire text but exclude CLIENT patterns
|
||||
@@ -813,7 +984,8 @@ class ReceiptExtractor:
|
||||
# Find all matches
|
||||
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
||||
cui = match.group(1)
|
||||
if 6 <= len(cui) <= 10:
|
||||
digit_count = get_cui_digit_count(cui)
|
||||
if 6 <= digit_count <= 10:
|
||||
# Check if this match is preceded by CLIENT in the same line
|
||||
start = match.start()
|
||||
line_start = text_upper.rfind('\n', 0, start) + 1
|
||||
@@ -937,9 +1109,90 @@ class ReceiptExtractor:
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
|
||||
# OCR tolerant: TUA, TVR, etc.
|
||||
pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
||||
# Pattern 0c: REVERSED FORMAT "5.00% TUA*B" followed by amount on next line
|
||||
# This handles receipts where percentage comes BEFORE TVA code (e.g., books with 5% rate)
|
||||
# Matches: "5.00% TUA*B", "5% TVA B", "5.00% TVA", "9% TUA", "5% IVA"
|
||||
if not tva_entries:
|
||||
# Pattern: PERCENT% + TVA/IVA + optional code, then amount on next line
|
||||
reversed_tva_pattern = r'(\d{1,2})[.,]?\d{0,2}\s*%\s*(?:T[VU][AR]|IVA)\s*\*?([A-D])?'
|
||||
for match in re.finditer(reversed_tva_pattern, normalized_text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
code = (match.group(2) or self._get_tva_code_from_percent(percent)).upper()
|
||||
|
||||
# Look for amount on the next line(s) after the match
|
||||
after_match = normalized_text[match.end():]
|
||||
# Find standalone number (amount) - skip empty lines
|
||||
amount_match = re.search(r'^[\s\n]*([\d]+[.,]\d{2})\b', after_match)
|
||||
if amount_match:
|
||||
amount_str = self._normalize_number(amount_match.group(1))
|
||||
amount = Decimal(amount_str)
|
||||
if amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen_entries:
|
||||
tva_entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen_entries.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Pattern 0d: "TOTAL TUA:", "TOTAL TVA:", "TOTAL IVA:" with amount (OCR variants)
|
||||
if not tva_entries:
|
||||
total_tva_simple = r'TOTAL\s+(?:T[VU][AR]|IVA)\s*:?\s*([\d.,]+)'
|
||||
match = re.search(total_tva_simple, normalized_text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
amount_str = self._normalize_number(match.group(1))
|
||||
amount = Decimal(amount_str)
|
||||
if amount > 0:
|
||||
# Try to find the rate in nearby text
|
||||
percent = self._detect_tva_percent(text)
|
||||
if percent:
|
||||
code = self._get_tva_code_from_percent(percent)
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen_entries:
|
||||
tva_entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen_entries.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
# Pattern 0e: Multiline "TOTAL TUA\n198\n30.43" where:
|
||||
# - "TOTAL TUA" on one line
|
||||
# - "198" or similar (corrupted "19%") on next line (optional)
|
||||
# - "30.43" (TVA amount) on following line
|
||||
# OCR often splits this across multiple lines
|
||||
if not tva_entries:
|
||||
multiline_tva = r'TOTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s*\n\s*\d*\s*\n?\s*([\d]+[.,]\d{2})\b'
|
||||
match = re.search(multiline_tva, normalized_text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
amount_str = self._normalize_number(match.group(1))
|
||||
amount = Decimal(amount_str)
|
||||
if amount > 0:
|
||||
percent = self._detect_tva_percent(text)
|
||||
if percent:
|
||||
code = self._get_tva_code_from_percent(percent)
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen_entries:
|
||||
tva_entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen_entries.add(entry_key)
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" or "IVA A - 19%" (with code)
|
||||
# OCR tolerant: TUA, TVR, IVA, etc.
|
||||
pattern_with_code = r'(?:T[VU][AR]|IVA)\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
||||
for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
@@ -959,9 +1212,9 @@ class ReceiptExtractor:
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A')
|
||||
# Pattern 2: "TVA - 21%: 32.31" or "IVA - 21%: 32.31" (without explicit code, assume 'A')
|
||||
if not tva_entries:
|
||||
pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
||||
pattern_no_code = r'(?:T[VU][AR]|IVA)\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
|
||||
for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
@@ -982,10 +1235,10 @@ class ReceiptExtractor:
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
|
||||
# Pattern 3: "TOTAL TVA A - 21%" or "TOTAL IVA" with amount on same line or "TOTAL TVA BON" with amount
|
||||
if not tva_entries:
|
||||
# First try: "TOTAL TVA A - 21% 32.31" (amount on same line)
|
||||
tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
|
||||
# First try: "TOTAL TVA A - 21% 32.31" or "TOTAL IVA A - 21% 32.31" (amount on same line)
|
||||
tva_with_amount = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
|
||||
for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
@@ -1004,16 +1257,16 @@ class ReceiptExtractor:
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
|
||||
# Pattern 3b: "TOTAL TVA A - 21%" or "TOTAL IVA A - 21%" on one line, look for "TOTAL TVA BON" amount
|
||||
if not tva_entries:
|
||||
tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
|
||||
tva_total_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
|
||||
for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
|
||||
# Look for "TOTAL TVA BON" followed by amount
|
||||
tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
|
||||
# Look for "TOTAL TVA BON" or "TOTAL IVA BON" followed by amount
|
||||
tva_bon_pattern = r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON[:\s]*([\d.,]+)'
|
||||
tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
|
||||
if tva_bon_match:
|
||||
amount_str = self._normalize_number(tva_bon_match.group(1))
|
||||
@@ -1029,8 +1282,8 @@ class ReceiptExtractor:
|
||||
seen_entries.add(entry_key)
|
||||
continue
|
||||
|
||||
# Fallback: Amount after TOTAL TVA BON on next line
|
||||
tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
|
||||
# Fallback: Amount after TOTAL TVA BON or TOTAL IVA BON on next line
|
||||
tva_bon_pos = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)\s+BON', normalized_text, re.IGNORECASE)
|
||||
if tva_bon_pos:
|
||||
after_bon = normalized_text[tva_bon_pos.end():]
|
||||
# Find first standalone number (likely TVA amount)
|
||||
@@ -1050,9 +1303,9 @@ class ReceiptExtractor:
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
|
||||
# Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
|
||||
# Pattern 3c: "TVAA - 21%" or "IVA A - 21%" on one line, amount on next line (simpler format)
|
||||
if not tva_entries:
|
||||
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
|
||||
tva_line_pattern = r'(?:T[VU][AR]|IVA)\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
|
||||
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
|
||||
try:
|
||||
code = (match.group(1) or 'A').upper()
|
||||
@@ -1158,16 +1411,18 @@ class ReceiptExtractor:
|
||||
Extract TOTAL TVA BON value separately as the reference.
|
||||
This is the authoritative total TVA on the receipt.
|
||||
|
||||
Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, etc.
|
||||
Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, TOTAL IVA BON, etc.
|
||||
"""
|
||||
# Pattern for TOTAL TVA BON with amount after
|
||||
# Pattern for TOTAL TVA BON or TOTAL IVA BON with amount after
|
||||
# OCR corruptions: TUAL (TVA+L merged), TVAL, TUAI, etc.
|
||||
patterns = [
|
||||
# Standard: TOTAL TVA BON: 14.92
|
||||
r'T?OTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
|
||||
# Standard: TOTAL TVA BON: 14.92 or TOTAL IVA BON: 14.92
|
||||
# Handles: TUAL (TVA+L), TVAL, TUAI, etc. with optional trailing letters
|
||||
r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
|
||||
# Amount before: 14.92 OTAL TUA BON (OCR line break)
|
||||
r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+T[VU][AR]\s+BON',
|
||||
# Amount on next line after TOTAL TVA BON
|
||||
r'T?OTAL\s+T[VU][AR]\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
|
||||
r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON',
|
||||
# Amount on next line after TOTAL TVA BON or TOTAL IVA BON
|
||||
r'T?OTAL\s+(?:T[VU][AR]L?|TU[AR]L|IVA)\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
@@ -1271,18 +1526,52 @@ class ReceiptExtractor:
|
||||
return tva_entries, tva_total
|
||||
|
||||
def _detect_tva_percent(self, text: str) -> Optional[int]:
|
||||
"""Detect TVA percentage from text content."""
|
||||
# Look for common Romanian TVA percentages
|
||||
if '19%' in text or '19 %' in text:
|
||||
"""Detect TVA percentage from text content.
|
||||
|
||||
IMPORTANT: Prioritize rates found near TVA markers over rates found elsewhere.
|
||||
E.g., "REDUCERE 5%" should not override "TVA A 19%".
|
||||
Also handle OCR corruptions like "194" for "19%" in "TOTAL TA F 194".
|
||||
"""
|
||||
import re as regex
|
||||
|
||||
# First, look for percent NEAR TVA markers (most reliable)
|
||||
# This handles "TVA A 19%", "TVA 19,00%", "TOTAL TVA 19%"
|
||||
tva_context_patterns = [
|
||||
r'T[VU][AR]\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
|
||||
r'IVA\s*[A-D]?\s*[-:]?\s*(19|21|11|9|5)[.,]?\s*\d{0,2}\s*%',
|
||||
# OCR corruption: "TOTAL TA F 194" where 194 = 19% (4 is artifact)
|
||||
r'TOTAL\s+T[VA][AR]?\s*[F\s]?\s*(19|21)\d\b',
|
||||
]
|
||||
for pattern in tva_context_patterns:
|
||||
match = regex.search(pattern, text, regex.IGNORECASE)
|
||||
if match:
|
||||
rate = int(match.group(1))
|
||||
if rate in (19, 21, 11, 9, 5):
|
||||
return rate
|
||||
|
||||
# Fallback: Look for common Romanian TVA percentages anywhere
|
||||
# But EXCLUDE patterns near "REDUCERE", "DISCOUNT", "RED." (these are discounts, not TVA)
|
||||
# Clean text by removing discount context
|
||||
# Handle OCR corruptions: RED.CERE (C instead of U), RED CERE, REDUC, etc.
|
||||
text_no_discount = regex.sub(r'(?:REDUC|DISCOUNT|RED)[.\sA-Z]*\d+[.,]?\d*\s*%', '', text, flags=regex.IGNORECASE)
|
||||
|
||||
# Now search in cleaned text (priority order: 19% > 21% > 11% > 9% > 5%)
|
||||
if regex.search(r'\b19[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
||||
return 19
|
||||
elif '21%' in text or '21 %' in text:
|
||||
elif regex.search(r'\b21[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
||||
return 21
|
||||
elif '11%' in text or '11 %' in text:
|
||||
elif regex.search(r'\b11[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
||||
return 11
|
||||
elif '9%' in text or '9 %' in text:
|
||||
elif regex.search(r'\b9[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
||||
return 9
|
||||
elif '5%' in text or '5 %' in text:
|
||||
elif regex.search(r'\b5[.,]?\s*\d{0,2}\s*%', text_no_discount):
|
||||
return 5
|
||||
|
||||
# Default: If no percent found but we're in Romanian receipt context,
|
||||
# assume 19% (standard rate)
|
||||
if regex.search(r'T[VU][AR]|IVA', text, regex.IGNORECASE):
|
||||
return 19
|
||||
|
||||
return None
|
||||
|
||||
def _validate_tva_reverse(
|
||||
@@ -1293,9 +1582,12 @@ class ReceiptExtractor:
|
||||
"""
|
||||
Reverse TVA validation: from TVA amount and rate, calculate expected total.
|
||||
|
||||
Formula:
|
||||
base = tva_amount / (rate/100)
|
||||
expected_total = sum(base + tva_amount) for all entries
|
||||
Formula (CORRECT):
|
||||
For TVA that is INCLUDED in total (standard Romanian receipts):
|
||||
total = base + tva
|
||||
tva = base * rate/100
|
||||
Therefore: base = tva * 100 / rate
|
||||
And: total = base + tva = tva * 100 / rate + tva = tva * (100 + rate) / rate
|
||||
|
||||
Returns (is_valid, expected_total, message)
|
||||
"""
|
||||
@@ -1307,10 +1599,14 @@ class ReceiptExtractor:
|
||||
tva_amount = entry['amount']
|
||||
rate = Decimal(str(entry['percent']))
|
||||
|
||||
print(f"[TVA Debug] Entry: amount={tva_amount}, rate={rate}%", flush=True)
|
||||
|
||||
if rate > 0:
|
||||
# Calculate base from TVA: base = tva / (rate/100)
|
||||
base = tva_amount / (rate / Decimal('100'))
|
||||
expected_total += base + tva_amount
|
||||
# CORRECT formula: total = tva * (100 + rate) / rate
|
||||
# Example: tva=55.22, rate=21 → total = 55.22 * 121 / 21 = 318.16
|
||||
gross_for_entry = tva_amount * (Decimal('100') + rate) / rate
|
||||
expected_total += gross_for_entry
|
||||
print(f"[TVA Debug] Calculated gross: {gross_for_entry}", flush=True)
|
||||
else:
|
||||
# 0% TVA - can't calculate base, skip
|
||||
pass
|
||||
@@ -1393,7 +1689,7 @@ class ReceiptExtractor:
|
||||
|
||||
# Find the region between TOTAL LEI and TOTAL TVA
|
||||
total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE)
|
||||
total_tva_match = re.search(r'TOTAL\s+T[VU][AR]', normalized_text, re.IGNORECASE)
|
||||
total_tva_match = re.search(r'TOTAL\s+(?:T[VU][AR]|IVA)', normalized_text, re.IGNORECASE)
|
||||
|
||||
# Define search region (after TOTAL LEI, before TOTAL TVA if exists)
|
||||
if total_lei_match:
|
||||
@@ -1404,22 +1700,60 @@ class ReceiptExtractor:
|
||||
search_region = normalized_text # Fallback to full text
|
||||
|
||||
for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS:
|
||||
for match in re.finditer(pattern, search_region, re.IGNORECASE):
|
||||
for match in re.finditer(pattern, search_region, re.IGNORECASE | re.MULTILINE):
|
||||
try:
|
||||
amount_str = match.group(1).replace(' ', '')
|
||||
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
||||
amount = Decimal(amount_str)
|
||||
if amount > 0 and method not in seen_methods:
|
||||
# Validate: amount must be positive and reasonable (< MAX_REASONABLE_PAYMENT)
|
||||
# This prevents OCR errors like CUI being parsed as payment
|
||||
if amount > 0 and amount < self.MAX_REASONABLE_PAYMENT and method not in seen_methods:
|
||||
payment_methods.append({
|
||||
'method': method,
|
||||
'amount': amount
|
||||
})
|
||||
seen_methods.add(method)
|
||||
print(f"[Payment] Found {method}: {amount} (pattern matched)", flush=True)
|
||||
elif amount >= self.MAX_REASONABLE_PAYMENT:
|
||||
print(f"[Payment] Rejected unreasonable amount {amount} for {method} (likely OCR error)", flush=True)
|
||||
except (InvalidOperation, ValueError):
|
||||
continue
|
||||
|
||||
return payment_methods
|
||||
|
||||
def _validate_payment_methods(
|
||||
self, payment_methods: List[dict], total: Optional[Decimal]
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Validate payment methods against extracted total.
|
||||
|
||||
If payment sum is way larger than total (>10x), it's likely an OCR error
|
||||
(e.g., CUI number parsed as payment amount). Clear invalid payments.
|
||||
|
||||
Args:
|
||||
payment_methods: List of {'method': str, 'amount': Decimal}
|
||||
total: Extracted total amount
|
||||
|
||||
Returns:
|
||||
Validated payment methods (may be empty if all were invalid)
|
||||
"""
|
||||
if not total or not payment_methods:
|
||||
return payment_methods
|
||||
|
||||
payment_sum = sum(pm.get('amount', Decimal('0')) for pm in payment_methods)
|
||||
|
||||
# If payment sum > 10x total, it's definitely an error
|
||||
if payment_sum > total * 10:
|
||||
print(f"[Payment Validation] Payment sum {payment_sum} >> Total {total} (>10x), clearing invalid payments", flush=True)
|
||||
return []
|
||||
|
||||
# If payment sum > 2x total, it's suspicious but might be valid in some edge cases
|
||||
# Just log a warning
|
||||
if payment_sum > total * 2:
|
||||
print(f"[Payment Validation] Warning: Payment sum {payment_sum} > 2x Total {total}, possible OCR error", flush=True)
|
||||
|
||||
return payment_methods
|
||||
|
||||
def _extract_client_data(
|
||||
self, text_upper: str, original_text: str
|
||||
) -> Tuple[Optional[str], Optional[str], Optional[str], float]:
|
||||
|
||||
Reference in New Issue
Block a user