feat(ocr): Add validation system and CLIENT CUI extraction
OCR Data Extraction Validation System: - Add 7 validation rules (amount range, TVA ratio, payment sum, etc.) - Add Medium preprocessing to replace Heavy (fixes digit concatenation) - Add validation warnings to API responses - Flag receipts needing manual review (needs_manual_review field) - Add database migration for needs_manual_review column CLIENT CUI Extraction Improvements: - Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc. - Handle OCR errors (R0 vs RO, C1F vs CIF) - Add client_name, client_cui, client_address to API response - Add validation fields to API response (was missing) QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings) - Fixed type safety in validation rules - Fixed ZeroDivisionError risk - Fixed schema mismatch (Optional[bool] for needs_manual_review) - All 37 unit tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,13 @@ class ExtractionResult:
|
||||
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
|
||||
processing_time_ms: int = 0 # Processing time in milliseconds
|
||||
|
||||
# Validation tracking (added by bon-ocr-validation feature)
|
||||
needs_manual_review: Optional[bool] = None # None=not validated, False=ok, True=needs review
|
||||
validation_warnings: List[str] = field(default_factory=list)
|
||||
validation_errors: List[str] = field(default_factory=list)
|
||||
confidence_adjustments: dict[str, float] = field(default_factory=dict) # Field -> penalty
|
||||
inter_ocr_ratios: dict[str, float] = field(default_factory=dict) # Field -> ratio
|
||||
|
||||
@property
|
||||
def overall_confidence(self) -> float:
|
||||
"""Calculate weighted overall confidence score."""
|
||||
@@ -238,10 +245,18 @@ class ReceiptExtractor:
|
||||
|
||||
# Client/Buyer patterns (for B2B receipts)
|
||||
# CLIENT, CUMPARATOR, BENEFICIAR sections
|
||||
# Variations: "CIF CLIENT:", "CLIENT C.U.I/C.I.F.", "CLIENT C. U. I./ C. I.F."
|
||||
CLIENT_SECTION_MARKERS = [
|
||||
r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT: (reversed format)
|
||||
r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:', # CUI CLIENT: (reversed format)
|
||||
# Reversed format: CIF/CUI before CLIENT
|
||||
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT:
|
||||
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:', # CUI CLIENT:
|
||||
# CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
|
||||
# Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
|
||||
r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
|
||||
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CLIENT CUI: or CLIENT CIF:
|
||||
r'CLIENT\s*:',
|
||||
# CUMPARATOR variants
|
||||
r'CUMPARATOR\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CUMPARATOR CUI: or CIF:
|
||||
r'CUMPARATOR\s*:',
|
||||
r'BENEFICIAR\s*:',
|
||||
r'CUMP[AĂ]R[AĂ]TOR\s*:',
|
||||
@@ -250,25 +265,30 @@ class ReceiptExtractor:
|
||||
]
|
||||
|
||||
# Client CUI patterns (explicitly after CLIENT marker)
|
||||
# OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# CIF CLIENT: R01879856 (reversed format - CIF before CLIENT)
|
||||
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
# CLIENT C.U.I./ C.I.F. :R01879855 (slash variant with both labels)
|
||||
(r'CLIENT\s+C\.\s*U\.\s*I\.?\s*/\s*C\.\s*[I1]\.\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?(?:\s*/\s*C\.?\s*[I1]\.?\s*F\.?)?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
(r'CLIENT\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
(r'CUMPARATOR\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CUMPARATOR\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
# CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
|
||||
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
|
||||
# CLIENT C.U.I/C.I.F. or CLIENT C. U. I./ C. I.F. (slash variant - all spacing)
|
||||
# Most flexible pattern for slash variants
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
|
||||
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
|
||||
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
|
||||
# CUMPARATOR variants
|
||||
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
# CUI/CIF on line immediately after CLIENT marker
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
(r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
|
||||
# CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
|
||||
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||
]
|
||||
|
||||
# Vendor name indicators (lines containing these are likely vendor names)
|
||||
|
||||
Reference in New Issue
Block a user