feat(ocr): Add validation system and CLIENT CUI extraction

OCR Data Extraction Validation System:
- Add 7 validation rules (amount range, TVA ratio, payment sum, etc.)
- Add Medium preprocessing to replace Heavy (fixes digit concatenation)
- Add validation warnings to API responses
- Flag receipts needing manual review (needs_manual_review field)
- Add database migration for needs_manual_review column

CLIENT CUI Extraction Improvements:
- Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc.
- Handle OCR errors (R0 vs RO, C1F vs CIF)
- Add client_name, client_cui, client_address to API response
- Add validation fields to API response (was missing)

QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings)
- Fixed type safety in validation rules
- Fixed ZeroDivisionError risk
- Fixed schema mismatch (Optional[bool] for needs_manual_review)
- All 37 unit tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 19:12:52 +02:00
parent ce85e0643b
commit ab160b628d
14 changed files with 4161 additions and 33 deletions

View File

@@ -38,6 +38,13 @@ class ExtractionResult:
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
processing_time_ms: int = 0 # Processing time in milliseconds
# Validation tracking (added by bon-ocr-validation feature)
needs_manual_review: Optional[bool] = None # None=not validated, False=ok, True=needs review
validation_warnings: List[str] = field(default_factory=list)
validation_errors: List[str] = field(default_factory=list)
confidence_adjustments: dict[str, float] = field(default_factory=dict) # Field -> penalty
inter_ocr_ratios: dict[str, float] = field(default_factory=dict) # Field -> ratio
@property
def overall_confidence(self) -> float:
"""Calculate weighted overall confidence score."""
@@ -238,10 +245,18 @@ class ReceiptExtractor:
# Client/Buyer patterns (for B2B receipts)
# CLIENT, CUMPARATOR, BENEFICIAR sections
# Variations: "CIF CLIENT:", "CLIENT C.U.I/C.I.F.", "CLIENT C. U. I./ C. I.F."
CLIENT_SECTION_MARKERS = [
r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT: (reversed format)
r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:', # CUI CLIENT: (reversed format)
# Reversed format: CIF/CUI before CLIENT
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT:
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:', # CUI CLIENT:
# CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
# Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CLIENT CUI: or CLIENT CIF:
r'CLIENT\s*:',
# CUMPARATOR variants
r'CUMPARATOR\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CUMPARATOR CUI: or CIF:
r'CUMPARATOR\s*:',
r'BENEFICIAR\s*:',
r'CUMP[AĂ]R[AĂ]TOR\s*:',
@@ -250,25 +265,30 @@ class ReceiptExtractor:
]
# Client CUI patterns (explicitly after CLIENT marker)
# OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
CLIENT_CUI_PATTERNS = [
# CIF CLIENT: R01879856 (reversed format - CIF before CLIENT)
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# CLIENT C.U.I./ C.I.F. :R01879855 (slash variant with both labels)
(r'CLIENT\s+C\.\s*U\.\s*I\.?\s*/\s*C\.\s*[I1]\.\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?(?:\s*/\s*C\.?\s*[I1]\.?\s*F\.?)?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'CLIENT\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'CUMPARATOR\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# CLIENT C.U.I/C.I.F. or CLIENT C. U. I./ C. I.F. (slash variant - all spacing)
# Most flexible pattern for slash variants
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
# CUMPARATOR variants
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUI/CIF on line immediately after CLIENT marker
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
]
# Vendor name indicators (lines containing these are likely vendor names)