feat(ocr): Add validation system and CLIENT CUI extraction

OCR Data Extraction Validation System:
- Add 7 validation rules (amount range, TVA ratio, payment sum, etc.)
- Add Medium preprocessing to replace Heavy (fixes digit concatenation)
- Add validation warnings to API responses
- Flag receipts needing manual review (needs_manual_review field)
- Add database migration for needs_manual_review column

CLIENT CUI Extraction Improvements:
- Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc.
- Handle OCR errors (R0 vs RO, C1F vs CIF)
- Add client_name, client_cui, client_address to API response
- Add validation fields to API response (was missing)

QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings)
- Fixed type safety in validation rules
- Fixed ZeroDivisionError risk
- Fixed schema mismatch (Optional[bool] for needs_manual_review)
- All 37 unit tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 19:12:52 +02:00
parent ce85e0643b
commit ab160b628d
14 changed files with 4161 additions and 33 deletions

View File

@@ -17,6 +17,7 @@ from typing import Optional, Tuple
from backend.modules.data_entry.services.ocr_engine import OCREngine
from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult
from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
# Setup logging
logger = logging.getLogger(__name__)
@@ -126,28 +127,28 @@ class OCRService:
extraction = ExtractionResult()
# ══════════════════════════════════════════════════════════════
# STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
# ══════════════════════════════════════════════════════════════
print("=" * 60, flush=True)
print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
print("[OCR] STEP 2: PaddleOCR + Medium preprocessing", flush=True)
print("=" * 60, flush=True)
heavy_img = self.preprocessor.preprocess_heavy(image)
medium_img = self.preprocessor.preprocess_medium(image)
try:
paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
if paddle_heavy and paddle_heavy.text:
extraction_heavy = self.extractor.extract(paddle_heavy.text)
extraction_heavy.ocr_engine = "paddle-heavy"
raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
if paddle_medium and paddle_medium.text:
extraction_medium = self.extractor.extract(paddle_medium.text)
extraction_medium.ocr_engine = "paddle-medium"
raw_texts.append(f"═══ PaddleOCR (medium, conf: {paddle_medium.confidence:.0%}) ═══\n{paddle_medium.text}")
print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
print(f" - Amount: {extraction_heavy.amount}", flush=True)
print(f" - Date: {extraction_heavy.receipt_date}", flush=True)
print(f" - CUI: {extraction_heavy.cui}", flush=True)
print(f"[OCR] Step 2 (Medium) Results:", flush=True)
print(f" - OCR Confidence: {paddle_medium.confidence:.0%}", flush=True)
print(f" - Amount: {extraction_medium.amount}", flush=True)
print(f" - Date: {extraction_medium.receipt_date}", flush=True)
print(f" - CUI: {extraction_medium.cui}", flush=True)
# Merge with previous
extraction = self._merge_extractions(extraction, extraction_heavy)
extraction = self._merge_extractions(extraction, extraction_medium)
print(f"[OCR] After merge:", flush=True)
print(f" - Amount: {extraction.amount}", flush=True)
@@ -167,7 +168,7 @@ class OCRService:
else:
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
except Exception as e:
print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
# ══════════════════════════════════════════════════════════════
# STEP 3: Tesseract - ONLY to complete missing fields
@@ -235,6 +236,70 @@ class OCRService:
print(f" - Processing Time: {elapsed_ms}ms", flush=True)
print(f" - Message: {message}", flush=True)
# ══════════════════════════════════════════════════════════════
# VALIDATION: Apply validation rules to final extraction
# ══════════════════════════════════════════════════════════════
print("\n" + "=" * 60, flush=True)
print("[Validation] Applying validation rules...", flush=True)
print("=" * 60, flush=True)
validator = OCRValidationEngine()
# Prepare data for validation with safe type conversions
def safe_float(value) -> Optional[float]:
"""Safely convert Decimal or number to float."""
if value is None:
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def safe_payment_sum(methods: list, method_type: str) -> Optional[float]:
"""Safely sum payment amounts for a given method type."""
if not methods:
return None
try:
total = sum(
float(pm.get('amount', 0) or 0)
for pm in methods
if pm.get('method') == method_type
)
return total if total > 0 else None
except (TypeError, ValueError):
return None
validation_data = {
'amount': safe_float(extraction.amount),
'tva': safe_float(extraction.tva_total),
'cui': extraction.cui,
'card_amount': safe_payment_sum(extraction.payment_methods, 'CARD'),
'cash_amount': safe_payment_sum(extraction.payment_methods, 'NUMERAR'),
'tva_entries': {
entry.get('code', ''): safe_float(entry.get('amount'))
for entry in (extraction.tva_entries or [])
if entry.get('code') and safe_float(entry.get('amount')) is not None
}
}
# Run validation (no light/medium comparison for final result)
validated_result = validator.validate_extraction(validation_data)
# Apply validation results to extraction
extraction.needs_manual_review = validated_result.needs_manual_review
extraction.validation_warnings = validated_result.validation_warnings
extraction.validation_errors = validated_result.validation_errors
extraction.confidence_adjustments = validated_result.confidence_adjustments
extraction.inter_ocr_ratios = validated_result.inter_ocr_ratios
print(f"[Validation] Complete:", flush=True)
print(f" - Warnings: {len(extraction.validation_warnings)}", flush=True)
print(f" - Errors: {len(extraction.validation_errors)}", flush=True)
print(f" - Needs Manual Review: {extraction.needs_manual_review}", flush=True)
if extraction.validation_warnings:
for warning in extraction.validation_warnings:
print(f" ⚠️ {warning}", flush=True)
return True, message, extraction
def _merge_extractions(