feat(ocr): Add validation system and CLIENT CUI extraction

OCR Data Extraction Validation System: - Add 7 validation rules (amount range, TVA ratio, payment sum, etc.) - Add Medium preprocessing to replace Heavy (fixes digit concatenation) - Add validation warnings to API responses - Flag receipts needing manual review (needs_manual_review field) - Add database migration for needs_manual_review column CLIENT CUI Extraction Improvements: - Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc. - Handle OCR errors (R0 vs RO, C1F vs CIF) - Add client_name, client_cui, client_address to API response - Add validation fields to API response (was missing) QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings) - Fixed type safety in validation rules - Fixed ZeroDivisionError risk - Fixed schema mismatch (Optional[bool] for needs_manual_review) - All 37 unit tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 19:12:52 +02:00
parent ce85e0643b
commit ab160b628d
14 changed files with 4161 additions and 33 deletions
--- a/backend/modules/data_entry/services/ocr_service.py
+++ b/backend/modules/data_entry/services/ocr_service.py
@@ -17,6 +17,7 @@ from typing import Optional, Tuple
 from backend.modules.data_entry.services.ocr_engine import OCREngine
 from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult
 from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
+from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine

 # Setup logging
 logger = logging.getLogger(__name__)
@@ -126,28 +127,28 @@ class OCRService:
            extraction = ExtractionResult()

        # ══════════════════════════════════════════════════════════════
-        # STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
+        # STEP 2: PaddleOCR + Medium (balanced preprocessing)
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
-        print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
+        print("[OCR] STEP 2: PaddleOCR + Medium preprocessing", flush=True)
        print("=" * 60, flush=True)
-        heavy_img = self.preprocessor.preprocess_heavy(image)
+        medium_img = self.preprocessor.preprocess_medium(image)

        try:
-            paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
-            if paddle_heavy and paddle_heavy.text:
-                extraction_heavy = self.extractor.extract(paddle_heavy.text)
-                extraction_heavy.ocr_engine = "paddle-heavy"
-                raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
+            paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
+            if paddle_medium and paddle_medium.text:
+                extraction_medium = self.extractor.extract(paddle_medium.text)
+                extraction_medium.ocr_engine = "paddle-medium"
+                raw_texts.append(f"═══ PaddleOCR (medium, conf: {paddle_medium.confidence:.0%}) ═══\n{paddle_medium.text}")

-                print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
-                print(f"  - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
-                print(f"  - Amount: {extraction_heavy.amount}", flush=True)
-                print(f"  - Date: {extraction_heavy.receipt_date}", flush=True)
-                print(f"  - CUI: {extraction_heavy.cui}", flush=True)
+                print(f"[OCR] Step 2 (Medium) Results:", flush=True)
+                print(f"  - OCR Confidence: {paddle_medium.confidence:.0%}", flush=True)
+                print(f"  - Amount: {extraction_medium.amount}", flush=True)
+                print(f"  - Date: {extraction_medium.receipt_date}", flush=True)
+                print(f"  - CUI: {extraction_medium.cui}", flush=True)

                # Merge with previous
-                extraction = self._merge_extractions(extraction, extraction_heavy)
+                extraction = self._merge_extractions(extraction, extraction_medium)

                print(f"[OCR] After merge:", flush=True)
                print(f"  - Amount: {extraction.amount}", flush=True)
@@ -167,7 +168,7 @@ class OCRService:
                else:
                    print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
        except Exception as e:
-            print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
+            print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)

        # ══════════════════════════════════════════════════════════════
        # STEP 3: Tesseract - ONLY to complete missing fields
@@ -235,6 +236,70 @@ class OCRService:
        print(f"  - Processing Time: {elapsed_ms}ms", flush=True)
        print(f"  - Message: {message}", flush=True)

+        # ══════════════════════════════════════════════════════════════
+        # VALIDATION: Apply validation rules to final extraction
+        # ══════════════════════════════════════════════════════════════
+        print("\n" + "=" * 60, flush=True)
+        print("[Validation] Applying validation rules...", flush=True)
+        print("=" * 60, flush=True)
+
+        validator = OCRValidationEngine()
+
+        # Prepare data for validation with safe type conversions
+        def safe_float(value) -> Optional[float]:
+            """Safely convert Decimal or number to float."""
+            if value is None:
+                return None
+            try:
+                return float(value)
+            except (TypeError, ValueError):
+                return None
+
+        def safe_payment_sum(methods: list, method_type: str) -> Optional[float]:
+            """Safely sum payment amounts for a given method type."""
+            if not methods:
+                return None
+            try:
+                total = sum(
+                    float(pm.get('amount', 0) or 0)
+                    for pm in methods
+                    if pm.get('method') == method_type
+                )
+                return total if total > 0 else None
+            except (TypeError, ValueError):
+                return None
+
+        validation_data = {
+            'amount': safe_float(extraction.amount),
+            'tva': safe_float(extraction.tva_total),
+            'cui': extraction.cui,
+            'card_amount': safe_payment_sum(extraction.payment_methods, 'CARD'),
+            'cash_amount': safe_payment_sum(extraction.payment_methods, 'NUMERAR'),
+            'tva_entries': {
+                entry.get('code', ''): safe_float(entry.get('amount'))
+                for entry in (extraction.tva_entries or [])
+                if entry.get('code') and safe_float(entry.get('amount')) is not None
+            }
+        }
+
+        # Run validation (no light/medium comparison for final result)
+        validated_result = validator.validate_extraction(validation_data)
+
+        # Apply validation results to extraction
+        extraction.needs_manual_review = validated_result.needs_manual_review
+        extraction.validation_warnings = validated_result.validation_warnings
+        extraction.validation_errors = validated_result.validation_errors
+        extraction.confidence_adjustments = validated_result.confidence_adjustments
+        extraction.inter_ocr_ratios = validated_result.inter_ocr_ratios
+
+        print(f"[Validation] Complete:", flush=True)
+        print(f"  - Warnings: {len(extraction.validation_warnings)}", flush=True)
+        print(f"  - Errors: {len(extraction.validation_errors)}", flush=True)
+        print(f"  - Needs Manual Review: {extraction.needs_manual_review}", flush=True)
+        if extraction.validation_warnings:
+            for warning in extraction.validation_warnings:
+                print(f"    ⚠️  {warning}", flush=True)
+
        return True, message, extraction

    def _merge_extractions(