feat: Add payment methods extraction, OCR improvements, and AutoComplete fix

Backend: - Add payment_methods and payment_mode fields to Receipt model - Add payment method extraction (CARD/NUMERAR) with auto-suggestion logic - Improve OCR service with TVA validation and reverse calculation - Fix nomenclature service supplier limit (was 50, now unlimited) - Add OCR fields migrations (ocr_raw_text, ocr_confidence, payment_mode) Frontend: - Fix AutoComplete to properly display supplier name after OCR - Add payment methods display in OCR preview with suggested payment mode - Improve ReceiptCreateView form handling and OCR data application Database migrations: - 20251215_add_ocr_fields_to_receipt.py - 20251215_remove_partner_id.py - 20251216_add_payment_mode.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-16 13:43:15 +02:00
parent 46d9be0c08
commit c1220e86a6
15 changed files with 734 additions and 94 deletions
--- a/data-entry-app/backend/app/services/ocr_service.py
+++ b/data-entry-app/backend/app/services/ocr_service.py
@@ -170,14 +170,17 @@ class OCRService:
            print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)

        # ══════════════════════════════════════════════════════════════
-        # STEP 3: Tesseract fallback
+        # STEP 3: Tesseract - ONLY to complete missing fields
+        # Uses Tesseract-optimized preprocessing (binarized, high contrast)
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
-        print("[OCR] STEP 3: Tesseract fallback", flush=True)
+        print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
        print("=" * 60, flush=True)

        try:
-            tesseract_result = self.ocr_engine._tesseract_recognize(light_img)
+            # Use Tesseract-specific preprocessing (Otsu binarization)
+            tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
+            tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
            if tesseract_result and tesseract_result.text:
                extraction_tess = self.extractor.extract(tesseract_result.text)
                extraction_tess.ocr_engine = "tesseract"
@@ -189,10 +192,17 @@ class OCRService:
                print(f"  - Date: {extraction_tess.receipt_date}", flush=True)
                print(f"  - CUI: {extraction_tess.cui}", flush=True)

-                extraction = self._merge_extractions(extraction, extraction_tess)
+                # IMPORTANT: Tesseract only COMPLETES missing fields, never overrides!
+                extraction = self._complement_extraction(extraction, extraction_tess)
        except Exception as e:
            print(f"[OCR] Tesseract failed: {e}", flush=True)

+        # ══════════════════════════════════════════════════════════════
+        # FINAL VALIDATION: Fix impossible values
+        # ══════════════════════════════════════════════════════════════
+        if extraction:
+            extraction = self._final_validation(extraction)
+
        # Final result
        if extraction is None:
            return False, "No text detected", None
@@ -438,6 +448,122 @@ class OCRService:
        print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
        return True

+    def _complement_extraction(
+        self,
+        primary: Optional[ExtractionResult],
+        secondary: Optional[ExtractionResult]
+    ) -> ExtractionResult:
+        """
+        Complement primary extraction with missing fields from secondary.
+        NEVER overrides existing values - only fills in gaps.
+
+        This is different from _merge_extractions which can override values.
+        """
+        if primary is None and secondary is None:
+            return ExtractionResult()
+        if primary is None:
+            return secondary
+        if secondary is None:
+            return primary
+
+        print("[Complement] Adding missing fields from Tesseract...", flush=True)
+
+        # Only fill missing amount
+        if not primary.amount and secondary.amount:
+            primary.amount = secondary.amount
+            primary.confidence_amount = secondary.confidence_amount
+            print(f"[Complement] Added amount: {secondary.amount}", flush=True)
+
+        # Only fill missing date
+        if not primary.receipt_date and secondary.receipt_date:
+            primary.receipt_date = secondary.receipt_date
+            primary.confidence_date = secondary.confidence_date
+            print(f"[Complement] Added date: {secondary.receipt_date}", flush=True)
+
+        # Only fill missing vendor
+        if not primary.partner_name and secondary.partner_name:
+            primary.partner_name = secondary.partner_name
+            primary.confidence_vendor = secondary.confidence_vendor
+            print(f"[Complement] Added vendor: {secondary.partner_name}", flush=True)
+
+        # Only fill missing CUI
+        if not primary.cui and secondary.cui and self._is_valid_cui(secondary.cui):
+            primary.cui = secondary.cui
+            print(f"[Complement] Added CUI: {secondary.cui}", flush=True)
+
+        # Only fill missing TVA
+        if not primary.tva_entries and secondary.tva_entries:
+            primary.tva_entries = secondary.tva_entries
+            primary.tva_total = secondary.tva_total
+            print(f"[Complement] Added TVA: {secondary.tva_total}", flush=True)
+
+        # Only fill missing receipt number
+        if not primary.receipt_number and secondary.receipt_number:
+            primary.receipt_number = secondary.receipt_number
+            print(f"[Complement] Added number: {secondary.receipt_number}", flush=True)
+
+        # Only fill missing address
+        if not primary.address and secondary.address:
+            primary.address = secondary.address
+            print(f"[Complement] Added address: {secondary.address}", flush=True)
+
+        return primary
+
+    def _final_validation(self, extraction: ExtractionResult) -> ExtractionResult:
+        """
+        Final validation and correction of impossible values.
+
+        Key rules:
+        1. TVA cannot be greater than TOTAL (it's always a fraction)
+        2. If TVA > TOTAL, recalculate TOTAL from TVA using known rates
+        3. Validate TVA entries sum equals TVA total
+        """
+        print("[Final Validation] Checking extracted values...", flush=True)
+
+        # Rule 1: TVA cannot be greater than TOTAL
+        if extraction.tva_total and extraction.amount:
+            if extraction.tva_total > extraction.amount:
+                print(f"[Final Validation] TVA ({extraction.tva_total}) > TOTAL ({extraction.amount}) - IMPOSSIBLE!", flush=True)
+
+                # Calculate TOTAL from TVA using reverse formula:
+                # total = base + tva = tva * (100/rate + 1) = tva * (100 + rate) / rate
+                # For 9% TVA: total = tva * 109 / 9 = tva * 12.11
+                # For 19% TVA: total = tva * 119 / 19 = tva * 6.26
+                # For 21% TVA: total = tva * 121 / 21 = tva * 5.76
+
+                rate = 19  # Default rate assumption
+                if extraction.tva_entries:
+                    # Use the rate from the first entry
+                    rate = extraction.tva_entries[0].get('percent', 19)
+
+                if rate > 0:
+                    # Formula: total = tva * (100 + rate) / rate
+                    calculated_total = extraction.tva_total * (Decimal('100') + Decimal(str(rate))) / Decimal(str(rate))
+                    calculated_total = calculated_total.quantize(Decimal('0.01'))
+
+                    print(f"[Final Validation] Calculated TOTAL from TVA: {calculated_total} (using {rate}% rate)", flush=True)
+
+                    extraction.amount = calculated_total
+                    extraction.confidence_amount = 0.70  # Lower confidence for calculated value
+
+        # Rule 2: TVA cannot be more than ~25% of total (max Romanian rate is 21%)
+        if extraction.tva_total and extraction.amount:
+            tva_percent = extraction.tva_total / extraction.amount * Decimal('100')
+            if tva_percent > Decimal('25'):
+                print(f"[Final Validation] Warning: TVA is {tva_percent:.1f}% of total - suspicious", flush=True)
+
+        # Rule 3: Validate TVA entries sum
+        if extraction.tva_entries and extraction.tva_total:
+            entries_sum = sum(e.get('amount', Decimal('0')) for e in extraction.tva_entries)
+            tolerance = Decimal('0.05')
+            if abs(entries_sum - extraction.tva_total) > tolerance:
+                print(f"[Final Validation] TVA entries sum ({entries_sum}) != tva_total ({extraction.tva_total})", flush=True)
+                # Use the sum as it's more reliable
+                extraction.tva_total = entries_sum
+
+        print(f"[Final Validation] Done. Amount={extraction.amount}, TVA={extraction.tva_total}", flush=True)
+        return extraction
+

 # Singleton instance
 ocr_service = OCRService()