feat: Add payment methods extraction, OCR improvements, and AutoComplete fix

Backend: - Add payment_methods and payment_mode fields to Receipt model - Add payment method extraction (CARD/NUMERAR) with auto-suggestion logic - Improve OCR service with TVA validation and reverse calculation - Fix nomenclature service supplier limit (was 50, now unlimited) - Add OCR fields migrations (ocr_raw_text, ocr_confidence, payment_mode) Frontend: - Fix AutoComplete to properly display supplier name after OCR - Add payment methods display in OCR preview with suggested payment mode - Improve ReceiptCreateView form handling and OCR data application Database migrations: - 20251215_add_ocr_fields_to_receipt.py - 20251215_remove_partner_id.py - 20251216_add_payment_mode.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-16 13:43:15 +02:00
parent 46d9be0c08
commit c1220e86a6
15 changed files with 734 additions and 94 deletions
--- a/data-entry-app/backend/app/services/nomenclature_service.py
+++ b/data-entry-app/backend/app/services/nomenclature_service.py
@@ -46,7 +46,7 @@ class NomenclatureService:
                    (SyncedSupplier.name.ilike(f"%{search}%")) |
                    (SyncedSupplier.fiscal_code.ilike(f"%{search}%"))
                )
-            stmt = stmt.limit(50)  # Limit results
+            stmt = stmt.order_by(SyncedSupplier.name)  # Order alphabetically, no limit for AutoComplete

            result = await session.execute(stmt)
            suppliers = result.scalars().all()
@@ -59,34 +59,44 @@ class NomenclatureService:
                        (LocalSupplier.name.ilike(f"%{search}%")) |
                        (LocalSupplier.fiscal_code.ilike(f"%{search}%"))
                    )
-                local_stmt = local_stmt.limit(50)
+                local_stmt = local_stmt.order_by(LocalSupplier.name)  # Order alphabetically

                local_result = await session.execute(local_stmt)
                local_suppliers = local_result.scalars().all()

-                # Combine both
+                # Combine both - no IDs needed, just text data for autocomplete
                partners = []
                for s in suppliers:
-                    partners.append(PartnerOption(id=s.id, name=s.name, code=s.fiscal_code))
+                    partners.append(PartnerOption(
+                        name=s.name,
+                        fiscal_code=s.fiscal_code,
+                        address=s.address,
+                        source="oracle"
+                    ))
                for l in local_suppliers:
-                    partners.append(PartnerOption(id=l.id, name=f"{l.name} (local)", code=l.fiscal_code))
+                    partners.append(PartnerOption(
+                        name=l.name,  # No suffix - must match search results
+                        fiscal_code=l.fiscal_code,
+                        address=l.address,
+                        source="local"
+                    ))

                return partners

-        # Fallback to mock data for Phase 1
+        # Fallback to mock data for Phase 1 (when no synced data)
        mock_partners = [
-            PartnerOption(id=1, name="OMV Petrom", code="RO123456"),
-            PartnerOption(id=2, name="Dedeman", code="RO789012"),
-            PartnerOption(id=3, name="Kaufland", code="RO345678"),
-            PartnerOption(id=4, name="Emag", code="RO901234"),
-            PartnerOption(id=5, name="Altex", code="RO567890"),
+            PartnerOption(name="OMV Petrom", fiscal_code="RO123456", source="mock"),
+            PartnerOption(name="Dedeman", fiscal_code="RO789012", source="mock"),
+            PartnerOption(name="Kaufland", fiscal_code="RO345678", source="mock"),
+            PartnerOption(name="Emag", fiscal_code="RO901234", source="mock"),
+            PartnerOption(name="Altex", fiscal_code="RO567890", source="mock"),
        ]

        if search:
            search_lower = search.lower()
            mock_partners = [
                p for p in mock_partners
-                if search_lower in p.name.lower() or (p.code and search_lower in p.code.lower())
+                if search_lower in p.name.lower() or (p.fiscal_code and search_lower in p.fiscal_code.lower())
            ]

        return mock_partners
--- a/data-entry-app/backend/app/services/ocr_engine.py
+++ b/data-entry-app/backend/app/services/ocr_engine.py
@@ -2,6 +2,8 @@

 import os
 import logging
+import threading
+import time
 from dataclasses import dataclass
 from typing import List, Optional, Tuple

@@ -53,23 +55,26 @@ class OCREngine:

    def __init__(self):
        self._paddle = None
-        self._paddle_initialized = False
+        self._paddle_init_started = False
+        self._paddle_ready = threading.Event()  # Signals when PaddleOCR is FULLY ready
+        self._paddle_init_lock = threading.Lock()

    def _init_paddle_lazy(self):
        """Lazy initialize PaddleOCR on first use (avoids slow startup)."""
        global PaddleOCR

-        if self._paddle_initialized:
-            return
+        with self._paddle_init_lock:
+            if self._paddle_init_started:
+                return  # Already initializing or done
+            self._paddle_init_started = True

-        self._paddle_initialized = True
        if PADDLE_AVAILABLE:
            try:
-                print("Importing PaddleOCR (first use, may take ~15-20 seconds)...")
+                print("Importing PaddleOCR (first use, may take ~15-20 seconds)...", flush=True)
                from paddleocr import PaddleOCR as _PaddleOCR
                PaddleOCR = _PaddleOCR

-                print("Initializing PaddleOCR engine...")
+                print("Initializing PaddleOCR engine...", flush=True)
                # PaddleOCR 3.x API - optimized for Romanian receipts
                # Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
                self._paddle = PaddleOCR(
@@ -81,11 +86,51 @@ class OCREngine:
                    rec_batch_num=6,        # Batch size for recognition
                    use_angle_cls=True,     # Enable text angle classification
                )
-                print("PaddleOCR initialized successfully with high-quality settings")
+                print("PaddleOCR initialized successfully with high-quality settings", flush=True)
            except Exception as e:
-                print(f"Warning: Failed to initialize PaddleOCR: {e}")
+                print(f"Warning: Failed to initialize PaddleOCR: {e}", flush=True)
                self._paddle = None

+        # Signal that initialization is complete (success or failure)
+        self._paddle_ready.set()
+
+    def wait_for_paddle(self, timeout: float = 30.0) -> bool:
+        """
+        Wait for PaddleOCR to be fully initialized.
+
+        Args:
+            timeout: Max seconds to wait (default 30s)
+
+        Returns:
+            True if PaddleOCR is ready, False if timeout or unavailable
+        """
+        if not PADDLE_AVAILABLE:
+            return False
+
+        if self._paddle is not None:
+            return True  # Already ready
+
+        if not self._paddle_init_started:
+            # Start initialization if not already started
+            self._init_paddle_lazy()
+
+        # Wait for initialization to complete
+        print(f"[OCR] Waiting for PaddleOCR to be ready (max {timeout}s)...", flush=True)
+        start = time.time()
+        ready = self._paddle_ready.wait(timeout=timeout)
+        elapsed = time.time() - start
+
+        if ready and self._paddle is not None:
+            print(f"[OCR] PaddleOCR ready after {elapsed:.1f}s", flush=True)
+            return True
+        else:
+            print(f"[OCR] PaddleOCR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
+            return False
+
+    def is_paddle_ready(self) -> bool:
+        """Check if PaddleOCR is ready without waiting."""
+        return self._paddle is not None
+
    def recognize(self, image: np.ndarray) -> OCRResult:
        """Perform OCR on preprocessed image."""
        logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
@@ -107,6 +152,13 @@ class OCREngine:

    def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using PaddleOCR 3.x API."""
+        # Wait for PaddleOCR to be fully ready (handles background init)
+        if not self.wait_for_paddle(timeout=30.0):
+            logger.warning("[PaddleOCR] Not ready, falling back to Tesseract")
+            if TESSERACT_AVAILABLE:
+                return self._tesseract_recognize(image)
+            raise RuntimeError("PaddleOCR not ready and Tesseract not available")
+
        try:
            logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")

--- a/data-entry-app/backend/app/services/ocr_service.py
+++ b/data-entry-app/backend/app/services/ocr_service.py
@@ -170,14 +170,17 @@ class OCRService:
            print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)

        # ══════════════════════════════════════════════════════════════
-        # STEP 3: Tesseract fallback
+        # STEP 3: Tesseract - ONLY to complete missing fields
+        # Uses Tesseract-optimized preprocessing (binarized, high contrast)
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
-        print("[OCR] STEP 3: Tesseract fallback", flush=True)
+        print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
        print("=" * 60, flush=True)

        try:
-            tesseract_result = self.ocr_engine._tesseract_recognize(light_img)
+            # Use Tesseract-specific preprocessing (Otsu binarization)
+            tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
+            tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
            if tesseract_result and tesseract_result.text:
                extraction_tess = self.extractor.extract(tesseract_result.text)
                extraction_tess.ocr_engine = "tesseract"
@@ -189,10 +192,17 @@ class OCRService:
                print(f"  - Date: {extraction_tess.receipt_date}", flush=True)
                print(f"  - CUI: {extraction_tess.cui}", flush=True)

-                extraction = self._merge_extractions(extraction, extraction_tess)
+                # IMPORTANT: Tesseract only COMPLETES missing fields, never overrides!
+                extraction = self._complement_extraction(extraction, extraction_tess)
        except Exception as e:
            print(f"[OCR] Tesseract failed: {e}", flush=True)

+        # ══════════════════════════════════════════════════════════════
+        # FINAL VALIDATION: Fix impossible values
+        # ══════════════════════════════════════════════════════════════
+        if extraction:
+            extraction = self._final_validation(extraction)
+
        # Final result
        if extraction is None:
            return False, "No text detected", None
@@ -438,6 +448,122 @@ class OCRService:
        print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
        return True

+    def _complement_extraction(
+        self,
+        primary: Optional[ExtractionResult],
+        secondary: Optional[ExtractionResult]
+    ) -> ExtractionResult:
+        """
+        Complement primary extraction with missing fields from secondary.
+        NEVER overrides existing values - only fills in gaps.
+
+        This is different from _merge_extractions which can override values.
+        """
+        if primary is None and secondary is None:
+            return ExtractionResult()
+        if primary is None:
+            return secondary
+        if secondary is None:
+            return primary
+
+        print("[Complement] Adding missing fields from Tesseract...", flush=True)
+
+        # Only fill missing amount
+        if not primary.amount and secondary.amount:
+            primary.amount = secondary.amount
+            primary.confidence_amount = secondary.confidence_amount
+            print(f"[Complement] Added amount: {secondary.amount}", flush=True)
+
+        # Only fill missing date
+        if not primary.receipt_date and secondary.receipt_date:
+            primary.receipt_date = secondary.receipt_date
+            primary.confidence_date = secondary.confidence_date
+            print(f"[Complement] Added date: {secondary.receipt_date}", flush=True)
+
+        # Only fill missing vendor
+        if not primary.partner_name and secondary.partner_name:
+            primary.partner_name = secondary.partner_name
+            primary.confidence_vendor = secondary.confidence_vendor
+            print(f"[Complement] Added vendor: {secondary.partner_name}", flush=True)
+
+        # Only fill missing CUI
+        if not primary.cui and secondary.cui and self._is_valid_cui(secondary.cui):
+            primary.cui = secondary.cui
+            print(f"[Complement] Added CUI: {secondary.cui}", flush=True)
+
+        # Only fill missing TVA
+        if not primary.tva_entries and secondary.tva_entries:
+            primary.tva_entries = secondary.tva_entries
+            primary.tva_total = secondary.tva_total
+            print(f"[Complement] Added TVA: {secondary.tva_total}", flush=True)
+
+        # Only fill missing receipt number
+        if not primary.receipt_number and secondary.receipt_number:
+            primary.receipt_number = secondary.receipt_number
+            print(f"[Complement] Added number: {secondary.receipt_number}", flush=True)
+
+        # Only fill missing address
+        if not primary.address and secondary.address:
+            primary.address = secondary.address
+            print(f"[Complement] Added address: {secondary.address}", flush=True)
+
+        return primary
+
+    def _final_validation(self, extraction: ExtractionResult) -> ExtractionResult:
+        """
+        Final validation and correction of impossible values.
+
+        Key rules:
+        1. TVA cannot be greater than TOTAL (it's always a fraction)
+        2. If TVA > TOTAL, recalculate TOTAL from TVA using known rates
+        3. Validate TVA entries sum equals TVA total
+        """
+        print("[Final Validation] Checking extracted values...", flush=True)
+
+        # Rule 1: TVA cannot be greater than TOTAL
+        if extraction.tva_total and extraction.amount:
+            if extraction.tva_total > extraction.amount:
+                print(f"[Final Validation] TVA ({extraction.tva_total}) > TOTAL ({extraction.amount}) - IMPOSSIBLE!", flush=True)
+
+                # Calculate TOTAL from TVA using reverse formula:
+                # total = base + tva = tva * (100/rate + 1) = tva * (100 + rate) / rate
+                # For 9% TVA: total = tva * 109 / 9 = tva * 12.11
+                # For 19% TVA: total = tva * 119 / 19 = tva * 6.26
+                # For 21% TVA: total = tva * 121 / 21 = tva * 5.76
+
+                rate = 19  # Default rate assumption
+                if extraction.tva_entries:
+                    # Use the rate from the first entry
+                    rate = extraction.tva_entries[0].get('percent', 19)
+
+                if rate > 0:
+                    # Formula: total = tva * (100 + rate) / rate
+                    calculated_total = extraction.tva_total * (Decimal('100') + Decimal(str(rate))) / Decimal(str(rate))
+                    calculated_total = calculated_total.quantize(Decimal('0.01'))
+
+                    print(f"[Final Validation] Calculated TOTAL from TVA: {calculated_total} (using {rate}% rate)", flush=True)
+
+                    extraction.amount = calculated_total
+                    extraction.confidence_amount = 0.70  # Lower confidence for calculated value
+
+        # Rule 2: TVA cannot be more than ~25% of total (max Romanian rate is 21%)
+        if extraction.tva_total and extraction.amount:
+            tva_percent = extraction.tva_total / extraction.amount * Decimal('100')
+            if tva_percent > Decimal('25'):
+                print(f"[Final Validation] Warning: TVA is {tva_percent:.1f}% of total - suspicious", flush=True)
+
+        # Rule 3: Validate TVA entries sum
+        if extraction.tva_entries and extraction.tva_total:
+            entries_sum = sum(e.get('amount', Decimal('0')) for e in extraction.tva_entries)
+            tolerance = Decimal('0.05')
+            if abs(entries_sum - extraction.tva_total) > tolerance:
+                print(f"[Final Validation] TVA entries sum ({entries_sum}) != tva_total ({extraction.tva_total})", flush=True)
+                # Use the sum as it's more reliable
+                extraction.tva_total = entries_sum
+
+        print(f"[Final Validation] Done. Amount={extraction.amount}, TVA={extraction.tva_total}", flush=True)
+        return extraction
+

 # Singleton instance
 ocr_service = OCRService()
--- a/data-entry-app/backend/app/services/receipt_service.py
+++ b/data-entry-app/backend/app/services/receipt_service.py
@@ -20,6 +20,14 @@ from app.schemas.receipt import (
 from app.services.expense_types import EXPENSE_TYPES, get_expense_type


+# Payment mode to accounting account mapping
+PAYMENT_MODE_ACCOUNTS = {
+    'casa': ('5311', 'Casa in lei'),
+    'banca': ('5121', 'Conturi la banci in lei'),
+    'avans_decontare': ('542', 'Avansuri de trezorerie'),
+}
+
+
 class ReceiptService:
    """Service for receipt business logic and workflow."""

@@ -151,21 +159,36 @@ class ReceiptService:
                    partner_id=receipt.partner_id,
                ))

-            # Credit: Cash/Bank
-            cash_account = receipt.cash_register_account or "5311"
-            cash_name = receipt.cash_register_name or "Casa in lei"
+            # Credit entry - based on payment_mode (new) or cash_register (legacy)
+            if receipt.payment_mode and receipt.payment_mode in PAYMENT_MODE_ACCOUNTS:
+                credit_account, credit_name = PAYMENT_MODE_ACCOUNTS[receipt.payment_mode]
+            elif receipt.cash_register_account:
+                # Backwards compatibility for existing receipts
+                credit_account = receipt.cash_register_account
+                credit_name = receipt.cash_register_name or "Casa/Banca"
+            else:
+                # Default fallback
+                credit_account = "5311"
+                credit_name = "Casa in lei"
+
            entries.append(AccountingEntryCreate(
                entry_type=EntryType.CREDIT,
-                account_code=cash_account,
-                account_name=cash_name,
+                account_code=credit_account,
+                account_name=credit_name,
                amount=amount,
            ))

        else:
            # Income: Debit cash/bank, Credit income account
-            # For now, simple income posting
-            cash_account = receipt.cash_register_account or "5311"
-            cash_name = receipt.cash_register_name or "Casa in lei"
+            # Based on payment_mode (new) or cash_register (legacy)
+            if receipt.payment_mode and receipt.payment_mode in PAYMENT_MODE_ACCOUNTS:
+                cash_account, cash_name = PAYMENT_MODE_ACCOUNTS[receipt.payment_mode]
+            elif receipt.cash_register_account:
+                cash_account = receipt.cash_register_account
+                cash_name = receipt.cash_register_name or "Casa/Banca"
+            else:
+                cash_account = "5311"
+                cash_name = "Casa in lei"

            # Debit: Cash/Bank
            entries.append(AccountingEntryCreate(
@@ -211,8 +234,9 @@ class ReceiptService:
        if not receipt.expense_type_code:
            return False, "Expense type is required", None

-        if not receipt.cash_register_account:
-            return False, "Cash register is required", None
+        # Validate payment_mode or cash_register (backwards compatibility)
+        if not receipt.payment_mode and not receipt.cash_register_account:
+            return False, "Modul de plata este obligatoriu", None

        # Generate accounting entries
        entries = ReceiptService.generate_accounting_entries(receipt)
@@ -239,6 +263,7 @@ class ReceiptService:
    ) -> Tuple[bool, str, Optional[Receipt]]:
        """
        Approve receipt (PENDING_REVIEW → APPROVED).
+        Requires valid CUI (fiscal code) for approval.
        """
        receipt = await ReceiptCRUD.get_by_id(session, receipt_id)

@@ -248,6 +273,10 @@ class ReceiptService:
        if receipt.status != ReceiptStatus.PENDING_REVIEW:
            return False, "Receipt is not pending review", None

+        # Validate CUI is present (required for Oracle import)
+        if not receipt.cui:
+            return False, "Trebuie completat codul fiscal (CUI) pentru aprobare", None
+
        # Validate accounting entries
        if not receipt.entries:
            return False, "Receipt has no accounting entries", None
--- a/data-entry-app/backend/app/services/sync_service.py
+++ b/data-entry-app/backend/app/services/sync_service.py
@@ -267,9 +267,8 @@ class SyncService:
        supplier = result.scalar_one_or_none()

        if supplier:
+            # Return only text data - no IDs needed for autocomplete
            return True, {
-                "id": supplier.id,
-                "oracle_id": supplier.oracle_id,
                "name": supplier.name,
                "fiscal_code": supplier.fiscal_code,
                "address": supplier.address,
@@ -291,12 +290,11 @@ class SyncService:
        local = result.scalar_one_or_none()

        if local:
+            # Return only text data - no IDs needed for autocomplete
            return True, {
-                "id": local.id,
                "name": local.name,
                "fiscal_code": local.fiscal_code,
                "address": local.address,
-                "is_local": True,
            }, "local"

        # 3. Try live Oracle search (optional fallback for unsynced data)