feat: Migrate to ultrathin monolith architecture

Consolidate 3 separate applications (reports-app, data-entry-app, telegram-bot) into a unified architecture with single backend and frontend: Backend Changes: - Unified FastAPI backend at backend/ with modular structure - Modules: reports, data_entry, telegram in backend/modules/ - Centralized config.py and main.py with all routers registered - Single worker mode (--workers 1) for Telegram bot compatibility - Shared Oracle connection pool and JWT authentication - Unified requirements.txt and environment configuration Frontend Changes: - Single Vue.js SPA with module-based routing - Unified frontend at src/ with modules in src/modules/{reports,data-entry}/ - Shared components and stores in src/shared/ - Error boundaries for module isolation - Dual API proxy in Vite for module communication Infrastructure: - New unified startup scripts: start-prod.sh, start-test.sh, start-backend.sh - Environment templates: .env.dev.example, .env.test.example, .env.prod.example - Updated deployment scripts for Windows IIS - Simplified SSH tunnel management Documentation: - Comprehensive CLAUDE.md with architecture overview - Module-specific docs in docs/{data-entry,telegram}/ - Architecture decision records in docs/ARCHITECTURE-DECISIONS.md - Deployment guides consolidated in deployment/windows/docs/ This migration reduces complexity, improves maintainability, and enables easier deployment while maintaining all existing functionality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-29 23:48:14 +02:00
parent 2a101f1ef5
commit c5e051ad80
378 changed files with 7566 additions and 73730 deletions
--- a/backend/modules/data_entry/services/ocr_service.py
+++ b/backend/modules/data_entry/services/ocr_service.py
@@ -0,0 +1,569 @@
+"""Main OCR service coordinating preprocessing, recognition, and extraction."""
+
+import os
+import re
+import logging
+
+# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+import time
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from decimal import Decimal
+from pathlib import Path
+from typing import Optional, Tuple
+
+from backend.modules.data_entry.services.ocr_engine import OCREngine
+from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult
+from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
+
+# Setup logging
+logger = logging.getLogger(__name__)
+
+
+class OCRService:
+    """Service for OCR processing of receipt images."""
+
+    _executor = ThreadPoolExecutor(max_workers=2)
+
+    def __init__(self):
+        self.preprocessor = ImagePreprocessor()
+        self.ocr_engine = OCREngine()
+        self.extractor = ReceiptExtractor()
+
+    async def process_image(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """
+        Process receipt image and extract structured data.
+
+        Args:
+            image_path: Path to the image file
+            mime_type: MIME type of the file
+
+        Returns:
+            Tuple of (success, message, extraction_result)
+        """
+        try:
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(
+                self._executor,
+                self._process_sync,
+                image_path,
+                mime_type
+            )
+            return result
+        except Exception as e:
+            return False, f"OCR processing failed: {str(e)}", None
+
+    def _process_sync(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """Synchronous processing with ADAPTIVE OCR pipeline."""
+
+        start_time = time.time()
+        print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
+
+        # Load image
+        if mime_type == 'application/pdf':
+            try:
+                images = self.preprocessor.pdf_to_images(image_path)
+                if not images:
+                    return False, "Failed to extract images from PDF", None
+                image = images[0]
+            except RuntimeError as e:
+                return False, str(e), None
+        else:
+            try:
+                image = self.preprocessor.load_image(image_path)
+            except ValueError as e:
+                return False, str(e), None
+
+        raw_texts = []
+        extraction = None
+
+        # ══════════════════════════════════════════════════════════════
+        # STEP 1: PaddleOCR + Light (fastest, best for clear PDFs)
+        # ══════════════════════════════════════════════════════════════
+        print("=" * 60, flush=True)
+        print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True)
+        print("=" * 60, flush=True)
+        light_img = self.preprocessor.preprocess_light(image)
+
+        try:
+            paddle_light = self.ocr_engine._paddle_recognize(light_img)
+            if paddle_light and paddle_light.text:
+                extraction = self.extractor.extract(paddle_light.text)
+                extraction.ocr_engine = "paddle-light"
+                raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}")
+
+                # Log extraction results
+                print(f"[OCR] Step 1 Results:", flush=True)
+                print(f"  - OCR Confidence: {paddle_light.confidence:.0%}", flush=True)
+                print(f"  - Amount: {extraction.amount}", flush=True)
+                print(f"  - Date: {extraction.receipt_date}", flush=True)
+                print(f"  - Number: {extraction.receipt_number}", flush=True)
+                print(f"  - CUI: {extraction.cui}", flush=True)
+                print(f"  - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True)
+                print(f"  - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
+
+                # Early exit if complete
+                if self._is_extraction_complete(extraction):
+                    extraction.raw_text = "\n\n".join(raw_texts)
+                    elapsed_ms = int((time.time() - start_time) * 1000)
+                    extraction.processing_time_ms = elapsed_ms
+                    print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
+                    return True, "OCR complete (fast mode)", extraction
+                else:
+                    print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
+        except Exception as e:
+            print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
+            extraction = ExtractionResult()
+
+        # ══════════════════════════════════════════════════════════════
+        # STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
+        # ══════════════════════════════════════════════════════════════
+        print("=" * 60, flush=True)
+        print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
+        print("=" * 60, flush=True)
+        heavy_img = self.preprocessor.preprocess_heavy(image)
+
+        try:
+            paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
+            if paddle_heavy and paddle_heavy.text:
+                extraction_heavy = self.extractor.extract(paddle_heavy.text)
+                extraction_heavy.ocr_engine = "paddle-heavy"
+                raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
+
+                print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
+                print(f"  - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
+                print(f"  - Amount: {extraction_heavy.amount}", flush=True)
+                print(f"  - Date: {extraction_heavy.receipt_date}", flush=True)
+                print(f"  - CUI: {extraction_heavy.cui}", flush=True)
+
+                # Merge with previous
+                extraction = self._merge_extractions(extraction, extraction_heavy)
+
+                print(f"[OCR] After merge:", flush=True)
+                print(f"  - Amount: {extraction.amount}", flush=True)
+                print(f"  - Date: {extraction.receipt_date}", flush=True)
+                print(f"  - Number: {extraction.receipt_number}", flush=True)
+                print(f"  - CUI: {extraction.cui}", flush=True)
+                print(f"  - TVA: {extraction.tva_total}", flush=True)
+                print(f"  - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
+
+                if self._is_extraction_complete(extraction):
+                    extraction.raw_text = "\n\n".join(raw_texts)
+                    extraction.ocr_engine = "paddle-adaptive"
+                    elapsed_ms = int((time.time() - start_time) * 1000)
+                    extraction.processing_time_ms = elapsed_ms
+                    print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
+                    return True, "OCR complete (paddle dual)", extraction
+                else:
+                    print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
+        except Exception as e:
+            print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
+
+        # ══════════════════════════════════════════════════════════════
+        # STEP 3: Tesseract - ONLY to complete missing fields
+        # Uses Tesseract-optimized preprocessing (binarized, high contrast)
+        # ══════════════════════════════════════════════════════════════
+        print("=" * 60, flush=True)
+        print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
+        print("=" * 60, flush=True)
+
+        try:
+            # Use Tesseract-specific preprocessing (Otsu binarization)
+            tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
+            tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
+            if tesseract_result and tesseract_result.text:
+                extraction_tess = self.extractor.extract(tesseract_result.text)
+                extraction_tess.ocr_engine = "tesseract"
+                raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}")
+
+                print(f"[OCR] Step 3 (Tesseract) Results:", flush=True)
+                print(f"  - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True)
+                print(f"  - Amount: {extraction_tess.amount}", flush=True)
+                print(f"  - Date: {extraction_tess.receipt_date}", flush=True)
+                print(f"  - CUI: {extraction_tess.cui}", flush=True)
+
+                # IMPORTANT: Tesseract only COMPLETES missing fields, never overrides!
+                extraction = self._complement_extraction(extraction, extraction_tess)
+        except Exception as e:
+            print(f"[OCR] Tesseract failed: {e}", flush=True)
+
+        # ══════════════════════════════════════════════════════════════
+        # FINAL VALIDATION: Fix impossible values
+        # ══════════════════════════════════════════════════════════════
+        if extraction:
+            extraction = self._final_validation(extraction)
+
+        # Final result
+        if extraction is None:
+            return False, "No text detected", None
+
+        extraction.raw_text = "\n\n".join(raw_texts)
+        extraction.ocr_engine = "adaptive-full"
+
+        # Build result message
+        fields_found = []
+        if extraction.amount: fields_found.append("amount")
+        if extraction.receipt_date: fields_found.append("date")
+        if extraction.receipt_number: fields_found.append("number")
+        if extraction.cui: fields_found.append("CUI")
+        if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA")
+
+        message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}"
+
+        elapsed_ms = int((time.time() - start_time) * 1000)
+        extraction.processing_time_ms = elapsed_ms
+
+        print("=" * 60, flush=True)
+        print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True)
+        print("=" * 60, flush=True)
+        print(f"  - Amount: {extraction.amount}", flush=True)
+        print(f"  - Date: {extraction.receipt_date}", flush=True)
+        print(f"  - Number: {extraction.receipt_number}", flush=True)
+        print(f"  - CUI: {extraction.cui}", flush=True)
+        print(f"  - TVA: {extraction.tva_total}", flush=True)
+        print(f"  - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
+        print(f"  - Processing Time: {elapsed_ms}ms", flush=True)
+        print(f"  - Message: {message}", flush=True)
+
+        return True, message, extraction
+
+    def _merge_extractions(
+        self,
+        paddle: Optional[ExtractionResult],
+        tesseract: Optional[ExtractionResult]
+    ) -> ExtractionResult:
+        """
+        Merge two extractions, picking best fields from each engine.
+
+        Strategy:
+        - For each field, prefer the one with higher confidence
+        - Use validation rules (CUI format, date validity, company indicators)
+        - Combine TVA entries if different
+        """
+        result = ExtractionResult()
+
+        # Handle case where one is None
+        if paddle is None and tesseract is None:
+            return result
+        if paddle is None:
+            return tesseract
+        if tesseract is None:
+            return paddle
+
+        print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True)
+
+        # === AMOUNT ===
+        # Pick higher confidence, both must be positive
+        if paddle.amount and tesseract.amount:
+            if paddle.confidence_amount >= tesseract.confidence_amount:
+                result.amount = paddle.amount
+                result.confidence_amount = paddle.confidence_amount
+                print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True)
+            else:
+                result.amount = tesseract.amount
+                result.confidence_amount = tesseract.confidence_amount
+                print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True)
+        elif paddle.amount:
+            result.amount = paddle.amount
+            result.confidence_amount = paddle.confidence_amount
+        elif tesseract.amount:
+            result.amount = tesseract.amount
+            result.confidence_amount = tesseract.confidence_amount
+
+        # === DATE ===
+        # Pick higher confidence, validate date reasonableness
+        if paddle.receipt_date and tesseract.receipt_date:
+            if paddle.confidence_date >= tesseract.confidence_date:
+                result.receipt_date = paddle.receipt_date
+                result.confidence_date = paddle.confidence_date
+                print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True)
+            else:
+                result.receipt_date = tesseract.receipt_date
+                result.confidence_date = tesseract.confidence_date
+                print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True)
+        elif paddle.receipt_date:
+            result.receipt_date = paddle.receipt_date
+            result.confidence_date = paddle.confidence_date
+        elif tesseract.receipt_date:
+            result.receipt_date = tesseract.receipt_date
+            result.confidence_date = tesseract.confidence_date
+
+        # === VENDOR NAME ===
+        # Prefer one with company indicators (S.R.L., S.A., etc.)
+        paddle_has_indicator = self._has_company_indicator(paddle.partner_name)
+        tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name)
+
+        if paddle.partner_name and tesseract.partner_name:
+            if paddle_has_indicator and not tesseract_has_indicator:
+                result.partner_name = paddle.partner_name
+                result.confidence_vendor = paddle.confidence_vendor
+                print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True)
+            elif tesseract_has_indicator and not paddle_has_indicator:
+                result.partner_name = tesseract.partner_name
+                result.confidence_vendor = tesseract.confidence_vendor
+                print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True)
+            elif paddle.confidence_vendor >= tesseract.confidence_vendor:
+                result.partner_name = paddle.partner_name
+                result.confidence_vendor = paddle.confidence_vendor
+                print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True)
+            else:
+                result.partner_name = tesseract.partner_name
+                result.confidence_vendor = tesseract.confidence_vendor
+                print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True)
+        elif paddle.partner_name:
+            result.partner_name = paddle.partner_name
+            result.confidence_vendor = paddle.confidence_vendor
+        elif tesseract.partner_name:
+            result.partner_name = tesseract.partner_name
+            result.confidence_vendor = tesseract.confidence_vendor
+
+        # === CUI (Fiscal Code) ===
+        # Validate format: 6-10 digits, prefer valid one
+        paddle_cui_valid = self._is_valid_cui(paddle.cui)
+        tesseract_cui_valid = self._is_valid_cui(tesseract.cui)
+
+        if paddle.cui and tesseract.cui:
+            if paddle_cui_valid and not tesseract_cui_valid:
+                result.cui = paddle.cui
+                print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True)
+            elif tesseract_cui_valid and not paddle_cui_valid:
+                result.cui = tesseract.cui
+                print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True)
+            else:
+                # Both valid or both invalid - prefer PaddleOCR
+                result.cui = paddle.cui
+                print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True)
+        elif paddle.cui and paddle_cui_valid:
+            result.cui = paddle.cui
+        elif tesseract.cui and tesseract_cui_valid:
+            result.cui = tesseract.cui
+        elif paddle.cui:
+            result.cui = paddle.cui
+        elif tesseract.cui:
+            result.cui = tesseract.cui
+
+        # === TVA ENTRIES ===
+        # Prefer non-empty, use the one with more entries or higher amounts
+        if paddle.tva_entries and tesseract.tva_entries:
+            # Compare: prefer the one with actual amounts (not just 0)
+            paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries)
+            tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries)
+
+            if paddle_total >= tesseract_total:
+                result.tva_entries = paddle.tva_entries
+                result.tva_total = paddle.tva_total
+                print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True)
+            else:
+                result.tva_entries = tesseract.tva_entries
+                result.tva_total = tesseract.tva_total
+                print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True)
+        elif paddle.tva_entries:
+            result.tva_entries = paddle.tva_entries
+            result.tva_total = paddle.tva_total
+        elif tesseract.tva_entries:
+            result.tva_entries = tesseract.tva_entries
+            result.tva_total = tesseract.tva_total
+
+        # === OTHER FIELDS ===
+        # Simple preference: paddle > tesseract
+        result.receipt_number = paddle.receipt_number or tesseract.receipt_number
+        result.receipt_series = paddle.receipt_series or tesseract.receipt_series
+        result.receipt_type = paddle.receipt_type or tesseract.receipt_type
+        result.items_count = paddle.items_count or tesseract.items_count
+        result.address = paddle.address or tesseract.address
+        result.description = paddle.description or tesseract.description
+
+        return result
+
+    def _has_company_indicator(self, name: Optional[str]) -> bool:
+        """Check if vendor name has company type indicator (S.R.L., S.A., etc.)"""
+        if not name:
+            return False
+        name_upper = name.upper()
+        indicators = [
+            r'\bS\.?\s*R\.?\s*L\.?\b',
+            r'\bS\.?\s*A\.?\b',
+            r'\bS\.?\s*N\.?\s*C\.?\b',
+            r'\bP\.?\s*F\.?\s*A\.?\b',
+            r'\bI\.?\s*I\.?\b',
+            r'\bHOLDING\b',
+            r'\bGROUP\b',
+            r'\bCOMPANY\b',
+        ]
+        for indicator in indicators:
+            if re.search(indicator, name_upper):
+                return True
+        return False
+
+    def _is_valid_cui(self, cui: Optional[str]) -> bool:
+        """Validate CUI format: 6-10 digits."""
+        if not cui:
+            return False
+        # Remove any RO prefix
+        cui_clean = re.sub(r'^RO', '', cui.upper())
+        # Must be 6-10 digits
+        return bool(re.match(r'^\d{6,10}$', cui_clean))
+
+    def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool:
+        """
+        Check if extraction has ALL required fields to skip further processing.
+
+        Required for early exit (ALL must be true):
+        - Overall confidence >= 85%
+        - ALL 5 critical fields present: number, date, amount, TVA, CUI
+        """
+        # Must have high confidence
+        if ext.overall_confidence < min_confidence:
+            print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True)
+            return False
+
+        # Check all required fields
+        has_number = bool(ext.receipt_number)
+        has_date = bool(ext.receipt_date)
+        has_amount = bool(ext.amount)
+        has_tva = bool(ext.tva_total) or bool(ext.tva_entries)
+        has_cui = bool(ext.cui)
+
+        missing = []
+        if not has_number: missing.append("number")
+        if not has_date: missing.append("date")
+        if not has_amount: missing.append("amount")
+        if not has_tva: missing.append("TVA")
+        if not has_cui: missing.append("CUI")
+
+        if missing:
+            print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True)
+            return False
+
+        print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
+        return True
+
+    def _complement_extraction(
+        self,
+        primary: Optional[ExtractionResult],
+        secondary: Optional[ExtractionResult]
+    ) -> ExtractionResult:
+        """
+        Complement primary extraction with missing fields from secondary.
+        NEVER overrides existing values - only fills in gaps.
+
+        This is different from _merge_extractions which can override values.
+        """
+        if primary is None and secondary is None:
+            return ExtractionResult()
+        if primary is None:
+            return secondary
+        if secondary is None:
+            return primary
+
+        print("[Complement] Adding missing fields from Tesseract...", flush=True)
+
+        # Only fill missing amount
+        if not primary.amount and secondary.amount:
+            primary.amount = secondary.amount
+            primary.confidence_amount = secondary.confidence_amount
+            print(f"[Complement] Added amount: {secondary.amount}", flush=True)
+
+        # Only fill missing date
+        if not primary.receipt_date and secondary.receipt_date:
+            primary.receipt_date = secondary.receipt_date
+            primary.confidence_date = secondary.confidence_date
+            print(f"[Complement] Added date: {secondary.receipt_date}", flush=True)
+
+        # Only fill missing vendor
+        if not primary.partner_name and secondary.partner_name:
+            primary.partner_name = secondary.partner_name
+            primary.confidence_vendor = secondary.confidence_vendor
+            print(f"[Complement] Added vendor: {secondary.partner_name}", flush=True)
+
+        # Only fill missing CUI
+        if not primary.cui and secondary.cui and self._is_valid_cui(secondary.cui):
+            primary.cui = secondary.cui
+            print(f"[Complement] Added CUI: {secondary.cui}", flush=True)
+
+        # Only fill missing TVA
+        if not primary.tva_entries and secondary.tva_entries:
+            primary.tva_entries = secondary.tva_entries
+            primary.tva_total = secondary.tva_total
+            print(f"[Complement] Added TVA: {secondary.tva_total}", flush=True)
+
+        # Only fill missing receipt number
+        if not primary.receipt_number and secondary.receipt_number:
+            primary.receipt_number = secondary.receipt_number
+            print(f"[Complement] Added number: {secondary.receipt_number}", flush=True)
+
+        # Only fill missing address
+        if not primary.address and secondary.address:
+            primary.address = secondary.address
+            print(f"[Complement] Added address: {secondary.address}", flush=True)
+
+        return primary
+
+    def _final_validation(self, extraction: ExtractionResult) -> ExtractionResult:
+        """
+        Final validation and correction of impossible values.
+
+        Key rules:
+        1. TVA cannot be greater than TOTAL (it's always a fraction)
+        2. If TVA > TOTAL, recalculate TOTAL from TVA using known rates
+        3. Validate TVA entries sum equals TVA total
+        """
+        print("[Final Validation] Checking extracted values...", flush=True)
+
+        # Rule 1: TVA cannot be greater than TOTAL
+        if extraction.tva_total and extraction.amount:
+            if extraction.tva_total > extraction.amount:
+                print(f"[Final Validation] TVA ({extraction.tva_total}) > TOTAL ({extraction.amount}) - IMPOSSIBLE!", flush=True)
+
+                # Calculate TOTAL from TVA using reverse formula:
+                # total = base + tva = tva * (100/rate + 1) = tva * (100 + rate) / rate
+                # For 9% TVA: total = tva * 109 / 9 = tva * 12.11
+                # For 19% TVA: total = tva * 119 / 19 = tva * 6.26
+                # For 21% TVA: total = tva * 121 / 21 = tva * 5.76
+
+                rate = 19  # Default rate assumption
+                if extraction.tva_entries:
+                    # Use the rate from the first entry
+                    rate = extraction.tva_entries[0].get('percent', 19)
+
+                if rate > 0:
+                    # Formula: total = tva * (100 + rate) / rate
+                    calculated_total = extraction.tva_total * (Decimal('100') + Decimal(str(rate))) / Decimal(str(rate))
+                    calculated_total = calculated_total.quantize(Decimal('0.01'))
+
+                    print(f"[Final Validation] Calculated TOTAL from TVA: {calculated_total} (using {rate}% rate)", flush=True)
+
+                    extraction.amount = calculated_total
+                    extraction.confidence_amount = 0.70  # Lower confidence for calculated value
+
+        # Rule 2: TVA cannot be more than ~25% of total (max Romanian rate is 21%)
+        if extraction.tva_total and extraction.amount:
+            tva_percent = extraction.tva_total / extraction.amount * Decimal('100')
+            if tva_percent > Decimal('25'):
+                print(f"[Final Validation] Warning: TVA is {tva_percent:.1f}% of total - suspicious", flush=True)
+
+        # Rule 3: Validate TVA entries sum
+        if extraction.tva_entries and extraction.tva_total:
+            entries_sum = sum(e.get('amount', Decimal('0')) for e in extraction.tva_entries)
+            tolerance = Decimal('0.05')
+            if abs(entries_sum - extraction.tva_total) > tolerance:
+                print(f"[Final Validation] TVA entries sum ({entries_sum}) != tva_total ({extraction.tva_total})", flush=True)
+                # Use the sum as it's more reliable
+                extraction.tva_total = entries_sum
+
+        print(f"[Final Validation] Done. Amount={extraction.amount}, TVA={extraction.tva_total}", flush=True)
+        return extraction
+
+
+# Singleton instance
+ocr_service = OCRService()