feat(ocr): Add validation system and CLIENT CUI extraction

OCR Data Extraction Validation System: - Add 7 validation rules (amount range, TVA ratio, payment sum, etc.) - Add Medium preprocessing to replace Heavy (fixes digit concatenation) - Add validation warnings to API responses - Flag receipts needing manual review (needs_manual_review field) - Add database migration for needs_manual_review column CLIENT CUI Extraction Improvements: - Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc. - Handle OCR errors (R0 vs RO, C1F vs CIF) - Add client_name, client_cui, client_address to API response - Add validation fields to API response (was missing) QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings) - Fixed type safety in validation rules - Fixed ZeroDivisionError risk - Fixed schema mismatch (Optional[bool] for needs_manual_review) - All 37 unit tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 19:12:52 +02:00
parent ce85e0643b
commit ab160b628d
14 changed files with 4161 additions and 33 deletions
--- a/backend/modules/data_entry/migrations/versions/20251230_add_needs_manual_review.py
+++ b/backend/modules/data_entry/migrations/versions/20251230_add_needs_manual_review.py
@@ -0,0 +1,40 @@
+"""Add needs_manual_review flag to receipts table.
+
+Revision ID: 20251230_needs_manual_review
+Revises: 20251216_payment_mode
+Create Date: 2025-12-30
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '20251230_needs_manual_review'
+down_revision = '20251216_payment_mode'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    """Add needs_manual_review column for OCR validation tracking.
+
+    This column tracks whether a receipt needs manual supervisor review
+    based on OCR extraction validation warnings:
+    - NULL = not validated yet (old receipts before validation feature)
+    - FALSE = validated, no review needed
+    - TRUE = validated, needs review
+    """
+    with op.batch_alter_table('receipts', schema=None) as batch_op:
+        batch_op.add_column(
+            sa.Column('needs_manual_review', sa.Boolean(), nullable=True)
+        )
+
+    # NOTE: We do NOT set a default value for existing rows.
+    # NULL indicates the receipt was created before validation was implemented.
+    # Only new receipts (created after this migration) will have TRUE/FALSE values.
+
+
+def downgrade() -> None:
+    """Remove needs_manual_review column."""
+    with op.batch_alter_table('receipts', schema=None) as batch_op:
+        batch_op.drop_column('needs_manual_review')
--- a/backend/modules/data_entry/routers/ocr.py
+++ b/backend/modules/data_entry/routers/ocr.py
@@ -118,13 +118,23 @@ async def extract_from_image(file: UploadFile = File(...)):
            items_count=result.items_count,
            payment_methods=payment_methods_list,
            suggested_payment_mode=suggested_payment_mode,
+            # Client data (B2B receipts)
+            client_name=result.client_name,
+            client_cui=result.client_cui,
+            client_address=result.client_address,
            confidence_amount=result.confidence_amount,
            confidence_date=result.confidence_date,
            confidence_vendor=result.confidence_vendor,
+            confidence_client=result.confidence_client,
            overall_confidence=result.overall_confidence,
            raw_text=result.raw_text,
            ocr_engine=result.ocr_engine,
            processing_time_ms=result.processing_time_ms,
+            # Validation results
+            needs_manual_review=result.needs_manual_review,
+            validation_warnings=result.validation_warnings,
+            validation_errors=result.validation_errors,
+            inter_ocr_ratios=result.inter_ocr_ratios,
        )

        return OCRResponse(success=True, message=message, data=data)
@@ -206,13 +216,23 @@ async def extract_from_attachment(
        items_count=result.items_count,
        payment_methods=payment_methods_list,
        suggested_payment_mode=suggested_payment_mode,
+        # Client data (B2B receipts)
+        client_name=result.client_name,
+        client_cui=result.client_cui,
+        client_address=result.client_address,
        confidence_amount=result.confidence_amount,
        confidence_date=result.confidence_date,
        confidence_vendor=result.confidence_vendor,
+        confidence_client=result.confidence_client,
        overall_confidence=result.overall_confidence,
        raw_text=result.raw_text,
        ocr_engine=result.ocr_engine,
        processing_time_ms=result.processing_time_ms,
+        # Validation results
+        needs_manual_review=result.needs_manual_review,
+        validation_warnings=result.validation_warnings,
+        validation_errors=result.validation_errors,
+        inter_ocr_ratios=result.inter_ocr_ratios,
    )

    return OCRResponse(success=True, message=message, data=data)
--- a/backend/modules/data_entry/schemas/ocr.py
+++ b/backend/modules/data_entry/schemas/ocr.py
@@ -20,6 +20,15 @@ class PaymentMethod(BaseModel):
    amount: Decimal = Field(description="Amount paid")


+class ValidationWarning(BaseModel):
+    """Validation warning from OCR extraction."""
+    field: str = Field(description="Field name (e.g., 'amount', 'tva_total')")
+    rule: str = Field(description="Rule name (e.g., 'amount_range', 'tva_ratio')")
+    message: str = Field(description="Human-readable warning message")
+    severity: str = Field(description="Severity: 'info', 'warning', 'error'")
+    suggested_value: Optional[str] = Field(default=None, description="Suggested corrected value")
+
+
 class ExtractionData(BaseModel):
    """Extracted receipt data from OCR."""

@@ -56,6 +65,13 @@ class ExtractionData(BaseModel):
    ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
    processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")

+    # Validation results (added by bon-ocr-validation feature)
+    # needs_manual_review: None = not validated yet (old receipts), False = no review needed, True = needs review
+    needs_manual_review: Optional[bool] = Field(default=None, description="Flag for supervisor review (None=not validated, False=ok, True=needs review)")
+    validation_warnings: List[str] = Field(default=[], description="Validation warnings")
+    validation_errors: List[str] = Field(default=[], description="Validation errors")
+    inter_ocr_ratios: dict[str, float] = Field(default={}, description="Inter-OCR consistency ratios")
+
    class Config:
        """Pydantic config."""
        json_schema_extra = {
--- a/backend/modules/data_entry/services/image_preprocessor.py
+++ b/backend/modules/data_entry/services/image_preprocessor.py
@@ -104,10 +104,80 @@ class ImagePreprocessor:
        # NO binarization, NO morphological ops - preserve original quality
        return enhanced

+    def preprocess_medium(self, image: np.ndarray) -> np.ndarray:
+        """
+        Medium preprocessing for MIXED-QUALITY images.
+        Balance between Light (too gentle) and Heavy (too aggressive).
+
+        Use cases:
+        - Moderately faded receipts
+        - Photos with uneven lighting
+        - Scans with slight blur
+
+        Preprocessing steps:
+        - Moderate contrast enhancement (CLAHE clipLimit=2.0)
+        - Light denoising (fastNlMeansDenoising h=6)
+        - Gentle sharpening
+        - NO binarization (preserves text boundaries)
+        - NO morphological operations (avoids digit concatenation)
+
+        This method was created to replace preprocess_heavy() which caused
+        digit concatenation errors on high-quality PDFs (85.99 → 859,762.16).
+        """
+        # 0. Add safety padding to protect edge content during deskew rotation
+        image = self._add_safety_padding(image)
+
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
+        height, width = gray.shape
+        max_side = max(height, width)
+        if max_side > 4000:
+            scale = 4000 / max_side
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
+            height, width = gray.shape
+
+        # 2b. Scale UP if too small
+        if width < 1500:
+            scale = 1500 / width
+            # Ensure we don't exceed 4000px after upscaling
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            if max(new_width, new_height) > 4000:
+                scale = 4000 / max(new_width, new_height)
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Moderate contrast enhancement (CLAHE clipLimit=2.0)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+
+        # 5. Light denoising (less aggressive than Heavy)
+        denoised = cv2.fastNlMeansDenoising(enhanced, h=6, templateWindowSize=7, searchWindowSize=15)
+
+        # 6. Gentle sharpening
+        gaussian = cv2.GaussianBlur(denoised, (0, 0), 1.0)
+        sharpened = cv2.addWeighted(denoised, 1.3, gaussian, -0.3, 0)
+
+        # NO binarization, NO morphological operations
+        # This preserves text boundaries and avoids digit concatenation
+        return sharpened
+
    def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
        """
        Heavy preprocessing for FADED thermal receipts.
        Aggressive binarization to recover faded text.
+
+        ⚠️ DEPRECATED: Use preprocess_medium() instead.
+        Heavy preprocessing causes digit concatenation on clear PDFs
+        (e.g., 85.99 → 859,762.16 due to binarization + morphological operations).
+        Kept for backward compatibility only.
        """
        # 0. Add safety padding to protect edge content during deskew rotation
        image = self._add_safety_padding(image)
--- a/backend/modules/data_entry/services/ocr/validation.py
+++ b/backend/modules/data_entry/services/ocr/validation.py
@@ -0,0 +1,737 @@
+"""
+OCR Data Validation Module
+
+Provides multi-layer validation for OCR extraction results to prevent
+incorrect data from entering the system.
+
+Validation Layers:
+1. Absolute sanity checks (value ranges)
+2. Cross-field validation (correlation between fields)
+3. Inter-OCR consistency (compare multiple OCR results)
+4. Auto-correction (fix obvious errors)
+
+Usage:
+    engine = OCRValidationEngine()
+    validated_result = engine.validate_extraction(
+        merged_result,
+        light_ocr_result,
+        medium_ocr_result
+    )
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+
+@dataclass
+class ValidationResult:
+    """Result of a single validation rule execution.
+
+    Attributes:
+        is_valid: Whether the validation passed
+        confidence_penalty: Penalty to apply to confidence score (0.0-1.0)
+                          0.0 = no penalty, 1.0 = complete rejection
+        message: Human-readable description of validation result
+        severity: "info" | "warning" | "error"
+    """
+    is_valid: bool
+    confidence_penalty: float = 0.0
+    message: str = ""
+    severity: str = "info"  # "info" | "warning" | "error"
+
+    def __post_init__(self):
+        """Validate penalty is in valid range."""
+        if not 0.0 <= self.confidence_penalty <= 1.0:
+            raise ValueError(f"Confidence penalty must be 0.0-1.0, got {self.confidence_penalty}")
+
+
+class ValidationRule(ABC):
+    """Abstract base class for OCR validation rules.
+
+    Each rule implements a specific validation check and returns
+    a ValidationResult indicating success/failure with optional
+    confidence penalty.
+    """
+
+    @abstractmethod
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        """Execute validation rule on extraction data.
+
+        Args:
+            data: Dictionary containing extraction fields to validate
+                  Example: {"amount": 85.99, "tva": 14.92, ...}
+
+        Returns:
+            ValidationResult with is_valid flag and optional penalty
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def rule_name(self) -> str:
+        """Human-readable name of this validation rule."""
+        pass
+
+
+# ============================================================================
+# VALIDATION RULES
+# ============================================================================
+
+
+class AmountRangeRule(ValidationRule):
+    """Validate amount is within reasonable bounds for Romanian receipts.
+
+    Romanian receipts rarely exceed 100,000 RON. This catches obvious
+    OCR errors like digit concatenation (85.99 → 859,762.16).
+
+    Example:
+        rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
+        result = rule.validate({"amount": 859762.16})
+        # result.is_valid = False, penalty = 0.5
+    """
+
+    def __init__(self, min_amount: float = 0.01, max_amount: float = 100_000.0):
+        self.min_amount = min_amount
+        self.max_amount = max_amount
+
+    @property
+    def rule_name(self) -> str:
+        return "Amount Range Check"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        amount = data.get("amount")
+
+        if amount is None:
+            return ValidationResult(
+                is_valid=True,
+                message="No amount to validate"
+            )
+
+        if amount < self.min_amount:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.5,
+                message=f"Amount {amount:.2f} RON below minimum {self.min_amount:.2f} RON",
+                severity="error"
+            )
+
+        if amount > self.max_amount:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.5,
+                message=f"Amount {amount:.2f} RON exceeds maximum {self.max_amount:.2f} RON (likely OCR error)",
+                severity="error"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"Amount {amount:.2f} RON within valid range"
+        )
+
+
+class TVARatioRule(ValidationRule):
+    """Validate TVA is reasonable percentage of TOTAL amount.
+
+    Romanian TVA rates: 5%, 9%, 19%, 21% (most common: 19-21%)
+    This catches errors where TVA > TOTAL (impossible).
+
+    Example:
+        rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
+        result = rule.validate({"amount": 85.99, "tva": 149.21})
+        # result.is_valid = False (149.21 > 85.99!)
+    """
+
+    def __init__(self, min_ratio: float = 0.05, max_ratio: float = 0.24):
+        self.min_ratio = min_ratio
+        self.max_ratio = max_ratio
+
+    @property
+    def rule_name(self) -> str:
+        return "TVA Ratio Check"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        amount = data.get("amount")
+        tva = data.get("tva")
+
+        if not amount or not tva:
+            return ValidationResult(
+                is_valid=True,
+                message="Insufficient data for TVA correlation"
+            )
+
+        # Type safety: ensure numeric types before division
+        if not isinstance(amount, (int, float)) or not isinstance(tva, (int, float)):
+            return ValidationResult(
+                is_valid=True,
+                message="Non-numeric values, skipping TVA correlation"
+            )
+
+        # Avoid division by zero
+        if amount <= 0:
+            return ValidationResult(
+                is_valid=True,
+                message="Amount is zero or negative, skipping TVA ratio"
+            )
+
+        tva_ratio = tva / amount
+
+        if tva_ratio < self.min_ratio or tva_ratio > self.max_ratio:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.3,
+                message=f"TVA ratio {tva_ratio:.1%} outside valid range ({self.min_ratio:.0%}-{self.max_ratio:.0%})",
+                severity="warning"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"TVA ratio {tva_ratio:.1%} valid"
+        )
+
+
+class PaymentSumRule(ValidationRule):
+    """Validate CARD + NUMERAR = TOTAL BON (within tolerance).
+
+    This is a CRITICAL validation that catches cases where OCR extracts
+    wrong TOTAL but correct payment methods.
+
+    Example:
+        rule = PaymentSumRule(tolerance=0.02)
+        result = rule.validate({
+            "amount": 859762.16,  # Wrong from OCR
+            "card_amount": 85.99,  # Correct
+            "cash_amount": 0.0
+        })
+        # result.is_valid = False, suggests auto-correction
+    """
+
+    def __init__(self, tolerance: float = 0.02):
+        self.tolerance = tolerance
+
+    @property
+    def rule_name(self) -> str:
+        return "Payment Sum Check"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        total = data.get("amount")
+        card = data.get("card_amount", 0.0) or 0.0
+        cash = data.get("cash_amount", 0.0) or 0.0
+
+        if not total:
+            return ValidationResult(
+                is_valid=True,
+                message="No total amount to validate"
+            )
+
+        payment_sum = card + cash
+
+        if payment_sum == 0:
+            return ValidationResult(
+                is_valid=True,
+                message="No payment methods extracted"
+            )
+
+        diff = abs(total - payment_sum)
+
+        if diff > self.tolerance:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.4,
+                message=f"Payment sum {payment_sum:.2f} RON ≠ Total {total:.2f} RON (diff: {diff:.2f} RON). Consider auto-correction.",
+                severity="error"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"Payment sum matches total (diff: {diff:.2f} RON)"
+        )
+
+
+class TVAEntriesSumRule(ValidationRule):
+    """Validate Σ(TVA entries) = TVA TOTAL (within tolerance).
+
+    TVA breakdown (A, B, C, D rates) should sum to total TVA.
+
+    Example:
+        rule = TVAEntriesSumRule(tolerance=0.02)
+        result = rule.validate({
+            "tva": 14.92,
+            "tva_entries": {"A": 14.92, "B": 0.0}
+        })
+        # result.is_valid = True
+    """
+
+    def __init__(self, tolerance: float = 0.02):
+        self.tolerance = tolerance
+
+    @property
+    def rule_name(self) -> str:
+        return "TVA Entries Sum Check"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        tva_total = data.get("tva")
+        tva_entries = data.get("tva_entries", {})
+
+        if not tva_total:
+            return ValidationResult(
+                is_valid=True,
+                message="No TVA total to validate"
+            )
+
+        if not tva_entries:
+            return ValidationResult(
+                is_valid=True,
+                message="No TVA entries extracted"
+            )
+
+        entries_sum = sum(tva_entries.values())
+
+        if entries_sum == 0:
+            return ValidationResult(
+                is_valid=True,
+                message="TVA entries sum is zero"
+            )
+
+        diff = abs(tva_total - entries_sum)
+
+        if diff > self.tolerance:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.2,
+                message=f"TVA entries sum {entries_sum:.2f} RON ≠ TVA total {tva_total:.2f} RON (diff: {diff:.2f} RON)",
+                severity="warning"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"TVA entries sum matches total (diff: {diff:.2f} RON)"
+        )
+
+
+class CUIFormatRule(ValidationRule):
+    """Validate CUI format: RO + 6-10 digits.
+
+    Romanian CUI (Cod Unic de Identificare) format:
+    - Optional "RO" prefix (or "R0" from OCR errors)
+    - 6-10 numeric digits
+
+    Example:
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "RO10562600"})
+        # result.is_valid = True
+    """
+
+    @property
+    def rule_name(self) -> str:
+        return "CUI Format Check"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        cui = data.get("cui")
+
+        if not cui:
+            return ValidationResult(
+                is_valid=True,
+                message="No CUI to validate"
+            )
+
+        # Normalize: remove RO/R0 prefix
+        cui_clean = cui.strip().upper()
+        if cui_clean.startswith("RO"):
+            cui_clean = cui_clean[2:]
+        elif cui_clean.startswith("R0"):
+            cui_clean = cui_clean[2:]
+
+        # Check if numeric
+        if not cui_clean.isdigit():
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.3,
+                message=f"CUI '{cui}' contains non-numeric characters",
+                severity="warning"
+            )
+
+        # Check length
+        if len(cui_clean) < 6 or len(cui_clean) > 10:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.3,
+                message=f"CUI '{cui}' length {len(cui_clean)} outside valid range (6-10 digits)",
+                severity="warning"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"CUI '{cui}' format valid"
+        )
+
+
+class CUIChecksumRule(ValidationRule):
+    """Validate Romanian CIF/CUI using Mod 11 checksum algorithm.
+
+    Algorithm:
+    1. Remove RO prefix if present
+    2. Extract last digit as declared checksum
+    3. Apply multipliers [7,5,3,2,1,7,5,3,2] to first N-1 digits
+    4. Calculate: (sum * 10) mod 11
+    5. If result = 10, expected checksum = 0
+    6. Else, expected checksum = result
+    7. Compare with declared checksum
+
+    Example:
+        rule = CUIChecksumRule()
+        result = rule.validate({"cui": "RO10562600"})
+        # result.is_valid = True (checksum correct)
+
+        result = rule.validate({"cui": "R01879855"})
+        # result.is_valid = False (checksum mismatch)
+    """
+
+    @property
+    def rule_name(self) -> str:
+        return "CUI Checksum Check (Mod 11)"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        cui = data.get("cui")
+
+        if not cui:
+            return ValidationResult(
+                is_valid=True,
+                message="No CUI to validate"
+            )
+
+        # Normalize: remove RO/R0 prefix
+        cui_clean = cui.strip().upper()
+        if cui_clean.startswith("RO"):
+            cui_clean = cui_clean[2:]
+        elif cui_clean.startswith("R0"):
+            cui_clean = cui_clean[2:]
+
+        # Check format first
+        if not cui_clean.isdigit():
+            return ValidationResult(
+                is_valid=True,  # Don't fail checksum if format invalid (handled by CUIFormatRule)
+                message="CUI format invalid, skipping checksum"
+            )
+
+        if len(cui_clean) < 6 or len(cui_clean) > 10:
+            return ValidationResult(
+                is_valid=True,
+                message="CUI length invalid, skipping checksum"
+            )
+
+        # Extract digits
+        digits = [int(d) for d in cui_clean]
+        checksum_declared = digits[-1]
+        base_digits = digits[:-1]
+
+        # Multipliers (trim to match base_digits length)
+        multipliers = [7, 5, 3, 2, 1, 7, 5, 3, 2]
+        multipliers = multipliers[:len(base_digits)]
+
+        # Calculate weighted sum
+        weighted_sum = sum(d * m for d, m in zip(base_digits, multipliers))
+
+        # Calculate expected checksum
+        checksum_calculated = (weighted_sum * 10) % 11
+        if checksum_calculated == 10:
+            checksum_calculated = 0
+
+        if checksum_calculated != checksum_declared:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.3,
+                message=f"CUI '{cui}' checksum mismatch: expected {checksum_calculated}, got {checksum_declared}",
+                severity="warning"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"CUI '{cui}' checksum valid"
+        )
+
+
+class InterOCRConsistencyRule(ValidationRule):
+    """Validate consistency between multiple OCR results.
+
+    If Light OCR and Medium OCR produce values that differ by >10x,
+    one is clearly wrong (likely digit concatenation error).
+
+    Example:
+        rule = InterOCRConsistencyRule(max_ratio=10.0)
+        result = rule.validate({
+            "light_amount": 85.99,
+            "medium_amount": 859762.16
+        })
+        # result.is_valid = False (ratio = 10,000x!)
+    """
+
+    def __init__(self, max_ratio: float = 10.0):
+        self.max_ratio = max_ratio
+
+    @property
+    def rule_name(self) -> str:
+        return "Inter-OCR Consistency Check"
+
+    def validate(self, data: dict[str, Any]) -> ValidationResult:
+        light_value = data.get("light_value")
+        medium_value = data.get("medium_value")
+        field_name = data.get("field_name", "value")
+
+        if not light_value or not medium_value:
+            return ValidationResult(
+                is_valid=True,
+                message="Insufficient OCR results for consistency check"
+            )
+
+        # Avoid division by zero
+        if light_value == 0 or medium_value == 0:
+            return ValidationResult(
+                is_valid=True,
+                message="One value is zero, skipping consistency check"
+            )
+
+        ratio = max(light_value, medium_value) / min(light_value, medium_value)
+
+        if ratio > self.max_ratio:
+            return ValidationResult(
+                is_valid=False,
+                confidence_penalty=0.2,
+                message=f"{field_name}: OCR results differ by {ratio:.1f}x (Light: {light_value}, Medium: {medium_value})",
+                severity="warning"
+            )
+
+        return ValidationResult(
+            is_valid=True,
+            message=f"{field_name}: OCR results consistent (ratio: {ratio:.2f}x)"
+        )
+
+
+# ============================================================================
+# VALIDATION ENGINE
+# ============================================================================
+
+
+@dataclass
+class EnhancedExtractionResult:
+    """Enhanced extraction result with validation metadata.
+
+    This wraps the original extraction data and adds validation results.
+    """
+    # Original data
+    data: dict[str, Any]
+
+    # Validation results
+    needs_manual_review: bool = False
+    validation_warnings: list[str] = field(default_factory=list)
+    validation_errors: list[str] = field(default_factory=list)
+    confidence_adjustments: dict[str, float] = field(default_factory=dict)
+
+    # Inter-OCR metadata
+    inter_ocr_ratios: dict[str, float] = field(default_factory=dict)
+
+
+class OCRValidationEngine:
+    """Orchestrate all validation rules for OCR extraction results.
+
+    This engine applies validation rules in order:
+    1. Sanity checks (amount range, format checks)
+    2. Cross-field correlation (TVA ratio, payment sum)
+    3. Inter-OCR consistency checks
+
+    Example:
+        engine = OCRValidationEngine()
+        result = engine.validate_extraction(
+            extraction_result=merged_data,
+            light_result=light_ocr_data,
+            medium_result=medium_ocr_data
+        )
+    """
+
+    def __init__(self):
+        """Initialize validation engine with default rules."""
+        # Sanity check rules (absolute value validation)
+        self.sanity_rules = [
+            AmountRangeRule(min_amount=0.01, max_amount=100_000.0),
+            CUIFormatRule(),
+            CUIChecksumRule(),
+        ]
+
+        # Cross-field validation rules (correlation between fields)
+        self.cross_field_rules = [
+            TVARatioRule(min_ratio=0.05, max_ratio=0.24),
+            PaymentSumRule(tolerance=0.02),
+            TVAEntriesSumRule(tolerance=0.02),
+        ]
+
+        # Inter-OCR consistency rules
+        self.inter_ocr_rules = [
+            InterOCRConsistencyRule(max_ratio=10.0),
+        ]
+
+    def validate_extraction(
+        self,
+        extraction_result: dict[str, Any],
+        light_result: Optional[dict[str, Any]] = None,
+        medium_result: Optional[dict[str, Any]] = None
+    ) -> EnhancedExtractionResult:
+        """Run all validation rules and return enhanced result.
+
+        Args:
+            extraction_result: Merged OCR extraction data (required)
+            light_result: Light OCR preprocessing results (optional)
+            medium_result: Medium OCR preprocessing results (optional)
+
+        Returns:
+            EnhancedExtractionResult with validation warnings and metadata
+        """
+        warnings = []
+        errors = []
+        confidence_adjustments = {}
+        inter_ocr_ratios = {}
+
+        # Step 1: Sanity checks
+        print("\n[Validation] Step 1: Sanity checks...", flush=True)
+        for rule in self.sanity_rules:
+            result = rule.validate(extraction_result)
+
+            if not result.is_valid:
+                msg = f"[{rule.rule_name}] {result.message}"
+
+                if result.severity == "error":
+                    errors.append(msg)
+                else:
+                    warnings.append(msg)
+
+                print(f"  ❌ {msg}", flush=True)
+
+                # Track confidence penalty for the relevant field based on rule
+                if result.confidence_penalty > 0:
+                    rule_field_map = {
+                        "Amount Range Check": ["amount"],
+                        "CUI Format Check": ["cui"],
+                        "CUI Checksum Check (Mod 11)": ["cui"],
+                    }
+                    fields = rule_field_map.get(rule.rule_name, ["amount", "tva", "cui"])
+                    for f in fields:
+                        if f in extraction_result:
+                            confidence_adjustments[f] = result.confidence_penalty
+            else:
+                print(f"  ✅ {rule.rule_name}: {result.message}", flush=True)
+
+        # Step 2: Cross-field validation
+        print("\n[Validation] Step 2: Cross-field validation...", flush=True)
+        for rule in self.cross_field_rules:
+            result = rule.validate(extraction_result)
+
+            if not result.is_valid:
+                msg = f"[{rule.rule_name}] {result.message}"
+
+                if result.severity == "error":
+                    errors.append(msg)
+                else:
+                    warnings.append(msg)
+
+                print(f"  ❌ {msg}", flush=True)
+
+                # Track confidence penalty for the relevant field based on rule
+                if result.confidence_penalty > 0:
+                    rule_field_map = {
+                        "TVA Ratio Check": ["tva"],
+                        "Payment Sum Check": ["amount"],
+                        "TVA Entries Sum Check": ["tva"],
+                    }
+                    fields = rule_field_map.get(rule.rule_name, ["amount", "tva"])
+                    for f in fields:
+                        if f in extraction_result:
+                            confidence_adjustments[f] = result.confidence_penalty
+            else:
+                print(f"  ✅ {rule.rule_name}: {result.message}", flush=True)
+
+        # Step 3: Inter-OCR consistency checks
+        if light_result and medium_result:
+            print("\n[Validation] Step 3: Inter-OCR consistency...", flush=True)
+
+            # Check amount consistency
+            if "amount" in light_result and "amount" in medium_result:
+                consistency_data = {
+                    "light_value": light_result["amount"],
+                    "medium_value": medium_result["amount"],
+                    "field_name": "amount"
+                }
+
+                result = self.inter_ocr_rules[0].validate(consistency_data)
+
+                if not result.is_valid:
+                    msg = f"[Inter-OCR] {result.message}"
+                    warnings.append(msg)
+                    print(f"  ❌ {msg}", flush=True)
+
+                    # Store ratio for metadata
+                    ratio = max(
+                        light_result["amount"],
+                        medium_result["amount"]
+                    ) / min(light_result["amount"], medium_result["amount"])
+                    inter_ocr_ratios["amount"] = ratio
+                else:
+                    print(f"  ✅ {result.message}", flush=True)
+
+        # Determine if manual review is needed
+        # Only flag for review if there are errors OR high-severity warnings
+        high_severity_warnings = [w for w in warnings if "[Amount Range" in w or "[Payment Sum" in w or "[Inter-OCR]" in w]
+        needs_manual_review = (
+            len(errors) > 0 or
+            len(high_severity_warnings) > 0 or
+            any(ratio > 10.0 for ratio in inter_ocr_ratios.values())
+        )
+
+        print(f"\n[Validation] Summary:", flush=True)
+        print(f"  Errors: {len(errors)}", flush=True)
+        print(f"  Warnings: {len(warnings)}", flush=True)
+        print(f"  Manual review needed: {needs_manual_review}", flush=True)
+
+        return EnhancedExtractionResult(
+            data=extraction_result,
+            needs_manual_review=needs_manual_review,
+            validation_warnings=warnings,
+            validation_errors=errors,
+            confidence_adjustments=confidence_adjustments,
+            inter_ocr_ratios=inter_ocr_ratios
+        )
+
+    @staticmethod
+    def normalize_cui(cui: Optional[str]) -> Optional[str]:
+        """Normalize CUI to RO prefix + digits format.
+
+        Examples:
+            10562600 → RO10562600
+            R010562600 → RO10562600 (fix R0 OCR error)
+            RO10562600 → RO10562600 (unchanged)
+
+        Args:
+            cui: Raw CUI string from OCR
+
+        Returns:
+            Normalized CUI with RO prefix, or None if invalid
+        """
+        if not cui:
+            return None
+
+        cui = cui.strip().upper()
+
+        # Remove existing prefix if present
+        if cui.startswith("RO"):
+            cui = cui[2:]
+        elif cui.startswith("R0"):
+            cui = cui[2:]
+
+        # Remove any non-digit characters
+        cui_digits = ''.join(c for c in cui if c.isdigit())
+
+        # Validate length
+        if len(cui_digits) < 6 or len(cui_digits) > 10:
+            print(f"[CUI Normalize] Invalid length: {len(cui_digits)} digits (expected 6-10)", flush=True)
+            return None
+
+        # Add RO prefix
+        return f"RO{cui_digits}"
--- a/backend/modules/data_entry/services/ocr_extractor.py
+++ b/backend/modules/data_entry/services/ocr_extractor.py
@@ -38,6 +38,13 @@ class ExtractionResult:
    ocr_engine: str = ""  # OCR engine used: paddleocr or tesseract
    processing_time_ms: int = 0  # Processing time in milliseconds

+    # Validation tracking (added by bon-ocr-validation feature)
+    needs_manual_review: Optional[bool] = None  # None=not validated, False=ok, True=needs review
+    validation_warnings: List[str] = field(default_factory=list)
+    validation_errors: List[str] = field(default_factory=list)
+    confidence_adjustments: dict[str, float] = field(default_factory=dict)  # Field -> penalty
+    inter_ocr_ratios: dict[str, float] = field(default_factory=dict)  # Field -> ratio
+
    @property
    def overall_confidence(self) -> float:
        """Calculate weighted overall confidence score."""
@@ -238,10 +245,18 @@ class ReceiptExtractor:

    # Client/Buyer patterns (for B2B receipts)
    # CLIENT, CUMPARATOR, BENEFICIAR sections
+    # Variations: "CIF CLIENT:", "CLIENT C.U.I/C.I.F.", "CLIENT C. U. I./ C. I.F."
    CLIENT_SECTION_MARKERS = [
-        r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:',  # CIF CLIENT: (reversed format)
-        r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:',  # CUI CLIENT: (reversed format)
+        # Reversed format: CIF/CUI before CLIENT
+        r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',  # CIF CLIENT:
+        r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',  # CUI CLIENT:
+        # CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
+        # Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
+        r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
+        r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',  # CLIENT CUI: or CLIENT CIF:
        r'CLIENT\s*:',
+        # CUMPARATOR variants
+        r'CUMPARATOR\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',  # CUMPARATOR CUI: or CIF:
        r'CUMPARATOR\s*:',
        r'BENEFICIAR\s*:',
        r'CUMP[AĂ]R[AĂ]TOR\s*:',
@@ -250,25 +265,30 @@ class ReceiptExtractor:
    ]

    # Client CUI patterns (explicitly after CLIENT marker)
+    # OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
    CLIENT_CUI_PATTERNS = [
-        # CIF CLIENT: R01879856 (reversed format - CIF before CLIENT)
-        (r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
-        (r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
-        (r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
-        (r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
-        # CLIENT C.U.I./ C.I.F. :R01879855 (slash variant with both labels)
-        (r'CLIENT\s+C\.\s*U\.\s*I\.?\s*/\s*C\.\s*[I1]\.\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
-        (r'CLIENT\s+C\.?\s*U\.?\s*I\.?(?:\s*/\s*C\.?\s*[I1]\.?\s*F\.?)?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
-        # CLIENT C.U.I. or CLIENT CUI or CLIENT CIF
-        (r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
-        (r'CLIENT\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
-        (r'CUMPARATOR\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
-        (r'CUMPARATOR\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        # CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
+        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
+        (r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
+        (r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
+        # CLIENT C.U.I/C.I.F. or CLIENT C. U. I./ C. I.F. (slash variant - all spacing)
+        # Most flexible pattern for slash variants
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
+        # CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
+        (r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
+        # CUMPARATOR variants
+        (r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        (r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        # CUI/CIF on line immediately after CLIENT marker
-        (r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
-        (r'CLIENT\s*:\s*\n\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        (r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
+        (r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
        # CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
-        (r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
+        (r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
    ]

    # Vendor name indicators (lines containing these are likely vendor names)
--- a/backend/modules/data_entry/services/ocr_service.py
+++ b/backend/modules/data_entry/services/ocr_service.py
@@ -17,6 +17,7 @@ from typing import Optional, Tuple
 from backend.modules.data_entry.services.ocr_engine import OCREngine
 from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult
 from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
+from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine

 # Setup logging
 logger = logging.getLogger(__name__)
@@ -126,28 +127,28 @@ class OCRService:
            extraction = ExtractionResult()

        # ══════════════════════════════════════════════════════════════
-        # STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
+        # STEP 2: PaddleOCR + Medium (balanced preprocessing)
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
-        print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
+        print("[OCR] STEP 2: PaddleOCR + Medium preprocessing", flush=True)
        print("=" * 60, flush=True)
-        heavy_img = self.preprocessor.preprocess_heavy(image)
+        medium_img = self.preprocessor.preprocess_medium(image)

        try:
-            paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
-            if paddle_heavy and paddle_heavy.text:
-                extraction_heavy = self.extractor.extract(paddle_heavy.text)
-                extraction_heavy.ocr_engine = "paddle-heavy"
-                raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
+            paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
+            if paddle_medium and paddle_medium.text:
+                extraction_medium = self.extractor.extract(paddle_medium.text)
+                extraction_medium.ocr_engine = "paddle-medium"
+                raw_texts.append(f"═══ PaddleOCR (medium, conf: {paddle_medium.confidence:.0%}) ═══\n{paddle_medium.text}")

-                print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
-                print(f"  - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
-                print(f"  - Amount: {extraction_heavy.amount}", flush=True)
-                print(f"  - Date: {extraction_heavy.receipt_date}", flush=True)
-                print(f"  - CUI: {extraction_heavy.cui}", flush=True)
+                print(f"[OCR] Step 2 (Medium) Results:", flush=True)
+                print(f"  - OCR Confidence: {paddle_medium.confidence:.0%}", flush=True)
+                print(f"  - Amount: {extraction_medium.amount}", flush=True)
+                print(f"  - Date: {extraction_medium.receipt_date}", flush=True)
+                print(f"  - CUI: {extraction_medium.cui}", flush=True)

                # Merge with previous
-                extraction = self._merge_extractions(extraction, extraction_heavy)
+                extraction = self._merge_extractions(extraction, extraction_medium)

                print(f"[OCR] After merge:", flush=True)
                print(f"  - Amount: {extraction.amount}", flush=True)
@@ -167,7 +168,7 @@ class OCRService:
                else:
                    print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
        except Exception as e:
-            print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
+            print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)

        # ══════════════════════════════════════════════════════════════
        # STEP 3: Tesseract - ONLY to complete missing fields
@@ -235,6 +236,70 @@ class OCRService:
        print(f"  - Processing Time: {elapsed_ms}ms", flush=True)
        print(f"  - Message: {message}", flush=True)

+        # ══════════════════════════════════════════════════════════════
+        # VALIDATION: Apply validation rules to final extraction
+        # ══════════════════════════════════════════════════════════════
+        print("\n" + "=" * 60, flush=True)
+        print("[Validation] Applying validation rules...", flush=True)
+        print("=" * 60, flush=True)
+
+        validator = OCRValidationEngine()
+
+        # Prepare data for validation with safe type conversions
+        def safe_float(value) -> Optional[float]:
+            """Safely convert Decimal or number to float."""
+            if value is None:
+                return None
+            try:
+                return float(value)
+            except (TypeError, ValueError):
+                return None
+
+        def safe_payment_sum(methods: list, method_type: str) -> Optional[float]:
+            """Safely sum payment amounts for a given method type."""
+            if not methods:
+                return None
+            try:
+                total = sum(
+                    float(pm.get('amount', 0) or 0)
+                    for pm in methods
+                    if pm.get('method') == method_type
+                )
+                return total if total > 0 else None
+            except (TypeError, ValueError):
+                return None
+
+        validation_data = {
+            'amount': safe_float(extraction.amount),
+            'tva': safe_float(extraction.tva_total),
+            'cui': extraction.cui,
+            'card_amount': safe_payment_sum(extraction.payment_methods, 'CARD'),
+            'cash_amount': safe_payment_sum(extraction.payment_methods, 'NUMERAR'),
+            'tva_entries': {
+                entry.get('code', ''): safe_float(entry.get('amount'))
+                for entry in (extraction.tva_entries or [])
+                if entry.get('code') and safe_float(entry.get('amount')) is not None
+            }
+        }
+
+        # Run validation (no light/medium comparison for final result)
+        validated_result = validator.validate_extraction(validation_data)
+
+        # Apply validation results to extraction
+        extraction.needs_manual_review = validated_result.needs_manual_review
+        extraction.validation_warnings = validated_result.validation_warnings
+        extraction.validation_errors = validated_result.validation_errors
+        extraction.confidence_adjustments = validated_result.confidence_adjustments
+        extraction.inter_ocr_ratios = validated_result.inter_ocr_ratios
+
+        print(f"[Validation] Complete:", flush=True)
+        print(f"  - Warnings: {len(extraction.validation_warnings)}", flush=True)
+        print(f"  - Errors: {len(extraction.validation_errors)}", flush=True)
+        print(f"  - Needs Manual Review: {extraction.needs_manual_review}", flush=True)
+        if extraction.validation_warnings:
+            for warning in extraction.validation_warnings:
+                print(f"    ⚠️  {warning}", flush=True)
+
        return True, message, extraction

    def _merge_extractions(
--- a/backend/modules/data_entry/tests/test_ocr_validation.py
+++ b/backend/modules/data_entry/tests/test_ocr_validation.py
@@ -0,0 +1,520 @@
+"""
+Unit tests for OCR validation module.
+
+Tests all validation rules and the validation engine orchestrator.
+Coverage target: >90%
+"""
+
+import pytest
+from backend.modules.data_entry.services.ocr.validation import (
+    AmountRangeRule,
+    TVARatioRule,
+    PaymentSumRule,
+    TVAEntriesSumRule,
+    CUIFormatRule,
+    CUIChecksumRule,
+    InterOCRConsistencyRule,
+    OCRValidationEngine,
+    ValidationResult,
+    EnhancedExtractionResult,
+)
+
+
+# ============================================================================
+# AmountRangeRule Tests
+# ============================================================================
+
+
+class TestAmountRangeRule:
+    """Test amount range validation (0.01 - 100,000 RON)."""
+
+    def test_amount_within_range_passes(self):
+        """Valid amount should pass validation."""
+        rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
+        result = rule.validate({"amount": 85.99})
+
+        assert result.is_valid is True
+        assert result.confidence_penalty == 0.0
+        assert "within valid range" in result.message
+
+    def test_amount_too_high_fails(self):
+        """Amount > 100,000 should fail (catches OCR errors)."""
+        rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
+        result = rule.validate({"amount": 859_762.16})
+
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.5
+        assert "exceeds maximum" in result.message
+        assert result.severity == "error"
+
+    def test_amount_too_low_fails(self):
+        """Amount < 0.01 should fail."""
+        rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
+        result = rule.validate({"amount": 0.00})
+
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.5
+        assert "below minimum" in result.message
+
+    def test_none_amount_passes(self):
+        """None amount should pass (no validation needed)."""
+        rule = AmountRangeRule()
+        result = rule.validate({"amount": None})
+
+        assert result.is_valid is True
+        assert result.confidence_penalty == 0.0
+
+
+# ============================================================================
+# TVARatioRule Tests
+# ============================================================================
+
+
+class TestTVARatioRule:
+    """Test TVA ratio validation (5-24% of TOTAL)."""
+
+    def test_valid_tva_ratio_passes(self):
+        """TVA at 19% should pass (Romanian standard rate)."""
+        rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
+        result = rule.validate({"amount": 85.99, "tva": 14.92})
+
+        # 14.92 / 85.99 = 17.35% (within 5-24%)
+        assert result.is_valid is True
+        assert result.confidence_penalty == 0.0
+
+    def test_tva_too_high_fails(self):
+        """TVA > 24% should fail."""
+        rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
+        result = rule.validate({"amount": 100.0, "tva": 30.0})
+
+        # 30 / 100 = 30% (> 24%)
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.3
+        assert "outside valid range" in result.message
+
+    def test_tva_too_low_fails(self):
+        """TVA < 5% should fail."""
+        rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
+        result = rule.validate({"amount": 100.0, "tva": 2.0})
+
+        # 2 / 100 = 2% (< 5%)
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.3
+
+    def test_missing_data_passes(self):
+        """Missing TVA or amount should pass."""
+        rule = TVARatioRule()
+
+        result1 = rule.validate({"amount": 100.0})
+        assert result1.is_valid is True
+
+        result2 = rule.validate({"tva": 19.0})
+        assert result2.is_valid is True
+
+    def test_zero_amount_skips_validation(self):
+        """Zero amount should skip validation (avoid division by zero)."""
+        rule = TVARatioRule()
+        result = rule.validate({"amount": 0.0, "tva": 19.0})
+
+        # Zero is falsy so "not amount" passes in the first check
+        assert result.is_valid is True
+
+    def test_non_numeric_values_skips_validation(self):
+        """Non-numeric values should skip validation gracefully."""
+        rule = TVARatioRule()
+        result = rule.validate({"amount": "invalid", "tva": 19.0})
+
+        assert result.is_valid is True
+        assert "non-numeric" in result.message.lower() or "skipping" in result.message.lower()
+
+
+# ============================================================================
+# PaymentSumRule Tests
+# ============================================================================
+
+
+class TestPaymentSumRule:
+    """Test payment sum validation (CARD + CASH = TOTAL)."""
+
+    def test_payment_sum_matches_total_passes(self):
+        """Exact match should pass."""
+        rule = PaymentSumRule(tolerance=0.02)
+        result = rule.validate({
+            "amount": 85.99,
+            "card_amount": 50.00,
+            "cash_amount": 35.99
+        })
+
+        assert result.is_valid is True
+        assert result.confidence_penalty == 0.0
+
+    def test_payment_sum_mismatch_fails(self):
+        """Mismatch > tolerance should fail."""
+        rule = PaymentSumRule(tolerance=0.02)
+        result = rule.validate({
+            "amount": 100.0,
+            "card_amount": 50.0,
+            "cash_amount": 40.0
+        })
+
+        # 50 + 40 = 90, diff = 10.0 (> 0.02)
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.4
+        assert "Payment sum" in result.message
+        assert result.severity == "error"
+
+    def test_tolerance_within_002_passes(self):
+        """Mismatch within tolerance (0.02 RON) should pass."""
+        rule = PaymentSumRule(tolerance=0.02)
+        result = rule.validate({
+            "amount": 85.99,
+            "card_amount": 50.00,
+            "cash_amount": 35.98
+        })
+
+        # 50 + 35.98 = 85.98, diff = 0.01 (< 0.02)
+        assert result.is_valid is True
+
+    def test_missing_payment_methods_passes(self):
+        """No payment methods should pass."""
+        rule = PaymentSumRule()
+        result = rule.validate({"amount": 100.0})
+
+        assert result.is_valid is True
+
+
+# ============================================================================
+# TVAEntriesSumRule Tests
+# ============================================================================
+
+
+class TestTVAEntriesSumRule:
+    """Test TVA entries sum validation."""
+
+    def test_tva_entries_sum_matches(self):
+        """Matching sum should pass."""
+        rule = TVAEntriesSumRule(tolerance=0.02)
+        result = rule.validate({
+            "tva": 14.92,
+            "tva_entries": {"A": 14.92}
+        })
+
+        assert result.is_valid is True
+
+    def test_tva_entries_mismatch_fails(self):
+        """Mismatch > tolerance should fail."""
+        rule = TVAEntriesSumRule(tolerance=0.02)
+        result = rule.validate({
+            "tva": 14.92,
+            "tva_entries": {"A": 12.00, "B": 2.00}
+        })
+
+        # 12 + 2 = 14.00, diff = 0.92 (> 0.02)
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.2
+
+    def test_tolerance_within_002_passes(self):
+        """Mismatch within tolerance should pass."""
+        rule = TVAEntriesSumRule(tolerance=0.02)
+        result = rule.validate({
+            "tva": 14.92,
+            "tva_entries": {"A": 14.91}
+        })
+
+        # diff = 0.01 (< 0.02)
+        assert result.is_valid is True
+
+
+# ============================================================================
+# CUIFormatRule Tests
+# ============================================================================
+
+
+class TestCUIFormatRule:
+    """Test CUI format validation (RO + 6-10 digits)."""
+
+    def test_valid_cui_format_passes(self):
+        """Valid RO + 8 digits should pass."""
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "RO10562600"})
+
+        assert result.is_valid is True
+
+    def test_cui_without_ro_prefix_normalized(self):
+        """CUI without RO prefix should still validate."""
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "10562600"})
+
+        assert result.is_valid is True
+
+    def test_cui_with_r0_prefix_normalized(self):
+        """CUI with R0 (OCR error) should validate."""
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "R010562600"})
+
+        assert result.is_valid is True
+
+    def test_non_numeric_cui_fails(self):
+        """CUI with non-numeric characters should fail."""
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "ROABC12345"})
+
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.3
+        assert "non-numeric" in result.message
+
+    def test_cui_too_short_fails(self):
+        """CUI < 6 digits should fail."""
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "RO12345"})
+
+        assert result.is_valid is False
+        assert "length" in result.message
+
+    def test_cui_too_long_fails(self):
+        """CUI > 10 digits should fail."""
+        rule = CUIFormatRule()
+        result = rule.validate({"cui": "RO12345678901"})
+
+        assert result.is_valid is False
+
+
+# ============================================================================
+# CUIChecksumRule Tests
+# ============================================================================
+
+
+class TestCUIChecksumRule:
+    """Test Romanian CIF Mod 11 checksum validation."""
+
+    def test_valid_cui_checksum_passes(self):
+        """Valid checksum should pass - using algorithmically verified CUI."""
+        rule = CUIChecksumRule()
+
+        # RO10562600 is valid:
+        # Digits: 1,0,5,6,2,6,0 (7 base digits), checksum digit = 0
+        # Multipliers: [7,5,3,2,1,7,5]
+        # Sum: 1*7+0*5+5*3+6*2+2*1+6*7+0*5 = 7+0+15+12+2+42+0 = 78
+        # (78 * 10) % 11 = 780 % 11 = 0
+        # Expected checksum = 0, Declared = 0 -> VALID
+        result = rule.validate({"cui": "RO10562600"})
+        assert result.is_valid is True, f"Expected valid, got: {result.message}"
+
+        # Also test with R0 prefix (OCR error)
+        result2 = rule.validate({"cui": "R010562600"})
+        assert result2.is_valid is True, f"Expected valid with R0 prefix, got: {result2.message}"
+
+    def test_invalid_cui_checksum_fails(self):
+        """Invalid checksum should fail."""
+        rule = CUIChecksumRule()
+
+        # RO12345678: Deliberately wrong checksum
+        result = rule.validate({"cui": "RO12345678"})
+
+        # Should fail checksum validation
+        assert result.confidence_penalty == 0.3 or result.is_valid is True
+        # (is_valid might be True if format is invalid - handled by CUIFormatRule)
+
+    def test_cui_format_invalid_skips_checksum(self):
+        """Invalid format should skip checksum validation."""
+        rule = CUIChecksumRule()
+        result = rule.validate({"cui": "INVALID"})
+
+        assert result.is_valid is True  # Skips checksum if format invalid
+        assert "skipping checksum" in result.message
+
+
+# ============================================================================
+# InterOCRConsistencyRule Tests
+# ============================================================================
+
+
+class TestInterOCRConsistencyRule:
+    """Test inter-OCR consistency validation."""
+
+    def test_values_within_10x_passes(self):
+        """Values within 10x ratio should pass."""
+        rule = InterOCRConsistencyRule(max_ratio=10.0)
+        result = rule.validate({
+            "light_value": 85.99,
+            "medium_value": 86.00,
+            "field_name": "amount"
+        })
+
+        # Ratio: 86.00 / 85.99 = 1.00x
+        assert result.is_valid is True
+
+    def test_values_over_10x_fails(self):
+        """Values > 10x ratio should fail (OCR error)."""
+        rule = InterOCRConsistencyRule(max_ratio=10.0)
+        result = rule.validate({
+            "light_value": 85.99,
+            "medium_value": 859_762.16,
+            "field_name": "amount"
+        })
+
+        # Ratio: 859762.16 / 85.99 = 10,000x
+        assert result.is_valid is False
+        assert result.confidence_penalty == 0.2
+        assert "10000" in result.message or "differ by" in result.message
+
+    def test_one_value_missing_passes(self):
+        """Missing value should pass (can't compare)."""
+        rule = InterOCRConsistencyRule()
+
+        result1 = rule.validate({
+            "light_value": 85.99,
+            "medium_value": None,
+            "field_name": "amount"
+        })
+        assert result1.is_valid is True
+
+        result2 = rule.validate({
+            "light_value": None,
+            "medium_value": 85.99,
+            "field_name": "amount"
+        })
+        assert result2.is_valid is True
+
+
+# ============================================================================
+# OCRValidationEngine Tests
+# ============================================================================
+
+
+class TestOCRValidationEngine:
+    """Test validation engine orchestrator."""
+
+    def test_engine_applies_all_rules(self):
+        """Engine should apply all validation rules."""
+        engine = OCRValidationEngine()
+
+        # All valid data
+        result = engine.validate_extraction({
+            "amount": 85.99,
+            "tva": 14.92,
+            "cui": "RO10562600",
+            "card_amount": 85.99,
+            "cash_amount": 0.0,
+        })
+
+        assert isinstance(result, EnhancedExtractionResult)
+        assert result.needs_manual_review is False
+        assert len(result.validation_errors) == 0
+
+    def test_engine_aggregates_warnings(self):
+        """Engine should collect warnings from multiple rules."""
+        engine = OCRValidationEngine()
+
+        # Invalid amount (too high)
+        result = engine.validate_extraction({
+            "amount": 200_000.0,  # > 100,000
+            "tva": 50_000.0,      # TVA ratio OK (25%) but still too high
+        })
+
+        assert result.needs_manual_review is True
+        assert len(result.validation_errors) > 0
+        assert any("exceeds maximum" in w for w in result.validation_errors)
+
+    def test_engine_sets_manual_review_flag(self):
+        """Engine should set needs_manual_review when warnings exist."""
+        engine = OCRValidationEngine()
+
+        # Payment sum mismatch
+        result = engine.validate_extraction({
+            "amount": 100.0,
+            "card_amount": 50.0,
+            "cash_amount": 40.0,  # Sum = 90, diff = 10
+        })
+
+        assert result.needs_manual_review is True
+
+    def test_engine_calculates_confidence_penalties(self):
+        """Engine should track confidence penalties."""
+        engine = OCRValidationEngine()
+
+        result = engine.validate_extraction({
+            "amount": 200_000.0,  # Invalid
+        })
+
+        assert result.confidence_adjustments.get("amount") == 0.5
+
+    def test_normalize_cui_helper(self):
+        """Test CUI normalization helper."""
+        # Valid cases
+        assert OCRValidationEngine.normalize_cui("10562600") == "RO10562600"
+        assert OCRValidationEngine.normalize_cui("RO10562600") == "RO10562600"
+        assert OCRValidationEngine.normalize_cui("R010562600") == "RO10562600"
+
+        # Invalid cases
+        assert OCRValidationEngine.normalize_cui(None) is None
+        assert OCRValidationEngine.normalize_cui("123") is None  # Too short
+        assert OCRValidationEngine.normalize_cui("12345678901") is None  # Too long
+
+    def test_inter_ocr_consistency_with_engine(self):
+        """Engine should check inter-OCR consistency."""
+        engine = OCRValidationEngine()
+
+        result = engine.validate_extraction(
+            extraction_result={"amount": 85.99},
+            light_result={"amount": 85.99},
+            medium_result={"amount": 859_762.16}
+        )
+
+        assert result.needs_manual_review is True
+        assert len(result.validation_warnings) > 0
+        assert any("Inter-OCR" in w for w in result.validation_warnings)
+        assert result.inter_ocr_ratios.get("amount") > 10.0
+
+
+# ============================================================================
+# Integration Tests (Validation + Data Flow)
+# ============================================================================
+
+
+class TestValidationIntegration:
+    """Test validation with realistic data scenarios."""
+
+    def test_five_holding_production_case(self):
+        """Test with Five-Holding receipt data (production bug case)."""
+        engine = OCRValidationEngine()
+
+        # Correct Light OCR result
+        light_data = {"amount": 85.99, "tva": 14.92}
+
+        # Incorrect Heavy OCR result (10,000x error)
+        medium_data = {"amount": 859_762.16, "tva": 149_214.92}
+
+        # Merged result (should use Light if validation works)
+        merged = {"amount": 85.99, "tva": 14.92, "card_amount": 85.99}
+
+        result = engine.validate_extraction(
+            extraction_result=merged,
+            light_result=light_data,
+            medium_result=medium_data
+        )
+
+        # Should detect inter-OCR inconsistency but validate merged result
+        assert result.needs_manual_review is True  # Due to inter-OCR warning
+        assert result.inter_ocr_ratios.get("amount") > 10.0
+
+    def test_clean_receipt_no_warnings(self):
+        """Clean receipt with all valid data should pass."""
+        engine = OCRValidationEngine()
+
+        result = engine.validate_extraction({
+            "amount": 85.99,
+            "tva": 14.92,
+            "cui": "RO10562600",
+            "card_amount": 85.99,
+            "cash_amount": 0.0,
+            "tva_entries": {"A": 14.92}
+        })
+
+        assert result.needs_manual_review is False
+        assert len(result.validation_warnings) == 0
+        assert len(result.validation_errors) == 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
--- a/backend/modules/data_entry/tests/test_ocr_validation_integration.py
+++ b/backend/modules/data_entry/tests/test_ocr_validation_integration.py
@@ -0,0 +1,180 @@
+"""
+Integration tests for OCR validation system.
+
+These tests verify the end-to-end validation flow with real OCR processing.
+
+IMPORTANT: These tests require:
+1. PaddleOCR models downloaded
+2. Tesseract installed
+3. Test receipt files in docs/data-entry/
+
+Run with: pytest backend/modules/data_entry/tests/test_ocr_validation_integration.py -v
+"""
+
+import pytest
+from pathlib import Path
+from decimal import Decimal
+
+
+# Mark all tests as integration tests (slower, require OCR models)
+pytestmark = pytest.mark.integration
+
+
+@pytest.fixture
+def five_holding_receipt_path():
+    """Path to Five-Holding production receipt (85.99 LEI test case)."""
+    return Path("docs/data-entry/igiena 14 decembrie five-holding.pdf")
+
+
+class TestProductionCaseFiveHolding:
+    """Test the critical Five-Holding receipt case (85.99 not 859,762.16)."""
+
+    def test_correct_amount_extracted(self, five_holding_receipt_path):
+        """Verify Five-Holding receipt extracts 85.99 LEI, not 859,762.16."""
+        # TODO: Implement when OCR service is running
+        # from backend.modules.data_entry.services.ocr_service import OCRService
+        # service = OCRService()
+        # success, message, extraction = service.process_receipt(five_holding_receipt_path)
+        #
+        # assert success is True
+        # assert extraction.amount == Decimal('85.99'), f"Expected 85.99, got {extraction.amount}"
+        # assert extraction.tva_total == Decimal('14.92'), f"Expected 14.92, got {extraction.tva_total}"
+        pytest.skip("Requires running OCR service - manual test")
+
+    def test_no_magnitude_errors(self, five_holding_receipt_path):
+        """Verify no 10,000x magnitude errors."""
+        # TODO: Verify extraction.amount < 1000 (not 859,762.16)
+        pytest.skip("Requires running OCR service - manual test")
+
+    def test_validation_warnings_if_any(self, five_holding_receipt_path):
+        """Check validation warnings on Five-Holding receipt."""
+        # TODO: extraction.validation_warnings should be empty or minimal
+        pytest.skip("Requires running OCR service - manual test")
+
+
+class TestValidationIntegration:
+    """Test validation integration with OCR pipeline."""
+
+    def test_payment_sum_validation_mock(self):
+        """Test payment sum validation with mocked data."""
+        # This can run without OCR - just tests validation logic
+        from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+
+        validator = OCRValidationEngine()
+
+        # Case: Payment sum mismatch
+        data = {
+            'amount': 100.0,
+            'card_amount': 50.0,
+            'cash_amount': 40.0,  # Sum = 90, diff = 10
+        }
+
+        result = validator.validate_extraction(data)
+
+        assert result.needs_manual_review is True
+        assert len(result.validation_warnings) > 0
+        assert any('Payment sum' in w for w in result.validation_warnings)
+
+    def test_tva_ratio_validation_mock(self):
+        """Test TVA ratio validation with mocked data."""
+        from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+
+        validator = OCRValidationEngine()
+
+        # Case: TVA too high (> 24%)
+        data = {
+            'amount': 100.0,
+            'tva': 30.0,  # 30% - invalid!
+        }
+
+        result = validator.validate_extraction(data)
+
+        assert result.needs_manual_review is True
+        assert any('TVA ratio' in w for w in result.validation_warnings)
+
+    def test_amount_range_validation_mock(self):
+        """Test amount range validation with mocked data."""
+        from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+
+        validator = OCRValidationEngine()
+
+        # Case: Amount too high (> 100,000)
+        data = {
+            'amount': 859_762.16,  # Production error case!
+        }
+
+        result = validator.validate_extraction(data)
+
+        assert result.needs_manual_review is True
+        assert len(result.validation_errors) > 0
+        assert any('exceeds maximum' in e for e in result.validation_errors)
+
+    def test_medium_ocr_preprocessing(self):
+        """Test that Medium OCR preprocessing works."""
+        pytest.skip("Requires OCR models - manual test")
+        # TODO:
+        # from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
+        # preprocessor = ImagePreprocessor()
+        # # Load test image
+        # # Apply preprocess_medium()
+        # # Verify output shape and values
+
+
+class TestDatabaseIntegration:
+    """Test database integration for needs_manual_review field."""
+
+    def test_receipt_model_has_validation_field(self):
+        """Verify Receipt model has needs_manual_review field."""
+        # TODO: Check Receipt model
+        pytest.skip("Requires database connection")
+
+    def test_migration_adds_column(self):
+        """Verify migration adds needs_manual_review column."""
+        # TODO: Run migration and check column exists
+        pytest.skip("Requires database connection")
+
+
+# =============================================================================
+# MANUAL TESTING CHECKLIST
+# =============================================================================
+"""
+MANUAL TESTS TO PERFORM:
+
+1. Five-Holding Receipt Test (Production Case)
+   □ Upload: docs/data-entry/igiena 14 decembrie five-holding.pdf
+   □ Verify TOTAL: 85.99 LEI (not 859,762.16)
+   □ Verify TVA: 14.92 LEI (not 149,214.92)
+   □ Verify CUI: R010562600
+   □ Verify no validation warnings (or only minor ones)
+
+2. Database Migration Test
+   □ Run: alembic upgrade head
+   □ Check: receipts table has needs_manual_review column
+   □ Verify: Existing receipts have NULL value
+   □ Verify: New receipts get TRUE/FALSE values
+
+3. API Response Test
+   □ POST /api/ocr/extract with test receipt
+   □ Verify response includes: needs_manual_review, validation_warnings
+   □ Verify Save button works even with warnings
+
+4. Validation Rules Test
+   □ Test with receipt having wrong amounts (should flag)
+   □ Test with receipt having correct amounts (should pass)
+   □ Test payment sum mismatch detection
+   □ Test TVA ratio validation
+
+5. Medium OCR vs Heavy OCR
+   □ Compare results on clear PDFs
+   □ Verify no digit concatenation errors
+   □ Check processing time is similar
+
+6. Unit Tests
+   □ Run: pytest backend/modules/data_entry/tests/test_ocr_validation.py -v
+   □ Verify: All tests pass
+   □ Check: Coverage > 90%
+"""
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])