feat(ocr): Add validation system and CLIENT CUI extraction

OCR Data Extraction Validation System:
- Add 7 validation rules (amount range, TVA ratio, payment sum, etc.)
- Add Medium preprocessing to replace Heavy (fixes digit concatenation)
- Add validation warnings to API responses
- Flag receipts needing manual review (needs_manual_review field)
- Add database migration for needs_manual_review column

CLIENT CUI Extraction Improvements:
- Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc.
- Handle OCR errors (R0 vs RO, C1F vs CIF)
- Add client_name, client_cui, client_address to API response
- Add validation fields to API response (was missing)

QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings)
- Fixed type safety in validation rules
- Fixed ZeroDivisionError risk
- Fixed schema mismatch (Optional[bool] for needs_manual_review)
- All 37 unit tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 19:12:52 +02:00
parent ce85e0643b
commit ab160b628d
14 changed files with 4161 additions and 33 deletions

View File

@@ -0,0 +1,40 @@
"""Add needs_manual_review flag to receipts table.
Revision ID: 20251230_needs_manual_review
Revises: 20251216_payment_mode
Create Date: 2025-12-30
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '20251230_needs_manual_review'
down_revision = '20251216_payment_mode'
branch_labels = None
depends_on = None
def upgrade() -> None:
"""Add needs_manual_review column for OCR validation tracking.
This column tracks whether a receipt needs manual supervisor review
based on OCR extraction validation warnings:
- NULL = not validated yet (old receipts before validation feature)
- FALSE = validated, no review needed
- TRUE = validated, needs review
"""
with op.batch_alter_table('receipts', schema=None) as batch_op:
batch_op.add_column(
sa.Column('needs_manual_review', sa.Boolean(), nullable=True)
)
# NOTE: We do NOT set a default value for existing rows.
# NULL indicates the receipt was created before validation was implemented.
# Only new receipts (created after this migration) will have TRUE/FALSE values.
def downgrade() -> None:
"""Remove needs_manual_review column."""
with op.batch_alter_table('receipts', schema=None) as batch_op:
batch_op.drop_column('needs_manual_review')

View File

@@ -118,13 +118,23 @@ async def extract_from_image(file: UploadFile = File(...)):
items_count=result.items_count,
payment_methods=payment_methods_list,
suggested_payment_mode=suggested_payment_mode,
# Client data (B2B receipts)
client_name=result.client_name,
client_cui=result.client_cui,
client_address=result.client_address,
confidence_amount=result.confidence_amount,
confidence_date=result.confidence_date,
confidence_vendor=result.confidence_vendor,
confidence_client=result.confidence_client,
overall_confidence=result.overall_confidence,
raw_text=result.raw_text,
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
# Validation results
needs_manual_review=result.needs_manual_review,
validation_warnings=result.validation_warnings,
validation_errors=result.validation_errors,
inter_ocr_ratios=result.inter_ocr_ratios,
)
return OCRResponse(success=True, message=message, data=data)
@@ -206,13 +216,23 @@ async def extract_from_attachment(
items_count=result.items_count,
payment_methods=payment_methods_list,
suggested_payment_mode=suggested_payment_mode,
# Client data (B2B receipts)
client_name=result.client_name,
client_cui=result.client_cui,
client_address=result.client_address,
confidence_amount=result.confidence_amount,
confidence_date=result.confidence_date,
confidence_vendor=result.confidence_vendor,
confidence_client=result.confidence_client,
overall_confidence=result.overall_confidence,
raw_text=result.raw_text,
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
# Validation results
needs_manual_review=result.needs_manual_review,
validation_warnings=result.validation_warnings,
validation_errors=result.validation_errors,
inter_ocr_ratios=result.inter_ocr_ratios,
)
return OCRResponse(success=True, message=message, data=data)

View File

@@ -20,6 +20,15 @@ class PaymentMethod(BaseModel):
amount: Decimal = Field(description="Amount paid")
class ValidationWarning(BaseModel):
"""Validation warning from OCR extraction."""
field: str = Field(description="Field name (e.g., 'amount', 'tva_total')")
rule: str = Field(description="Rule name (e.g., 'amount_range', 'tva_ratio')")
message: str = Field(description="Human-readable warning message")
severity: str = Field(description="Severity: 'info', 'warning', 'error'")
suggested_value: Optional[str] = Field(default=None, description="Suggested corrected value")
class ExtractionData(BaseModel):
"""Extracted receipt data from OCR."""
@@ -56,6 +65,13 @@ class ExtractionData(BaseModel):
ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")
# Validation results (added by bon-ocr-validation feature)
# needs_manual_review: None = not validated yet (old receipts), False = no review needed, True = needs review
needs_manual_review: Optional[bool] = Field(default=None, description="Flag for supervisor review (None=not validated, False=ok, True=needs review)")
validation_warnings: List[str] = Field(default=[], description="Validation warnings")
validation_errors: List[str] = Field(default=[], description="Validation errors")
inter_ocr_ratios: dict[str, float] = Field(default={}, description="Inter-OCR consistency ratios")
class Config:
"""Pydantic config."""
json_schema_extra = {

View File

@@ -104,10 +104,80 @@ class ImagePreprocessor:
# NO binarization, NO morphological ops - preserve original quality
return enhanced
def preprocess_medium(self, image: np.ndarray) -> np.ndarray:
"""
Medium preprocessing for MIXED-QUALITY images.
Balance between Light (too gentle) and Heavy (too aggressive).
Use cases:
- Moderately faded receipts
- Photos with uneven lighting
- Scans with slight blur
Preprocessing steps:
- Moderate contrast enhancement (CLAHE clipLimit=2.0)
- Light denoising (fastNlMeansDenoising h=6)
- Gentle sharpening
- NO binarization (preserves text boundaries)
- NO morphological operations (avoids digit concatenation)
This method was created to replace preprocess_heavy() which caused
digit concatenation errors on high-quality PDFs (85.99 → 859,762.16).
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)
# 1. Grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
height, width = gray.shape
max_side = max(height, width)
if max_side > 4000:
scale = 4000 / max_side
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
height, width = gray.shape
# 2b. Scale UP if too small
if width < 1500:
scale = 1500 / width
# Ensure we don't exceed 4000px after upscaling
new_width = int(width * scale)
new_height = int(height * scale)
if max(new_width, new_height) > 4000:
scale = 4000 / max(new_width, new_height)
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# 3. Deskew
gray = self._deskew(gray)
# 4. Moderate contrast enhancement (CLAHE clipLimit=2.0)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 5. Light denoising (less aggressive than Heavy)
denoised = cv2.fastNlMeansDenoising(enhanced, h=6, templateWindowSize=7, searchWindowSize=15)
# 6. Gentle sharpening
gaussian = cv2.GaussianBlur(denoised, (0, 0), 1.0)
sharpened = cv2.addWeighted(denoised, 1.3, gaussian, -0.3, 0)
# NO binarization, NO morphological operations
# This preserves text boundaries and avoids digit concatenation
return sharpened
def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
"""
Heavy preprocessing for FADED thermal receipts.
Aggressive binarization to recover faded text.
⚠️ DEPRECATED: Use preprocess_medium() instead.
Heavy preprocessing causes digit concatenation on clear PDFs
(e.g., 85.99 → 859,762.16 due to binarization + morphological operations).
Kept for backward compatibility only.
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)

View File

@@ -0,0 +1,737 @@
"""
OCR Data Validation Module
Provides multi-layer validation for OCR extraction results to prevent
incorrect data from entering the system.
Validation Layers:
1. Absolute sanity checks (value ranges)
2. Cross-field validation (correlation between fields)
3. Inter-OCR consistency (compare multiple OCR results)
4. Auto-correction (fix obvious errors)
Usage:
engine = OCRValidationEngine()
validated_result = engine.validate_extraction(
merged_result,
light_ocr_result,
medium_ocr_result
)
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Optional
@dataclass
class ValidationResult:
"""Result of a single validation rule execution.
Attributes:
is_valid: Whether the validation passed
confidence_penalty: Penalty to apply to confidence score (0.0-1.0)
0.0 = no penalty, 1.0 = complete rejection
message: Human-readable description of validation result
severity: "info" | "warning" | "error"
"""
is_valid: bool
confidence_penalty: float = 0.0
message: str = ""
severity: str = "info" # "info" | "warning" | "error"
def __post_init__(self):
"""Validate penalty is in valid range."""
if not 0.0 <= self.confidence_penalty <= 1.0:
raise ValueError(f"Confidence penalty must be 0.0-1.0, got {self.confidence_penalty}")
class ValidationRule(ABC):
"""Abstract base class for OCR validation rules.
Each rule implements a specific validation check and returns
a ValidationResult indicating success/failure with optional
confidence penalty.
"""
@abstractmethod
def validate(self, data: dict[str, Any]) -> ValidationResult:
"""Execute validation rule on extraction data.
Args:
data: Dictionary containing extraction fields to validate
Example: {"amount": 85.99, "tva": 14.92, ...}
Returns:
ValidationResult with is_valid flag and optional penalty
"""
pass
@property
@abstractmethod
def rule_name(self) -> str:
"""Human-readable name of this validation rule."""
pass
# ============================================================================
# VALIDATION RULES
# ============================================================================
class AmountRangeRule(ValidationRule):
"""Validate amount is within reasonable bounds for Romanian receipts.
Romanian receipts rarely exceed 100,000 RON. This catches obvious
OCR errors like digit concatenation (85.99 → 859,762.16).
Example:
rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
result = rule.validate({"amount": 859762.16})
# result.is_valid = False, penalty = 0.5
"""
def __init__(self, min_amount: float = 0.01, max_amount: float = 100_000.0):
self.min_amount = min_amount
self.max_amount = max_amount
@property
def rule_name(self) -> str:
return "Amount Range Check"
def validate(self, data: dict[str, Any]) -> ValidationResult:
amount = data.get("amount")
if amount is None:
return ValidationResult(
is_valid=True,
message="No amount to validate"
)
if amount < self.min_amount:
return ValidationResult(
is_valid=False,
confidence_penalty=0.5,
message=f"Amount {amount:.2f} RON below minimum {self.min_amount:.2f} RON",
severity="error"
)
if amount > self.max_amount:
return ValidationResult(
is_valid=False,
confidence_penalty=0.5,
message=f"Amount {amount:.2f} RON exceeds maximum {self.max_amount:.2f} RON (likely OCR error)",
severity="error"
)
return ValidationResult(
is_valid=True,
message=f"Amount {amount:.2f} RON within valid range"
)
class TVARatioRule(ValidationRule):
"""Validate TVA is reasonable percentage of TOTAL amount.
Romanian TVA rates: 5%, 9%, 19%, 21% (most common: 19-21%)
This catches errors where TVA > TOTAL (impossible).
Example:
rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
result = rule.validate({"amount": 85.99, "tva": 149.21})
# result.is_valid = False (149.21 > 85.99!)
"""
def __init__(self, min_ratio: float = 0.05, max_ratio: float = 0.24):
self.min_ratio = min_ratio
self.max_ratio = max_ratio
@property
def rule_name(self) -> str:
return "TVA Ratio Check"
def validate(self, data: dict[str, Any]) -> ValidationResult:
amount = data.get("amount")
tva = data.get("tva")
if not amount or not tva:
return ValidationResult(
is_valid=True,
message="Insufficient data for TVA correlation"
)
# Type safety: ensure numeric types before division
if not isinstance(amount, (int, float)) or not isinstance(tva, (int, float)):
return ValidationResult(
is_valid=True,
message="Non-numeric values, skipping TVA correlation"
)
# Avoid division by zero
if amount <= 0:
return ValidationResult(
is_valid=True,
message="Amount is zero or negative, skipping TVA ratio"
)
tva_ratio = tva / amount
if tva_ratio < self.min_ratio or tva_ratio > self.max_ratio:
return ValidationResult(
is_valid=False,
confidence_penalty=0.3,
message=f"TVA ratio {tva_ratio:.1%} outside valid range ({self.min_ratio:.0%}-{self.max_ratio:.0%})",
severity="warning"
)
return ValidationResult(
is_valid=True,
message=f"TVA ratio {tva_ratio:.1%} valid"
)
class PaymentSumRule(ValidationRule):
"""Validate CARD + NUMERAR = TOTAL BON (within tolerance).
This is a CRITICAL validation that catches cases where OCR extracts
wrong TOTAL but correct payment methods.
Example:
rule = PaymentSumRule(tolerance=0.02)
result = rule.validate({
"amount": 859762.16, # Wrong from OCR
"card_amount": 85.99, # Correct
"cash_amount": 0.0
})
# result.is_valid = False, suggests auto-correction
"""
def __init__(self, tolerance: float = 0.02):
self.tolerance = tolerance
@property
def rule_name(self) -> str:
return "Payment Sum Check"
def validate(self, data: dict[str, Any]) -> ValidationResult:
total = data.get("amount")
card = data.get("card_amount", 0.0) or 0.0
cash = data.get("cash_amount", 0.0) or 0.0
if not total:
return ValidationResult(
is_valid=True,
message="No total amount to validate"
)
payment_sum = card + cash
if payment_sum == 0:
return ValidationResult(
is_valid=True,
message="No payment methods extracted"
)
diff = abs(total - payment_sum)
if diff > self.tolerance:
return ValidationResult(
is_valid=False,
confidence_penalty=0.4,
message=f"Payment sum {payment_sum:.2f} RON ≠ Total {total:.2f} RON (diff: {diff:.2f} RON). Consider auto-correction.",
severity="error"
)
return ValidationResult(
is_valid=True,
message=f"Payment sum matches total (diff: {diff:.2f} RON)"
)
class TVAEntriesSumRule(ValidationRule):
"""Validate Σ(TVA entries) = TVA TOTAL (within tolerance).
TVA breakdown (A, B, C, D rates) should sum to total TVA.
Example:
rule = TVAEntriesSumRule(tolerance=0.02)
result = rule.validate({
"tva": 14.92,
"tva_entries": {"A": 14.92, "B": 0.0}
})
# result.is_valid = True
"""
def __init__(self, tolerance: float = 0.02):
self.tolerance = tolerance
@property
def rule_name(self) -> str:
return "TVA Entries Sum Check"
def validate(self, data: dict[str, Any]) -> ValidationResult:
tva_total = data.get("tva")
tva_entries = data.get("tva_entries", {})
if not tva_total:
return ValidationResult(
is_valid=True,
message="No TVA total to validate"
)
if not tva_entries:
return ValidationResult(
is_valid=True,
message="No TVA entries extracted"
)
entries_sum = sum(tva_entries.values())
if entries_sum == 0:
return ValidationResult(
is_valid=True,
message="TVA entries sum is zero"
)
diff = abs(tva_total - entries_sum)
if diff > self.tolerance:
return ValidationResult(
is_valid=False,
confidence_penalty=0.2,
message=f"TVA entries sum {entries_sum:.2f} RON ≠ TVA total {tva_total:.2f} RON (diff: {diff:.2f} RON)",
severity="warning"
)
return ValidationResult(
is_valid=True,
message=f"TVA entries sum matches total (diff: {diff:.2f} RON)"
)
class CUIFormatRule(ValidationRule):
"""Validate CUI format: RO + 6-10 digits.
Romanian CUI (Cod Unic de Identificare) format:
- Optional "RO" prefix (or "R0" from OCR errors)
- 6-10 numeric digits
Example:
rule = CUIFormatRule()
result = rule.validate({"cui": "RO10562600"})
# result.is_valid = True
"""
@property
def rule_name(self) -> str:
return "CUI Format Check"
def validate(self, data: dict[str, Any]) -> ValidationResult:
cui = data.get("cui")
if not cui:
return ValidationResult(
is_valid=True,
message="No CUI to validate"
)
# Normalize: remove RO/R0 prefix
cui_clean = cui.strip().upper()
if cui_clean.startswith("RO"):
cui_clean = cui_clean[2:]
elif cui_clean.startswith("R0"):
cui_clean = cui_clean[2:]
# Check if numeric
if not cui_clean.isdigit():
return ValidationResult(
is_valid=False,
confidence_penalty=0.3,
message=f"CUI '{cui}' contains non-numeric characters",
severity="warning"
)
# Check length
if len(cui_clean) < 6 or len(cui_clean) > 10:
return ValidationResult(
is_valid=False,
confidence_penalty=0.3,
message=f"CUI '{cui}' length {len(cui_clean)} outside valid range (6-10 digits)",
severity="warning"
)
return ValidationResult(
is_valid=True,
message=f"CUI '{cui}' format valid"
)
class CUIChecksumRule(ValidationRule):
"""Validate Romanian CIF/CUI using Mod 11 checksum algorithm.
Algorithm:
1. Remove RO prefix if present
2. Extract last digit as declared checksum
3. Apply multipliers [7,5,3,2,1,7,5,3,2] to first N-1 digits
4. Calculate: (sum * 10) mod 11
5. If result = 10, expected checksum = 0
6. Else, expected checksum = result
7. Compare with declared checksum
Example:
rule = CUIChecksumRule()
result = rule.validate({"cui": "RO10562600"})
# result.is_valid = True (checksum correct)
result = rule.validate({"cui": "R01879855"})
# result.is_valid = False (checksum mismatch)
"""
@property
def rule_name(self) -> str:
return "CUI Checksum Check (Mod 11)"
def validate(self, data: dict[str, Any]) -> ValidationResult:
cui = data.get("cui")
if not cui:
return ValidationResult(
is_valid=True,
message="No CUI to validate"
)
# Normalize: remove RO/R0 prefix
cui_clean = cui.strip().upper()
if cui_clean.startswith("RO"):
cui_clean = cui_clean[2:]
elif cui_clean.startswith("R0"):
cui_clean = cui_clean[2:]
# Check format first
if not cui_clean.isdigit():
return ValidationResult(
is_valid=True, # Don't fail checksum if format invalid (handled by CUIFormatRule)
message="CUI format invalid, skipping checksum"
)
if len(cui_clean) < 6 or len(cui_clean) > 10:
return ValidationResult(
is_valid=True,
message="CUI length invalid, skipping checksum"
)
# Extract digits
digits = [int(d) for d in cui_clean]
checksum_declared = digits[-1]
base_digits = digits[:-1]
# Multipliers (trim to match base_digits length)
multipliers = [7, 5, 3, 2, 1, 7, 5, 3, 2]
multipliers = multipliers[:len(base_digits)]
# Calculate weighted sum
weighted_sum = sum(d * m for d, m in zip(base_digits, multipliers))
# Calculate expected checksum
checksum_calculated = (weighted_sum * 10) % 11
if checksum_calculated == 10:
checksum_calculated = 0
if checksum_calculated != checksum_declared:
return ValidationResult(
is_valid=False,
confidence_penalty=0.3,
message=f"CUI '{cui}' checksum mismatch: expected {checksum_calculated}, got {checksum_declared}",
severity="warning"
)
return ValidationResult(
is_valid=True,
message=f"CUI '{cui}' checksum valid"
)
class InterOCRConsistencyRule(ValidationRule):
"""Validate consistency between multiple OCR results.
If Light OCR and Medium OCR produce values that differ by >10x,
one is clearly wrong (likely digit concatenation error).
Example:
rule = InterOCRConsistencyRule(max_ratio=10.0)
result = rule.validate({
"light_amount": 85.99,
"medium_amount": 859762.16
})
# result.is_valid = False (ratio = 10,000x!)
"""
def __init__(self, max_ratio: float = 10.0):
self.max_ratio = max_ratio
@property
def rule_name(self) -> str:
return "Inter-OCR Consistency Check"
def validate(self, data: dict[str, Any]) -> ValidationResult:
light_value = data.get("light_value")
medium_value = data.get("medium_value")
field_name = data.get("field_name", "value")
if not light_value or not medium_value:
return ValidationResult(
is_valid=True,
message="Insufficient OCR results for consistency check"
)
# Avoid division by zero
if light_value == 0 or medium_value == 0:
return ValidationResult(
is_valid=True,
message="One value is zero, skipping consistency check"
)
ratio = max(light_value, medium_value) / min(light_value, medium_value)
if ratio > self.max_ratio:
return ValidationResult(
is_valid=False,
confidence_penalty=0.2,
message=f"{field_name}: OCR results differ by {ratio:.1f}x (Light: {light_value}, Medium: {medium_value})",
severity="warning"
)
return ValidationResult(
is_valid=True,
message=f"{field_name}: OCR results consistent (ratio: {ratio:.2f}x)"
)
# ============================================================================
# VALIDATION ENGINE
# ============================================================================
@dataclass
class EnhancedExtractionResult:
"""Enhanced extraction result with validation metadata.
This wraps the original extraction data and adds validation results.
"""
# Original data
data: dict[str, Any]
# Validation results
needs_manual_review: bool = False
validation_warnings: list[str] = field(default_factory=list)
validation_errors: list[str] = field(default_factory=list)
confidence_adjustments: dict[str, float] = field(default_factory=dict)
# Inter-OCR metadata
inter_ocr_ratios: dict[str, float] = field(default_factory=dict)
class OCRValidationEngine:
"""Orchestrate all validation rules for OCR extraction results.
This engine applies validation rules in order:
1. Sanity checks (amount range, format checks)
2. Cross-field correlation (TVA ratio, payment sum)
3. Inter-OCR consistency checks
Example:
engine = OCRValidationEngine()
result = engine.validate_extraction(
extraction_result=merged_data,
light_result=light_ocr_data,
medium_result=medium_ocr_data
)
"""
def __init__(self):
"""Initialize validation engine with default rules."""
# Sanity check rules (absolute value validation)
self.sanity_rules = [
AmountRangeRule(min_amount=0.01, max_amount=100_000.0),
CUIFormatRule(),
CUIChecksumRule(),
]
# Cross-field validation rules (correlation between fields)
self.cross_field_rules = [
TVARatioRule(min_ratio=0.05, max_ratio=0.24),
PaymentSumRule(tolerance=0.02),
TVAEntriesSumRule(tolerance=0.02),
]
# Inter-OCR consistency rules
self.inter_ocr_rules = [
InterOCRConsistencyRule(max_ratio=10.0),
]
def validate_extraction(
self,
extraction_result: dict[str, Any],
light_result: Optional[dict[str, Any]] = None,
medium_result: Optional[dict[str, Any]] = None
) -> EnhancedExtractionResult:
"""Run all validation rules and return enhanced result.
Args:
extraction_result: Merged OCR extraction data (required)
light_result: Light OCR preprocessing results (optional)
medium_result: Medium OCR preprocessing results (optional)
Returns:
EnhancedExtractionResult with validation warnings and metadata
"""
warnings = []
errors = []
confidence_adjustments = {}
inter_ocr_ratios = {}
# Step 1: Sanity checks
print("\n[Validation] Step 1: Sanity checks...", flush=True)
for rule in self.sanity_rules:
result = rule.validate(extraction_result)
if not result.is_valid:
msg = f"[{rule.rule_name}] {result.message}"
if result.severity == "error":
errors.append(msg)
else:
warnings.append(msg)
print(f"{msg}", flush=True)
# Track confidence penalty for the relevant field based on rule
if result.confidence_penalty > 0:
rule_field_map = {
"Amount Range Check": ["amount"],
"CUI Format Check": ["cui"],
"CUI Checksum Check (Mod 11)": ["cui"],
}
fields = rule_field_map.get(rule.rule_name, ["amount", "tva", "cui"])
for f in fields:
if f in extraction_result:
confidence_adjustments[f] = result.confidence_penalty
else:
print(f"{rule.rule_name}: {result.message}", flush=True)
# Step 2: Cross-field validation
print("\n[Validation] Step 2: Cross-field validation...", flush=True)
for rule in self.cross_field_rules:
result = rule.validate(extraction_result)
if not result.is_valid:
msg = f"[{rule.rule_name}] {result.message}"
if result.severity == "error":
errors.append(msg)
else:
warnings.append(msg)
print(f"{msg}", flush=True)
# Track confidence penalty for the relevant field based on rule
if result.confidence_penalty > 0:
rule_field_map = {
"TVA Ratio Check": ["tva"],
"Payment Sum Check": ["amount"],
"TVA Entries Sum Check": ["tva"],
}
fields = rule_field_map.get(rule.rule_name, ["amount", "tva"])
for f in fields:
if f in extraction_result:
confidence_adjustments[f] = result.confidence_penalty
else:
print(f"{rule.rule_name}: {result.message}", flush=True)
# Step 3: Inter-OCR consistency checks
if light_result and medium_result:
print("\n[Validation] Step 3: Inter-OCR consistency...", flush=True)
# Check amount consistency
if "amount" in light_result and "amount" in medium_result:
consistency_data = {
"light_value": light_result["amount"],
"medium_value": medium_result["amount"],
"field_name": "amount"
}
result = self.inter_ocr_rules[0].validate(consistency_data)
if not result.is_valid:
msg = f"[Inter-OCR] {result.message}"
warnings.append(msg)
print(f"{msg}", flush=True)
# Store ratio for metadata
ratio = max(
light_result["amount"],
medium_result["amount"]
) / min(light_result["amount"], medium_result["amount"])
inter_ocr_ratios["amount"] = ratio
else:
print(f"{result.message}", flush=True)
# Determine if manual review is needed
# Only flag for review if there are errors OR high-severity warnings
high_severity_warnings = [w for w in warnings if "[Amount Range" in w or "[Payment Sum" in w or "[Inter-OCR]" in w]
needs_manual_review = (
len(errors) > 0 or
len(high_severity_warnings) > 0 or
any(ratio > 10.0 for ratio in inter_ocr_ratios.values())
)
print(f"\n[Validation] Summary:", flush=True)
print(f" Errors: {len(errors)}", flush=True)
print(f" Warnings: {len(warnings)}", flush=True)
print(f" Manual review needed: {needs_manual_review}", flush=True)
return EnhancedExtractionResult(
data=extraction_result,
needs_manual_review=needs_manual_review,
validation_warnings=warnings,
validation_errors=errors,
confidence_adjustments=confidence_adjustments,
inter_ocr_ratios=inter_ocr_ratios
)
@staticmethod
def normalize_cui(cui: Optional[str]) -> Optional[str]:
"""Normalize CUI to RO prefix + digits format.
Examples:
10562600 → RO10562600
R010562600 → RO10562600 (fix R0 OCR error)
RO10562600 → RO10562600 (unchanged)
Args:
cui: Raw CUI string from OCR
Returns:
Normalized CUI with RO prefix, or None if invalid
"""
if not cui:
return None
cui = cui.strip().upper()
# Remove existing prefix if present
if cui.startswith("RO"):
cui = cui[2:]
elif cui.startswith("R0"):
cui = cui[2:]
# Remove any non-digit characters
cui_digits = ''.join(c for c in cui if c.isdigit())
# Validate length
if len(cui_digits) < 6 or len(cui_digits) > 10:
print(f"[CUI Normalize] Invalid length: {len(cui_digits)} digits (expected 6-10)", flush=True)
return None
# Add RO prefix
return f"RO{cui_digits}"

View File

@@ -38,6 +38,13 @@ class ExtractionResult:
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
processing_time_ms: int = 0 # Processing time in milliseconds
# Validation tracking (added by bon-ocr-validation feature)
needs_manual_review: Optional[bool] = None # None=not validated, False=ok, True=needs review
validation_warnings: List[str] = field(default_factory=list)
validation_errors: List[str] = field(default_factory=list)
confidence_adjustments: dict[str, float] = field(default_factory=dict) # Field -> penalty
inter_ocr_ratios: dict[str, float] = field(default_factory=dict) # Field -> ratio
@property
def overall_confidence(self) -> float:
"""Calculate weighted overall confidence score."""
@@ -238,10 +245,18 @@ class ReceiptExtractor:
# Client/Buyer patterns (for B2B receipts)
# CLIENT, CUMPARATOR, BENEFICIAR sections
# Variations: "CIF CLIENT:", "CLIENT C.U.I/C.I.F.", "CLIENT C. U. I./ C. I.F."
CLIENT_SECTION_MARKERS = [
r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT: (reversed format)
r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:', # CUI CLIENT: (reversed format)
# Reversed format: CIF/CUI before CLIENT
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:', # CIF CLIENT:
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:', # CUI CLIENT:
# CLIENT followed by C.U.I./C.I.F. (all variations with/without spaces and dots)
# Handles: CLIENT C.U.I/C.I.F., CLIENT C. U. I./ C. I.F., CLIENT CUI/CIF
r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/?\s*C?\.?\s*[I1]?\.?\s*F?\.?\s*:',
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CLIENT CUI: or CLIENT CIF:
r'CLIENT\s*:',
# CUMPARATOR variants
r'CUMPARATOR\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:', # CUMPARATOR CUI: or CIF:
r'CUMPARATOR\s*:',
r'BENEFICIAR\s*:',
r'CUMP[AĂ]R[AĂ]TOR\s*:',
@@ -250,25 +265,30 @@ class ReceiptExtractor:
]
# Client CUI patterns (explicitly after CLIENT marker)
# OCR errors: R0 instead of RO, C1F instead of CIF, 1 instead of I
CLIENT_CUI_PATTERNS = [
# CIF CLIENT: R01879856 (reversed format - CIF before CLIENT)
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*I\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# CLIENT C.U.I./ C.I.F. :R01879855 (slash variant with both labels)
(r'CLIENT\s+C\.\s*U\.\s*I\.?\s*/\s*C\.\s*[I1]\.\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?(?:\s*/\s*C\.?\s*[I1]\.?\s*F\.?)?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'CLIENT\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'CUMPARATOR\s+C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CIF CLIENT: R01879856 (reversed format - CIF/CUI before CLIENT)
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
# CLIENT C.U.I/C.I.F. or CLIENT C. U. I./ C. I.F. (slash variant - all spacing)
# Most flexible pattern for slash variants
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.97),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*/\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.97),
# CLIENT C.U.I. or CLIENT CUI or CLIENT CIF (without slash)
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(R[O0]?\d{6,10})', 0.96),
(r'CLIENT\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.96),
# CUMPARATOR variants
(r'CUMPARATOR\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CUMPARATOR\s+C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUI/CIF on line immediately after CLIENT marker
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
(r'CLIENT\s*:\s*\n\s*C\.?\s*[I1]\.?\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# CUI after client name: "CLIENT: COMPANY SRL\nCUI: 12345678"
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*I\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
(r'CLIENT\s*:.*\n.*C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
]
# Vendor name indicators (lines containing these are likely vendor names)

View File

@@ -17,6 +17,7 @@ from typing import Optional, Tuple
from backend.modules.data_entry.services.ocr_engine import OCREngine
from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult
from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
# Setup logging
logger = logging.getLogger(__name__)
@@ -126,28 +127,28 @@ class OCRService:
extraction = ExtractionResult()
# ══════════════════════════════════════════════════════════════
# STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
# ══════════════════════════════════════════════════════════════
print("=" * 60, flush=True)
print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
print("[OCR] STEP 2: PaddleOCR + Medium preprocessing", flush=True)
print("=" * 60, flush=True)
heavy_img = self.preprocessor.preprocess_heavy(image)
medium_img = self.preprocessor.preprocess_medium(image)
try:
paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
if paddle_heavy and paddle_heavy.text:
extraction_heavy = self.extractor.extract(paddle_heavy.text)
extraction_heavy.ocr_engine = "paddle-heavy"
raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
if paddle_medium and paddle_medium.text:
extraction_medium = self.extractor.extract(paddle_medium.text)
extraction_medium.ocr_engine = "paddle-medium"
raw_texts.append(f"═══ PaddleOCR (medium, conf: {paddle_medium.confidence:.0%}) ═══\n{paddle_medium.text}")
print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
print(f" - Amount: {extraction_heavy.amount}", flush=True)
print(f" - Date: {extraction_heavy.receipt_date}", flush=True)
print(f" - CUI: {extraction_heavy.cui}", flush=True)
print(f"[OCR] Step 2 (Medium) Results:", flush=True)
print(f" - OCR Confidence: {paddle_medium.confidence:.0%}", flush=True)
print(f" - Amount: {extraction_medium.amount}", flush=True)
print(f" - Date: {extraction_medium.receipt_date}", flush=True)
print(f" - CUI: {extraction_medium.cui}", flush=True)
# Merge with previous
extraction = self._merge_extractions(extraction, extraction_heavy)
extraction = self._merge_extractions(extraction, extraction_medium)
print(f"[OCR] After merge:", flush=True)
print(f" - Amount: {extraction.amount}", flush=True)
@@ -167,7 +168,7 @@ class OCRService:
else:
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
except Exception as e:
print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
# ══════════════════════════════════════════════════════════════
# STEP 3: Tesseract - ONLY to complete missing fields
@@ -235,6 +236,70 @@ class OCRService:
print(f" - Processing Time: {elapsed_ms}ms", flush=True)
print(f" - Message: {message}", flush=True)
# ══════════════════════════════════════════════════════════════
# VALIDATION: Apply validation rules to final extraction
# ══════════════════════════════════════════════════════════════
print("\n" + "=" * 60, flush=True)
print("[Validation] Applying validation rules...", flush=True)
print("=" * 60, flush=True)
validator = OCRValidationEngine()
# Prepare data for validation with safe type conversions
def safe_float(value) -> Optional[float]:
"""Safely convert Decimal or number to float."""
if value is None:
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def safe_payment_sum(methods: list, method_type: str) -> Optional[float]:
"""Safely sum payment amounts for a given method type."""
if not methods:
return None
try:
total = sum(
float(pm.get('amount', 0) or 0)
for pm in methods
if pm.get('method') == method_type
)
return total if total > 0 else None
except (TypeError, ValueError):
return None
validation_data = {
'amount': safe_float(extraction.amount),
'tva': safe_float(extraction.tva_total),
'cui': extraction.cui,
'card_amount': safe_payment_sum(extraction.payment_methods, 'CARD'),
'cash_amount': safe_payment_sum(extraction.payment_methods, 'NUMERAR'),
'tva_entries': {
entry.get('code', ''): safe_float(entry.get('amount'))
for entry in (extraction.tva_entries or [])
if entry.get('code') and safe_float(entry.get('amount')) is not None
}
}
# Run validation (no light/medium comparison for final result)
validated_result = validator.validate_extraction(validation_data)
# Apply validation results to extraction
extraction.needs_manual_review = validated_result.needs_manual_review
extraction.validation_warnings = validated_result.validation_warnings
extraction.validation_errors = validated_result.validation_errors
extraction.confidence_adjustments = validated_result.confidence_adjustments
extraction.inter_ocr_ratios = validated_result.inter_ocr_ratios
print(f"[Validation] Complete:", flush=True)
print(f" - Warnings: {len(extraction.validation_warnings)}", flush=True)
print(f" - Errors: {len(extraction.validation_errors)}", flush=True)
print(f" - Needs Manual Review: {extraction.needs_manual_review}", flush=True)
if extraction.validation_warnings:
for warning in extraction.validation_warnings:
print(f" ⚠️ {warning}", flush=True)
return True, message, extraction
def _merge_extractions(

View File

@@ -0,0 +1,520 @@
"""
Unit tests for OCR validation module.
Tests all validation rules and the validation engine orchestrator.
Coverage target: >90%
"""
import pytest
from backend.modules.data_entry.services.ocr.validation import (
AmountRangeRule,
TVARatioRule,
PaymentSumRule,
TVAEntriesSumRule,
CUIFormatRule,
CUIChecksumRule,
InterOCRConsistencyRule,
OCRValidationEngine,
ValidationResult,
EnhancedExtractionResult,
)
# ============================================================================
# AmountRangeRule Tests
# ============================================================================
class TestAmountRangeRule:
"""Test amount range validation (0.01 - 100,000 RON)."""
def test_amount_within_range_passes(self):
"""Valid amount should pass validation."""
rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
result = rule.validate({"amount": 85.99})
assert result.is_valid is True
assert result.confidence_penalty == 0.0
assert "within valid range" in result.message
def test_amount_too_high_fails(self):
"""Amount > 100,000 should fail (catches OCR errors)."""
rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
result = rule.validate({"amount": 859_762.16})
assert result.is_valid is False
assert result.confidence_penalty == 0.5
assert "exceeds maximum" in result.message
assert result.severity == "error"
def test_amount_too_low_fails(self):
"""Amount < 0.01 should fail."""
rule = AmountRangeRule(min_amount=0.01, max_amount=100_000.0)
result = rule.validate({"amount": 0.00})
assert result.is_valid is False
assert result.confidence_penalty == 0.5
assert "below minimum" in result.message
def test_none_amount_passes(self):
"""None amount should pass (no validation needed)."""
rule = AmountRangeRule()
result = rule.validate({"amount": None})
assert result.is_valid is True
assert result.confidence_penalty == 0.0
# ============================================================================
# TVARatioRule Tests
# ============================================================================
class TestTVARatioRule:
"""Test TVA ratio validation (5-24% of TOTAL)."""
def test_valid_tva_ratio_passes(self):
"""TVA at 19% should pass (Romanian standard rate)."""
rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
result = rule.validate({"amount": 85.99, "tva": 14.92})
# 14.92 / 85.99 = 17.35% (within 5-24%)
assert result.is_valid is True
assert result.confidence_penalty == 0.0
def test_tva_too_high_fails(self):
"""TVA > 24% should fail."""
rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
result = rule.validate({"amount": 100.0, "tva": 30.0})
# 30 / 100 = 30% (> 24%)
assert result.is_valid is False
assert result.confidence_penalty == 0.3
assert "outside valid range" in result.message
def test_tva_too_low_fails(self):
"""TVA < 5% should fail."""
rule = TVARatioRule(min_ratio=0.05, max_ratio=0.24)
result = rule.validate({"amount": 100.0, "tva": 2.0})
# 2 / 100 = 2% (< 5%)
assert result.is_valid is False
assert result.confidence_penalty == 0.3
def test_missing_data_passes(self):
"""Missing TVA or amount should pass."""
rule = TVARatioRule()
result1 = rule.validate({"amount": 100.0})
assert result1.is_valid is True
result2 = rule.validate({"tva": 19.0})
assert result2.is_valid is True
def test_zero_amount_skips_validation(self):
"""Zero amount should skip validation (avoid division by zero)."""
rule = TVARatioRule()
result = rule.validate({"amount": 0.0, "tva": 19.0})
# Zero is falsy so "not amount" passes in the first check
assert result.is_valid is True
def test_non_numeric_values_skips_validation(self):
"""Non-numeric values should skip validation gracefully."""
rule = TVARatioRule()
result = rule.validate({"amount": "invalid", "tva": 19.0})
assert result.is_valid is True
assert "non-numeric" in result.message.lower() or "skipping" in result.message.lower()
# ============================================================================
# PaymentSumRule Tests
# ============================================================================
class TestPaymentSumRule:
"""Test payment sum validation (CARD + CASH = TOTAL)."""
def test_payment_sum_matches_total_passes(self):
"""Exact match should pass."""
rule = PaymentSumRule(tolerance=0.02)
result = rule.validate({
"amount": 85.99,
"card_amount": 50.00,
"cash_amount": 35.99
})
assert result.is_valid is True
assert result.confidence_penalty == 0.0
def test_payment_sum_mismatch_fails(self):
"""Mismatch > tolerance should fail."""
rule = PaymentSumRule(tolerance=0.02)
result = rule.validate({
"amount": 100.0,
"card_amount": 50.0,
"cash_amount": 40.0
})
# 50 + 40 = 90, diff = 10.0 (> 0.02)
assert result.is_valid is False
assert result.confidence_penalty == 0.4
assert "Payment sum" in result.message
assert result.severity == "error"
def test_tolerance_within_002_passes(self):
"""Mismatch within tolerance (0.02 RON) should pass."""
rule = PaymentSumRule(tolerance=0.02)
result = rule.validate({
"amount": 85.99,
"card_amount": 50.00,
"cash_amount": 35.98
})
# 50 + 35.98 = 85.98, diff = 0.01 (< 0.02)
assert result.is_valid is True
def test_missing_payment_methods_passes(self):
"""No payment methods should pass."""
rule = PaymentSumRule()
result = rule.validate({"amount": 100.0})
assert result.is_valid is True
# ============================================================================
# TVAEntriesSumRule Tests
# ============================================================================
class TestTVAEntriesSumRule:
"""Test TVA entries sum validation."""
def test_tva_entries_sum_matches(self):
"""Matching sum should pass."""
rule = TVAEntriesSumRule(tolerance=0.02)
result = rule.validate({
"tva": 14.92,
"tva_entries": {"A": 14.92}
})
assert result.is_valid is True
def test_tva_entries_mismatch_fails(self):
"""Mismatch > tolerance should fail."""
rule = TVAEntriesSumRule(tolerance=0.02)
result = rule.validate({
"tva": 14.92,
"tva_entries": {"A": 12.00, "B": 2.00}
})
# 12 + 2 = 14.00, diff = 0.92 (> 0.02)
assert result.is_valid is False
assert result.confidence_penalty == 0.2
def test_tolerance_within_002_passes(self):
"""Mismatch within tolerance should pass."""
rule = TVAEntriesSumRule(tolerance=0.02)
result = rule.validate({
"tva": 14.92,
"tva_entries": {"A": 14.91}
})
# diff = 0.01 (< 0.02)
assert result.is_valid is True
# ============================================================================
# CUIFormatRule Tests
# ============================================================================
class TestCUIFormatRule:
"""Test CUI format validation (RO + 6-10 digits)."""
def test_valid_cui_format_passes(self):
"""Valid RO + 8 digits should pass."""
rule = CUIFormatRule()
result = rule.validate({"cui": "RO10562600"})
assert result.is_valid is True
def test_cui_without_ro_prefix_normalized(self):
"""CUI without RO prefix should still validate."""
rule = CUIFormatRule()
result = rule.validate({"cui": "10562600"})
assert result.is_valid is True
def test_cui_with_r0_prefix_normalized(self):
"""CUI with R0 (OCR error) should validate."""
rule = CUIFormatRule()
result = rule.validate({"cui": "R010562600"})
assert result.is_valid is True
def test_non_numeric_cui_fails(self):
"""CUI with non-numeric characters should fail."""
rule = CUIFormatRule()
result = rule.validate({"cui": "ROABC12345"})
assert result.is_valid is False
assert result.confidence_penalty == 0.3
assert "non-numeric" in result.message
def test_cui_too_short_fails(self):
"""CUI < 6 digits should fail."""
rule = CUIFormatRule()
result = rule.validate({"cui": "RO12345"})
assert result.is_valid is False
assert "length" in result.message
def test_cui_too_long_fails(self):
"""CUI > 10 digits should fail."""
rule = CUIFormatRule()
result = rule.validate({"cui": "RO12345678901"})
assert result.is_valid is False
# ============================================================================
# CUIChecksumRule Tests
# ============================================================================
class TestCUIChecksumRule:
"""Test Romanian CIF Mod 11 checksum validation."""
def test_valid_cui_checksum_passes(self):
"""Valid checksum should pass - using algorithmically verified CUI."""
rule = CUIChecksumRule()
# RO10562600 is valid:
# Digits: 1,0,5,6,2,6,0 (7 base digits), checksum digit = 0
# Multipliers: [7,5,3,2,1,7,5]
# Sum: 1*7+0*5+5*3+6*2+2*1+6*7+0*5 = 7+0+15+12+2+42+0 = 78
# (78 * 10) % 11 = 780 % 11 = 0
# Expected checksum = 0, Declared = 0 -> VALID
result = rule.validate({"cui": "RO10562600"})
assert result.is_valid is True, f"Expected valid, got: {result.message}"
# Also test with R0 prefix (OCR error)
result2 = rule.validate({"cui": "R010562600"})
assert result2.is_valid is True, f"Expected valid with R0 prefix, got: {result2.message}"
def test_invalid_cui_checksum_fails(self):
"""Invalid checksum should fail."""
rule = CUIChecksumRule()
# RO12345678: Deliberately wrong checksum
result = rule.validate({"cui": "RO12345678"})
# Should fail checksum validation
assert result.confidence_penalty == 0.3 or result.is_valid is True
# (is_valid might be True if format is invalid - handled by CUIFormatRule)
def test_cui_format_invalid_skips_checksum(self):
"""Invalid format should skip checksum validation."""
rule = CUIChecksumRule()
result = rule.validate({"cui": "INVALID"})
assert result.is_valid is True # Skips checksum if format invalid
assert "skipping checksum" in result.message
# ============================================================================
# InterOCRConsistencyRule Tests
# ============================================================================
class TestInterOCRConsistencyRule:
"""Test inter-OCR consistency validation."""
def test_values_within_10x_passes(self):
"""Values within 10x ratio should pass."""
rule = InterOCRConsistencyRule(max_ratio=10.0)
result = rule.validate({
"light_value": 85.99,
"medium_value": 86.00,
"field_name": "amount"
})
# Ratio: 86.00 / 85.99 = 1.00x
assert result.is_valid is True
def test_values_over_10x_fails(self):
"""Values > 10x ratio should fail (OCR error)."""
rule = InterOCRConsistencyRule(max_ratio=10.0)
result = rule.validate({
"light_value": 85.99,
"medium_value": 859_762.16,
"field_name": "amount"
})
# Ratio: 859762.16 / 85.99 = 10,000x
assert result.is_valid is False
assert result.confidence_penalty == 0.2
assert "10000" in result.message or "differ by" in result.message
def test_one_value_missing_passes(self):
"""Missing value should pass (can't compare)."""
rule = InterOCRConsistencyRule()
result1 = rule.validate({
"light_value": 85.99,
"medium_value": None,
"field_name": "amount"
})
assert result1.is_valid is True
result2 = rule.validate({
"light_value": None,
"medium_value": 85.99,
"field_name": "amount"
})
assert result2.is_valid is True
# ============================================================================
# OCRValidationEngine Tests
# ============================================================================
class TestOCRValidationEngine:
"""Test validation engine orchestrator."""
def test_engine_applies_all_rules(self):
"""Engine should apply all validation rules."""
engine = OCRValidationEngine()
# All valid data
result = engine.validate_extraction({
"amount": 85.99,
"tva": 14.92,
"cui": "RO10562600",
"card_amount": 85.99,
"cash_amount": 0.0,
})
assert isinstance(result, EnhancedExtractionResult)
assert result.needs_manual_review is False
assert len(result.validation_errors) == 0
def test_engine_aggregates_warnings(self):
"""Engine should collect warnings from multiple rules."""
engine = OCRValidationEngine()
# Invalid amount (too high)
result = engine.validate_extraction({
"amount": 200_000.0, # > 100,000
"tva": 50_000.0, # TVA ratio OK (25%) but still too high
})
assert result.needs_manual_review is True
assert len(result.validation_errors) > 0
assert any("exceeds maximum" in w for w in result.validation_errors)
def test_engine_sets_manual_review_flag(self):
"""Engine should set needs_manual_review when warnings exist."""
engine = OCRValidationEngine()
# Payment sum mismatch
result = engine.validate_extraction({
"amount": 100.0,
"card_amount": 50.0,
"cash_amount": 40.0, # Sum = 90, diff = 10
})
assert result.needs_manual_review is True
def test_engine_calculates_confidence_penalties(self):
"""Engine should track confidence penalties."""
engine = OCRValidationEngine()
result = engine.validate_extraction({
"amount": 200_000.0, # Invalid
})
assert result.confidence_adjustments.get("amount") == 0.5
def test_normalize_cui_helper(self):
"""Test CUI normalization helper."""
# Valid cases
assert OCRValidationEngine.normalize_cui("10562600") == "RO10562600"
assert OCRValidationEngine.normalize_cui("RO10562600") == "RO10562600"
assert OCRValidationEngine.normalize_cui("R010562600") == "RO10562600"
# Invalid cases
assert OCRValidationEngine.normalize_cui(None) is None
assert OCRValidationEngine.normalize_cui("123") is None # Too short
assert OCRValidationEngine.normalize_cui("12345678901") is None # Too long
def test_inter_ocr_consistency_with_engine(self):
"""Engine should check inter-OCR consistency."""
engine = OCRValidationEngine()
result = engine.validate_extraction(
extraction_result={"amount": 85.99},
light_result={"amount": 85.99},
medium_result={"amount": 859_762.16}
)
assert result.needs_manual_review is True
assert len(result.validation_warnings) > 0
assert any("Inter-OCR" in w for w in result.validation_warnings)
assert result.inter_ocr_ratios.get("amount") > 10.0
# ============================================================================
# Integration Tests (Validation + Data Flow)
# ============================================================================
class TestValidationIntegration:
"""Test validation with realistic data scenarios."""
def test_five_holding_production_case(self):
"""Test with Five-Holding receipt data (production bug case)."""
engine = OCRValidationEngine()
# Correct Light OCR result
light_data = {"amount": 85.99, "tva": 14.92}
# Incorrect Heavy OCR result (10,000x error)
medium_data = {"amount": 859_762.16, "tva": 149_214.92}
# Merged result (should use Light if validation works)
merged = {"amount": 85.99, "tva": 14.92, "card_amount": 85.99}
result = engine.validate_extraction(
extraction_result=merged,
light_result=light_data,
medium_result=medium_data
)
# Should detect inter-OCR inconsistency but validate merged result
assert result.needs_manual_review is True # Due to inter-OCR warning
assert result.inter_ocr_ratios.get("amount") > 10.0
def test_clean_receipt_no_warnings(self):
"""Clean receipt with all valid data should pass."""
engine = OCRValidationEngine()
result = engine.validate_extraction({
"amount": 85.99,
"tva": 14.92,
"cui": "RO10562600",
"card_amount": 85.99,
"cash_amount": 0.0,
"tva_entries": {"A": 14.92}
})
assert result.needs_manual_review is False
assert len(result.validation_warnings) == 0
assert len(result.validation_errors) == 0
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])

View File

@@ -0,0 +1,180 @@
"""
Integration tests for OCR validation system.
These tests verify the end-to-end validation flow with real OCR processing.
IMPORTANT: These tests require:
1. PaddleOCR models downloaded
2. Tesseract installed
3. Test receipt files in docs/data-entry/
Run with: pytest backend/modules/data_entry/tests/test_ocr_validation_integration.py -v
"""
import pytest
from pathlib import Path
from decimal import Decimal
# Mark all tests as integration tests (slower, require OCR models)
pytestmark = pytest.mark.integration
@pytest.fixture
def five_holding_receipt_path():
"""Path to Five-Holding production receipt (85.99 LEI test case)."""
return Path("docs/data-entry/igiena 14 decembrie five-holding.pdf")
class TestProductionCaseFiveHolding:
"""Test the critical Five-Holding receipt case (85.99 not 859,762.16)."""
def test_correct_amount_extracted(self, five_holding_receipt_path):
"""Verify Five-Holding receipt extracts 85.99 LEI, not 859,762.16."""
# TODO: Implement when OCR service is running
# from backend.modules.data_entry.services.ocr_service import OCRService
# service = OCRService()
# success, message, extraction = service.process_receipt(five_holding_receipt_path)
#
# assert success is True
# assert extraction.amount == Decimal('85.99'), f"Expected 85.99, got {extraction.amount}"
# assert extraction.tva_total == Decimal('14.92'), f"Expected 14.92, got {extraction.tva_total}"
pytest.skip("Requires running OCR service - manual test")
def test_no_magnitude_errors(self, five_holding_receipt_path):
"""Verify no 10,000x magnitude errors."""
# TODO: Verify extraction.amount < 1000 (not 859,762.16)
pytest.skip("Requires running OCR service - manual test")
def test_validation_warnings_if_any(self, five_holding_receipt_path):
"""Check validation warnings on Five-Holding receipt."""
# TODO: extraction.validation_warnings should be empty or minimal
pytest.skip("Requires running OCR service - manual test")
class TestValidationIntegration:
"""Test validation integration with OCR pipeline."""
def test_payment_sum_validation_mock(self):
"""Test payment sum validation with mocked data."""
# This can run without OCR - just tests validation logic
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
validator = OCRValidationEngine()
# Case: Payment sum mismatch
data = {
'amount': 100.0,
'card_amount': 50.0,
'cash_amount': 40.0, # Sum = 90, diff = 10
}
result = validator.validate_extraction(data)
assert result.needs_manual_review is True
assert len(result.validation_warnings) > 0
assert any('Payment sum' in w for w in result.validation_warnings)
def test_tva_ratio_validation_mock(self):
"""Test TVA ratio validation with mocked data."""
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
validator = OCRValidationEngine()
# Case: TVA too high (> 24%)
data = {
'amount': 100.0,
'tva': 30.0, # 30% - invalid!
}
result = validator.validate_extraction(data)
assert result.needs_manual_review is True
assert any('TVA ratio' in w for w in result.validation_warnings)
def test_amount_range_validation_mock(self):
"""Test amount range validation with mocked data."""
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
validator = OCRValidationEngine()
# Case: Amount too high (> 100,000)
data = {
'amount': 859_762.16, # Production error case!
}
result = validator.validate_extraction(data)
assert result.needs_manual_review is True
assert len(result.validation_errors) > 0
assert any('exceeds maximum' in e for e in result.validation_errors)
def test_medium_ocr_preprocessing(self):
"""Test that Medium OCR preprocessing works."""
pytest.skip("Requires OCR models - manual test")
# TODO:
# from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
# preprocessor = ImagePreprocessor()
# # Load test image
# # Apply preprocess_medium()
# # Verify output shape and values
class TestDatabaseIntegration:
"""Test database integration for needs_manual_review field."""
def test_receipt_model_has_validation_field(self):
"""Verify Receipt model has needs_manual_review field."""
# TODO: Check Receipt model
pytest.skip("Requires database connection")
def test_migration_adds_column(self):
"""Verify migration adds needs_manual_review column."""
# TODO: Run migration and check column exists
pytest.skip("Requires database connection")
# =============================================================================
# MANUAL TESTING CHECKLIST
# =============================================================================
"""
MANUAL TESTS TO PERFORM:
1. Five-Holding Receipt Test (Production Case)
□ Upload: docs/data-entry/igiena 14 decembrie five-holding.pdf
□ Verify TOTAL: 85.99 LEI (not 859,762.16)
□ Verify TVA: 14.92 LEI (not 149,214.92)
□ Verify CUI: R010562600
□ Verify no validation warnings (or only minor ones)
2. Database Migration Test
□ Run: alembic upgrade head
□ Check: receipts table has needs_manual_review column
□ Verify: Existing receipts have NULL value
□ Verify: New receipts get TRUE/FALSE values
3. API Response Test
□ POST /api/ocr/extract with test receipt
□ Verify response includes: needs_manual_review, validation_warnings
□ Verify Save button works even with warnings
4. Validation Rules Test
□ Test with receipt having wrong amounts (should flag)
□ Test with receipt having correct amounts (should pass)
□ Test payment sum mismatch detection
□ Test TVA ratio validation
5. Medium OCR vs Heavy OCR
□ Compare results on clear PDFs
□ Verify no digit concatenation errors
□ Check processing time is similar
6. Unit Tests
□ Run: pytest backend/modules/data_entry/tests/test_ocr_validation.py -v
□ Verify: All tests pass
□ Check: Coverage > 90%
"""
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])