OCR Data Extraction Validation System: - Add 7 validation rules (amount range, TVA ratio, payment sum, etc.) - Add Medium preprocessing to replace Heavy (fixes digit concatenation) - Add validation warnings to API responses - Flag receipts needing manual review (needs_manual_review field) - Add database migration for needs_manual_review column CLIENT CUI Extraction Improvements: - Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc. - Handle OCR errors (R0 vs RO, C1F vs CIF) - Add client_name, client_cui, client_address to API response - Add validation fields to API response (was missing) QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings) - Fixed type safety in validation rules - Fixed ZeroDivisionError risk - Fixed schema mismatch (Optional[bool] for needs_manual_review) - All 37 unit tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
181 lines
6.4 KiB
Python
181 lines
6.4 KiB
Python
"""
|
|
Integration tests for OCR validation system.
|
|
|
|
These tests verify the end-to-end validation flow with real OCR processing.
|
|
|
|
IMPORTANT: These tests require:
|
|
1. PaddleOCR models downloaded
|
|
2. Tesseract installed
|
|
3. Test receipt files in docs/data-entry/
|
|
|
|
Run with: pytest backend/modules/data_entry/tests/test_ocr_validation_integration.py -v
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from decimal import Decimal
|
|
|
|
|
|
# Mark all tests as integration tests (slower, require OCR models)
|
|
pytestmark = pytest.mark.integration
|
|
|
|
|
|
@pytest.fixture
|
|
def five_holding_receipt_path():
|
|
"""Path to Five-Holding production receipt (85.99 LEI test case)."""
|
|
return Path("docs/data-entry/igiena 14 decembrie five-holding.pdf")
|
|
|
|
|
|
class TestProductionCaseFiveHolding:
|
|
"""Test the critical Five-Holding receipt case (85.99 not 859,762.16)."""
|
|
|
|
def test_correct_amount_extracted(self, five_holding_receipt_path):
|
|
"""Verify Five-Holding receipt extracts 85.99 LEI, not 859,762.16."""
|
|
# TODO: Implement when OCR service is running
|
|
# from backend.modules.data_entry.services.ocr_service import OCRService
|
|
# service = OCRService()
|
|
# success, message, extraction = service.process_receipt(five_holding_receipt_path)
|
|
#
|
|
# assert success is True
|
|
# assert extraction.amount == Decimal('85.99'), f"Expected 85.99, got {extraction.amount}"
|
|
# assert extraction.tva_total == Decimal('14.92'), f"Expected 14.92, got {extraction.tva_total}"
|
|
pytest.skip("Requires running OCR service - manual test")
|
|
|
|
def test_no_magnitude_errors(self, five_holding_receipt_path):
|
|
"""Verify no 10,000x magnitude errors."""
|
|
# TODO: Verify extraction.amount < 1000 (not 859,762.16)
|
|
pytest.skip("Requires running OCR service - manual test")
|
|
|
|
def test_validation_warnings_if_any(self, five_holding_receipt_path):
|
|
"""Check validation warnings on Five-Holding receipt."""
|
|
# TODO: extraction.validation_warnings should be empty or minimal
|
|
pytest.skip("Requires running OCR service - manual test")
|
|
|
|
|
|
class TestValidationIntegration:
|
|
"""Test validation integration with OCR pipeline."""
|
|
|
|
def test_payment_sum_validation_mock(self):
|
|
"""Test payment sum validation with mocked data."""
|
|
# This can run without OCR - just tests validation logic
|
|
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
|
|
|
validator = OCRValidationEngine()
|
|
|
|
# Case: Payment sum mismatch
|
|
data = {
|
|
'amount': 100.0,
|
|
'card_amount': 50.0,
|
|
'cash_amount': 40.0, # Sum = 90, diff = 10
|
|
}
|
|
|
|
result = validator.validate_extraction(data)
|
|
|
|
assert result.needs_manual_review is True
|
|
assert len(result.validation_warnings) > 0
|
|
assert any('Payment sum' in w for w in result.validation_warnings)
|
|
|
|
def test_tva_ratio_validation_mock(self):
|
|
"""Test TVA ratio validation with mocked data."""
|
|
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
|
|
|
validator = OCRValidationEngine()
|
|
|
|
# Case: TVA too high (> 24%)
|
|
data = {
|
|
'amount': 100.0,
|
|
'tva': 30.0, # 30% - invalid!
|
|
}
|
|
|
|
result = validator.validate_extraction(data)
|
|
|
|
assert result.needs_manual_review is True
|
|
assert any('TVA ratio' in w for w in result.validation_warnings)
|
|
|
|
def test_amount_range_validation_mock(self):
|
|
"""Test amount range validation with mocked data."""
|
|
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
|
|
|
validator = OCRValidationEngine()
|
|
|
|
# Case: Amount too high (> 100,000)
|
|
data = {
|
|
'amount': 859_762.16, # Production error case!
|
|
}
|
|
|
|
result = validator.validate_extraction(data)
|
|
|
|
assert result.needs_manual_review is True
|
|
assert len(result.validation_errors) > 0
|
|
assert any('exceeds maximum' in e for e in result.validation_errors)
|
|
|
|
def test_medium_ocr_preprocessing(self):
|
|
"""Test that Medium OCR preprocessing works."""
|
|
pytest.skip("Requires OCR models - manual test")
|
|
# TODO:
|
|
# from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
|
|
# preprocessor = ImagePreprocessor()
|
|
# # Load test image
|
|
# # Apply preprocess_medium()
|
|
# # Verify output shape and values
|
|
|
|
|
|
class TestDatabaseIntegration:
|
|
"""Test database integration for needs_manual_review field."""
|
|
|
|
def test_receipt_model_has_validation_field(self):
|
|
"""Verify Receipt model has needs_manual_review field."""
|
|
# TODO: Check Receipt model
|
|
pytest.skip("Requires database connection")
|
|
|
|
def test_migration_adds_column(self):
|
|
"""Verify migration adds needs_manual_review column."""
|
|
# TODO: Run migration and check column exists
|
|
pytest.skip("Requires database connection")
|
|
|
|
|
|
# =============================================================================
|
|
# MANUAL TESTING CHECKLIST
|
|
# =============================================================================
|
|
"""
|
|
MANUAL TESTS TO PERFORM:
|
|
|
|
1. Five-Holding Receipt Test (Production Case)
|
|
□ Upload: docs/data-entry/igiena 14 decembrie five-holding.pdf
|
|
□ Verify TOTAL: 85.99 LEI (not 859,762.16)
|
|
□ Verify TVA: 14.92 LEI (not 149,214.92)
|
|
□ Verify CUI: R010562600
|
|
□ Verify no validation warnings (or only minor ones)
|
|
|
|
2. Database Migration Test
|
|
□ Run: alembic upgrade head
|
|
□ Check: receipts table has needs_manual_review column
|
|
□ Verify: Existing receipts have NULL value
|
|
□ Verify: New receipts get TRUE/FALSE values
|
|
|
|
3. API Response Test
|
|
□ POST /api/ocr/extract with test receipt
|
|
□ Verify response includes: needs_manual_review, validation_warnings
|
|
□ Verify Save button works even with warnings
|
|
|
|
4. Validation Rules Test
|
|
□ Test with receipt having wrong amounts (should flag)
|
|
□ Test with receipt having correct amounts (should pass)
|
|
□ Test payment sum mismatch detection
|
|
□ Test TVA ratio validation
|
|
|
|
5. Medium OCR vs Heavy OCR
|
|
□ Compare results on clear PDFs
|
|
□ Verify no digit concatenation errors
|
|
□ Check processing time is similar
|
|
|
|
6. Unit Tests
|
|
□ Run: pytest backend/modules/data_entry/tests/test_ocr_validation.py -v
|
|
□ Verify: All tests pass
|
|
□ Check: Coverage > 90%
|
|
"""
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v", "--tb=short"])
|