Files
roa2web-service-auto/backend/modules/data_entry/tests/test_ocr_validation_integration.py
Marius Mutu ab160b628d feat(ocr): Add validation system and CLIENT CUI extraction
OCR Data Extraction Validation System:
- Add 7 validation rules (amount range, TVA ratio, payment sum, etc.)
- Add Medium preprocessing to replace Heavy (fixes digit concatenation)
- Add validation warnings to API responses
- Flag receipts needing manual review (needs_manual_review field)
- Add database migration for needs_manual_review column

CLIENT CUI Extraction Improvements:
- Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc.
- Handle OCR errors (R0 vs RO, C1F vs CIF)
- Add client_name, client_cui, client_address to API response
- Add validation fields to API response (was missing)

QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings)
- Fixed type safety in validation rules
- Fixed ZeroDivisionError risk
- Fixed schema mismatch (Optional[bool] for needs_manual_review)
- All 37 unit tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 19:12:52 +02:00

181 lines
6.4 KiB
Python

"""
Integration tests for OCR validation system.
These tests verify the end-to-end validation flow with real OCR processing.
IMPORTANT: These tests require:
1. PaddleOCR models downloaded
2. Tesseract installed
3. Test receipt files in docs/data-entry/
Run with: pytest backend/modules/data_entry/tests/test_ocr_validation_integration.py -v
"""
import pytest
from pathlib import Path
from decimal import Decimal
# Mark all tests as integration tests (slower, require OCR models)
pytestmark = pytest.mark.integration
@pytest.fixture
def five_holding_receipt_path():
"""Path to Five-Holding production receipt (85.99 LEI test case)."""
return Path("docs/data-entry/igiena 14 decembrie five-holding.pdf")
class TestProductionCaseFiveHolding:
"""Test the critical Five-Holding receipt case (85.99 not 859,762.16)."""
def test_correct_amount_extracted(self, five_holding_receipt_path):
"""Verify Five-Holding receipt extracts 85.99 LEI, not 859,762.16."""
# TODO: Implement when OCR service is running
# from backend.modules.data_entry.services.ocr_service import OCRService
# service = OCRService()
# success, message, extraction = service.process_receipt(five_holding_receipt_path)
#
# assert success is True
# assert extraction.amount == Decimal('85.99'), f"Expected 85.99, got {extraction.amount}"
# assert extraction.tva_total == Decimal('14.92'), f"Expected 14.92, got {extraction.tva_total}"
pytest.skip("Requires running OCR service - manual test")
def test_no_magnitude_errors(self, five_holding_receipt_path):
"""Verify no 10,000x magnitude errors."""
# TODO: Verify extraction.amount < 1000 (not 859,762.16)
pytest.skip("Requires running OCR service - manual test")
def test_validation_warnings_if_any(self, five_holding_receipt_path):
"""Check validation warnings on Five-Holding receipt."""
# TODO: extraction.validation_warnings should be empty or minimal
pytest.skip("Requires running OCR service - manual test")
class TestValidationIntegration:
"""Test validation integration with OCR pipeline."""
def test_payment_sum_validation_mock(self):
"""Test payment sum validation with mocked data."""
# This can run without OCR - just tests validation logic
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
validator = OCRValidationEngine()
# Case: Payment sum mismatch
data = {
'amount': 100.0,
'card_amount': 50.0,
'cash_amount': 40.0, # Sum = 90, diff = 10
}
result = validator.validate_extraction(data)
assert result.needs_manual_review is True
assert len(result.validation_warnings) > 0
assert any('Payment sum' in w for w in result.validation_warnings)
def test_tva_ratio_validation_mock(self):
"""Test TVA ratio validation with mocked data."""
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
validator = OCRValidationEngine()
# Case: TVA too high (> 24%)
data = {
'amount': 100.0,
'tva': 30.0, # 30% - invalid!
}
result = validator.validate_extraction(data)
assert result.needs_manual_review is True
assert any('TVA ratio' in w for w in result.validation_warnings)
def test_amount_range_validation_mock(self):
"""Test amount range validation with mocked data."""
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
validator = OCRValidationEngine()
# Case: Amount too high (> 100,000)
data = {
'amount': 859_762.16, # Production error case!
}
result = validator.validate_extraction(data)
assert result.needs_manual_review is True
assert len(result.validation_errors) > 0
assert any('exceeds maximum' in e for e in result.validation_errors)
def test_medium_ocr_preprocessing(self):
"""Test that Medium OCR preprocessing works."""
pytest.skip("Requires OCR models - manual test")
# TODO:
# from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
# preprocessor = ImagePreprocessor()
# # Load test image
# # Apply preprocess_medium()
# # Verify output shape and values
class TestDatabaseIntegration:
"""Test database integration for needs_manual_review field."""
def test_receipt_model_has_validation_field(self):
"""Verify Receipt model has needs_manual_review field."""
# TODO: Check Receipt model
pytest.skip("Requires database connection")
def test_migration_adds_column(self):
"""Verify migration adds needs_manual_review column."""
# TODO: Run migration and check column exists
pytest.skip("Requires database connection")
# =============================================================================
# MANUAL TESTING CHECKLIST
# =============================================================================
"""
MANUAL TESTS TO PERFORM:
1. Five-Holding Receipt Test (Production Case)
□ Upload: docs/data-entry/igiena 14 decembrie five-holding.pdf
□ Verify TOTAL: 85.99 LEI (not 859,762.16)
□ Verify TVA: 14.92 LEI (not 149,214.92)
□ Verify CUI: R010562600
□ Verify no validation warnings (or only minor ones)
2. Database Migration Test
□ Run: alembic upgrade head
□ Check: receipts table has needs_manual_review column
□ Verify: Existing receipts have NULL value
□ Verify: New receipts get TRUE/FALSE values
3. API Response Test
□ POST /api/ocr/extract with test receipt
□ Verify response includes: needs_manual_review, validation_warnings
□ Verify Save button works even with warnings
4. Validation Rules Test
□ Test with receipt having wrong amounts (should flag)
□ Test with receipt having correct amounts (should pass)
□ Test payment sum mismatch detection
□ Test TVA ratio validation
5. Medium OCR vs Heavy OCR
□ Compare results on clear PDFs
□ Verify no digit concatenation errors
□ Check processing time is similar
6. Unit Tests
□ Run: pytest backend/modules/data_entry/tests/test_ocr_validation.py -v
□ Verify: All tests pass
□ Check: Coverage > 90%
"""
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])