""" Integration tests for OCR validation system. These tests verify the end-to-end validation flow with real OCR processing. IMPORTANT: These tests require: 1. PaddleOCR models downloaded 2. Tesseract installed 3. Test receipt files in docs/data-entry/ Run with: pytest backend/modules/data_entry/tests/test_ocr_validation_integration.py -v """ import pytest from pathlib import Path from decimal import Decimal # Mark all tests as integration tests (slower, require OCR models) pytestmark = pytest.mark.integration @pytest.fixture def five_holding_receipt_path(): """Path to Five-Holding production receipt (85.99 LEI test case).""" return Path("docs/data-entry/igiena 14 decembrie five-holding.pdf") class TestProductionCaseFiveHolding: """Test the critical Five-Holding receipt case (85.99 not 859,762.16).""" def test_correct_amount_extracted(self, five_holding_receipt_path): """Verify Five-Holding receipt extracts 85.99 LEI, not 859,762.16.""" # TODO: Implement when OCR service is running # from backend.modules.data_entry.services.ocr_service import OCRService # service = OCRService() # success, message, extraction = service.process_receipt(five_holding_receipt_path) # # assert success is True # assert extraction.amount == Decimal('85.99'), f"Expected 85.99, got {extraction.amount}" # assert extraction.tva_total == Decimal('14.92'), f"Expected 14.92, got {extraction.tva_total}" pytest.skip("Requires running OCR service - manual test") def test_no_magnitude_errors(self, five_holding_receipt_path): """Verify no 10,000x magnitude errors.""" # TODO: Verify extraction.amount < 1000 (not 859,762.16) pytest.skip("Requires running OCR service - manual test") def test_validation_warnings_if_any(self, five_holding_receipt_path): """Check validation warnings on Five-Holding receipt.""" # TODO: extraction.validation_warnings should be empty or minimal pytest.skip("Requires running OCR service - manual test") class TestValidationIntegration: """Test validation integration with OCR pipeline.""" def test_payment_sum_validation_mock(self): """Test payment sum validation with mocked data.""" # This can run without OCR - just tests validation logic from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine validator = OCRValidationEngine() # Case: Payment sum mismatch data = { 'amount': 100.0, 'card_amount': 50.0, 'cash_amount': 40.0, # Sum = 90, diff = 10 } result = validator.validate_extraction(data) assert result.needs_manual_review is True assert len(result.validation_warnings) > 0 assert any('Payment sum' in w for w in result.validation_warnings) def test_tva_ratio_validation_mock(self): """Test TVA ratio validation with mocked data.""" from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine validator = OCRValidationEngine() # Case: TVA too high (> 24%) data = { 'amount': 100.0, 'tva': 30.0, # 30% - invalid! } result = validator.validate_extraction(data) assert result.needs_manual_review is True assert any('TVA ratio' in w for w in result.validation_warnings) def test_amount_range_validation_mock(self): """Test amount range validation with mocked data.""" from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine validator = OCRValidationEngine() # Case: Amount too high (> 100,000) data = { 'amount': 859_762.16, # Production error case! } result = validator.validate_extraction(data) assert result.needs_manual_review is True assert len(result.validation_errors) > 0 assert any('exceeds maximum' in e for e in result.validation_errors) def test_medium_ocr_preprocessing(self): """Test that Medium OCR preprocessing works.""" pytest.skip("Requires OCR models - manual test") # TODO: # from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor # preprocessor = ImagePreprocessor() # # Load test image # # Apply preprocess_medium() # # Verify output shape and values class TestDatabaseIntegration: """Test database integration for needs_manual_review field.""" def test_receipt_model_has_validation_field(self): """Verify Receipt model has needs_manual_review field.""" # TODO: Check Receipt model pytest.skip("Requires database connection") def test_migration_adds_column(self): """Verify migration adds needs_manual_review column.""" # TODO: Run migration and check column exists pytest.skip("Requires database connection") # ============================================================================= # MANUAL TESTING CHECKLIST # ============================================================================= """ MANUAL TESTS TO PERFORM: 1. Five-Holding Receipt Test (Production Case) □ Upload: docs/data-entry/igiena 14 decembrie five-holding.pdf □ Verify TOTAL: 85.99 LEI (not 859,762.16) □ Verify TVA: 14.92 LEI (not 149,214.92) □ Verify CUI: R010562600 □ Verify no validation warnings (or only minor ones) 2. Database Migration Test □ Run: alembic upgrade head □ Check: receipts table has needs_manual_review column □ Verify: Existing receipts have NULL value □ Verify: New receipts get TRUE/FALSE values 3. API Response Test □ POST /api/ocr/extract with test receipt □ Verify response includes: needs_manual_review, validation_warnings □ Verify Save button works even with warnings 4. Validation Rules Test □ Test with receipt having wrong amounts (should flag) □ Test with receipt having correct amounts (should pass) □ Test payment sum mismatch detection □ Test TVA ratio validation 5. Medium OCR vs Heavy OCR □ Compare results on clear PDFs □ Verify no digit concatenation errors □ Check processing time is similar 6. Unit Tests □ Run: pytest backend/modules/data_entry/tests/test_ocr_validation.py -v □ Verify: All tests pass □ Check: Coverage > 90% """ if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"])