feat(ocr): Add validation system and CLIENT CUI extraction

OCR Data Extraction Validation System: - Add 7 validation rules (amount range, TVA ratio, payment sum, etc.) - Add Medium preprocessing to replace Heavy (fixes digit concatenation) - Add validation warnings to API responses - Flag receipts needing manual review (needs_manual_review field) - Add database migration for needs_manual_review column CLIENT CUI Extraction Improvements: - Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc. - Handle OCR errors (R0 vs RO, C1F vs CIF) - Add client_name, client_cui, client_address to API response - Add validation fields to API response (was missing) QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings) - Fixed type safety in validation rules - Fixed ZeroDivisionError risk - Fixed schema mismatch (Optional[bool] for needs_manual_review) - All 37 unit tests passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 19:12:52 +02:00
parent ce85e0643b
commit ab160b628d
14 changed files with 4161 additions and 33 deletions
--- a/backend/modules/data_entry/tests/test_ocr_validation_integration.py
+++ b/backend/modules/data_entry/tests/test_ocr_validation_integration.py
@@ -0,0 +1,180 @@
+"""
+Integration tests for OCR validation system.
+
+These tests verify the end-to-end validation flow with real OCR processing.
+
+IMPORTANT: These tests require:
+1. PaddleOCR models downloaded
+2. Tesseract installed
+3. Test receipt files in docs/data-entry/
+
+Run with: pytest backend/modules/data_entry/tests/test_ocr_validation_integration.py -v
+"""
+
+import pytest
+from pathlib import Path
+from decimal import Decimal
+
+
+# Mark all tests as integration tests (slower, require OCR models)
+pytestmark = pytest.mark.integration
+
+
+@pytest.fixture
+def five_holding_receipt_path():
+    """Path to Five-Holding production receipt (85.99 LEI test case)."""
+    return Path("docs/data-entry/igiena 14 decembrie five-holding.pdf")
+
+
+class TestProductionCaseFiveHolding:
+    """Test the critical Five-Holding receipt case (85.99 not 859,762.16)."""
+
+    def test_correct_amount_extracted(self, five_holding_receipt_path):
+        """Verify Five-Holding receipt extracts 85.99 LEI, not 859,762.16."""
+        # TODO: Implement when OCR service is running
+        # from backend.modules.data_entry.services.ocr_service import OCRService
+        # service = OCRService()
+        # success, message, extraction = service.process_receipt(five_holding_receipt_path)
+        #
+        # assert success is True
+        # assert extraction.amount == Decimal('85.99'), f"Expected 85.99, got {extraction.amount}"
+        # assert extraction.tva_total == Decimal('14.92'), f"Expected 14.92, got {extraction.tva_total}"
+        pytest.skip("Requires running OCR service - manual test")
+
+    def test_no_magnitude_errors(self, five_holding_receipt_path):
+        """Verify no 10,000x magnitude errors."""
+        # TODO: Verify extraction.amount < 1000 (not 859,762.16)
+        pytest.skip("Requires running OCR service - manual test")
+
+    def test_validation_warnings_if_any(self, five_holding_receipt_path):
+        """Check validation warnings on Five-Holding receipt."""
+        # TODO: extraction.validation_warnings should be empty or minimal
+        pytest.skip("Requires running OCR service - manual test")
+
+
+class TestValidationIntegration:
+    """Test validation integration with OCR pipeline."""
+
+    def test_payment_sum_validation_mock(self):
+        """Test payment sum validation with mocked data."""
+        # This can run without OCR - just tests validation logic
+        from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+
+        validator = OCRValidationEngine()
+
+        # Case: Payment sum mismatch
+        data = {
+            'amount': 100.0,
+            'card_amount': 50.0,
+            'cash_amount': 40.0,  # Sum = 90, diff = 10
+        }
+
+        result = validator.validate_extraction(data)
+
+        assert result.needs_manual_review is True
+        assert len(result.validation_warnings) > 0
+        assert any('Payment sum' in w for w in result.validation_warnings)
+
+    def test_tva_ratio_validation_mock(self):
+        """Test TVA ratio validation with mocked data."""
+        from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+
+        validator = OCRValidationEngine()
+
+        # Case: TVA too high (> 24%)
+        data = {
+            'amount': 100.0,
+            'tva': 30.0,  # 30% - invalid!
+        }
+
+        result = validator.validate_extraction(data)
+
+        assert result.needs_manual_review is True
+        assert any('TVA ratio' in w for w in result.validation_warnings)
+
+    def test_amount_range_validation_mock(self):
+        """Test amount range validation with mocked data."""
+        from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
+
+        validator = OCRValidationEngine()
+
+        # Case: Amount too high (> 100,000)
+        data = {
+            'amount': 859_762.16,  # Production error case!
+        }
+
+        result = validator.validate_extraction(data)
+
+        assert result.needs_manual_review is True
+        assert len(result.validation_errors) > 0
+        assert any('exceeds maximum' in e for e in result.validation_errors)
+
+    def test_medium_ocr_preprocessing(self):
+        """Test that Medium OCR preprocessing works."""
+        pytest.skip("Requires OCR models - manual test")
+        # TODO:
+        # from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
+        # preprocessor = ImagePreprocessor()
+        # # Load test image
+        # # Apply preprocess_medium()
+        # # Verify output shape and values
+
+
+class TestDatabaseIntegration:
+    """Test database integration for needs_manual_review field."""
+
+    def test_receipt_model_has_validation_field(self):
+        """Verify Receipt model has needs_manual_review field."""
+        # TODO: Check Receipt model
+        pytest.skip("Requires database connection")
+
+    def test_migration_adds_column(self):
+        """Verify migration adds needs_manual_review column."""
+        # TODO: Run migration and check column exists
+        pytest.skip("Requires database connection")
+
+
+# =============================================================================
+# MANUAL TESTING CHECKLIST
+# =============================================================================
+"""
+MANUAL TESTS TO PERFORM:
+
+1. Five-Holding Receipt Test (Production Case)
+   □ Upload: docs/data-entry/igiena 14 decembrie five-holding.pdf
+   □ Verify TOTAL: 85.99 LEI (not 859,762.16)
+   □ Verify TVA: 14.92 LEI (not 149,214.92)
+   □ Verify CUI: R010562600
+   □ Verify no validation warnings (or only minor ones)
+
+2. Database Migration Test
+   □ Run: alembic upgrade head
+   □ Check: receipts table has needs_manual_review column
+   □ Verify: Existing receipts have NULL value
+   □ Verify: New receipts get TRUE/FALSE values
+
+3. API Response Test
+   □ POST /api/ocr/extract with test receipt
+   □ Verify response includes: needs_manual_review, validation_warnings
+   □ Verify Save button works even with warnings
+
+4. Validation Rules Test
+   □ Test with receipt having wrong amounts (should flag)
+   □ Test with receipt having correct amounts (should pass)
+   □ Test payment sum mismatch detection
+   □ Test TVA ratio validation
+
+5. Medium OCR vs Heavy OCR
+   □ Compare results on clear PDFs
+   □ Verify no digit concatenation errors
+   □ Check processing time is similar
+
+6. Unit Tests
+   □ Run: pytest backend/modules/data_entry/tests/test_ocr_validation.py -v
+   □ Verify: All tests pass
+   □ Check: Coverage > 90%
+"""
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])