Files
roa2web-service-auto/backend/modules/data_entry/routers/ocr.py
Marius Mutu ab160b628d feat(ocr): Add validation system and CLIENT CUI extraction
OCR Data Extraction Validation System:
- Add 7 validation rules (amount range, TVA ratio, payment sum, etc.)
- Add Medium preprocessing to replace Heavy (fixes digit concatenation)
- Add validation warnings to API responses
- Flag receipts needing manual review (needs_manual_review field)
- Add database migration for needs_manual_review column

CLIENT CUI Extraction Improvements:
- Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc.
- Handle OCR errors (R0 vs RO, C1F vs CIF)
- Add client_name, client_cui, client_address to API response
- Add validation fields to API response (was missing)

QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings)
- Fixed type safety in validation rules
- Fixed ZeroDivisionError risk
- Fixed schema mismatch (Optional[bool] for needs_manual_review)
- All 37 unit tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 19:12:52 +02:00

239 lines
8.6 KiB
Python

"""OCR API endpoints."""
import os
import tempfile
from pathlib import Path
from fastapi import APIRouter, HTTPException, UploadFile, File, Depends
from sqlalchemy.ext.asyncio import AsyncSession
from backend.modules.data_entry.db.database import get_session
from backend.modules.data_entry.db.crud.attachment import AttachmentCRUD
from backend.modules.data_entry.services.ocr_service import ocr_service
from backend.modules.data_entry.services.ocr_engine import OCREngine
from backend.modules.data_entry.schemas.ocr import OCRResponse, OCRStatusResponse, ExtractionData, TvaEntry, PaymentMethod
# Auth integration (will be protected by middleware)
from shared.auth.dependencies import get_current_user
from shared.auth.models import CurrentUser
router = APIRouter()
@router.get("/status", response_model=OCRStatusResponse)
async def get_ocr_status():
"""Check OCR service status and available engines."""
engines = OCREngine.get_available_engines()
available = len(engines) > 0
if available:
message = f"OCR service ready with engines: {', '.join(engines)}"
else:
message = "No OCR engines available. Install PaddleOCR or Tesseract."
return OCRStatusResponse(
available=available,
engines=engines,
message=message
)
@router.post("/extract", response_model=OCRResponse)
async def extract_from_image(file: UploadFile = File(...)):
"""
Extract receipt data from uploaded image.
Accepts JPG, PNG, or PDF files (max 10MB).
Returns extracted fields with confidence scores.
"""
allowed_types = ['image/jpeg', 'image/png', 'application/pdf']
if file.content_type not in allowed_types:
raise HTTPException(
status_code=400,
detail=f"File type not supported: {file.content_type}. Allowed: JPG, PNG, PDF"
)
# Get file extension
suffix = Path(file.filename).suffix.lower() if file.filename else '.jpg'
if suffix not in ['.jpg', '.jpeg', '.png', '.pdf']:
suffix = '.jpg'
# Save to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
content = await file.read()
# Check file size (10MB limit)
if len(content) > 10 * 1024 * 1024:
raise HTTPException(
status_code=400,
detail="File too large. Maximum size is 10MB."
)
tmp.write(content)
tmp_path = Path(tmp.name)
try:
success, message, result = await ocr_service.process_image(
tmp_path, file.content_type
)
if not success:
raise HTTPException(status_code=422, detail=message)
# Convert ExtractionResult to ExtractionData schema
# Convert tva_entries from dict to TvaEntry objects
tva_entries_schema = [
TvaEntry(code=e.get('code'), percent=e['percent'], amount=e['amount'])
for e in result.tva_entries
] if result.tva_entries else []
# Convert payment_methods from dict to PaymentMethod objects
from decimal import Decimal
payment_methods_list = [
PaymentMethod(method=pm['method'], amount=Decimal(str(pm['amount'])))
for pm in result.payment_methods
] if result.payment_methods else []
# Auto-suggest payment_mode based on detected methods
suggested_payment_mode = None
if payment_methods_list:
has_card = any(pm.method == 'CARD' for pm in payment_methods_list)
if has_card:
suggested_payment_mode = 'banca'
# NUMERAR -> no auto-suggestion, user chooses between casa/avans
data = ExtractionData(
receipt_type=result.receipt_type,
receipt_number=result.receipt_number,
receipt_series=result.receipt_series,
receipt_date=result.receipt_date,
amount=result.amount,
partner_name=result.partner_name,
cui=result.cui,
description=result.description,
tva_entries=tva_entries_schema,
tva_total=result.tva_total,
address=result.address,
items_count=result.items_count,
payment_methods=payment_methods_list,
suggested_payment_mode=suggested_payment_mode,
# Client data (B2B receipts)
client_name=result.client_name,
client_cui=result.client_cui,
client_address=result.client_address,
confidence_amount=result.confidence_amount,
confidence_date=result.confidence_date,
confidence_vendor=result.confidence_vendor,
confidence_client=result.confidence_client,
overall_confidence=result.overall_confidence,
raw_text=result.raw_text,
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
# Validation results
needs_manual_review=result.needs_manual_review,
validation_warnings=result.validation_warnings,
validation_errors=result.validation_errors,
inter_ocr_ratios=result.inter_ocr_ratios,
)
return OCRResponse(success=True, message=message, data=data)
finally:
# Clean up temp file
if tmp_path.exists():
os.unlink(tmp_path)
@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse)
async def extract_from_attachment(
attachment_id: int,
session: AsyncSession = Depends(get_session),
):
"""
Extract receipt data from an existing attachment.
Re-processes an already uploaded file with OCR.
"""
attachment = await AttachmentCRUD.get_by_id(session, attachment_id)
if not attachment:
raise HTTPException(status_code=404, detail="Attachment not found")
file_path = AttachmentCRUD.get_file_path(attachment)
if not file_path.exists():
raise HTTPException(status_code=404, detail="File not found on disk")
# Check if file type is supported
if attachment.mime_type not in ['image/jpeg', 'image/png', 'application/pdf']:
raise HTTPException(
status_code=400,
detail=f"File type not supported for OCR: {attachment.mime_type}"
)
success, message, result = await ocr_service.process_image(
file_path, attachment.mime_type
)
if not success:
raise HTTPException(status_code=422, detail=message)
# Convert ExtractionResult to ExtractionData schema
# Convert tva_entries from dict to TvaEntry objects
tva_entries_schema = [
TvaEntry(code=e.get('code'), percent=e['percent'], amount=e['amount'])
for e in result.tva_entries
] if result.tva_entries else []
# Convert payment_methods from dict to PaymentMethod objects
from decimal import Decimal
payment_methods_list = [
PaymentMethod(method=pm['method'], amount=Decimal(str(pm['amount'])))
for pm in result.payment_methods
] if result.payment_methods else []
# Auto-suggest payment_mode based on detected methods
suggested_payment_mode = None
if payment_methods_list:
has_card = any(pm.method == 'CARD' for pm in payment_methods_list)
if has_card:
suggested_payment_mode = 'banca'
# NUMERAR -> no auto-suggestion, user chooses between casa/avans
data = ExtractionData(
receipt_type=result.receipt_type,
receipt_number=result.receipt_number,
receipt_series=result.receipt_series,
receipt_date=result.receipt_date,
amount=result.amount,
partner_name=result.partner_name,
cui=result.cui,
description=result.description,
tva_entries=tva_entries_schema,
tva_total=result.tva_total,
address=result.address,
items_count=result.items_count,
payment_methods=payment_methods_list,
suggested_payment_mode=suggested_payment_mode,
# Client data (B2B receipts)
client_name=result.client_name,
client_cui=result.client_cui,
client_address=result.client_address,
confidence_amount=result.confidence_amount,
confidence_date=result.confidence_date,
confidence_vendor=result.confidence_vendor,
confidence_client=result.confidence_client,
overall_confidence=result.overall_confidence,
raw_text=result.raw_text,
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
# Validation results
needs_manual_review=result.needs_manual_review,
validation_warnings=result.validation_warnings,
validation_errors=result.validation_errors,
inter_ocr_ratios=result.inter_ocr_ratios,
)
return OCRResponse(success=True, message=message, data=data)