Files
roa2web-service-auto/backend/modules/data_entry/routers/ocr.py
Claude Agent b4a226409c feat(data-entry): Add unified receipt form with OCR confidence tracking
New unified receipt creation system with:
- UnifiedReceiptForm component with inline OCR preview and confidence indicators
- Compact upload zone with drag-drop and camera support
- TVA and Payment fields with dynamic add/remove
- Supplier dual-field with autocomplete and OCR hint
- Receipt form sections with collapsible auxiliary data

Backend OCR improvements:
- Add confidence_tva and confidence_payment to extraction results
- Update TVA extraction to return confidence scores
- Include TVA (15%) and payment (10%) in overall_confidence calculation

Also includes:
- CSS design system rules documentation
- Port check helper function for service scripts
- Expanded design tokens documentation in CLAUDE.md

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 21:48:37 +00:00

716 lines
25 KiB
Python

"""
OCR API endpoints with async job queue support.
Endpoints:
- POST /extract - Submit OCR job (returns job_id immediately)
- GET /jobs/{job_id} - Get job status and result
- GET /queue/status - Get queue statistics
- GET /status - Check OCR service availability
For backwards compatibility, we also support sync mode via query param:
- POST /extract?sync=true - Process synchronously (blocks until complete)
"""
import os
import tempfile
from datetime import datetime
from decimal import Decimal
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, HTTPException, UploadFile, File, Depends, Query, Response
from sqlalchemy.ext.asyncio import AsyncSession
from backend.modules.data_entry.db.database import get_session
from backend.modules.data_entry.db.crud.attachment import AttachmentCRUD
from backend.modules.data_entry.services.ocr_service import ocr_service
from backend.modules.data_entry.services.ocr_engine import OCREngine
from backend.modules.data_entry.services.ocr.job_queue import job_queue, OCRJobStatus as JobStatus
from backend.modules.data_entry.services.ocr.job_worker import estimate_wait_time
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
from backend.modules.data_entry.schemas.ocr import (
OCRResponse,
OCRStatusResponse,
ExtractionData,
TvaEntry,
PaymentMethod,
# New job queue schemas
OCREngineChoice,
OCRJobStatus,
OCRJobSubmitResponse,
OCRJobResponse,
OCRQueueStatusResponse,
)
# Auth integration
from shared.auth.dependencies import get_current_user
from shared.auth.models import CurrentUser
router = APIRouter()
# ============================================================================
# OCR Job Queue Endpoints (NEW)
# ============================================================================
@router.post("/extract", response_model=OCRJobSubmitResponse)
async def submit_ocr_job(
file: UploadFile = File(...),
engine: OCREngineChoice = Query(default=OCREngineChoice.doctr_plus, description="OCR engine to use"),
sync: bool = Query(default=False, description="If true, process synchronously (blocks)"),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Submit an OCR job for processing.
By default, returns immediately with a job_id. Poll GET /jobs/{job_id} for result.
Use ?sync=true for synchronous processing (blocks until complete).
This is for backwards compatibility but not recommended for production.
Args:
file: Image or PDF file (max 10MB)
engine: OCR engine choice (tesseract, doctr, doctr_plus, paddleocr)
sync: If true, process synchronously (legacy mode)
Returns:
OCRJobSubmitResponse with job_id, queue_position, estimated_wait
"""
allowed_types = ['image/jpeg', 'image/png', 'application/pdf']
if file.content_type not in allowed_types:
raise HTTPException(
status_code=400,
detail=f"File type not supported: {file.content_type}. Allowed: JPG, PNG, PDF"
)
# Read file content
content = await file.read()
# Check file size (10MB limit)
if len(content) > 10 * 1024 * 1024:
raise HTTPException(
status_code=400,
detail="File too large. Maximum size is 10MB."
)
# Sync mode - use legacy processing (blocks)
if sync:
return await _process_sync(content, file, engine, current_user)
# Async mode - create job and return immediately
try:
job = await job_queue.create_job(
file_bytes=content,
mime_type=file.content_type,
engine=engine.value,
username=current_user.username,
original_filename=file.filename
)
# Get queue position
queue_position = await job_queue.get_queue_position(job.id)
estimated_wait = estimate_wait_time(queue_position or 1)
return OCRJobSubmitResponse(
job_id=job.id,
status=OCRJobStatus.pending,
queue_position=queue_position or 1,
estimated_wait_seconds=estimated_wait,
created_at=job.created_at or datetime.utcnow()
)
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to create OCR job: {str(e)}"
)
@router.get("/jobs/{job_id}", response_model=OCRJobResponse)
async def get_job_status(
job_id: str,
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get OCR job status and result (instant response).
For efficient polling, use GET /jobs/{job_id}/wait instead (long-polling).
Args:
job_id: Job UUID from POST /extract response
Returns:
OCRJobResponse with status, queue_position, and result (if completed)
"""
job = await job_queue.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
# Get queue position for pending jobs
queue_position = None
estimated_wait = None
if job.status == JobStatus.pending:
queue_position = await job_queue.get_queue_position(job_id)
estimated_wait = estimate_wait_time(queue_position or 1)
elif job.status == JobStatus.processing:
queue_position = 0
# Estimate remaining time based on average
avg_time = await job_queue.get_average_processing_time()
estimated_wait = int(avg_time * 0.5) # Rough estimate: half remaining
# Convert result to ExtractionData if available
result_data = None
if job.status == JobStatus.completed and job.result:
result_data = _dict_to_extraction_data(job.result)
# Apply fuzzy CUI matching
result_data = await _apply_fuzzy_cui_matching(result_data, session)
# Debug: log suggested_payment_mode being returned
print(f"[OCR Router] Returning job {job_id} with suggested_payment_mode={result_data.suggested_payment_mode}", flush=True)
return OCRJobResponse(
job_id=job.id,
status=OCRJobStatus(job.status.value),
queue_position=queue_position,
estimated_wait_seconds=estimated_wait,
created_at=job.created_at or datetime.utcnow(),
started_at=job.started_at,
completed_at=job.completed_at,
queue_wait_ms=job.queue_wait_ms,
ocr_time_ms=job.ocr_time_ms,
processing_time_ms=job.processing_time_ms,
result=result_data,
error=job.error_message
)
@router.get("/jobs/{job_id}/wait", response_model=OCRJobResponse)
async def wait_for_job_status(
job_id: str,
response: Response,
timeout: int = Query(default=30, ge=1, le=60, description="Max wait time in seconds"),
wait_for_terminal: bool = Query(default=False, description="If true, only return on completed/failed"),
_t: int = Query(default=None, description="Cache-busting timestamp (ignored)"),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Long-poll for OCR job status change.
Waits until:
- Job status changes (default behavior - returns on any status change)
- Job reaches terminal state (if wait_for_terminal=true)
- Timeout expires (returns current status)
Recommended client timeout: timeout + 5 seconds
Args:
job_id: Job UUID from POST /extract response
timeout: Max wait time in seconds (1-60, default 30)
wait_for_terminal: If true, wait until completed/failed only
Returns:
OCRJobResponse with status, queue_position, and result (if completed)
"""
# Prevent caching - critical for long-polling
response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
response.headers["Pragma"] = "no-cache"
response.headers["Expires"] = "0"
import asyncio
import time
start_time = time.time()
end_time = start_time + timeout
last_status = None
iteration = 0
print(f"[OCR Wait] Starting long-poll for job {job_id}, timeout={timeout}s, wait_for_terminal={wait_for_terminal}", flush=True)
while time.time() < end_time:
iteration += 1
job = await job_queue.get_job(job_id)
if not job:
print(f"[OCR Wait] Job {job_id} not found after {iteration} iterations", flush=True)
raise HTTPException(status_code=404, detail="Job not found")
# Return immediately if job completed or failed (terminal states)
if job.status in [JobStatus.completed, JobStatus.failed]:
elapsed = time.time() - start_time
print(f"[OCR Wait] Job {job_id} {job.status.value} after {elapsed:.1f}s ({iteration} iterations)", flush=True)
return await get_job_status(job_id, session, current_user)
# Return on status change (unless wait_for_terminal is set)
if not wait_for_terminal and last_status is not None and job.status != last_status:
elapsed = time.time() - start_time
print(f"[OCR Wait] Job {job_id} status changed {last_status.value}->{job.status.value} after {elapsed:.1f}s", flush=True)
return await get_job_status(job_id, session, current_user)
last_status = job.status
# Wait 500ms before next internal check (faster polling for better responsiveness)
await asyncio.sleep(0.5)
# Timeout - return current status
elapsed = time.time() - start_time
print(f"[OCR Wait] Job {job_id} timeout after {elapsed:.1f}s ({iteration} iterations), status={last_status.value if last_status else 'unknown'}", flush=True)
return await get_job_status(job_id, session, current_user)
@router.get("/queue/status", response_model=OCRQueueStatusResponse)
async def get_queue_status(
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get OCR queue statistics.
Returns:
Queue status with pending/processing counts and average time
"""
stats = await job_queue.get_queue_stats()
return OCRQueueStatusResponse(
pending_jobs=stats["pending"],
processing_jobs=stats["processing"],
average_time_seconds=stats["average_time_seconds"]
)
# ============================================================================
# Legacy Endpoints (backwards compatibility)
# ============================================================================
@router.get("/status", response_model=OCRStatusResponse)
async def get_ocr_status():
"""Check OCR service status and available engines."""
engines = OCREngine.get_available_engines()
available = len(engines) > 0
if available:
message = f"OCR service ready with engines: {', '.join(engines)}"
else:
message = "No OCR engines available. Install PaddleOCR or Tesseract."
return OCRStatusResponse(
available=available,
engines=engines,
message=message
)
@router.get("/engines")
async def get_available_engines():
"""
Get list of enabled OCR engines based on .env configuration.
Returns engines availability and available processing modes.
Frontend should use this to filter engine selection dropdown.
Available engines: tesseract, doctr, doctr_plus, paddleocr
"""
# Check which engines are enabled via .env
paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
default_engine = os.getenv("OCR_DEFAULT_ENGINE", "doctr_plus")
# Build engines dict
engines = {
"tesseract": tesseract_enabled,
"doctr": True, # Always available (primary engine)
"doctr_plus": True, # Always available (recommended)
"paddleocr": paddle_enabled,
}
# Build available modes based on enabled engines
modes = []
if tesseract_enabled:
modes.append("tesseract")
modes.append("doctr")
modes.append("doctr_plus")
if paddle_enabled:
modes.append("paddleocr")
return {
"engines": engines,
"available_modes": modes,
"default_mode": default_engine,
"memory_estimate_mb": {
"tesseract": 50,
"doctr": 600,
"doctr_plus": 600,
"paddleocr": 800,
}
}
@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse)
async def extract_from_attachment(
attachment_id: int,
engine: OCREngineChoice = Query(default=OCREngineChoice.doctr_plus),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Extract receipt data from an existing attachment.
Re-processes an already uploaded file with OCR.
This endpoint always processes synchronously.
"""
attachment = await AttachmentCRUD.get_by_id(session, attachment_id)
if not attachment:
raise HTTPException(status_code=404, detail="Attachment not found")
file_path = AttachmentCRUD.get_file_path(attachment)
if not file_path.exists():
raise HTTPException(status_code=404, detail="File not found on disk")
# Check if file type is supported
if attachment.mime_type not in ['image/jpeg', 'image/png', 'application/pdf']:
raise HTTPException(
status_code=400,
detail=f"File type not supported for OCR: {attachment.mime_type}"
)
# TODO: Could use job queue here too, but keeping sync for now
success, message, result = await ocr_service.process_image(
file_path, attachment.mime_type
)
if not success:
raise HTTPException(status_code=422, detail=message)
data = _result_to_extraction_data(result)
# Apply fuzzy CUI matching
data = await _apply_fuzzy_cui_matching(data, session)
return OCRResponse(success=True, message=message, data=data)
# ============================================================================
# Helper Functions
# ============================================================================
async def _apply_fuzzy_cui_matching(
extraction_data: ExtractionData,
session: AsyncSession
) -> ExtractionData:
"""
Apply fuzzy CUI matching to extraction data.
ONLY applies fuzzy matching if CUI is missing OR has invalid checksum.
If CUI has valid checksum, we trust the OCR and skip fuzzy matching.
Args:
extraction_data: ExtractionData with CUI to potentially correct
session: AsyncSession for database lookups
Returns:
ExtractionData with CUI corrected if a match was found
"""
from backend.modules.data_entry.services.ocr.validation import CUIChecksumRule
# Skip if no CUI and no vendor name (nothing to match)
if not extraction_data.cui and not extraction_data.partner_name:
return extraction_data
# Check if CUI has valid checksum - if valid, skip fuzzy matching
if extraction_data.cui:
cui_digits = CUIChecksumRule.extract_digits(extraction_data.cui)
if len(cui_digits) >= 6 and CUIChecksumRule.validate_checksum(cui_digits):
print(f"[Fuzzy Match] CUI {extraction_data.cui} has valid checksum, skipping fuzzy match", flush=True)
return extraction_data
# CUI missing or invalid checksum - try fuzzy matching
try:
match = await OCRValidationEngine.fuzzy_match_supplier(
cui=extraction_data.cui,
vendor_name=extraction_data.partner_name,
db_session=session
)
if match:
corrected_cui, supplier_name = match
if corrected_cui != extraction_data.cui:
print(f"[Fuzzy Match] Corrected: {extraction_data.cui} -> {corrected_cui} ({supplier_name})", flush=True)
extraction_data.cui = corrected_cui
# Also set partner_name if not already set
if not extraction_data.partner_name:
extraction_data.partner_name = supplier_name
except Exception as e:
print(f"[Fuzzy Match] Error: {e}", flush=True)
return extraction_data
async def _process_sync(
content: bytes,
file: UploadFile,
engine: OCREngineChoice,
current_user: CurrentUser
) -> OCRJobSubmitResponse:
"""
Process OCR synchronously (legacy mode).
Creates a job, processes it immediately, and returns the result
wrapped in a JobSubmitResponse for API consistency.
"""
# Get file extension
suffix = Path(file.filename).suffix.lower() if file.filename else '.jpg'
if suffix not in ['.jpg', '.jpeg', '.png', '.pdf']:
suffix = '.jpg'
# Save to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(content)
tmp_path = Path(tmp.name)
try:
success, message, result = await ocr_service.process_image(
tmp_path, file.content_type
)
if not success:
raise HTTPException(status_code=422, detail=message)
# Create a fake job response with the result embedded
# This maintains API compatibility
now = datetime.utcnow()
# For sync mode, we return a special response that includes
# the result directly. Clients should check if result is present.
return OCRJobSubmitResponse(
job_id="sync-" + str(hash(content))[:16],
status=OCRJobStatus.completed,
queue_position=0,
estimated_wait_seconds=0,
created_at=now
)
finally:
# Clean up temp file
if tmp_path.exists():
os.unlink(tmp_path)
def _result_to_extraction_data(result) -> ExtractionData:
"""Convert ExtractionResult to ExtractionData schema."""
# Convert tva_entries from dict to TvaEntry objects
tva_entries_schema = [
TvaEntry(code=e.get('code'), percent=e['percent'], amount=e['amount'])
for e in result.tva_entries
] if result.tva_entries else []
# Convert payment_methods from dict to PaymentMethod objects
payment_methods_list = [
PaymentMethod(method=pm['method'], amount=Decimal(str(pm['amount'])))
for pm in result.payment_methods
] if result.payment_methods else []
# Auto-suggest payment_mode based on detected methods
suggested_payment_mode = None
if payment_methods_list:
has_card = any(pm.method == 'CARD' for pm in payment_methods_list)
if has_card:
suggested_payment_mode = 'banca'
return ExtractionData(
receipt_type=result.receipt_type,
receipt_number=result.receipt_number,
receipt_series=result.receipt_series,
receipt_date=result.receipt_date,
amount=result.amount,
partner_name=result.partner_name,
cui=result.cui,
description=result.description,
tva_entries=tva_entries_schema,
tva_total=result.tva_total,
address=result.address,
items_count=result.items_count,
payment_methods=payment_methods_list,
suggested_payment_mode=suggested_payment_mode,
client_name=result.client_name,
client_cui=result.client_cui,
client_address=result.client_address,
confidence_amount=result.confidence_amount,
confidence_date=result.confidence_date,
confidence_vendor=result.confidence_vendor,
confidence_client=getattr(result, 'confidence_client', 0.0),
overall_confidence=result.overall_confidence,
raw_text=result.raw_text,
raw_texts=getattr(result, 'raw_texts', []),
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
needs_manual_review=result.needs_manual_review,
validation_warnings=result.validation_warnings,
validation_errors=result.validation_errors,
inter_ocr_ratios=result.inter_ocr_ratios,
)
def _dict_to_extraction_data(data: dict) -> ExtractionData:
"""Convert result dict (from job queue) to ExtractionData schema."""
from datetime import date
# Parse date if string
receipt_date = data.get('receipt_date')
if isinstance(receipt_date, str):
try:
receipt_date = date.fromisoformat(receipt_date)
except (ValueError, TypeError):
receipt_date = None
# Convert tva_entries
tva_entries = data.get('tva_entries', []) or []
tva_entries_schema = []
for e in tva_entries:
if isinstance(e, dict):
tva_entries_schema.append(TvaEntry(
code=e.get('code'),
percent=e.get('percent', 0),
amount=Decimal(str(e.get('amount', 0)))
))
# Convert payment_methods
payment_methods = data.get('payment_methods', []) or []
payment_methods_list = []
for pm in payment_methods:
if isinstance(pm, dict):
payment_methods_list.append(PaymentMethod(
method=pm.get('method', 'NUMERAR'),
amount=Decimal(str(pm.get('amount', 0)))
))
# Convert amount and tva_total to Decimal
amount = data.get('amount')
if amount is not None:
amount = Decimal(str(amount))
tva_total = data.get('tva_total')
if tva_total is not None:
tva_total = Decimal(str(tva_total))
return ExtractionData(
receipt_type=data.get('receipt_type', 'bon_fiscal'),
receipt_number=data.get('receipt_number'),
receipt_series=data.get('receipt_series'),
receipt_date=receipt_date,
amount=amount,
partner_name=data.get('partner_name'),
cui=data.get('cui'),
description=data.get('description'),
tva_entries=tva_entries_schema,
tva_total=tva_total,
address=data.get('address'),
items_count=data.get('items_count'),
payment_methods=payment_methods_list,
suggested_payment_mode=data.get('suggested_payment_mode'),
client_name=data.get('client_name'),
client_cui=data.get('client_cui'),
client_address=data.get('client_address'),
confidence_amount=data.get('confidence_amount', 0.0),
confidence_date=data.get('confidence_date', 0.0),
confidence_vendor=data.get('confidence_vendor', 0.0),
confidence_client=data.get('confidence_client', 0.0),
confidence_tva=data.get('confidence_tva', 0.0),
confidence_payment=data.get('confidence_payment', 0.0),
overall_confidence=data.get('overall_confidence', 0.0),
raw_text=data.get('raw_text', ''),
raw_texts=data.get('raw_texts', []),
ocr_engine=data.get('ocr_engine', ''),
processing_time_ms=data.get('processing_time_ms', 0),
needs_manual_review=data.get('needs_manual_review'),
validation_warnings=data.get('validation_warnings', []),
validation_errors=data.get('validation_errors', []),
inter_ocr_ratios=data.get('inter_ocr_ratios', {}),
)
# ============================================================================
# Store Profiles Management Endpoints
# ============================================================================
@router.post("/profiles/reload")
async def reload_store_profiles(
current_user: CurrentUser = Depends(get_current_user)
) -> dict:
"""
Hot-reload all store profiles.
Reloads profile Python modules without server restart.
Use after adding/modifying profile files.
Returns:
Dict with reloaded count and profile list
"""
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry
count = ProfileRegistry.reload_all()
status = ProfileRegistry.get_reload_status()
return {
"success": True,
"reloaded_modules": count,
"profiles_count": status["profiles_count"],
"registered_cuis": status["registered_cuis"],
"last_reload": status["last_reload"],
}
@router.get("/profiles")
async def list_store_profiles(
current_user: CurrentUser = Depends(get_current_user)
) -> dict:
"""
List all registered store profiles.
Returns:
Dict with profiles list and status
"""
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry
profiles = ProfileRegistry.list_profiles()
status = ProfileRegistry.get_reload_status()
return {
"profiles": profiles,
"count": len(profiles),
"last_reload": status["last_reload"],
}
@router.get("/profiles/{cui}")
async def get_store_profile(
cui: str,
current_user: CurrentUser = Depends(get_current_user)
) -> dict:
"""
Get details for a specific store profile.
Args:
cui: Store CUI (with or without RO prefix)
Returns:
Profile details including validation hints
Raises:
404: If no profile exists for this CUI
"""
from backend.modules.data_entry.services.ocr.profiles import ProfileRegistry
info = ProfileRegistry.get_profile_info(cui)
if not info:
raise HTTPException(
status_code=404,
detail=f"No profile registered for CUI: {cui}"
)
return info