"""Pydantic schemas for OCR API.""" from datetime import date from decimal import Decimal from typing import Optional, List from pydantic import BaseModel, Field class TvaEntry(BaseModel): """Single TVA entry with code, percentage and amount.""" code: Optional[str] = Field(default=None, description="TVA code: A, B, C, D") percent: int = Field(description="TVA percentage: 0, 5, 9, 19, 21") amount: Decimal = Field(description="TVA amount for this rate") class PaymentMethod(BaseModel): """Payment method entry from OCR.""" method: str = Field(description="CARD or NUMERAR") amount: Decimal = Field(description="Amount paid") class ValidationWarning(BaseModel): """Validation warning from OCR extraction.""" field: str = Field(description="Field name (e.g., 'amount', 'tva_total')") rule: str = Field(description="Rule name (e.g., 'amount_range', 'tva_ratio')") message: str = Field(description="Human-readable warning message") severity: str = Field(description="Severity: 'info', 'warning', 'error'") suggested_value: Optional[str] = Field(default=None, description="Suggested corrected value") class ExtractionData(BaseModel): """Extracted receipt data from OCR.""" receipt_type: str = Field(default='bon_fiscal', description="Receipt type: bon_fiscal or chitanta") receipt_number: Optional[str] = Field(default=None, description="Receipt number") receipt_series: Optional[str] = Field(default=None, description="Receipt series") receipt_date: Optional[date] = Field(default=None, description="Receipt date") amount: Optional[Decimal] = Field(default=None, description="Total amount") partner_name: Optional[str] = Field(default=None, description="Vendor/partner name") cui: Optional[str] = Field(default=None, description="CUI (fiscal identification code)") description: Optional[str] = Field(default=None, description="Optional description") # Additional extracted fields - Multiple TVA entries support tva_entries: List[TvaEntry] = Field(default=[], description="List of TVA entries by rate (A, B, C, D)") tva_total: Optional[Decimal] = Field(default=None, description="Total TVA amount") address: Optional[str] = Field(default=None, description="Vendor address") items_count: Optional[int] = Field(default=None, description="Number of items/articles") # Payment methods extracted from receipt payment_methods: List[PaymentMethod] = Field(default=[], description="Payment methods from receipt (CARD, NUMERAR)") suggested_payment_mode: Optional[str] = Field(default=None, description="Auto-suggested payment mode based on OCR (casa/banca)") # Client data (for B2B receipts - buyer information) client_name: Optional[str] = Field(default=None, description="Client/customer company name") client_cui: Optional[str] = Field(default=None, description="Client CUI/CIF fiscal code") client_address: Optional[str] = Field(default=None, description="Client address") confidence_amount: float = Field(default=0.0, ge=0, le=1, description="Amount extraction confidence") confidence_date: float = Field(default=0.0, ge=0, le=1, description="Date extraction confidence") confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence") confidence_client: float = Field(default=0.0, ge=0, le=1, description="Client extraction confidence") confidence_tva: float = Field(default=0.0, ge=0, le=1, description="TVA extraction confidence") confidence_payment: float = Field(default=0.0, ge=0, le=1, description="Payment extraction confidence") overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score") raw_text: str = Field(default="", description="Raw OCR text (primary)") raw_texts: List[str] = Field(default=[], description="Raw OCR texts from all engine passes (for analysis)") ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract") processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds") # Validation results (added by bon-ocr-validation feature) # needs_manual_review: None = not validated yet (old receipts), False = no review needed, True = needs review needs_manual_review: Optional[bool] = Field(default=None, description="Flag for supervisor review (None=not validated, False=ok, True=needs review)") validation_warnings: List[str] = Field(default=[], description="Validation warnings") validation_errors: List[str] = Field(default=[], description="Validation errors") inter_ocr_ratios: dict[str, float] = Field(default={}, description="Inter-OCR consistency ratios") class Config: """Pydantic config.""" json_schema_extra = { "example": { "receipt_type": "bon_fiscal", "receipt_number": "1360760", "receipt_series": "0146", "receipt_date": "2025-10-11", "amount": 186.16, "partner_name": "FIVE-HOLDING S.A.", "cui": "10562600", "description": None, "tva_entries": [ {"code": "A", "percent": 19, "amount": 25.00}, {"code": "B", "percent": 9, "amount": 7.31} ], "tva_total": 32.31, "address": "JUD. CONSTANTA, MUN. CONSTANTA, STR. ION ROATA NR. 3", "items_count": 17, "confidence_amount": 0.98, "confidence_date": 0.98, "confidence_vendor": 0.95, "overall_confidence": 0.97, "raw_text": "FIVE-HOLDING S.A.\nCIF: RO10562600\n..." } } class OCRResponse(BaseModel): """OCR API response.""" success: bool = Field(description="Whether OCR processing was successful") message: str = Field(description="Status message") data: Optional[ExtractionData] = Field(default=None, description="Extracted data") class Config: """Pydantic config.""" json_schema_extra = { "example": { "success": True, "message": "OCR processing successful. Found: amount, date, vendor", "data": { "receipt_type": "bon_fiscal", "receipt_number": "12345", "receipt_date": "2024-01-15", "amount": 125.50, "partner_name": "MEGA IMAGE SRL", "cui": "12345678", "confidence_amount": 0.95, "confidence_date": 0.90, "confidence_vendor": 0.75, "overall_confidence": 0.87, "raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..." } } } class OCRStatusResponse(BaseModel): """OCR service status response.""" available: bool = Field(description="Whether OCR service is available") engines: list[str] = Field(description="Available OCR engines") message: str = Field(description="Status message") # ============================================================================ # Job Queue Schemas (for async OCR processing) # ============================================================================ from datetime import datetime from enum import Enum class OCREngineChoice(str, Enum): """OCR engine selection options.""" tesseract = "tesseract" doctr = "doctr" # 3.3x faster than PaddleOCR with same accuracy (90/100) doctr_plus = "doctr_plus" # docTR with 2-tier sequential processing + early exit (optimized, recommended) paddleocr = "paddleocr" class OCRJobStatus(str, Enum): """OCR job status.""" pending = "pending" processing = "processing" completed = "completed" failed = "failed" class OCRJobSubmitResponse(BaseModel): """Response when submitting an OCR job.""" job_id: str = Field(description="Unique job identifier (UUID)") status: OCRJobStatus = Field(description="Initial job status (pending)") queue_position: int = Field(description="Position in queue (1 = next to process)") estimated_wait_seconds: int = Field(description="Estimated wait time in seconds") created_at: datetime = Field(description="Job creation timestamp") class Config: """Pydantic config.""" json_schema_extra = { "example": { "job_id": "abc123-def456-ghi789", "status": "pending", "queue_position": 3, "estimated_wait_seconds": 21, "created_at": "2024-01-15T12:00:00" } } class OCRJobResponse(BaseModel): """Full OCR job status response.""" job_id: str = Field(description="Unique job identifier") status: OCRJobStatus = Field(description="Current job status") queue_position: Optional[int] = Field(default=None, description="Queue position (None if processing/completed)") estimated_wait_seconds: Optional[int] = Field(default=None, description="Estimated wait time") created_at: datetime = Field(description="Job creation timestamp") started_at: Optional[datetime] = Field(default=None, description="Processing start timestamp") completed_at: Optional[datetime] = Field(default=None, description="Completion timestamp") # Detailed timing breakdown queue_wait_ms: Optional[int] = Field(default=None, description="Time waiting in queue (started_at - created_at)") ocr_time_ms: Optional[int] = Field(default=None, description="Actual OCR engine processing time") processing_time_ms: Optional[int] = Field(default=None, description="Total job processing time (completed_at - started_at)") result: Optional[ExtractionData] = Field(default=None, description="Extraction result (only if completed)") error: Optional[str] = Field(default=None, description="Error message (only if failed)") class Config: """Pydantic config.""" json_schema_extra = { "example": { "job_id": "abc123-def456-ghi789", "status": "completed", "queue_position": None, "estimated_wait_seconds": 0, "created_at": "2024-01-15T12:00:00", "started_at": "2024-01-15T12:00:21", "completed_at": "2024-01-15T12:00:28", "processing_time_ms": 6543, "result": { "receipt_number": "123", "amount": 85.99, "ocr_engine": "paddleocr-light" } } } class OCRQueueStatusResponse(BaseModel): """Queue statistics response.""" pending_jobs: int = Field(description="Number of jobs waiting in queue") processing_jobs: int = Field(description="Number of jobs currently processing") average_time_seconds: float = Field(description="Average processing time in seconds") class Config: """Pydantic config.""" json_schema_extra = { "example": { "pending_jobs": 5, "processing_jobs": 1, "average_time_seconds": 7.2 } }