Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
242 lines
11 KiB
Python
242 lines
11 KiB
Python
"""Pydantic schemas for OCR API."""
|
|
|
|
from datetime import date
|
|
from decimal import Decimal
|
|
from typing import Optional, List
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class TvaEntry(BaseModel):
|
|
"""Single TVA entry with code, percentage and amount."""
|
|
code: Optional[str] = Field(default=None, description="TVA code: A, B, C, D")
|
|
percent: int = Field(description="TVA percentage: 0, 5, 9, 19, 21")
|
|
amount: Decimal = Field(description="TVA amount for this rate")
|
|
|
|
|
|
class PaymentMethod(BaseModel):
|
|
"""Payment method entry from OCR."""
|
|
method: str = Field(description="CARD or NUMERAR")
|
|
amount: Decimal = Field(description="Amount paid")
|
|
|
|
|
|
class ValidationWarning(BaseModel):
|
|
"""Validation warning from OCR extraction."""
|
|
field: str = Field(description="Field name (e.g., 'amount', 'tva_total')")
|
|
rule: str = Field(description="Rule name (e.g., 'amount_range', 'tva_ratio')")
|
|
message: str = Field(description="Human-readable warning message")
|
|
severity: str = Field(description="Severity: 'info', 'warning', 'error'")
|
|
suggested_value: Optional[str] = Field(default=None, description="Suggested corrected value")
|
|
|
|
|
|
class ExtractionData(BaseModel):
|
|
"""Extracted receipt data from OCR."""
|
|
|
|
receipt_type: str = Field(default='bon_fiscal', description="Receipt type: bon_fiscal or chitanta")
|
|
receipt_number: Optional[str] = Field(default=None, description="Receipt number")
|
|
receipt_series: Optional[str] = Field(default=None, description="Receipt series")
|
|
receipt_date: Optional[date] = Field(default=None, description="Receipt date")
|
|
amount: Optional[Decimal] = Field(default=None, description="Total amount")
|
|
partner_name: Optional[str] = Field(default=None, description="Vendor/partner name")
|
|
cui: Optional[str] = Field(default=None, description="CUI (fiscal identification code)")
|
|
description: Optional[str] = Field(default=None, description="Optional description")
|
|
|
|
# Additional extracted fields - Multiple TVA entries support
|
|
tva_entries: List[TvaEntry] = Field(default=[], description="List of TVA entries by rate (A, B, C, D)")
|
|
tva_total: Optional[Decimal] = Field(default=None, description="Total TVA amount")
|
|
address: Optional[str] = Field(default=None, description="Vendor address")
|
|
items_count: Optional[int] = Field(default=None, description="Number of items/articles")
|
|
|
|
# Payment methods extracted from receipt
|
|
payment_methods: List[PaymentMethod] = Field(default=[], description="Payment methods from receipt (CARD, NUMERAR)")
|
|
suggested_payment_mode: Optional[str] = Field(default=None, description="Auto-suggested payment mode based on OCR (casa/banca)")
|
|
|
|
# Client data (for B2B receipts - buyer information)
|
|
client_name: Optional[str] = Field(default=None, description="Client/customer company name")
|
|
client_cui: Optional[str] = Field(default=None, description="Client CUI/CIF fiscal code")
|
|
client_address: Optional[str] = Field(default=None, description="Client address")
|
|
|
|
confidence_amount: float = Field(default=0.0, ge=0, le=1, description="Amount extraction confidence")
|
|
confidence_date: float = Field(default=0.0, ge=0, le=1, description="Date extraction confidence")
|
|
confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
|
|
confidence_client: float = Field(default=0.0, ge=0, le=1, description="Client extraction confidence")
|
|
overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
|
|
raw_text: str = Field(default="", description="Raw OCR text (primary)")
|
|
raw_texts: List[str] = Field(default=[], description="Raw OCR texts from all engine passes (for analysis)")
|
|
ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
|
|
processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")
|
|
|
|
# Validation results (added by bon-ocr-validation feature)
|
|
# needs_manual_review: None = not validated yet (old receipts), False = no review needed, True = needs review
|
|
needs_manual_review: Optional[bool] = Field(default=None, description="Flag for supervisor review (None=not validated, False=ok, True=needs review)")
|
|
validation_warnings: List[str] = Field(default=[], description="Validation warnings")
|
|
validation_errors: List[str] = Field(default=[], description="Validation errors")
|
|
inter_ocr_ratios: dict[str, float] = Field(default={}, description="Inter-OCR consistency ratios")
|
|
|
|
class Config:
|
|
"""Pydantic config."""
|
|
json_schema_extra = {
|
|
"example": {
|
|
"receipt_type": "bon_fiscal",
|
|
"receipt_number": "1360760",
|
|
"receipt_series": "0146",
|
|
"receipt_date": "2025-10-11",
|
|
"amount": 186.16,
|
|
"partner_name": "FIVE-HOLDING S.A.",
|
|
"cui": "10562600",
|
|
"description": None,
|
|
"tva_entries": [
|
|
{"code": "A", "percent": 19, "amount": 25.00},
|
|
{"code": "B", "percent": 9, "amount": 7.31}
|
|
],
|
|
"tva_total": 32.31,
|
|
"address": "JUD. CONSTANTA, MUN. CONSTANTA, STR. ION ROATA NR. 3",
|
|
"items_count": 17,
|
|
"confidence_amount": 0.98,
|
|
"confidence_date": 0.98,
|
|
"confidence_vendor": 0.95,
|
|
"overall_confidence": 0.97,
|
|
"raw_text": "FIVE-HOLDING S.A.\nCIF: RO10562600\n..."
|
|
}
|
|
}
|
|
|
|
|
|
class OCRResponse(BaseModel):
|
|
"""OCR API response."""
|
|
|
|
success: bool = Field(description="Whether OCR processing was successful")
|
|
message: str = Field(description="Status message")
|
|
data: Optional[ExtractionData] = Field(default=None, description="Extracted data")
|
|
|
|
class Config:
|
|
"""Pydantic config."""
|
|
json_schema_extra = {
|
|
"example": {
|
|
"success": True,
|
|
"message": "OCR processing successful. Found: amount, date, vendor",
|
|
"data": {
|
|
"receipt_type": "bon_fiscal",
|
|
"receipt_number": "12345",
|
|
"receipt_date": "2024-01-15",
|
|
"amount": 125.50,
|
|
"partner_name": "MEGA IMAGE SRL",
|
|
"cui": "12345678",
|
|
"confidence_amount": 0.95,
|
|
"confidence_date": 0.90,
|
|
"confidence_vendor": 0.75,
|
|
"overall_confidence": 0.87,
|
|
"raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..."
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
class OCRStatusResponse(BaseModel):
|
|
"""OCR service status response."""
|
|
|
|
available: bool = Field(description="Whether OCR service is available")
|
|
engines: list[str] = Field(description="Available OCR engines")
|
|
message: str = Field(description="Status message")
|
|
|
|
|
|
# ============================================================================
|
|
# Job Queue Schemas (for async OCR processing)
|
|
# ============================================================================
|
|
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
|
|
|
|
class OCREngineChoice(str, Enum):
|
|
"""OCR engine selection options."""
|
|
tesseract = "tesseract"
|
|
doctr = "doctr" # 3.3x faster than PaddleOCR with same accuracy (90/100)
|
|
doctr_plus = "doctr_plus" # docTR with 2-tier sequential processing + early exit (optimized, recommended)
|
|
paddleocr = "paddleocr"
|
|
|
|
|
|
class OCRJobStatus(str, Enum):
|
|
"""OCR job status."""
|
|
pending = "pending"
|
|
processing = "processing"
|
|
completed = "completed"
|
|
failed = "failed"
|
|
|
|
|
|
class OCRJobSubmitResponse(BaseModel):
|
|
"""Response when submitting an OCR job."""
|
|
|
|
job_id: str = Field(description="Unique job identifier (UUID)")
|
|
status: OCRJobStatus = Field(description="Initial job status (pending)")
|
|
queue_position: int = Field(description="Position in queue (1 = next to process)")
|
|
estimated_wait_seconds: int = Field(description="Estimated wait time in seconds")
|
|
created_at: datetime = Field(description="Job creation timestamp")
|
|
|
|
class Config:
|
|
"""Pydantic config."""
|
|
json_schema_extra = {
|
|
"example": {
|
|
"job_id": "abc123-def456-ghi789",
|
|
"status": "pending",
|
|
"queue_position": 3,
|
|
"estimated_wait_seconds": 21,
|
|
"created_at": "2024-01-15T12:00:00"
|
|
}
|
|
}
|
|
|
|
|
|
class OCRJobResponse(BaseModel):
|
|
"""Full OCR job status response."""
|
|
|
|
job_id: str = Field(description="Unique job identifier")
|
|
status: OCRJobStatus = Field(description="Current job status")
|
|
queue_position: Optional[int] = Field(default=None, description="Queue position (None if processing/completed)")
|
|
estimated_wait_seconds: Optional[int] = Field(default=None, description="Estimated wait time")
|
|
created_at: datetime = Field(description="Job creation timestamp")
|
|
started_at: Optional[datetime] = Field(default=None, description="Processing start timestamp")
|
|
completed_at: Optional[datetime] = Field(default=None, description="Completion timestamp")
|
|
# Detailed timing breakdown
|
|
queue_wait_ms: Optional[int] = Field(default=None, description="Time waiting in queue (started_at - created_at)")
|
|
ocr_time_ms: Optional[int] = Field(default=None, description="Actual OCR engine processing time")
|
|
processing_time_ms: Optional[int] = Field(default=None, description="Total job processing time (completed_at - started_at)")
|
|
result: Optional[ExtractionData] = Field(default=None, description="Extraction result (only if completed)")
|
|
error: Optional[str] = Field(default=None, description="Error message (only if failed)")
|
|
|
|
class Config:
|
|
"""Pydantic config."""
|
|
json_schema_extra = {
|
|
"example": {
|
|
"job_id": "abc123-def456-ghi789",
|
|
"status": "completed",
|
|
"queue_position": None,
|
|
"estimated_wait_seconds": 0,
|
|
"created_at": "2024-01-15T12:00:00",
|
|
"started_at": "2024-01-15T12:00:21",
|
|
"completed_at": "2024-01-15T12:00:28",
|
|
"processing_time_ms": 6543,
|
|
"result": {
|
|
"receipt_number": "123",
|
|
"amount": 85.99,
|
|
"ocr_engine": "paddleocr-light"
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
class OCRQueueStatusResponse(BaseModel):
|
|
"""Queue statistics response."""
|
|
|
|
pending_jobs: int = Field(description="Number of jobs waiting in queue")
|
|
processing_jobs: int = Field(description="Number of jobs currently processing")
|
|
average_time_seconds: float = Field(description="Average processing time in seconds")
|
|
|
|
class Config:
|
|
"""Pydantic config."""
|
|
json_schema_extra = {
|
|
"example": {
|
|
"pending_jobs": 5,
|
|
"processing_jobs": 1,
|
|
"average_time_seconds": 7.2
|
|
}
|
|
}
|