Files
roa2web-service-auto/backend/modules/data_entry/db/models/ocr_settings.py
Marius Mutu 495790411f feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00

103 lines
3.1 KiB
Python

"""OCR settings and metrics SQLModel models."""
from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Optional
from sqlmodel import SQLModel, Field
class OCREngine(str, Enum):
"""Available OCR engines."""
TESSERACT = "tesseract"
DOCTR = "doctr"
DOCTR_PLUS = "doctr_plus" # docTR with 2-tier sequential processing + early exit (optimized, recommended)
PADDLEOCR = "paddleocr"
class UserOCRPreference(SQLModel, table=True):
"""
User's preferred OCR engine setting.
Each user can have one preferred OCR engine that will be
auto-selected when they upload new receipts for processing.
"""
__tablename__ = "user_ocr_preferences"
id: Optional[int] = Field(default=None, primary_key=True)
# User identification
username: str = Field(max_length=100, unique=True, index=True)
# Preference settings
preferred_engine: OCREngine = Field(default=OCREngine.DOCTR_PLUS)
# Timestamps
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
class OCRJobMetrics(SQLModel, table=True):
"""
OCR job processing metrics for analytics.
Stores metrics for each OCR job to enable:
- Performance tracking by engine
- Success rate analysis
- Processing time trends
- User-specific analytics
"""
__tablename__ = "ocr_job_metrics"
id: Optional[int] = Field(default=None, primary_key=True)
# Job identification
job_id: str = Field(max_length=50, unique=True, index=True)
# User and company context
username: str = Field(max_length=100, index=True)
company_id: Optional[int] = Field(default=None, index=True)
# Engine used
engine_requested: str = Field(max_length=20) # What user/auto requested
engine_used: str = Field(max_length=50) # What was actually used (e.g., "doctr-light")
# Processing metrics
processing_time_ms: int = Field(default=0)
file_size_bytes: int = Field(default=0)
file_type: str = Field(max_length=50, default="image/jpeg") # MIME type
original_filename: Optional[str] = Field(default=None, max_length=255) # Original uploaded filename
# Success metrics
success: bool = Field(default=True)
error_message: Optional[str] = Field(default=None, max_length=500)
# Extraction quality metrics
overall_confidence: float = Field(default=0.0)
fields_extracted: int = Field(default=0) # Number of fields successfully extracted
needs_manual_review: Optional[bool] = Field(default=None)
validation_warnings_count: int = Field(default=0)
validation_errors_count: int = Field(default=0)
# Timestamps
created_at: datetime = Field(default_factory=datetime.utcnow)
class OCRMetricsSummary(SQLModel):
"""
Summary metrics for OCR analytics.
Not a database table - used for API responses.
"""
engine: str
total_jobs: int
successful_jobs: int
failed_jobs: int
success_rate: float # Computed: successful_jobs / total_jobs
avg_processing_time_ms: float
avg_confidence: float
avg_fields_extracted: float