Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
"""OCR settings and metrics SQLModel models."""
|
|
|
|
from datetime import datetime
|
|
from decimal import Decimal
|
|
from enum import Enum
|
|
from typing import Optional
|
|
|
|
from sqlmodel import SQLModel, Field
|
|
|
|
|
|
class OCREngine(str, Enum):
|
|
"""Available OCR engines."""
|
|
TESSERACT = "tesseract"
|
|
DOCTR = "doctr"
|
|
DOCTR_PLUS = "doctr_plus" # docTR with 2-tier sequential processing + early exit (optimized, recommended)
|
|
PADDLEOCR = "paddleocr"
|
|
|
|
|
|
class UserOCRPreference(SQLModel, table=True):
|
|
"""
|
|
User's preferred OCR engine setting.
|
|
|
|
Each user can have one preferred OCR engine that will be
|
|
auto-selected when they upload new receipts for processing.
|
|
"""
|
|
|
|
__tablename__ = "user_ocr_preferences"
|
|
|
|
id: Optional[int] = Field(default=None, primary_key=True)
|
|
|
|
# User identification
|
|
username: str = Field(max_length=100, unique=True, index=True)
|
|
|
|
# Preference settings
|
|
preferred_engine: OCREngine = Field(default=OCREngine.DOCTR_PLUS)
|
|
|
|
# Timestamps
|
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
|
|
|
|
|
class OCRJobMetrics(SQLModel, table=True):
|
|
"""
|
|
OCR job processing metrics for analytics.
|
|
|
|
Stores metrics for each OCR job to enable:
|
|
- Performance tracking by engine
|
|
- Success rate analysis
|
|
- Processing time trends
|
|
- User-specific analytics
|
|
"""
|
|
|
|
__tablename__ = "ocr_job_metrics"
|
|
|
|
id: Optional[int] = Field(default=None, primary_key=True)
|
|
|
|
# Job identification
|
|
job_id: str = Field(max_length=50, unique=True, index=True)
|
|
|
|
# User and company context
|
|
username: str = Field(max_length=100, index=True)
|
|
company_id: Optional[int] = Field(default=None, index=True)
|
|
|
|
# Engine used
|
|
engine_requested: str = Field(max_length=20) # What user/auto requested
|
|
engine_used: str = Field(max_length=50) # What was actually used (e.g., "doctr-light")
|
|
|
|
# Processing metrics
|
|
processing_time_ms: int = Field(default=0)
|
|
file_size_bytes: int = Field(default=0)
|
|
file_type: str = Field(max_length=50, default="image/jpeg") # MIME type
|
|
original_filename: Optional[str] = Field(default=None, max_length=255) # Original uploaded filename
|
|
|
|
# Success metrics
|
|
success: bool = Field(default=True)
|
|
error_message: Optional[str] = Field(default=None, max_length=500)
|
|
|
|
# Extraction quality metrics
|
|
overall_confidence: float = Field(default=0.0)
|
|
fields_extracted: int = Field(default=0) # Number of fields successfully extracted
|
|
needs_manual_review: Optional[bool] = Field(default=None)
|
|
validation_warnings_count: int = Field(default=0)
|
|
validation_errors_count: int = Field(default=0)
|
|
|
|
# Timestamps
|
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
|
|
|
|
class OCRMetricsSummary(SQLModel):
|
|
"""
|
|
Summary metrics for OCR analytics.
|
|
|
|
Not a database table - used for API responses.
|
|
"""
|
|
engine: str
|
|
total_jobs: int
|
|
successful_jobs: int
|
|
failed_jobs: int
|
|
success_rate: float # Computed: successful_jobs / total_jobs
|
|
avg_processing_time_ms: float
|
|
avg_confidence: float
|
|
avg_fields_extracted: float
|