feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions
--- a/backend/modules/data_entry/db/models/ocr_settings.py
+++ b/backend/modules/data_entry/db/models/ocr_settings.py
@@ -0,0 +1,102 @@
+"""OCR settings and metrics SQLModel models."""
+
+from datetime import datetime
+from decimal import Decimal
+from enum import Enum
+from typing import Optional
+
+from sqlmodel import SQLModel, Field
+
+
+class OCREngine(str, Enum):
+    """Available OCR engines."""
+    TESSERACT = "tesseract"
+    DOCTR = "doctr"
+    DOCTR_PLUS = "doctr_plus"  # docTR with 2-tier sequential processing + early exit (optimized, recommended)
+    PADDLEOCR = "paddleocr"
+
+
+class UserOCRPreference(SQLModel, table=True):
+    """
+    User's preferred OCR engine setting.
+
+    Each user can have one preferred OCR engine that will be
+    auto-selected when they upload new receipts for processing.
+    """
+
+    __tablename__ = "user_ocr_preferences"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+
+    # User identification
+    username: str = Field(max_length=100, unique=True, index=True)
+
+    # Preference settings
+    preferred_engine: OCREngine = Field(default=OCREngine.DOCTR_PLUS)
+
+    # Timestamps
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+
+
+class OCRJobMetrics(SQLModel, table=True):
+    """
+    OCR job processing metrics for analytics.
+
+    Stores metrics for each OCR job to enable:
+    - Performance tracking by engine
+    - Success rate analysis
+    - Processing time trends
+    - User-specific analytics
+    """
+
+    __tablename__ = "ocr_job_metrics"
+
+    id: Optional[int] = Field(default=None, primary_key=True)
+
+    # Job identification
+    job_id: str = Field(max_length=50, unique=True, index=True)
+
+    # User and company context
+    username: str = Field(max_length=100, index=True)
+    company_id: Optional[int] = Field(default=None, index=True)
+
+    # Engine used
+    engine_requested: str = Field(max_length=20)  # What user/auto requested
+    engine_used: str = Field(max_length=50)  # What was actually used (e.g., "doctr-light")
+
+    # Processing metrics
+    processing_time_ms: int = Field(default=0)
+    file_size_bytes: int = Field(default=0)
+    file_type: str = Field(max_length=50, default="image/jpeg")  # MIME type
+    original_filename: Optional[str] = Field(default=None, max_length=255)  # Original uploaded filename
+
+    # Success metrics
+    success: bool = Field(default=True)
+    error_message: Optional[str] = Field(default=None, max_length=500)
+
+    # Extraction quality metrics
+    overall_confidence: float = Field(default=0.0)
+    fields_extracted: int = Field(default=0)  # Number of fields successfully extracted
+    needs_manual_review: Optional[bool] = Field(default=None)
+    validation_warnings_count: int = Field(default=0)
+    validation_errors_count: int = Field(default=0)
+
+    # Timestamps
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+
+
+class OCRMetricsSummary(SQLModel):
+    """
+    Summary metrics for OCR analytics.
+
+    Not a database table - used for API responses.
+    """
+    engine: str
+    total_jobs: int
+    successful_jobs: int
+    failed_jobs: int
+    success_rate: float  # Computed: successful_jobs / total_jobs
+    avg_processing_time_ms: float
+    avg_confidence: float
+    avg_fields_extracted: float