feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions

View File

@@ -61,7 +61,8 @@ class ExtractionData(BaseModel):
confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
confidence_client: float = Field(default=0.0, ge=0, le=1, description="Client extraction confidence")
overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
raw_text: str = Field(default="", description="Raw OCR text")
raw_text: str = Field(default="", description="Raw OCR text (primary)")
raw_texts: List[str] = Field(default=[], description="Raw OCR texts from all engine passes (for analysis)")
ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")
@@ -148,9 +149,10 @@ from enum import Enum
class OCREngineChoice(str, Enum):
"""OCR engine selection options."""
auto = "auto"
paddleocr = "paddleocr"
tesseract = "tesseract"
doctr = "doctr" # 3.3x faster than PaddleOCR with same accuracy (90/100)
doctr_plus = "doctr_plus" # docTR with 2-tier sequential processing + early exit (optimized, recommended)
paddleocr = "paddleocr"
class OCRJobStatus(str, Enum):
@@ -193,7 +195,10 @@ class OCRJobResponse(BaseModel):
created_at: datetime = Field(description="Job creation timestamp")
started_at: Optional[datetime] = Field(default=None, description="Processing start timestamp")
completed_at: Optional[datetime] = Field(default=None, description="Completion timestamp")
processing_time_ms: Optional[int] = Field(default=None, description="Actual processing time in ms")
# Detailed timing breakdown
queue_wait_ms: Optional[int] = Field(default=None, description="Time waiting in queue (started_at - created_at)")
ocr_time_ms: Optional[int] = Field(default=None, description="Actual OCR engine processing time")
processing_time_ms: Optional[int] = Field(default=None, description="Total job processing time (completed_at - started_at)")
result: Optional[ExtractionData] = Field(default=None, description="Extraction result (only if completed)")
error: Optional[str] = Field(default=None, description="Error message (only if failed)")