feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions

View File

@@ -11,6 +11,8 @@ def create_data_entry_router() -> APIRouter:
- /receipts - Receipt CRUD and workflow
- /ocr - OCR processing for receipts
- /nomenclature - Nomenclature syncing from Oracle
- /settings - User settings (OCR preferences)
- /metrics - OCR analytics and metrics
Returns:
APIRouter: Configured router for data entry module
@@ -21,10 +23,13 @@ def create_data_entry_router() -> APIRouter:
from .receipts import router as receipts_router
from .ocr import router as ocr_router
from .nomenclature import router as nomenclature_router
from .ocr_settings import router as ocr_settings_router
# Include all sub-routers (no prefix - already prefixed in main.py with /api/data-entry)
router.include_router(receipts_router, prefix="/receipts", tags=["data-entry-receipts"])
router.include_router(ocr_router, prefix="/ocr", tags=["data-entry-ocr"])
router.include_router(nomenclature_router, prefix="/nomenclature", tags=["data-entry-nomenclature"])
# OCR settings and metrics (endpoints at /settings/* and /metrics/*)
router.include_router(ocr_settings_router, tags=["data-entry-settings"])
return router

View File

@@ -27,6 +27,7 @@ from backend.modules.data_entry.services.ocr_service import ocr_service
from backend.modules.data_entry.services.ocr_engine import OCREngine
from backend.modules.data_entry.services.ocr.job_queue import job_queue, OCRJobStatus as JobStatus
from backend.modules.data_entry.services.ocr.job_worker import estimate_wait_time
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
from backend.modules.data_entry.schemas.ocr import (
OCRResponse,
OCRStatusResponse,
@@ -55,7 +56,7 @@ router = APIRouter()
@router.post("/extract", response_model=OCRJobSubmitResponse)
async def submit_ocr_job(
file: UploadFile = File(...),
engine: OCREngineChoice = Query(default=OCREngineChoice.auto, description="OCR engine to use"),
engine: OCREngineChoice = Query(default=OCREngineChoice.doctr_plus, description="OCR engine to use"),
sync: bool = Query(default=False, description="If true, process synchronously (blocks)"),
current_user: CurrentUser = Depends(get_current_user)
):
@@ -69,7 +70,7 @@ async def submit_ocr_job(
Args:
file: Image or PDF file (max 10MB)
engine: OCR engine choice (auto, paddleocr, tesseract)
engine: OCR engine choice (tesseract, doctr, doctr_plus, paddleocr)
sync: If true, process synchronously (legacy mode)
Returns:
@@ -129,13 +130,13 @@ async def submit_ocr_job(
@router.get("/jobs/{job_id}", response_model=OCRJobResponse)
async def get_job_status(
job_id: str,
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get OCR job status and result.
Get OCR job status and result (instant response).
Poll this endpoint to check job progress.
Recommended polling interval: 2 seconds.
For efficient polling, use GET /jobs/{job_id}/wait instead (long-polling).
Args:
job_id: Job UUID from POST /extract response
@@ -165,6 +166,10 @@ async def get_job_status(
result_data = None
if job.status == JobStatus.completed and job.result:
result_data = _dict_to_extraction_data(job.result)
# Apply fuzzy CUI matching
result_data = await _apply_fuzzy_cui_matching(result_data, session)
# Debug: log suggested_payment_mode being returned
print(f"[OCR Router] Returning job {job_id} with suggested_payment_mode={result_data.suggested_payment_mode}", flush=True)
return OCRJobResponse(
job_id=job.id,
@@ -174,12 +179,66 @@ async def get_job_status(
created_at=job.created_at or datetime.utcnow(),
started_at=job.started_at,
completed_at=job.completed_at,
queue_wait_ms=job.queue_wait_ms,
ocr_time_ms=job.ocr_time_ms,
processing_time_ms=job.processing_time_ms,
result=result_data,
error=job.error_message
)
@router.get("/jobs/{job_id}/wait", response_model=OCRJobResponse)
async def wait_for_job_status(
job_id: str,
timeout: int = Query(default=30, ge=1, le=60, description="Max wait time in seconds"),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Long-poll for OCR job status change.
Waits until:
- Job status changes to completed/failed
- Timeout expires (returns current status)
Recommended client timeout: timeout + 5 seconds
Args:
job_id: Job UUID from POST /extract response
timeout: Max wait time in seconds (1-60, default 30)
Returns:
OCRJobResponse with status, queue_position, and result (if completed)
"""
import asyncio
import time
end_time = time.time() + timeout
last_status = None
while time.time() < end_time:
job = await job_queue.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
# Return immediately if job completed or failed
if job.status in [JobStatus.completed, JobStatus.failed]:
return await get_job_status(job_id, session, current_user)
# Return if status changed from last check
if last_status is not None and job.status != last_status:
return await get_job_status(job_id, session, current_user)
last_status = job.status
# Wait 1 second before next internal check
await asyncio.sleep(1)
# Timeout - return current status
return await get_job_status(job_id, session, current_user)
@router.get("/queue/status", response_model=OCRQueueStatusResponse)
async def get_queue_status(
current_user: CurrentUser = Depends(get_current_user)
@@ -221,10 +280,58 @@ async def get_ocr_status():
)
@router.get("/engines")
async def get_available_engines():
"""
Get list of enabled OCR engines based on .env configuration.
Returns engines availability and available processing modes.
Frontend should use this to filter engine selection dropdown.
Available engines: tesseract, doctr, doctr_plus, paddleocr
"""
# Check which engines are enabled via .env
paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
default_engine = os.getenv("OCR_DEFAULT_ENGINE", "doctr_plus")
# Build engines dict
engines = {
"tesseract": tesseract_enabled,
"doctr": True, # Always available (primary engine)
"doctr_plus": True, # Always available (recommended)
"paddleocr": paddle_enabled,
}
# Build available modes based on enabled engines
modes = []
if tesseract_enabled:
modes.append("tesseract")
modes.append("doctr")
modes.append("doctr_plus")
if paddle_enabled:
modes.append("paddleocr")
return {
"engines": engines,
"available_modes": modes,
"default_mode": default_engine,
"memory_estimate_mb": {
"tesseract": 50,
"doctr": 600,
"doctr_plus": 600,
"paddleocr": 800,
}
}
@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse)
async def extract_from_attachment(
attachment_id: int,
engine: OCREngineChoice = Query(default=OCREngineChoice.auto),
engine: OCREngineChoice = Query(default=OCREngineChoice.doctr_plus),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
@@ -260,6 +367,8 @@ async def extract_from_attachment(
raise HTTPException(status_code=422, detail=message)
data = _result_to_extraction_data(result)
# Apply fuzzy CUI matching
data = await _apply_fuzzy_cui_matching(data, session)
return OCRResponse(success=True, message=message, data=data)
@@ -267,6 +376,58 @@ async def extract_from_attachment(
# Helper Functions
# ============================================================================
async def _apply_fuzzy_cui_matching(
extraction_data: ExtractionData,
session: AsyncSession
) -> ExtractionData:
"""
Apply fuzzy CUI matching to extraction data.
ONLY applies fuzzy matching if CUI is missing OR has invalid checksum.
If CUI has valid checksum, we trust the OCR and skip fuzzy matching.
Args:
extraction_data: ExtractionData with CUI to potentially correct
session: AsyncSession for database lookups
Returns:
ExtractionData with CUI corrected if a match was found
"""
from backend.modules.data_entry.services.ocr.validation import CUIChecksumRule
# Skip if no CUI and no vendor name (nothing to match)
if not extraction_data.cui and not extraction_data.partner_name:
return extraction_data
# Check if CUI has valid checksum - if valid, skip fuzzy matching
if extraction_data.cui:
cui_digits = CUIChecksumRule.extract_digits(extraction_data.cui)
if len(cui_digits) >= 6 and CUIChecksumRule.validate_checksum(cui_digits):
print(f"[Fuzzy Match] CUI {extraction_data.cui} has valid checksum, skipping fuzzy match", flush=True)
return extraction_data
# CUI missing or invalid checksum - try fuzzy matching
try:
match = await OCRValidationEngine.fuzzy_match_supplier(
cui=extraction_data.cui,
vendor_name=extraction_data.partner_name,
db_session=session
)
if match:
corrected_cui, supplier_name = match
if corrected_cui != extraction_data.cui:
print(f"[Fuzzy Match] Corrected: {extraction_data.cui}{corrected_cui} ({supplier_name})", flush=True)
extraction_data.cui = corrected_cui
# Also set partner_name if not already set
if not extraction_data.partner_name:
extraction_data.partner_name = supplier_name
except Exception as e:
print(f"[Fuzzy Match] Error: {e}", flush=True)
return extraction_data
async def _process_sync(
content: bytes,
file: UploadFile,
@@ -362,6 +523,7 @@ def _result_to_extraction_data(result) -> ExtractionData:
confidence_client=getattr(result, 'confidence_client', 0.0),
overall_confidence=result.overall_confidence,
raw_text=result.raw_text,
raw_texts=getattr(result, 'raw_texts', []),
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
needs_manual_review=result.needs_manual_review,
@@ -437,6 +599,7 @@ def _dict_to_extraction_data(data: dict) -> ExtractionData:
confidence_client=data.get('confidence_client', 0.0),
overall_confidence=data.get('overall_confidence', 0.0),
raw_text=data.get('raw_text', ''),
raw_texts=data.get('raw_texts', []),
ocr_engine=data.get('ocr_engine', ''),
processing_time_ms=data.get('processing_time_ms', 0),
needs_manual_review=data.get('needs_manual_review'),

View File

@@ -0,0 +1,268 @@
"""
OCR Settings and Metrics API endpoints.
Endpoints:
- GET /settings/ocr-preference - Get user's preferred OCR engine
- POST /settings/ocr-preference - Set user's preferred OCR engine
- GET /metrics/ocr/summary - Get OCR metrics summary by engine
- GET /metrics/ocr/history - Get user's OCR job history
- GET /metrics/ocr/stats - Get overall OCR statistics
"""
from typing import List, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from backend.modules.data_entry.db.database import get_session
from backend.modules.data_entry.db.crud.ocr_settings import OCRPreferenceCRUD, OCRMetricsCRUD
from backend.modules.data_entry.db.models.ocr_settings import OCREngine, OCRMetricsSummary
# Auth integration
from shared.auth.dependencies import get_current_user
from shared.auth.models import CurrentUser
router = APIRouter()
# ============================================================================
# Schemas
# ============================================================================
class OCRPreferenceResponse(BaseModel):
"""Response for OCR preference endpoint."""
username: str
preferred_engine: str
available_engines: List[str] = Field(
default=["tesseract", "doctr", "doctr_plus", "paddleocr"],
description="Available OCR engines"
)
class OCRPreferenceRequest(BaseModel):
"""Request to set OCR preference."""
preferred_engine: str = Field(
default="doctr_plus",
description="Preferred OCR engine: tesseract, doctr, doctr_plus, paddleocr"
)
class OCRMetricsHistoryItem(BaseModel):
"""Single OCR job metrics item."""
job_id: str
engine_requested: str
engine_used: str
processing_time_ms: int
success: bool
overall_confidence: float
fields_extracted: int
created_at: str
original_filename: Optional[str] = None
class OCRMetricsHistoryResponse(BaseModel):
"""Response for OCR history endpoint."""
items: List[OCRMetricsHistoryItem]
total: int
class OCRStatsResponse(BaseModel):
"""Response for OCR stats endpoint."""
total_jobs: int
successful_jobs: int
failed_jobs: int
success_rate: float
avg_processing_time_ms: float
avg_confidence: float
period_days: int
class OCRActiveEnginesResponse(BaseModel):
"""Response for active OCR engines endpoint."""
engines: List[str] = Field(description="List of active OCR engines from .env config")
recommended: str = Field(default="doctr_plus", description="Recommended engine")
# ============================================================================
# OCR Engines Configuration Endpoint
# ============================================================================
@router.get("/settings/ocr-engines", response_model=OCRActiveEnginesResponse)
async def get_active_ocr_engines():
"""
Get list of active OCR engines configured in .env.
Returns the engines that should be shown in the frontend dropdown.
Configured via OCR_ACTIVE_ENGINES environment variable.
Default: doctr,doctr_plus
Available: tesseract, paddleocr, doctr, doctr_plus
"""
from backend.modules.data_entry.config import settings
return OCRActiveEnginesResponse(
engines=settings.ocr_active_engines_list,
recommended="doctr_plus"
)
# ============================================================================
# OCR Preference Endpoints
# ============================================================================
@router.get("/settings/ocr-preference", response_model=OCRPreferenceResponse)
async def get_ocr_preference(
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get user's preferred OCR engine.
Returns the user's saved preference or 'doctr_plus' if not set.
Also returns list of available engines.
"""
from backend.modules.data_entry.services.ocr_engine import OCREngine as OCREngineClass
preference = await OCRPreferenceCRUD.get_by_username(session, current_user.username)
# Get available engines from OCR service
available = OCREngineClass.get_available_engines()
return OCRPreferenceResponse(
username=current_user.username,
preferred_engine=preference.preferred_engine.value if preference else "doctr_plus",
available_engines=available
)
@router.post("/settings/ocr-preference", response_model=OCRPreferenceResponse)
async def set_ocr_preference(
request: OCRPreferenceRequest,
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Set user's preferred OCR engine.
Valid engines: tesseract, doctr, doctr_plus, paddleocr
Note: Available engines depend on .env configuration (OCR_ENABLE_PADDLEOCR, OCR_ENABLE_TESSERACT)
"""
from backend.modules.data_entry.services.ocr_engine import OCREngine as OCREngineClass
# Get dynamically available engines
available = OCREngineClass.get_available_engines()
if request.preferred_engine not in available:
raise HTTPException(
status_code=400,
detail=f"Invalid engine. Must be one of: {', '.join(available)}"
)
# Map string to enum
engine_map = {
"tesseract": OCREngine.TESSERACT,
"doctr": OCREngine.DOCTR,
"doctr_plus": OCREngine.DOCTR_PLUS,
"paddleocr": OCREngine.PADDLEOCR,
}
engine_enum = engine_map.get(request.preferred_engine, OCREngine.DOCTR_PLUS)
# Save preference
preference = await OCRPreferenceCRUD.create_or_update(
session,
current_user.username,
engine_enum
)
# Get available engines
available = OCREngineClass.get_available_engines()
return OCRPreferenceResponse(
username=current_user.username,
preferred_engine=preference.preferred_engine.value,
available_engines=available
)
# ============================================================================
# OCR Metrics Endpoints
# ============================================================================
@router.get("/metrics/ocr/summary", response_model=List[OCRMetricsSummary])
async def get_ocr_metrics_summary(
days: int = Query(default=30, ge=1, le=365, description="Number of days to include"),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get OCR metrics summary grouped by engine.
Returns aggregated metrics for each engine used in the specified period.
"""
summaries = await OCRMetricsCRUD.get_summary_by_engine(
session,
days=days,
username=current_user.username
)
return summaries
@router.get("/metrics/ocr/history", response_model=OCRMetricsHistoryResponse)
async def get_ocr_metrics_history(
limit: int = Query(default=50, ge=1, le=200, description="Max items to return"),
offset: int = Query(default=0, ge=0, description="Items to skip"),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get user's OCR job history.
Returns list of OCR jobs with their metrics, ordered by most recent first.
"""
items = await OCRMetricsCRUD.get_user_history(
session,
username=current_user.username,
limit=limit,
offset=offset
)
history_items = [
OCRMetricsHistoryItem(
job_id=item.job_id,
engine_requested=item.engine_requested,
engine_used=item.engine_used,
processing_time_ms=item.processing_time_ms,
success=item.success,
overall_confidence=item.overall_confidence,
fields_extracted=item.fields_extracted,
created_at=item.created_at.isoformat(),
original_filename=item.original_filename
)
for item in items
]
return OCRMetricsHistoryResponse(
items=history_items,
total=len(history_items)
)
@router.get("/metrics/ocr/stats", response_model=OCRStatsResponse)
async def get_ocr_stats(
days: int = Query(default=30, ge=1, le=365, description="Number of days to include"),
session: AsyncSession = Depends(get_session),
current_user: CurrentUser = Depends(get_current_user)
):
"""
Get overall OCR statistics for the user.
Returns aggregated stats including success rate, average processing time, etc.
"""
stats = await OCRMetricsCRUD.get_overall_stats(
session,
days=days,
username=current_user.username
)
return OCRStatsResponse(**stats)