feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions

View File

@@ -0,0 +1,74 @@
"""Add OCR settings and metrics tables.
Revision ID: add_ocr_settings_metrics
Revises: 20251230_add_needs_manual_review
Create Date: 2025-12-31
This migration adds:
- user_ocr_preferences: Store user's preferred OCR engine
- ocr_job_metrics: Store OCR job processing metrics for analytics
"""
from alembic import op
import sqlalchemy as sa
# Revision identifiers
revision = 'add_ocr_settings_metrics'
down_revision = '20251230_add_needs_manual_review'
branch_labels = None
depends_on = None
def upgrade() -> None:
"""Create OCR settings and metrics tables."""
# Create user_ocr_preferences table
op.create_table(
'user_ocr_preferences',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('username', sa.String(length=100), nullable=False),
sa.Column('preferred_engine', sa.String(length=20), nullable=False, server_default='doctr_plus'),
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.func.now()),
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.func.now()),
sa.PrimaryKeyConstraint('id')
)
op.create_index('ix_user_ocr_preferences_username', 'user_ocr_preferences', ['username'], unique=True)
# Create ocr_job_metrics table
op.create_table(
'ocr_job_metrics',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('job_id', sa.String(length=50), nullable=False),
sa.Column('username', sa.String(length=100), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('engine_requested', sa.String(length=20), nullable=False),
sa.Column('engine_used', sa.String(length=50), nullable=False),
sa.Column('processing_time_ms', sa.Integer(), nullable=False, server_default='0'),
sa.Column('file_size_bytes', sa.Integer(), nullable=False, server_default='0'),
sa.Column('file_type', sa.String(length=50), nullable=False, server_default='image/jpeg'),
sa.Column('success', sa.Boolean(), nullable=False, server_default='1'),
sa.Column('error_message', sa.String(length=500), nullable=True),
sa.Column('overall_confidence', sa.Float(), nullable=False, server_default='0.0'),
sa.Column('fields_extracted', sa.Integer(), nullable=False, server_default='0'),
sa.Column('needs_manual_review', sa.Boolean(), nullable=True),
sa.Column('validation_warnings_count', sa.Integer(), nullable=False, server_default='0'),
sa.Column('validation_errors_count', sa.Integer(), nullable=False, server_default='0'),
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.func.now()),
sa.PrimaryKeyConstraint('id')
)
op.create_index('ix_ocr_job_metrics_job_id', 'ocr_job_metrics', ['job_id'], unique=True)
op.create_index('ix_ocr_job_metrics_username', 'ocr_job_metrics', ['username'], unique=False)
op.create_index('ix_ocr_job_metrics_company_id', 'ocr_job_metrics', ['company_id'], unique=False)
op.create_index('ix_ocr_job_metrics_created_at', 'ocr_job_metrics', ['created_at'], unique=False)
def downgrade() -> None:
"""Drop OCR settings and metrics tables."""
op.drop_index('ix_ocr_job_metrics_created_at', table_name='ocr_job_metrics')
op.drop_index('ix_ocr_job_metrics_company_id', table_name='ocr_job_metrics')
op.drop_index('ix_ocr_job_metrics_username', table_name='ocr_job_metrics')
op.drop_index('ix_ocr_job_metrics_job_id', table_name='ocr_job_metrics')
op.drop_table('ocr_job_metrics')
op.drop_index('ix_user_ocr_preferences_username', table_name='user_ocr_preferences')
op.drop_table('user_ocr_preferences')

View File

@@ -0,0 +1,30 @@
"""Add original_filename to ocr_job_metrics.
Revision ID: add_original_filename_to_metrics
Revises: add_ocr_settings_metrics
Create Date: 2025-12-31
Adds original_filename column to track the uploaded filename.
"""
from alembic import op
import sqlalchemy as sa
# Revision identifiers
revision = 'add_original_filename_to_metrics'
down_revision = 'add_ocr_settings_metrics'
branch_labels = None
depends_on = None
def upgrade() -> None:
"""Add original_filename column to ocr_job_metrics."""
op.add_column(
'ocr_job_metrics',
sa.Column('original_filename', sa.String(length=255), nullable=True)
)
def downgrade() -> None:
"""Remove original_filename column."""
op.drop_column('ocr_job_metrics', 'original_filename')