feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -17,6 +17,7 @@ load_dotenv()
|
||||
from backend.modules.data_entry.db.models.receipt import Receipt, ReceiptAttachment
|
||||
from backend.modules.data_entry.db.models.accounting_entry import AccountingEntry
|
||||
from backend.modules.data_entry.db.models.nomenclature import SyncedSupplier, LocalSupplier, SyncedCashRegister
|
||||
from backend.modules.data_entry.db.models.ocr_settings import UserOCRPreference, OCRJobMetrics
|
||||
|
||||
# this is the Alembic Config object, which provides
|
||||
# access to the values within the .ini file in use.
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Add OCR settings and metrics tables.
|
||||
|
||||
Revision ID: add_ocr_settings_metrics
|
||||
Revises: 20251230_add_needs_manual_review
|
||||
Create Date: 2025-12-31
|
||||
|
||||
This migration adds:
|
||||
- user_ocr_preferences: Store user's preferred OCR engine
|
||||
- ocr_job_metrics: Store OCR job processing metrics for analytics
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# Revision identifiers
|
||||
revision = 'add_ocr_settings_metrics'
|
||||
down_revision = '20251230_add_needs_manual_review'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Create OCR settings and metrics tables."""
|
||||
|
||||
# Create user_ocr_preferences table
|
||||
op.create_table(
|
||||
'user_ocr_preferences',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('username', sa.String(length=100), nullable=False),
|
||||
sa.Column('preferred_engine', sa.String(length=20), nullable=False, server_default='doctr_plus'),
|
||||
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.func.now()),
|
||||
sa.Column('updated_at', sa.DateTime(), nullable=False, server_default=sa.func.now()),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index('ix_user_ocr_preferences_username', 'user_ocr_preferences', ['username'], unique=True)
|
||||
|
||||
# Create ocr_job_metrics table
|
||||
op.create_table(
|
||||
'ocr_job_metrics',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('job_id', sa.String(length=50), nullable=False),
|
||||
sa.Column('username', sa.String(length=100), nullable=False),
|
||||
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||
sa.Column('engine_requested', sa.String(length=20), nullable=False),
|
||||
sa.Column('engine_used', sa.String(length=50), nullable=False),
|
||||
sa.Column('processing_time_ms', sa.Integer(), nullable=False, server_default='0'),
|
||||
sa.Column('file_size_bytes', sa.Integer(), nullable=False, server_default='0'),
|
||||
sa.Column('file_type', sa.String(length=50), nullable=False, server_default='image/jpeg'),
|
||||
sa.Column('success', sa.Boolean(), nullable=False, server_default='1'),
|
||||
sa.Column('error_message', sa.String(length=500), nullable=True),
|
||||
sa.Column('overall_confidence', sa.Float(), nullable=False, server_default='0.0'),
|
||||
sa.Column('fields_extracted', sa.Integer(), nullable=False, server_default='0'),
|
||||
sa.Column('needs_manual_review', sa.Boolean(), nullable=True),
|
||||
sa.Column('validation_warnings_count', sa.Integer(), nullable=False, server_default='0'),
|
||||
sa.Column('validation_errors_count', sa.Integer(), nullable=False, server_default='0'),
|
||||
sa.Column('created_at', sa.DateTime(), nullable=False, server_default=sa.func.now()),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index('ix_ocr_job_metrics_job_id', 'ocr_job_metrics', ['job_id'], unique=True)
|
||||
op.create_index('ix_ocr_job_metrics_username', 'ocr_job_metrics', ['username'], unique=False)
|
||||
op.create_index('ix_ocr_job_metrics_company_id', 'ocr_job_metrics', ['company_id'], unique=False)
|
||||
op.create_index('ix_ocr_job_metrics_created_at', 'ocr_job_metrics', ['created_at'], unique=False)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Drop OCR settings and metrics tables."""
|
||||
op.drop_index('ix_ocr_job_metrics_created_at', table_name='ocr_job_metrics')
|
||||
op.drop_index('ix_ocr_job_metrics_company_id', table_name='ocr_job_metrics')
|
||||
op.drop_index('ix_ocr_job_metrics_username', table_name='ocr_job_metrics')
|
||||
op.drop_index('ix_ocr_job_metrics_job_id', table_name='ocr_job_metrics')
|
||||
op.drop_table('ocr_job_metrics')
|
||||
|
||||
op.drop_index('ix_user_ocr_preferences_username', table_name='user_ocr_preferences')
|
||||
op.drop_table('user_ocr_preferences')
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Add original_filename to ocr_job_metrics.
|
||||
|
||||
Revision ID: add_original_filename_to_metrics
|
||||
Revises: add_ocr_settings_metrics
|
||||
Create Date: 2025-12-31
|
||||
|
||||
Adds original_filename column to track the uploaded filename.
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# Revision identifiers
|
||||
revision = 'add_original_filename_to_metrics'
|
||||
down_revision = 'add_ocr_settings_metrics'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add original_filename column to ocr_job_metrics."""
|
||||
op.add_column(
|
||||
'ocr_job_metrics',
|
||||
sa.Column('original_filename', sa.String(length=255), nullable=True)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove original_filename column."""
|
||||
op.drop_column('ocr_job_metrics', 'original_filename')
|
||||
Reference in New Issue
Block a user