Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/job_queue.py
Marius Mutu 495790411f feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00

609 lines
20 KiB
Python

"""
SQLite Job Queue Manager for OCR Processing
Provides async job queue for OCR requests:
- Jobs are stored in SQLite for persistence
- Queue position and time estimation
- Automatic expiration after 24 hours
- Statistics for monitoring
Schema:
ocr_jobs (
id TEXT PRIMARY KEY, -- UUID
status TEXT NOT NULL, -- pending, processing, completed, failed
file_path TEXT NOT NULL, -- Path to uploaded file
mime_type TEXT NOT NULL,
engine TEXT DEFAULT 'doctr_plus',
created_at TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
result_json TEXT, -- JSON extraction result
error_message TEXT,
processing_time_ms INTEGER, -- Total job time (started_at to completed_at)
ocr_time_ms INTEGER, -- Actual OCR engine processing time
created_by TEXT, -- Username
original_filename TEXT,
expires_at TIMESTAMP
)
"""
import asyncio
import json
from decimal import Decimal
class DecimalEncoder(json.JSONEncoder):
"""JSON encoder that handles Decimal types."""
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return super().default(obj)
import logging
import os
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
import aiosqlite
logger = logging.getLogger(__name__)
# Default paths
DEFAULT_QUEUE_DIR = Path(__file__).parent.parent.parent.parent.parent / "data" / "ocr_queue"
DEFAULT_DB_PATH = DEFAULT_QUEUE_DIR / "ocr_jobs.db"
DEFAULT_FILES_DIR = DEFAULT_QUEUE_DIR / "files"
# Job expiration
JOB_EXPIRY_HOURS = 24
class OCRJobStatus(str, Enum):
"""Job status enum."""
pending = "pending"
processing = "processing"
completed = "completed"
failed = "failed"
@dataclass
class OCRJob:
"""OCR Job data class."""
id: str
status: OCRJobStatus
file_path: str
mime_type: str
engine: str = "doctr_plus"
created_at: Optional[datetime] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
result_json: Optional[str] = None
error_message: Optional[str] = None
processing_time_ms: Optional[int] = None # Total job time (started_at to completed_at)
ocr_time_ms: Optional[int] = None # Actual OCR engine processing time
created_by: Optional[str] = None
original_filename: Optional[str] = None
expires_at: Optional[datetime] = None
@property
def queue_wait_ms(self) -> Optional[int]:
"""Calculate queue wait time (created_at to started_at)."""
if self.created_at and self.started_at:
delta = self.started_at - self.created_at
return int(delta.total_seconds() * 1000)
return None
@property
def result(self) -> Optional[Dict]:
"""Parse result_json to dict."""
if self.result_json:
try:
return json.loads(self.result_json)
except json.JSONDecodeError:
return None
return None
class OCRJobQueue:
"""
SQLite-based job queue for OCR processing.
Provides async methods for job management with position
tracking and time estimation.
"""
def __init__(
self,
db_path: Optional[Path] = None,
files_dir: Optional[Path] = None
):
"""
Initialize job queue.
Args:
db_path: Path to SQLite database (default: data/ocr_queue/ocr_jobs.db)
files_dir: Path to files directory (default: data/ocr_queue/files/)
"""
self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH
self.files_dir = Path(files_dir) if files_dir else DEFAULT_FILES_DIR
self._lock = asyncio.Lock()
self._initialized = False
async def initialize(self) -> None:
"""
Initialize database and directories.
Creates SQLite database and tables if they don't exist.
Creates files directory for uploaded files.
"""
if self._initialized:
return
# Create directories
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self.files_dir.mkdir(parents=True, exist_ok=True)
# Create database and tables
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute('''
CREATE TABLE IF NOT EXISTS ocr_jobs (
id TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'pending',
file_path TEXT NOT NULL,
mime_type TEXT NOT NULL,
engine TEXT DEFAULT 'doctr_plus',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
result_json TEXT,
error_message TEXT,
processing_time_ms INTEGER,
ocr_time_ms INTEGER,
created_by TEXT,
original_filename TEXT,
expires_at TIMESTAMP
)
''')
# Migration: add ocr_time_ms column if it doesn't exist
try:
await db.execute('ALTER TABLE ocr_jobs ADD COLUMN ocr_time_ms INTEGER')
logger.info("[OCRJobQueue] Added ocr_time_ms column to existing table")
except Exception:
pass # Column already exists
# Index for efficient queue queries
await db.execute('''
CREATE INDEX IF NOT EXISTS idx_ocr_jobs_status
ON ocr_jobs(status, created_at)
''')
# Index for expiration cleanup
await db.execute('''
CREATE INDEX IF NOT EXISTS idx_ocr_jobs_expires
ON ocr_jobs(expires_at)
''')
await db.commit()
self._initialized = True
logger.info(f"[OCRJobQueue] Initialized: db={self.db_path}, files={self.files_dir}")
async def create_job(
self,
file_bytes: bytes,
mime_type: str,
engine: str = "doctr_plus",
username: Optional[str] = None,
original_filename: Optional[str] = None
) -> OCRJob:
"""
Create a new OCR job.
Saves file to disk and creates database record.
Args:
file_bytes: Raw file bytes
mime_type: MIME type of file
engine: OCR engine ('tesseract', 'doctr', 'doctr_plus', 'paddleocr')
username: Username of requester
original_filename: Original filename from upload
Returns:
Created OCRJob instance
"""
await self.initialize()
# Generate job ID
job_id = str(uuid.uuid4())
# Determine file extension
ext_map = {
'image/jpeg': '.jpg',
'image/png': '.png',
'application/pdf': '.pdf',
}
ext = ext_map.get(mime_type, '.bin')
# Save file
file_path = self.files_dir / f"{job_id}{ext}"
with open(file_path, 'wb') as f:
f.write(file_bytes)
# Calculate expiration
now = datetime.utcnow()
expires_at = now + timedelta(hours=JOB_EXPIRY_HOURS)
# Insert job record
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute('''
INSERT INTO ocr_jobs (
id, status, file_path, mime_type, engine,
created_at, created_by, original_filename, expires_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
job_id, OCRJobStatus.pending.value, str(file_path), mime_type, engine,
now.isoformat(), username, original_filename, expires_at.isoformat()
))
await db.commit()
logger.info(f"[OCRJobQueue] Created job {job_id}: engine={engine}, file={file_path.name}")
return OCRJob(
id=job_id,
status=OCRJobStatus.pending,
file_path=str(file_path),
mime_type=mime_type,
engine=engine,
created_at=now,
created_by=username,
original_filename=original_filename,
expires_at=expires_at
)
async def get_job(self, job_id: str) -> Optional[OCRJob]:
"""
Get job by ID.
Args:
job_id: Job UUID
Returns:
OCRJob or None if not found
"""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
'SELECT * FROM ocr_jobs WHERE id = ?',
(job_id,)
) as cursor:
row = await cursor.fetchone()
if row:
return self._row_to_job(row)
return None
async def get_queue_position(self, job_id: str) -> Optional[int]:
"""
Get position in queue for a pending job.
Args:
job_id: Job UUID
Returns:
Queue position (1 = next to process) or None if not pending
"""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
# Check if job is pending
async with db.execute(
'SELECT status, created_at FROM ocr_jobs WHERE id = ?',
(job_id,)
) as cursor:
row = await cursor.fetchone()
if not row or row[0] != OCRJobStatus.pending.value:
return None
job_created_at = row[1]
# Count jobs ahead in queue (created before this job)
async with db.execute('''
SELECT COUNT(*) FROM ocr_jobs
WHERE status = 'pending' AND created_at < ?
''', (job_created_at,)) as cursor:
count = await cursor.fetchone()
return (count[0] + 1) if count else 1
async def get_next_pending(self) -> Optional[OCRJob]:
"""
Get the next pending job (oldest first) and atomically mark it as processing.
This prevents race conditions in parallel processing - only one worker
can claim each job.
Returns:
Next OCRJob to process or None if queue empty
"""
await self.initialize()
now = datetime.utcnow()
async with self._lock: # Serialize access to prevent race conditions
async with aiosqlite.connect(str(self.db_path)) as db:
db.row_factory = aiosqlite.Row
# Get the next pending job
async with db.execute('''
SELECT * FROM ocr_jobs
WHERE status = 'pending'
ORDER BY created_at ASC
LIMIT 1
''') as cursor:
row = await cursor.fetchone()
if not row:
return None
job_id = row['id']
# Atomically mark as processing
await db.execute('''
UPDATE ocr_jobs
SET status = 'processing', started_at = ?
WHERE id = ? AND status = 'pending'
''', (now.isoformat(), job_id))
await db.commit()
# Fetch the updated job
async with db.execute(
'SELECT * FROM ocr_jobs WHERE id = ?',
(job_id,)
) as cursor:
updated_row = await cursor.fetchone()
if updated_row:
return self._row_to_job(updated_row)
return None
async def update_status(
self,
job_id: str,
status: OCRJobStatus,
result: Optional[Dict] = None,
error: Optional[str] = None,
processing_time_ms: Optional[int] = None,
ocr_time_ms: Optional[int] = None
) -> bool:
"""
Update job status.
Args:
job_id: Job UUID
status: New status
result: Extraction result dict (for completed)
error: Error message (for failed)
processing_time_ms: Total job processing time (started_at to completed_at)
ocr_time_ms: Actual OCR engine processing time
Returns:
True if update successful
"""
await self.initialize()
now = datetime.utcnow()
result_json = json.dumps(result, cls=DecimalEncoder) if result else None
# Build update query based on status
if status == OCRJobStatus.processing:
query = '''
UPDATE ocr_jobs
SET status = ?, started_at = ?
WHERE id = ?
'''
params = (status.value, now.isoformat(), job_id)
elif status == OCRJobStatus.completed:
query = '''
UPDATE ocr_jobs
SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?, ocr_time_ms = ?
WHERE id = ?
'''
params = (status.value, now.isoformat(), result_json, processing_time_ms, ocr_time_ms, job_id)
elif status == OCRJobStatus.failed:
query = '''
UPDATE ocr_jobs
SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?, ocr_time_ms = ?
WHERE id = ?
'''
params = (status.value, now.isoformat(), error, processing_time_ms, ocr_time_ms, job_id)
else:
query = 'UPDATE ocr_jobs SET status = ? WHERE id = ?'
params = (status.value, job_id)
async with aiosqlite.connect(str(self.db_path)) as db:
cursor = await db.execute(query, params)
await db.commit()
return cursor.rowcount > 0
async def get_average_processing_time(self) -> float:
"""
Calculate average processing time from recent completed jobs.
Uses last 50 completed jobs for accuracy.
Returns:
Average time in seconds (default 7.0 if no data)
"""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
async with db.execute('''
SELECT AVG(processing_time_ms)
FROM (
SELECT processing_time_ms FROM ocr_jobs
WHERE status = 'completed' AND processing_time_ms IS NOT NULL
ORDER BY completed_at DESC
LIMIT 50
)
''') as cursor:
row = await cursor.fetchone()
if row and row[0]:
return row[0] / 1000.0 # Convert ms to seconds
return 7.0 # Default estimate
async def count_pending(self) -> int:
"""Count pending jobs in queue."""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
async with db.execute(
'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
(OCRJobStatus.pending.value,)
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def count_processing(self) -> int:
"""Count currently processing jobs."""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
async with db.execute(
'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
(OCRJobStatus.processing.value,)
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def cleanup_expired(self) -> int:
"""
Delete expired jobs and their files.
Returns:
Number of jobs deleted
"""
await self.initialize()
now = datetime.utcnow()
deleted = 0
async with aiosqlite.connect(str(self.db_path)) as db:
db.row_factory = aiosqlite.Row
# Get expired jobs
async with db.execute('''
SELECT id, file_path FROM ocr_jobs
WHERE expires_at < ?
''', (now.isoformat(),)) as cursor:
rows = await cursor.fetchall()
for row in rows:
# Delete file
file_path = Path(row['file_path'])
if file_path.exists():
try:
file_path.unlink()
except Exception as e:
logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
# Delete job record
await db.execute('DELETE FROM ocr_jobs WHERE id = ?', (row['id'],))
deleted += 1
await db.commit()
if deleted > 0:
logger.info(f"[OCRJobQueue] Cleaned up {deleted} expired job(s)")
return deleted
async def cleanup_job_file(self, job_id: str) -> bool:
"""
Delete the file associated with a job.
Called after processing to free disk space.
Args:
job_id: Job UUID
Returns:
True if file deleted
"""
job = await self.get_job(job_id)
if job:
file_path = Path(job.file_path)
if file_path.exists():
try:
file_path.unlink()
return True
except Exception as e:
logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
return False
async def get_queue_stats(self) -> Dict[str, Any]:
"""
Get queue statistics.
Returns:
Dict with pending, processing, completed, failed counts
"""
await self.initialize()
stats = {
"pending": 0,
"processing": 0,
"completed": 0,
"failed": 0,
"average_time_seconds": 0.0,
}
async with aiosqlite.connect(str(self.db_path)) as db:
async with db.execute('''
SELECT status, COUNT(*) as count
FROM ocr_jobs
GROUP BY status
''') as cursor:
rows = await cursor.fetchall()
for row in rows:
if row[0] in stats:
stats[row[0]] = row[1]
stats["average_time_seconds"] = await self.get_average_processing_time()
return stats
def _row_to_job(self, row: aiosqlite.Row) -> OCRJob:
"""Convert database row to OCRJob."""
def parse_datetime(val):
if val:
try:
return datetime.fromisoformat(val)
except (ValueError, TypeError):
return None
return None
return OCRJob(
id=row['id'],
status=OCRJobStatus(row['status']),
file_path=row['file_path'],
mime_type=row['mime_type'],
engine=row['engine'] or 'doctr_plus',
created_at=parse_datetime(row['created_at']),
started_at=parse_datetime(row['started_at']),
completed_at=parse_datetime(row['completed_at']),
result_json=row['result_json'],
error_message=row['error_message'],
processing_time_ms=row['processing_time_ms'],
ocr_time_ms=row['ocr_time_ms'] if 'ocr_time_ms' in row.keys() else None,
created_by=row['created_by'],
original_filename=row['original_filename'],
expires_at=parse_datetime(row['expires_at']),
)
# Singleton instance
job_queue = OCRJobQueue()