Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
609 lines
20 KiB
Python
609 lines
20 KiB
Python
"""
|
|
SQLite Job Queue Manager for OCR Processing
|
|
|
|
Provides async job queue for OCR requests:
|
|
- Jobs are stored in SQLite for persistence
|
|
- Queue position and time estimation
|
|
- Automatic expiration after 24 hours
|
|
- Statistics for monitoring
|
|
|
|
Schema:
|
|
ocr_jobs (
|
|
id TEXT PRIMARY KEY, -- UUID
|
|
status TEXT NOT NULL, -- pending, processing, completed, failed
|
|
file_path TEXT NOT NULL, -- Path to uploaded file
|
|
mime_type TEXT NOT NULL,
|
|
engine TEXT DEFAULT 'doctr_plus',
|
|
created_at TIMESTAMP,
|
|
started_at TIMESTAMP,
|
|
completed_at TIMESTAMP,
|
|
result_json TEXT, -- JSON extraction result
|
|
error_message TEXT,
|
|
processing_time_ms INTEGER, -- Total job time (started_at to completed_at)
|
|
ocr_time_ms INTEGER, -- Actual OCR engine processing time
|
|
created_by TEXT, -- Username
|
|
original_filename TEXT,
|
|
expires_at TIMESTAMP
|
|
)
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
from decimal import Decimal
|
|
|
|
|
|
class DecimalEncoder(json.JSONEncoder):
|
|
"""JSON encoder that handles Decimal types."""
|
|
def default(self, obj):
|
|
if isinstance(obj, Decimal):
|
|
return float(obj)
|
|
return super().default(obj)
|
|
import logging
|
|
import os
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timedelta
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import aiosqlite
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default paths
|
|
DEFAULT_QUEUE_DIR = Path(__file__).parent.parent.parent.parent.parent / "data" / "ocr_queue"
|
|
DEFAULT_DB_PATH = DEFAULT_QUEUE_DIR / "ocr_jobs.db"
|
|
DEFAULT_FILES_DIR = DEFAULT_QUEUE_DIR / "files"
|
|
|
|
# Job expiration
|
|
JOB_EXPIRY_HOURS = 24
|
|
|
|
|
|
class OCRJobStatus(str, Enum):
|
|
"""Job status enum."""
|
|
pending = "pending"
|
|
processing = "processing"
|
|
completed = "completed"
|
|
failed = "failed"
|
|
|
|
|
|
@dataclass
|
|
class OCRJob:
|
|
"""OCR Job data class."""
|
|
id: str
|
|
status: OCRJobStatus
|
|
file_path: str
|
|
mime_type: str
|
|
engine: str = "doctr_plus"
|
|
created_at: Optional[datetime] = None
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
result_json: Optional[str] = None
|
|
error_message: Optional[str] = None
|
|
processing_time_ms: Optional[int] = None # Total job time (started_at to completed_at)
|
|
ocr_time_ms: Optional[int] = None # Actual OCR engine processing time
|
|
created_by: Optional[str] = None
|
|
original_filename: Optional[str] = None
|
|
expires_at: Optional[datetime] = None
|
|
|
|
@property
|
|
def queue_wait_ms(self) -> Optional[int]:
|
|
"""Calculate queue wait time (created_at to started_at)."""
|
|
if self.created_at and self.started_at:
|
|
delta = self.started_at - self.created_at
|
|
return int(delta.total_seconds() * 1000)
|
|
return None
|
|
|
|
@property
|
|
def result(self) -> Optional[Dict]:
|
|
"""Parse result_json to dict."""
|
|
if self.result_json:
|
|
try:
|
|
return json.loads(self.result_json)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
return None
|
|
|
|
|
|
class OCRJobQueue:
|
|
"""
|
|
SQLite-based job queue for OCR processing.
|
|
|
|
Provides async methods for job management with position
|
|
tracking and time estimation.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
db_path: Optional[Path] = None,
|
|
files_dir: Optional[Path] = None
|
|
):
|
|
"""
|
|
Initialize job queue.
|
|
|
|
Args:
|
|
db_path: Path to SQLite database (default: data/ocr_queue/ocr_jobs.db)
|
|
files_dir: Path to files directory (default: data/ocr_queue/files/)
|
|
"""
|
|
self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH
|
|
self.files_dir = Path(files_dir) if files_dir else DEFAULT_FILES_DIR
|
|
self._lock = asyncio.Lock()
|
|
self._initialized = False
|
|
|
|
async def initialize(self) -> None:
|
|
"""
|
|
Initialize database and directories.
|
|
|
|
Creates SQLite database and tables if they don't exist.
|
|
Creates files directory for uploaded files.
|
|
"""
|
|
if self._initialized:
|
|
return
|
|
|
|
# Create directories
|
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
self.files_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create database and tables
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
await db.execute('''
|
|
CREATE TABLE IF NOT EXISTS ocr_jobs (
|
|
id TEXT PRIMARY KEY,
|
|
status TEXT NOT NULL DEFAULT 'pending',
|
|
file_path TEXT NOT NULL,
|
|
mime_type TEXT NOT NULL,
|
|
engine TEXT DEFAULT 'doctr_plus',
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
started_at TIMESTAMP,
|
|
completed_at TIMESTAMP,
|
|
result_json TEXT,
|
|
error_message TEXT,
|
|
processing_time_ms INTEGER,
|
|
ocr_time_ms INTEGER,
|
|
created_by TEXT,
|
|
original_filename TEXT,
|
|
expires_at TIMESTAMP
|
|
)
|
|
''')
|
|
|
|
# Migration: add ocr_time_ms column if it doesn't exist
|
|
try:
|
|
await db.execute('ALTER TABLE ocr_jobs ADD COLUMN ocr_time_ms INTEGER')
|
|
logger.info("[OCRJobQueue] Added ocr_time_ms column to existing table")
|
|
except Exception:
|
|
pass # Column already exists
|
|
|
|
# Index for efficient queue queries
|
|
await db.execute('''
|
|
CREATE INDEX IF NOT EXISTS idx_ocr_jobs_status
|
|
ON ocr_jobs(status, created_at)
|
|
''')
|
|
|
|
# Index for expiration cleanup
|
|
await db.execute('''
|
|
CREATE INDEX IF NOT EXISTS idx_ocr_jobs_expires
|
|
ON ocr_jobs(expires_at)
|
|
''')
|
|
|
|
await db.commit()
|
|
|
|
self._initialized = True
|
|
logger.info(f"[OCRJobQueue] Initialized: db={self.db_path}, files={self.files_dir}")
|
|
|
|
async def create_job(
|
|
self,
|
|
file_bytes: bytes,
|
|
mime_type: str,
|
|
engine: str = "doctr_plus",
|
|
username: Optional[str] = None,
|
|
original_filename: Optional[str] = None
|
|
) -> OCRJob:
|
|
"""
|
|
Create a new OCR job.
|
|
|
|
Saves file to disk and creates database record.
|
|
|
|
Args:
|
|
file_bytes: Raw file bytes
|
|
mime_type: MIME type of file
|
|
engine: OCR engine ('tesseract', 'doctr', 'doctr_plus', 'paddleocr')
|
|
username: Username of requester
|
|
original_filename: Original filename from upload
|
|
|
|
Returns:
|
|
Created OCRJob instance
|
|
"""
|
|
await self.initialize()
|
|
|
|
# Generate job ID
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Determine file extension
|
|
ext_map = {
|
|
'image/jpeg': '.jpg',
|
|
'image/png': '.png',
|
|
'application/pdf': '.pdf',
|
|
}
|
|
ext = ext_map.get(mime_type, '.bin')
|
|
|
|
# Save file
|
|
file_path = self.files_dir / f"{job_id}{ext}"
|
|
with open(file_path, 'wb') as f:
|
|
f.write(file_bytes)
|
|
|
|
# Calculate expiration
|
|
now = datetime.utcnow()
|
|
expires_at = now + timedelta(hours=JOB_EXPIRY_HOURS)
|
|
|
|
# Insert job record
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
await db.execute('''
|
|
INSERT INTO ocr_jobs (
|
|
id, status, file_path, mime_type, engine,
|
|
created_at, created_by, original_filename, expires_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
''', (
|
|
job_id, OCRJobStatus.pending.value, str(file_path), mime_type, engine,
|
|
now.isoformat(), username, original_filename, expires_at.isoformat()
|
|
))
|
|
await db.commit()
|
|
|
|
logger.info(f"[OCRJobQueue] Created job {job_id}: engine={engine}, file={file_path.name}")
|
|
|
|
return OCRJob(
|
|
id=job_id,
|
|
status=OCRJobStatus.pending,
|
|
file_path=str(file_path),
|
|
mime_type=mime_type,
|
|
engine=engine,
|
|
created_at=now,
|
|
created_by=username,
|
|
original_filename=original_filename,
|
|
expires_at=expires_at
|
|
)
|
|
|
|
async def get_job(self, job_id: str) -> Optional[OCRJob]:
|
|
"""
|
|
Get job by ID.
|
|
|
|
Args:
|
|
job_id: Job UUID
|
|
|
|
Returns:
|
|
OCRJob or None if not found
|
|
"""
|
|
await self.initialize()
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
db.row_factory = aiosqlite.Row
|
|
async with db.execute(
|
|
'SELECT * FROM ocr_jobs WHERE id = ?',
|
|
(job_id,)
|
|
) as cursor:
|
|
row = await cursor.fetchone()
|
|
if row:
|
|
return self._row_to_job(row)
|
|
return None
|
|
|
|
async def get_queue_position(self, job_id: str) -> Optional[int]:
|
|
"""
|
|
Get position in queue for a pending job.
|
|
|
|
Args:
|
|
job_id: Job UUID
|
|
|
|
Returns:
|
|
Queue position (1 = next to process) or None if not pending
|
|
"""
|
|
await self.initialize()
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
# Check if job is pending
|
|
async with db.execute(
|
|
'SELECT status, created_at FROM ocr_jobs WHERE id = ?',
|
|
(job_id,)
|
|
) as cursor:
|
|
row = await cursor.fetchone()
|
|
if not row or row[0] != OCRJobStatus.pending.value:
|
|
return None
|
|
job_created_at = row[1]
|
|
|
|
# Count jobs ahead in queue (created before this job)
|
|
async with db.execute('''
|
|
SELECT COUNT(*) FROM ocr_jobs
|
|
WHERE status = 'pending' AND created_at < ?
|
|
''', (job_created_at,)) as cursor:
|
|
count = await cursor.fetchone()
|
|
return (count[0] + 1) if count else 1
|
|
|
|
async def get_next_pending(self) -> Optional[OCRJob]:
|
|
"""
|
|
Get the next pending job (oldest first) and atomically mark it as processing.
|
|
|
|
This prevents race conditions in parallel processing - only one worker
|
|
can claim each job.
|
|
|
|
Returns:
|
|
Next OCRJob to process or None if queue empty
|
|
"""
|
|
await self.initialize()
|
|
|
|
now = datetime.utcnow()
|
|
|
|
async with self._lock: # Serialize access to prevent race conditions
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
db.row_factory = aiosqlite.Row
|
|
|
|
# Get the next pending job
|
|
async with db.execute('''
|
|
SELECT * FROM ocr_jobs
|
|
WHERE status = 'pending'
|
|
ORDER BY created_at ASC
|
|
LIMIT 1
|
|
''') as cursor:
|
|
row = await cursor.fetchone()
|
|
if not row:
|
|
return None
|
|
|
|
job_id = row['id']
|
|
|
|
# Atomically mark as processing
|
|
await db.execute('''
|
|
UPDATE ocr_jobs
|
|
SET status = 'processing', started_at = ?
|
|
WHERE id = ? AND status = 'pending'
|
|
''', (now.isoformat(), job_id))
|
|
await db.commit()
|
|
|
|
# Fetch the updated job
|
|
async with db.execute(
|
|
'SELECT * FROM ocr_jobs WHERE id = ?',
|
|
(job_id,)
|
|
) as cursor:
|
|
updated_row = await cursor.fetchone()
|
|
if updated_row:
|
|
return self._row_to_job(updated_row)
|
|
|
|
return None
|
|
|
|
async def update_status(
|
|
self,
|
|
job_id: str,
|
|
status: OCRJobStatus,
|
|
result: Optional[Dict] = None,
|
|
error: Optional[str] = None,
|
|
processing_time_ms: Optional[int] = None,
|
|
ocr_time_ms: Optional[int] = None
|
|
) -> bool:
|
|
"""
|
|
Update job status.
|
|
|
|
Args:
|
|
job_id: Job UUID
|
|
status: New status
|
|
result: Extraction result dict (for completed)
|
|
error: Error message (for failed)
|
|
processing_time_ms: Total job processing time (started_at to completed_at)
|
|
ocr_time_ms: Actual OCR engine processing time
|
|
|
|
Returns:
|
|
True if update successful
|
|
"""
|
|
await self.initialize()
|
|
|
|
now = datetime.utcnow()
|
|
result_json = json.dumps(result, cls=DecimalEncoder) if result else None
|
|
|
|
# Build update query based on status
|
|
if status == OCRJobStatus.processing:
|
|
query = '''
|
|
UPDATE ocr_jobs
|
|
SET status = ?, started_at = ?
|
|
WHERE id = ?
|
|
'''
|
|
params = (status.value, now.isoformat(), job_id)
|
|
|
|
elif status == OCRJobStatus.completed:
|
|
query = '''
|
|
UPDATE ocr_jobs
|
|
SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?, ocr_time_ms = ?
|
|
WHERE id = ?
|
|
'''
|
|
params = (status.value, now.isoformat(), result_json, processing_time_ms, ocr_time_ms, job_id)
|
|
|
|
elif status == OCRJobStatus.failed:
|
|
query = '''
|
|
UPDATE ocr_jobs
|
|
SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?, ocr_time_ms = ?
|
|
WHERE id = ?
|
|
'''
|
|
params = (status.value, now.isoformat(), error, processing_time_ms, ocr_time_ms, job_id)
|
|
|
|
else:
|
|
query = 'UPDATE ocr_jobs SET status = ? WHERE id = ?'
|
|
params = (status.value, job_id)
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
cursor = await db.execute(query, params)
|
|
await db.commit()
|
|
return cursor.rowcount > 0
|
|
|
|
async def get_average_processing_time(self) -> float:
|
|
"""
|
|
Calculate average processing time from recent completed jobs.
|
|
|
|
Uses last 50 completed jobs for accuracy.
|
|
|
|
Returns:
|
|
Average time in seconds (default 7.0 if no data)
|
|
"""
|
|
await self.initialize()
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
async with db.execute('''
|
|
SELECT AVG(processing_time_ms)
|
|
FROM (
|
|
SELECT processing_time_ms FROM ocr_jobs
|
|
WHERE status = 'completed' AND processing_time_ms IS NOT NULL
|
|
ORDER BY completed_at DESC
|
|
LIMIT 50
|
|
)
|
|
''') as cursor:
|
|
row = await cursor.fetchone()
|
|
if row and row[0]:
|
|
return row[0] / 1000.0 # Convert ms to seconds
|
|
return 7.0 # Default estimate
|
|
|
|
async def count_pending(self) -> int:
|
|
"""Count pending jobs in queue."""
|
|
await self.initialize()
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
async with db.execute(
|
|
'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
|
|
(OCRJobStatus.pending.value,)
|
|
) as cursor:
|
|
row = await cursor.fetchone()
|
|
return row[0] if row else 0
|
|
|
|
async def count_processing(self) -> int:
|
|
"""Count currently processing jobs."""
|
|
await self.initialize()
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
async with db.execute(
|
|
'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
|
|
(OCRJobStatus.processing.value,)
|
|
) as cursor:
|
|
row = await cursor.fetchone()
|
|
return row[0] if row else 0
|
|
|
|
async def cleanup_expired(self) -> int:
|
|
"""
|
|
Delete expired jobs and their files.
|
|
|
|
Returns:
|
|
Number of jobs deleted
|
|
"""
|
|
await self.initialize()
|
|
|
|
now = datetime.utcnow()
|
|
deleted = 0
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
db.row_factory = aiosqlite.Row
|
|
|
|
# Get expired jobs
|
|
async with db.execute('''
|
|
SELECT id, file_path FROM ocr_jobs
|
|
WHERE expires_at < ?
|
|
''', (now.isoformat(),)) as cursor:
|
|
rows = await cursor.fetchall()
|
|
|
|
for row in rows:
|
|
# Delete file
|
|
file_path = Path(row['file_path'])
|
|
if file_path.exists():
|
|
try:
|
|
file_path.unlink()
|
|
except Exception as e:
|
|
logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
|
|
|
|
# Delete job record
|
|
await db.execute('DELETE FROM ocr_jobs WHERE id = ?', (row['id'],))
|
|
deleted += 1
|
|
|
|
await db.commit()
|
|
|
|
if deleted > 0:
|
|
logger.info(f"[OCRJobQueue] Cleaned up {deleted} expired job(s)")
|
|
|
|
return deleted
|
|
|
|
async def cleanup_job_file(self, job_id: str) -> bool:
|
|
"""
|
|
Delete the file associated with a job.
|
|
|
|
Called after processing to free disk space.
|
|
|
|
Args:
|
|
job_id: Job UUID
|
|
|
|
Returns:
|
|
True if file deleted
|
|
"""
|
|
job = await self.get_job(job_id)
|
|
if job:
|
|
file_path = Path(job.file_path)
|
|
if file_path.exists():
|
|
try:
|
|
file_path.unlink()
|
|
return True
|
|
except Exception as e:
|
|
logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
|
|
return False
|
|
|
|
async def get_queue_stats(self) -> Dict[str, Any]:
|
|
"""
|
|
Get queue statistics.
|
|
|
|
Returns:
|
|
Dict with pending, processing, completed, failed counts
|
|
"""
|
|
await self.initialize()
|
|
|
|
stats = {
|
|
"pending": 0,
|
|
"processing": 0,
|
|
"completed": 0,
|
|
"failed": 0,
|
|
"average_time_seconds": 0.0,
|
|
}
|
|
|
|
async with aiosqlite.connect(str(self.db_path)) as db:
|
|
async with db.execute('''
|
|
SELECT status, COUNT(*) as count
|
|
FROM ocr_jobs
|
|
GROUP BY status
|
|
''') as cursor:
|
|
rows = await cursor.fetchall()
|
|
for row in rows:
|
|
if row[0] in stats:
|
|
stats[row[0]] = row[1]
|
|
|
|
stats["average_time_seconds"] = await self.get_average_processing_time()
|
|
return stats
|
|
|
|
def _row_to_job(self, row: aiosqlite.Row) -> OCRJob:
|
|
"""Convert database row to OCRJob."""
|
|
def parse_datetime(val):
|
|
if val:
|
|
try:
|
|
return datetime.fromisoformat(val)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
return None
|
|
|
|
return OCRJob(
|
|
id=row['id'],
|
|
status=OCRJobStatus(row['status']),
|
|
file_path=row['file_path'],
|
|
mime_type=row['mime_type'],
|
|
engine=row['engine'] or 'doctr_plus',
|
|
created_at=parse_datetime(row['created_at']),
|
|
started_at=parse_datetime(row['started_at']),
|
|
completed_at=parse_datetime(row['completed_at']),
|
|
result_json=row['result_json'],
|
|
error_message=row['error_message'],
|
|
processing_time_ms=row['processing_time_ms'],
|
|
ocr_time_ms=row['ocr_time_ms'] if 'ocr_time_ms' in row.keys() else None,
|
|
created_by=row['created_by'],
|
|
original_filename=row['original_filename'],
|
|
expires_at=parse_datetime(row['expires_at']),
|
|
)
|
|
|
|
|
|
# Singleton instance
|
|
job_queue = OCRJobQueue()
|