""" SQLite Job Queue Manager for OCR Processing Provides async job queue for OCR requests: - Jobs are stored in SQLite for persistence - Queue position and time estimation - Automatic expiration after 24 hours - Statistics for monitoring Schema: ocr_jobs ( id TEXT PRIMARY KEY, -- UUID status TEXT NOT NULL, -- pending, processing, completed, failed file_path TEXT NOT NULL, -- Path to uploaded file mime_type TEXT NOT NULL, engine TEXT DEFAULT 'doctr_plus', created_at TIMESTAMP, started_at TIMESTAMP, completed_at TIMESTAMP, result_json TEXT, -- JSON extraction result error_message TEXT, processing_time_ms INTEGER, -- Total job time (started_at to completed_at) ocr_time_ms INTEGER, -- Actual OCR engine processing time created_by TEXT, -- Username original_filename TEXT, expires_at TIMESTAMP, batch_id INTEGER, -- Foreign key to batch_uploads (for bulk processing) file_hash TEXT -- SHA-256 hash for duplicate detection (US-007) ) """ import asyncio import json from decimal import Decimal class DecimalEncoder(json.JSONEncoder): """JSON encoder that handles Decimal types.""" def default(self, obj): if isinstance(obj, Decimal): return float(obj) return super().default(obj) import logging import os import uuid from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional import aiosqlite logger = logging.getLogger(__name__) # Default paths DEFAULT_QUEUE_DIR = Path(__file__).parent.parent.parent.parent.parent / "data" / "ocr_queue" DEFAULT_DB_PATH = DEFAULT_QUEUE_DIR / "ocr_jobs.db" DEFAULT_FILES_DIR = DEFAULT_QUEUE_DIR / "files" # Job expiration JOB_EXPIRY_HOURS = 24 # SQLite busy timeout (milliseconds) - prevents "database is locked" errors SQLITE_BUSY_TIMEOUT_MS = 5000 class OCRJobStatus(str, Enum): """Job status enum.""" pending = "pending" processing = "processing" completed = "completed" failed = "failed" cancelled = "cancelled" @dataclass class OCRJob: """OCR Job data class.""" id: str status: OCRJobStatus file_path: str mime_type: str engine: str = "doctr_plus" created_at: Optional[datetime] = None started_at: Optional[datetime] = None completed_at: Optional[datetime] = None result_json: Optional[str] = None error_message: Optional[str] = None processing_time_ms: Optional[int] = None # Total job time (started_at to completed_at) ocr_time_ms: Optional[int] = None # Actual OCR engine processing time created_by: Optional[str] = None original_filename: Optional[str] = None expires_at: Optional[datetime] = None batch_id: Optional[int] = None # Links to batch_uploads table for bulk processing file_hash: Optional[str] = None # SHA-256 hash for duplicate detection (US-007) @property def queue_wait_ms(self) -> Optional[int]: """Calculate queue wait time (created_at to started_at).""" if self.created_at and self.started_at: delta = self.started_at - self.created_at return int(delta.total_seconds() * 1000) return None @property def result(self) -> Optional[Dict]: """Parse result_json to dict.""" if self.result_json: try: return json.loads(self.result_json) except json.JSONDecodeError: return None return None class OCRJobQueue: """ SQLite-based job queue for OCR processing. Provides async methods for job management with position tracking and time estimation. """ def __init__( self, db_path: Optional[Path] = None, files_dir: Optional[Path] = None ): """ Initialize job queue. Args: db_path: Path to SQLite database (default: data/ocr_queue/ocr_jobs.db) files_dir: Path to files directory (default: data/ocr_queue/files/) """ self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH self.files_dir = Path(files_dir) if files_dir else DEFAULT_FILES_DIR self._lock = asyncio.Lock() self._initialized = False async def initialize(self) -> None: """ Initialize database and directories. Creates SQLite database and tables if they don't exist. Creates files directory for uploaded files. """ if self._initialized: return # Create directories self.db_path.parent.mkdir(parents=True, exist_ok=True) self.files_dir.mkdir(parents=True, exist_ok=True) # Create database and tables async with aiosqlite.connect(str(self.db_path)) as db: # Enable WAL mode for better concurrency and set busy timeout await db.execute("PRAGMA journal_mode=WAL") await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") await db.execute(''' CREATE TABLE IF NOT EXISTS ocr_jobs ( id TEXT PRIMARY KEY, status TEXT NOT NULL DEFAULT 'pending', file_path TEXT NOT NULL, mime_type TEXT NOT NULL, engine TEXT DEFAULT 'doctr_plus', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, started_at TIMESTAMP, completed_at TIMESTAMP, result_json TEXT, error_message TEXT, processing_time_ms INTEGER, ocr_time_ms INTEGER, created_by TEXT, original_filename TEXT, expires_at TIMESTAMP, batch_id INTEGER ) ''') # Migration: add ocr_time_ms column if it doesn't exist try: await db.execute('ALTER TABLE ocr_jobs ADD COLUMN ocr_time_ms INTEGER') logger.info("[OCRJobQueue] Added ocr_time_ms column to existing table") except Exception: pass # Column already exists # Migration: add batch_id column if it doesn't exist try: await db.execute('ALTER TABLE ocr_jobs ADD COLUMN batch_id INTEGER') logger.info("[OCRJobQueue] Added batch_id column to existing table") except Exception: pass # Column already exists # Migration: add file_hash column if it doesn't exist (US-007) try: await db.execute('ALTER TABLE ocr_jobs ADD COLUMN file_hash TEXT') logger.info("[OCRJobQueue] Added file_hash column to existing table") except Exception: pass # Column already exists # Index for efficient queue queries await db.execute(''' CREATE INDEX IF NOT EXISTS idx_ocr_jobs_status ON ocr_jobs(status, created_at) ''') # Index for expiration cleanup await db.execute(''' CREATE INDEX IF NOT EXISTS idx_ocr_jobs_expires ON ocr_jobs(expires_at) ''') await db.commit() self._initialized = True logger.info(f"[OCRJobQueue] Initialized: db={self.db_path}, files={self.files_dir}") async def create_job( self, file_bytes: bytes, mime_type: str, engine: str = "doctr_plus", username: Optional[str] = None, original_filename: Optional[str] = None, batch_id: Optional[int] = None, file_hash: Optional[str] = None ) -> OCRJob: """ Create a new OCR job. Saves file to disk and creates database record. Args: file_bytes: Raw file bytes mime_type: MIME type of file engine: OCR engine ('tesseract', 'doctr', 'doctr_plus', 'paddleocr') username: Username of requester original_filename: Original filename from upload batch_id: Optional batch ID for bulk upload processing file_hash: Optional SHA-256 hash for duplicate detection (US-007) Returns: Created OCRJob instance """ await self.initialize() # Generate job ID job_id = str(uuid.uuid4()) # Determine file extension ext_map = { 'image/jpeg': '.jpg', 'image/png': '.png', 'application/pdf': '.pdf', } ext = ext_map.get(mime_type, '.bin') # Save file file_path = self.files_dir / f"{job_id}{ext}" with open(file_path, 'wb') as f: f.write(file_bytes) # Calculate expiration now = datetime.utcnow() expires_at = now + timedelta(hours=JOB_EXPIRY_HOURS) # Insert job record async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") await db.execute(''' INSERT INTO ocr_jobs ( id, status, file_path, mime_type, engine, created_at, created_by, original_filename, expires_at, batch_id, file_hash ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( job_id, OCRJobStatus.pending.value, str(file_path), mime_type, engine, now.isoformat(), username, original_filename, expires_at.isoformat(), batch_id, file_hash )) await db.commit() logger.info(f"[OCRJobQueue] Created job {job_id}: engine={engine}, file={file_path.name}, batch_id={batch_id}") return OCRJob( id=job_id, status=OCRJobStatus.pending, file_path=str(file_path), mime_type=mime_type, engine=engine, created_at=now, created_by=username, original_filename=original_filename, expires_at=expires_at, batch_id=batch_id, file_hash=file_hash ) async def get_job(self, job_id: str) -> Optional[OCRJob]: """ Get job by ID. Args: job_id: Job UUID Returns: OCRJob or None if not found """ await self.initialize() async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") db.row_factory = aiosqlite.Row async with db.execute( 'SELECT * FROM ocr_jobs WHERE id = ?', (job_id,) ) as cursor: row = await cursor.fetchone() if row: return self._row_to_job(row) return None async def get_queue_position(self, job_id: str) -> Optional[int]: """ Get position in queue for a pending job. Args: job_id: Job UUID Returns: Queue position (1 = next to process) or None if not pending """ await self.initialize() async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") # Check if job is pending async with db.execute( 'SELECT status, created_at FROM ocr_jobs WHERE id = ?', (job_id,) ) as cursor: row = await cursor.fetchone() if not row or row[0] != OCRJobStatus.pending.value: return None job_created_at = row[1] # Count jobs ahead in queue (created before this job) async with db.execute(''' SELECT COUNT(*) FROM ocr_jobs WHERE status = 'pending' AND created_at < ? ''', (job_created_at,)) as cursor: count = await cursor.fetchone() return (count[0] + 1) if count else 1 async def get_next_pending(self) -> Optional[OCRJob]: """ Get the next pending job (oldest first) and atomically mark it as processing. This prevents race conditions in parallel processing - only one worker can claim each job. Returns: Next OCRJob to process or None if queue empty """ await self.initialize() now = datetime.utcnow() async with self._lock: # Serialize access to prevent race conditions async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") db.row_factory = aiosqlite.Row # Get the next pending job async with db.execute(''' SELECT * FROM ocr_jobs WHERE status = 'pending' ORDER BY created_at ASC LIMIT 1 ''') as cursor: row = await cursor.fetchone() if not row: return None job_id = row['id'] # Atomically mark as processing await db.execute(''' UPDATE ocr_jobs SET status = 'processing', started_at = ? WHERE id = ? AND status = 'pending' ''', (now.isoformat(), job_id)) await db.commit() # Fetch the updated job async with db.execute( 'SELECT * FROM ocr_jobs WHERE id = ?', (job_id,) ) as cursor: updated_row = await cursor.fetchone() if updated_row: return self._row_to_job(updated_row) return None async def update_status( self, job_id: str, status: OCRJobStatus, result: Optional[Dict] = None, error: Optional[str] = None, processing_time_ms: Optional[int] = None, ocr_time_ms: Optional[int] = None ) -> bool: """ Update job status. Args: job_id: Job UUID status: New status result: Extraction result dict (for completed) error: Error message (for failed) processing_time_ms: Total job processing time (started_at to completed_at) ocr_time_ms: Actual OCR engine processing time Returns: True if update successful """ await self.initialize() now = datetime.utcnow() result_json = json.dumps(result, cls=DecimalEncoder) if result else None # Build update query based on status if status == OCRJobStatus.processing: query = ''' UPDATE ocr_jobs SET status = ?, started_at = ? WHERE id = ? ''' params = (status.value, now.isoformat(), job_id) elif status == OCRJobStatus.completed: query = ''' UPDATE ocr_jobs SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?, ocr_time_ms = ? WHERE id = ? ''' params = (status.value, now.isoformat(), result_json, processing_time_ms, ocr_time_ms, job_id) elif status == OCRJobStatus.failed: query = ''' UPDATE ocr_jobs SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?, ocr_time_ms = ? WHERE id = ? ''' params = (status.value, now.isoformat(), error, processing_time_ms, ocr_time_ms, job_id) else: query = 'UPDATE ocr_jobs SET status = ? WHERE id = ?' params = (status.value, job_id) async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") cursor = await db.execute(query, params) await db.commit() return cursor.rowcount > 0 async def get_average_processing_time(self) -> float: """ Calculate average processing time from recent completed jobs. Uses last 50 completed jobs for accuracy. Returns: Average time in seconds (default 7.0 if no data) """ await self.initialize() async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") async with db.execute(''' SELECT AVG(processing_time_ms) FROM ( SELECT processing_time_ms FROM ocr_jobs WHERE status = 'completed' AND processing_time_ms IS NOT NULL ORDER BY completed_at DESC LIMIT 50 ) ''') as cursor: row = await cursor.fetchone() if row and row[0]: return row[0] / 1000.0 # Convert ms to seconds return 7.0 # Default estimate async def count_pending(self) -> int: """Count pending jobs in queue.""" await self.initialize() async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") async with db.execute( 'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?', (OCRJobStatus.pending.value,) ) as cursor: row = await cursor.fetchone() return row[0] if row else 0 async def count_processing(self) -> int: """Count currently processing jobs.""" await self.initialize() async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") async with db.execute( 'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?', (OCRJobStatus.processing.value,) ) as cursor: row = await cursor.fetchone() return row[0] if row else 0 async def cleanup_expired(self) -> int: """ Delete expired jobs and their files. Returns: Number of jobs deleted """ await self.initialize() now = datetime.utcnow() deleted = 0 async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") db.row_factory = aiosqlite.Row # Get expired jobs async with db.execute(''' SELECT id, file_path FROM ocr_jobs WHERE expires_at < ? ''', (now.isoformat(),)) as cursor: rows = await cursor.fetchall() for row in rows: # Delete file file_path = Path(row['file_path']) if file_path.exists(): try: file_path.unlink() except Exception as e: logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}") # Delete job record await db.execute('DELETE FROM ocr_jobs WHERE id = ?', (row['id'],)) deleted += 1 await db.commit() if deleted > 0: logger.info(f"[OCRJobQueue] Cleaned up {deleted} expired job(s)") return deleted async def cleanup_job_file(self, job_id: str) -> bool: """ Delete the file associated with a job. Called after processing to free disk space. Args: job_id: Job UUID Returns: True if file deleted """ job = await self.get_job(job_id) if job: file_path = Path(job.file_path) if file_path.exists(): try: file_path.unlink() return True except Exception as e: logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}") return False async def get_queue_stats(self) -> Dict[str, Any]: """ Get queue statistics. Returns: Dict with pending, processing, completed, failed counts """ await self.initialize() stats = { "pending": 0, "processing": 0, "completed": 0, "failed": 0, "average_time_seconds": 0.0, } async with aiosqlite.connect(str(self.db_path)) as db: await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}") async with db.execute(''' SELECT status, COUNT(*) as count FROM ocr_jobs GROUP BY status ''') as cursor: rows = await cursor.fetchall() for row in rows: if row[0] in stats: stats[row[0]] = row[1] stats["average_time_seconds"] = await self.get_average_processing_time() return stats def _row_to_job(self, row: aiosqlite.Row) -> OCRJob: """Convert database row to OCRJob.""" def parse_datetime(val): if val: try: return datetime.fromisoformat(val) except (ValueError, TypeError): return None return None return OCRJob( id=row['id'], status=OCRJobStatus(row['status']), file_path=row['file_path'], mime_type=row['mime_type'], engine=row['engine'] or 'doctr_plus', created_at=parse_datetime(row['created_at']), started_at=parse_datetime(row['started_at']), completed_at=parse_datetime(row['completed_at']), result_json=row['result_json'], error_message=row['error_message'], processing_time_ms=row['processing_time_ms'], ocr_time_ms=row['ocr_time_ms'] if 'ocr_time_ms' in row.keys() else None, created_by=row['created_by'], original_filename=row['original_filename'], expires_at=parse_datetime(row['expires_at']), batch_id=row['batch_id'] if 'batch_id' in row.keys() else None, file_hash=row['file_hash'] if 'file_hash' in row.keys() else None, ) # Singleton instance job_queue = OCRJobQueue()