Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/job_queue.py
Claude Agent b137e80b71 feat: multi-Oracle server support with runtime switching
Complete implementation of multi-server Oracle database support:

Backend:
- Multi-pool Oracle with lazy loading per server
- Email-to-server cache for automatic server discovery
- JWT tokens include server_id claim
- /auth/check-identity and /auth/check-email endpoints
- /auth/my-servers endpoint for listing user's accessible servers
- Server switch with password re-authentication

Frontend:
- New ServerSelector component for header dropdown
- Multi-step login flow (identity → server → password)
- Server switching from header with password modal
- Mobile drawer menu with server selection
- Dark mode support for all new components
- URL bookmark support with ?server= query param

Scripts:
- Unified start.sh replacing start-prod.sh/start-test.sh
- Unified ssh-tunnel.sh with multi-server support
- Updated status.sh for new architecture

Tests:
- E2E tests for multi-server and single-server login flows
- Backend unit tests for all new endpoints
- Oracle multi-pool integration tests

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-26 22:39:06 +00:00

654 lines
22 KiB
Python

"""
SQLite Job Queue Manager for OCR Processing
Provides async job queue for OCR requests:
- Jobs are stored in SQLite for persistence
- Queue position and time estimation
- Automatic expiration after 24 hours
- Statistics for monitoring
Schema:
ocr_jobs (
id TEXT PRIMARY KEY, -- UUID
status TEXT NOT NULL, -- pending, processing, completed, failed
file_path TEXT NOT NULL, -- Path to uploaded file
mime_type TEXT NOT NULL,
engine TEXT DEFAULT 'doctr_plus',
created_at TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
result_json TEXT, -- JSON extraction result
error_message TEXT,
processing_time_ms INTEGER, -- Total job time (started_at to completed_at)
ocr_time_ms INTEGER, -- Actual OCR engine processing time
created_by TEXT, -- Username
original_filename TEXT,
expires_at TIMESTAMP,
batch_id INTEGER, -- Foreign key to batch_uploads (for bulk processing)
file_hash TEXT -- SHA-256 hash for duplicate detection (US-007)
)
"""
import asyncio
import json
from decimal import Decimal
class DecimalEncoder(json.JSONEncoder):
"""JSON encoder that handles Decimal types."""
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return super().default(obj)
import logging
import os
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
import aiosqlite
logger = logging.getLogger(__name__)
# Default paths
DEFAULT_QUEUE_DIR = Path(__file__).parent.parent.parent.parent.parent / "data" / "ocr_queue"
DEFAULT_DB_PATH = DEFAULT_QUEUE_DIR / "ocr_jobs.db"
DEFAULT_FILES_DIR = DEFAULT_QUEUE_DIR / "files"
# Job expiration
JOB_EXPIRY_HOURS = 24
# SQLite busy timeout (milliseconds) - prevents "database is locked" errors
SQLITE_BUSY_TIMEOUT_MS = 5000
class OCRJobStatus(str, Enum):
"""Job status enum."""
pending = "pending"
processing = "processing"
completed = "completed"
failed = "failed"
cancelled = "cancelled"
@dataclass
class OCRJob:
"""OCR Job data class."""
id: str
status: OCRJobStatus
file_path: str
mime_type: str
engine: str = "doctr_plus"
created_at: Optional[datetime] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
result_json: Optional[str] = None
error_message: Optional[str] = None
processing_time_ms: Optional[int] = None # Total job time (started_at to completed_at)
ocr_time_ms: Optional[int] = None # Actual OCR engine processing time
created_by: Optional[str] = None
original_filename: Optional[str] = None
expires_at: Optional[datetime] = None
batch_id: Optional[int] = None # Links to batch_uploads table for bulk processing
file_hash: Optional[str] = None # SHA-256 hash for duplicate detection (US-007)
@property
def queue_wait_ms(self) -> Optional[int]:
"""Calculate queue wait time (created_at to started_at)."""
if self.created_at and self.started_at:
delta = self.started_at - self.created_at
return int(delta.total_seconds() * 1000)
return None
@property
def result(self) -> Optional[Dict]:
"""Parse result_json to dict."""
if self.result_json:
try:
return json.loads(self.result_json)
except json.JSONDecodeError:
return None
return None
class OCRJobQueue:
"""
SQLite-based job queue for OCR processing.
Provides async methods for job management with position
tracking and time estimation.
"""
def __init__(
self,
db_path: Optional[Path] = None,
files_dir: Optional[Path] = None
):
"""
Initialize job queue.
Args:
db_path: Path to SQLite database (default: data/ocr_queue/ocr_jobs.db)
files_dir: Path to files directory (default: data/ocr_queue/files/)
"""
self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH
self.files_dir = Path(files_dir) if files_dir else DEFAULT_FILES_DIR
self._lock = asyncio.Lock()
self._initialized = False
async def initialize(self) -> None:
"""
Initialize database and directories.
Creates SQLite database and tables if they don't exist.
Creates files directory for uploaded files.
"""
if self._initialized:
return
# Create directories
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self.files_dir.mkdir(parents=True, exist_ok=True)
# Create database and tables
async with aiosqlite.connect(str(self.db_path)) as db:
# Enable WAL mode for better concurrency and set busy timeout
await db.execute("PRAGMA journal_mode=WAL")
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
await db.execute('''
CREATE TABLE IF NOT EXISTS ocr_jobs (
id TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'pending',
file_path TEXT NOT NULL,
mime_type TEXT NOT NULL,
engine TEXT DEFAULT 'doctr_plus',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
result_json TEXT,
error_message TEXT,
processing_time_ms INTEGER,
ocr_time_ms INTEGER,
created_by TEXT,
original_filename TEXT,
expires_at TIMESTAMP,
batch_id INTEGER
)
''')
# Migration: add ocr_time_ms column if it doesn't exist
try:
await db.execute('ALTER TABLE ocr_jobs ADD COLUMN ocr_time_ms INTEGER')
logger.info("[OCRJobQueue] Added ocr_time_ms column to existing table")
except Exception:
pass # Column already exists
# Migration: add batch_id column if it doesn't exist
try:
await db.execute('ALTER TABLE ocr_jobs ADD COLUMN batch_id INTEGER')
logger.info("[OCRJobQueue] Added batch_id column to existing table")
except Exception:
pass # Column already exists
# Migration: add file_hash column if it doesn't exist (US-007)
try:
await db.execute('ALTER TABLE ocr_jobs ADD COLUMN file_hash TEXT')
logger.info("[OCRJobQueue] Added file_hash column to existing table")
except Exception:
pass # Column already exists
# Index for efficient queue queries
await db.execute('''
CREATE INDEX IF NOT EXISTS idx_ocr_jobs_status
ON ocr_jobs(status, created_at)
''')
# Index for expiration cleanup
await db.execute('''
CREATE INDEX IF NOT EXISTS idx_ocr_jobs_expires
ON ocr_jobs(expires_at)
''')
await db.commit()
self._initialized = True
logger.info(f"[OCRJobQueue] Initialized: db={self.db_path}, files={self.files_dir}")
async def create_job(
self,
file_bytes: bytes,
mime_type: str,
engine: str = "doctr_plus",
username: Optional[str] = None,
original_filename: Optional[str] = None,
batch_id: Optional[int] = None,
file_hash: Optional[str] = None
) -> OCRJob:
"""
Create a new OCR job.
Saves file to disk and creates database record.
Args:
file_bytes: Raw file bytes
mime_type: MIME type of file
engine: OCR engine ('tesseract', 'doctr', 'doctr_plus', 'paddleocr')
username: Username of requester
original_filename: Original filename from upload
batch_id: Optional batch ID for bulk upload processing
file_hash: Optional SHA-256 hash for duplicate detection (US-007)
Returns:
Created OCRJob instance
"""
await self.initialize()
# Generate job ID
job_id = str(uuid.uuid4())
# Determine file extension
ext_map = {
'image/jpeg': '.jpg',
'image/png': '.png',
'application/pdf': '.pdf',
}
ext = ext_map.get(mime_type, '.bin')
# Save file
file_path = self.files_dir / f"{job_id}{ext}"
with open(file_path, 'wb') as f:
f.write(file_bytes)
# Calculate expiration
now = datetime.utcnow()
expires_at = now + timedelta(hours=JOB_EXPIRY_HOURS)
# Insert job record
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
await db.execute('''
INSERT INTO ocr_jobs (
id, status, file_path, mime_type, engine,
created_at, created_by, original_filename, expires_at, batch_id, file_hash
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
job_id, OCRJobStatus.pending.value, str(file_path), mime_type, engine,
now.isoformat(), username, original_filename, expires_at.isoformat(), batch_id, file_hash
))
await db.commit()
logger.info(f"[OCRJobQueue] Created job {job_id}: engine={engine}, file={file_path.name}, batch_id={batch_id}")
return OCRJob(
id=job_id,
status=OCRJobStatus.pending,
file_path=str(file_path),
mime_type=mime_type,
engine=engine,
created_at=now,
created_by=username,
original_filename=original_filename,
expires_at=expires_at,
batch_id=batch_id,
file_hash=file_hash
)
async def get_job(self, job_id: str) -> Optional[OCRJob]:
"""
Get job by ID.
Args:
job_id: Job UUID
Returns:
OCRJob or None if not found
"""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
db.row_factory = aiosqlite.Row
async with db.execute(
'SELECT * FROM ocr_jobs WHERE id = ?',
(job_id,)
) as cursor:
row = await cursor.fetchone()
if row:
return self._row_to_job(row)
return None
async def get_queue_position(self, job_id: str) -> Optional[int]:
"""
Get position in queue for a pending job.
Args:
job_id: Job UUID
Returns:
Queue position (1 = next to process) or None if not pending
"""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
# Check if job is pending
async with db.execute(
'SELECT status, created_at FROM ocr_jobs WHERE id = ?',
(job_id,)
) as cursor:
row = await cursor.fetchone()
if not row or row[0] != OCRJobStatus.pending.value:
return None
job_created_at = row[1]
# Count jobs ahead in queue (created before this job)
async with db.execute('''
SELECT COUNT(*) FROM ocr_jobs
WHERE status = 'pending' AND created_at < ?
''', (job_created_at,)) as cursor:
count = await cursor.fetchone()
return (count[0] + 1) if count else 1
async def get_next_pending(self) -> Optional[OCRJob]:
"""
Get the next pending job (oldest first) and atomically mark it as processing.
This prevents race conditions in parallel processing - only one worker
can claim each job.
Returns:
Next OCRJob to process or None if queue empty
"""
await self.initialize()
now = datetime.utcnow()
async with self._lock: # Serialize access to prevent race conditions
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
db.row_factory = aiosqlite.Row
# Get the next pending job
async with db.execute('''
SELECT * FROM ocr_jobs
WHERE status = 'pending'
ORDER BY created_at ASC
LIMIT 1
''') as cursor:
row = await cursor.fetchone()
if not row:
return None
job_id = row['id']
# Atomically mark as processing
await db.execute('''
UPDATE ocr_jobs
SET status = 'processing', started_at = ?
WHERE id = ? AND status = 'pending'
''', (now.isoformat(), job_id))
await db.commit()
# Fetch the updated job
async with db.execute(
'SELECT * FROM ocr_jobs WHERE id = ?',
(job_id,)
) as cursor:
updated_row = await cursor.fetchone()
if updated_row:
return self._row_to_job(updated_row)
return None
async def update_status(
self,
job_id: str,
status: OCRJobStatus,
result: Optional[Dict] = None,
error: Optional[str] = None,
processing_time_ms: Optional[int] = None,
ocr_time_ms: Optional[int] = None
) -> bool:
"""
Update job status.
Args:
job_id: Job UUID
status: New status
result: Extraction result dict (for completed)
error: Error message (for failed)
processing_time_ms: Total job processing time (started_at to completed_at)
ocr_time_ms: Actual OCR engine processing time
Returns:
True if update successful
"""
await self.initialize()
now = datetime.utcnow()
result_json = json.dumps(result, cls=DecimalEncoder) if result else None
# Build update query based on status
if status == OCRJobStatus.processing:
query = '''
UPDATE ocr_jobs
SET status = ?, started_at = ?
WHERE id = ?
'''
params = (status.value, now.isoformat(), job_id)
elif status == OCRJobStatus.completed:
query = '''
UPDATE ocr_jobs
SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?, ocr_time_ms = ?
WHERE id = ?
'''
params = (status.value, now.isoformat(), result_json, processing_time_ms, ocr_time_ms, job_id)
elif status == OCRJobStatus.failed:
query = '''
UPDATE ocr_jobs
SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?, ocr_time_ms = ?
WHERE id = ?
'''
params = (status.value, now.isoformat(), error, processing_time_ms, ocr_time_ms, job_id)
else:
query = 'UPDATE ocr_jobs SET status = ? WHERE id = ?'
params = (status.value, job_id)
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
cursor = await db.execute(query, params)
await db.commit()
return cursor.rowcount > 0
async def get_average_processing_time(self) -> float:
"""
Calculate average processing time from recent completed jobs.
Uses last 50 completed jobs for accuracy.
Returns:
Average time in seconds (default 7.0 if no data)
"""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
async with db.execute('''
SELECT AVG(processing_time_ms)
FROM (
SELECT processing_time_ms FROM ocr_jobs
WHERE status = 'completed' AND processing_time_ms IS NOT NULL
ORDER BY completed_at DESC
LIMIT 50
)
''') as cursor:
row = await cursor.fetchone()
if row and row[0]:
return row[0] / 1000.0 # Convert ms to seconds
return 7.0 # Default estimate
async def count_pending(self) -> int:
"""Count pending jobs in queue."""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
async with db.execute(
'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
(OCRJobStatus.pending.value,)
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def count_processing(self) -> int:
"""Count currently processing jobs."""
await self.initialize()
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
async with db.execute(
'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
(OCRJobStatus.processing.value,)
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def cleanup_expired(self) -> int:
"""
Delete expired jobs and their files.
Returns:
Number of jobs deleted
"""
await self.initialize()
now = datetime.utcnow()
deleted = 0
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
db.row_factory = aiosqlite.Row
# Get expired jobs
async with db.execute('''
SELECT id, file_path FROM ocr_jobs
WHERE expires_at < ?
''', (now.isoformat(),)) as cursor:
rows = await cursor.fetchall()
for row in rows:
# Delete file
file_path = Path(row['file_path'])
if file_path.exists():
try:
file_path.unlink()
except Exception as e:
logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
# Delete job record
await db.execute('DELETE FROM ocr_jobs WHERE id = ?', (row['id'],))
deleted += 1
await db.commit()
if deleted > 0:
logger.info(f"[OCRJobQueue] Cleaned up {deleted} expired job(s)")
return deleted
async def cleanup_job_file(self, job_id: str) -> bool:
"""
Delete the file associated with a job.
Called after processing to free disk space.
Args:
job_id: Job UUID
Returns:
True if file deleted
"""
job = await self.get_job(job_id)
if job:
file_path = Path(job.file_path)
if file_path.exists():
try:
file_path.unlink()
return True
except Exception as e:
logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
return False
async def get_queue_stats(self) -> Dict[str, Any]:
"""
Get queue statistics.
Returns:
Dict with pending, processing, completed, failed counts
"""
await self.initialize()
stats = {
"pending": 0,
"processing": 0,
"completed": 0,
"failed": 0,
"average_time_seconds": 0.0,
}
async with aiosqlite.connect(str(self.db_path)) as db:
await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
async with db.execute('''
SELECT status, COUNT(*) as count
FROM ocr_jobs
GROUP BY status
''') as cursor:
rows = await cursor.fetchall()
for row in rows:
if row[0] in stats:
stats[row[0]] = row[1]
stats["average_time_seconds"] = await self.get_average_processing_time()
return stats
def _row_to_job(self, row: aiosqlite.Row) -> OCRJob:
"""Convert database row to OCRJob."""
def parse_datetime(val):
if val:
try:
return datetime.fromisoformat(val)
except (ValueError, TypeError):
return None
return None
return OCRJob(
id=row['id'],
status=OCRJobStatus(row['status']),
file_path=row['file_path'],
mime_type=row['mime_type'],
engine=row['engine'] or 'doctr_plus',
created_at=parse_datetime(row['created_at']),
started_at=parse_datetime(row['started_at']),
completed_at=parse_datetime(row['completed_at']),
result_json=row['result_json'],
error_message=row['error_message'],
processing_time_ms=row['processing_time_ms'],
ocr_time_ms=row['ocr_time_ms'] if 'ocr_time_ms' in row.keys() else None,
created_by=row['created_by'],
original_filename=row['original_filename'],
expires_at=parse_datetime(row['expires_at']),
batch_id=row['batch_id'] if 'batch_id' in row.keys() else None,
file_hash=row['file_hash'] if 'file_hash' in row.keys() else None,
)
# Singleton instance
job_queue = OCRJobQueue()