feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements: - Add persistent SQLite-based job queue for OCR tasks - Implement worker pool with process isolation and auto-restart - Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone - Optimize Tesseract preprocessing based on benchmark results (8x faster) - Add recognize_cif_optimized() with multi-strategy CIF extraction - Add Romanian CIF checksum validation - Increase Telegram long polling timeout from 10s to 30s Squashed commits: - feat(ocr): Implement persistent worker pool with SQLite job queue - feat(ocr): Add OCR engine selector dropdown to upload zone - perf(telegram): Increase long polling timeout from 10s to 30s - perf(ocr): Optimize Tesseract preprocessing based on benchmark results 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
342
backend/modules/data_entry/services/ocr/job_worker.py
Normal file
342
backend/modules/data_entry/services/ocr/job_worker.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
OCR Job Worker - Background Task for Queue Processing
|
||||
|
||||
Runs as an asyncio background task in FastAPI.
|
||||
Continuously polls the job queue and processes OCR requests.
|
||||
|
||||
Architecture:
|
||||
FastAPI startup
|
||||
↓
|
||||
start_job_worker()
|
||||
↓
|
||||
asyncio.create_task(_job_worker_loop())
|
||||
↓
|
||||
while True:
|
||||
job = job_queue.get_next_pending()
|
||||
if job:
|
||||
result = ocr_worker_pool.submit_task(...)
|
||||
job_queue.update_status(...)
|
||||
await asyncio.sleep(0.5)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .job_queue import job_queue, OCRJobStatus, OCRJob
|
||||
from .ocr_worker_pool import ocr_worker_pool
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global task reference
|
||||
_job_worker_task: Optional[asyncio.Task] = None
|
||||
_cleanup_task: Optional[asyncio.Task] = None
|
||||
_shutdown_event: Optional[asyncio.Event] = None
|
||||
|
||||
# Configuration
|
||||
POLL_INTERVAL_SECONDS = 0.5 # How often to check for new jobs
|
||||
CLEANUP_INTERVAL_SECONDS = 3600 # Clean expired jobs every hour
|
||||
OCR_TIMEOUT_SECONDS = 120 # Max time for OCR processing
|
||||
|
||||
|
||||
async def _job_worker_loop() -> None:
|
||||
"""
|
||||
Main worker loop - processes jobs from queue.
|
||||
|
||||
Runs continuously until shutdown. Polls queue every 0.5s
|
||||
and submits jobs to worker pool for processing.
|
||||
"""
|
||||
global _shutdown_event
|
||||
|
||||
logger.info("[JobWorker] Starting worker loop...")
|
||||
_shutdown_event = asyncio.Event()
|
||||
|
||||
consecutive_errors = 0
|
||||
max_consecutive_errors = 5
|
||||
|
||||
while not _shutdown_event.is_set():
|
||||
try:
|
||||
# Get next pending job
|
||||
job = await job_queue.get_next_pending()
|
||||
|
||||
if job:
|
||||
consecutive_errors = 0 # Reset error counter on success
|
||||
await _process_job(job)
|
||||
else:
|
||||
# No jobs - wait before polling again
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
_shutdown_event.wait(),
|
||||
timeout=POLL_INTERVAL_SECONDS
|
||||
)
|
||||
if _shutdown_event.is_set():
|
||||
break
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, continue loop
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("[JobWorker] Worker loop cancelled")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
consecutive_errors += 1
|
||||
logger.error(f"[JobWorker] Error in worker loop ({consecutive_errors}/{max_consecutive_errors}): {e}")
|
||||
|
||||
if consecutive_errors >= max_consecutive_errors:
|
||||
logger.error("[JobWorker] Too many consecutive errors, stopping worker")
|
||||
break
|
||||
|
||||
# Backoff on errors
|
||||
await asyncio.sleep(min(consecutive_errors * 2, 30))
|
||||
|
||||
logger.info("[JobWorker] Worker loop stopped")
|
||||
|
||||
|
||||
async def _process_job(job: OCRJob) -> None:
|
||||
"""
|
||||
Process a single OCR job.
|
||||
|
||||
Reads file, submits to worker pool, updates job status.
|
||||
|
||||
Args:
|
||||
job: OCRJob to process
|
||||
"""
|
||||
logger.info(f"[JobWorker] Processing job {job.id}: engine={job.engine}, file={Path(job.file_path).name}")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Mark as processing
|
||||
await job_queue.update_status(job.id, OCRJobStatus.processing)
|
||||
|
||||
# Read file bytes
|
||||
file_path = Path(job.file_path)
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
with open(file_path, 'rb') as f:
|
||||
file_bytes = f.read()
|
||||
|
||||
# Submit to worker pool
|
||||
result = await ocr_worker_pool.submit_task(
|
||||
image_bytes=file_bytes,
|
||||
engine=job.engine,
|
||||
preprocessing="auto",
|
||||
timeout=OCR_TIMEOUT_SECONDS
|
||||
)
|
||||
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if result.get("success"):
|
||||
# Job completed successfully
|
||||
extraction = result.get("extraction", {})
|
||||
|
||||
await job_queue.update_status(
|
||||
job_id=job.id,
|
||||
status=OCRJobStatus.completed,
|
||||
result=extraction,
|
||||
processing_time_ms=elapsed_ms
|
||||
)
|
||||
|
||||
logger.info(f"[JobWorker] Job {job.id} completed in {elapsed_ms}ms")
|
||||
|
||||
else:
|
||||
# Job failed
|
||||
error_msg = result.get("error", "Unknown error")
|
||||
|
||||
await job_queue.update_status(
|
||||
job_id=job.id,
|
||||
status=OCRJobStatus.failed,
|
||||
error=error_msg,
|
||||
processing_time_ms=elapsed_ms
|
||||
)
|
||||
|
||||
logger.warning(f"[JobWorker] Job {job.id} failed after {elapsed_ms}ms: {error_msg}")
|
||||
|
||||
except Exception as e:
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
logger.error(f"[JobWorker] Job {job.id} error after {elapsed_ms}ms: {e}")
|
||||
|
||||
await job_queue.update_status(
|
||||
job_id=job.id,
|
||||
status=OCRJobStatus.failed,
|
||||
error=str(e),
|
||||
processing_time_ms=elapsed_ms
|
||||
)
|
||||
|
||||
finally:
|
||||
# Cleanup file after processing
|
||||
try:
|
||||
await job_queue.cleanup_job_file(job.id)
|
||||
except Exception as e:
|
||||
logger.warning(f"[JobWorker] Failed to cleanup file for job {job.id}: {e}")
|
||||
|
||||
|
||||
async def _cleanup_loop() -> None:
|
||||
"""
|
||||
Periodic cleanup of expired jobs.
|
||||
|
||||
Runs every hour to delete jobs older than 24 hours.
|
||||
"""
|
||||
global _shutdown_event
|
||||
|
||||
logger.info("[JobWorker] Starting cleanup loop...")
|
||||
|
||||
while not _shutdown_event.is_set():
|
||||
try:
|
||||
# Wait for interval or shutdown
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
_shutdown_event.wait(),
|
||||
timeout=CLEANUP_INTERVAL_SECONDS
|
||||
)
|
||||
if _shutdown_event.is_set():
|
||||
break
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, do cleanup
|
||||
|
||||
# Run cleanup
|
||||
deleted = await job_queue.cleanup_expired()
|
||||
if deleted > 0:
|
||||
logger.info(f"[JobWorker] Cleanup: deleted {deleted} expired jobs")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("[JobWorker] Cleanup loop cancelled")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[JobWorker] Cleanup error: {e}")
|
||||
await asyncio.sleep(60) # Retry after 1 minute
|
||||
|
||||
logger.info("[JobWorker] Cleanup loop stopped")
|
||||
|
||||
|
||||
async def start_job_worker() -> bool:
|
||||
"""
|
||||
Start the job worker background task.
|
||||
|
||||
Called at FastAPI startup to begin processing queue.
|
||||
|
||||
Returns:
|
||||
True if started successfully
|
||||
"""
|
||||
global _job_worker_task, _cleanup_task, _shutdown_event
|
||||
|
||||
if _job_worker_task is not None and not _job_worker_task.done():
|
||||
logger.warning("[JobWorker] Already running")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Initialize job queue
|
||||
await job_queue.initialize()
|
||||
|
||||
# Initialize worker pool
|
||||
if not ocr_worker_pool.initialize():
|
||||
logger.error("[JobWorker] Failed to initialize worker pool")
|
||||
return False
|
||||
|
||||
# Pre-warm worker pool in BACKGROUND (don't block startup)
|
||||
# First OCR request may be slower if prewarm isn't done yet
|
||||
async def _background_prewarm():
|
||||
logger.info("[JobWorker] Pre-warming OCR worker pool (background)...")
|
||||
warmup_success = await ocr_worker_pool.prewarm(timeout=90.0)
|
||||
if warmup_success:
|
||||
logger.info("[JobWorker] ✅ OCR worker pool pre-warmed successfully")
|
||||
else:
|
||||
logger.warning("[JobWorker] ⚠️ Worker pool pre-warm failed, first request will be slower")
|
||||
|
||||
asyncio.create_task(_background_prewarm())
|
||||
|
||||
# Start worker loop
|
||||
_shutdown_event = asyncio.Event()
|
||||
_job_worker_task = asyncio.create_task(_job_worker_loop())
|
||||
|
||||
# Start cleanup loop
|
||||
_cleanup_task = asyncio.create_task(_cleanup_loop())
|
||||
|
||||
logger.info("[JobWorker] Started successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[JobWorker] Failed to start: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def stop_job_worker() -> None:
|
||||
"""
|
||||
Stop the job worker background task.
|
||||
|
||||
Called at FastAPI shutdown to gracefully stop processing.
|
||||
"""
|
||||
global _job_worker_task, _cleanup_task, _shutdown_event
|
||||
|
||||
logger.info("[JobWorker] Stopping...")
|
||||
|
||||
# Signal shutdown
|
||||
if _shutdown_event:
|
||||
_shutdown_event.set()
|
||||
|
||||
# Cancel worker task
|
||||
if _job_worker_task and not _job_worker_task.done():
|
||||
_job_worker_task.cancel()
|
||||
try:
|
||||
await _job_worker_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Cancel cleanup task
|
||||
if _cleanup_task and not _cleanup_task.done():
|
||||
_cleanup_task.cancel()
|
||||
try:
|
||||
await _cleanup_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Shutdown worker pool
|
||||
ocr_worker_pool.shutdown(wait=True)
|
||||
|
||||
_job_worker_task = None
|
||||
_cleanup_task = None
|
||||
_shutdown_event = None
|
||||
|
||||
logger.info("[JobWorker] Stopped")
|
||||
|
||||
|
||||
def is_running() -> bool:
|
||||
"""Check if job worker is running."""
|
||||
return _job_worker_task is not None and not _job_worker_task.done()
|
||||
|
||||
|
||||
def estimate_wait_time(queue_position: int) -> int:
|
||||
"""
|
||||
Estimate wait time for a job in queue.
|
||||
|
||||
Args:
|
||||
queue_position: Position in queue (1 = next)
|
||||
|
||||
Returns:
|
||||
Estimated wait time in seconds
|
||||
"""
|
||||
if queue_position <= 0:
|
||||
return 0
|
||||
|
||||
# Get average processing time (synchronous fallback)
|
||||
# Default ~7 seconds per job if no data
|
||||
avg_time = 7.0
|
||||
|
||||
try:
|
||||
# Try to get from queue stats
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
# Can't use sync call in async context, use default
|
||||
pass
|
||||
else:
|
||||
avg_time = loop.run_until_complete(job_queue.get_average_processing_time())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Estimate: position * average_time
|
||||
return int(queue_position * avg_time)
|
||||
Reference in New Issue
Block a user