Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/job_worker.py
Marius Mutu 74f7aefc26 feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 12:32:12 +02:00

343 lines
9.9 KiB
Python

"""
OCR Job Worker - Background Task for Queue Processing
Runs as an asyncio background task in FastAPI.
Continuously polls the job queue and processes OCR requests.
Architecture:
FastAPI startup
start_job_worker()
asyncio.create_task(_job_worker_loop())
while True:
job = job_queue.get_next_pending()
if job:
result = ocr_worker_pool.submit_task(...)
job_queue.update_status(...)
await asyncio.sleep(0.5)
"""
import asyncio
import logging
import time
from pathlib import Path
from typing import Optional
from .job_queue import job_queue, OCRJobStatus, OCRJob
from .ocr_worker_pool import ocr_worker_pool
logger = logging.getLogger(__name__)
# Global task reference
_job_worker_task: Optional[asyncio.Task] = None
_cleanup_task: Optional[asyncio.Task] = None
_shutdown_event: Optional[asyncio.Event] = None
# Configuration
POLL_INTERVAL_SECONDS = 0.5 # How often to check for new jobs
CLEANUP_INTERVAL_SECONDS = 3600 # Clean expired jobs every hour
OCR_TIMEOUT_SECONDS = 120 # Max time for OCR processing
async def _job_worker_loop() -> None:
"""
Main worker loop - processes jobs from queue.
Runs continuously until shutdown. Polls queue every 0.5s
and submits jobs to worker pool for processing.
"""
global _shutdown_event
logger.info("[JobWorker] Starting worker loop...")
_shutdown_event = asyncio.Event()
consecutive_errors = 0
max_consecutive_errors = 5
while not _shutdown_event.is_set():
try:
# Get next pending job
job = await job_queue.get_next_pending()
if job:
consecutive_errors = 0 # Reset error counter on success
await _process_job(job)
else:
# No jobs - wait before polling again
try:
await asyncio.wait_for(
_shutdown_event.wait(),
timeout=POLL_INTERVAL_SECONDS
)
if _shutdown_event.is_set():
break
except asyncio.TimeoutError:
pass # Normal timeout, continue loop
except asyncio.CancelledError:
logger.info("[JobWorker] Worker loop cancelled")
break
except Exception as e:
consecutive_errors += 1
logger.error(f"[JobWorker] Error in worker loop ({consecutive_errors}/{max_consecutive_errors}): {e}")
if consecutive_errors >= max_consecutive_errors:
logger.error("[JobWorker] Too many consecutive errors, stopping worker")
break
# Backoff on errors
await asyncio.sleep(min(consecutive_errors * 2, 30))
logger.info("[JobWorker] Worker loop stopped")
async def _process_job(job: OCRJob) -> None:
"""
Process a single OCR job.
Reads file, submits to worker pool, updates job status.
Args:
job: OCRJob to process
"""
logger.info(f"[JobWorker] Processing job {job.id}: engine={job.engine}, file={Path(job.file_path).name}")
start_time = time.time()
try:
# Mark as processing
await job_queue.update_status(job.id, OCRJobStatus.processing)
# Read file bytes
file_path = Path(job.file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
with open(file_path, 'rb') as f:
file_bytes = f.read()
# Submit to worker pool
result = await ocr_worker_pool.submit_task(
image_bytes=file_bytes,
engine=job.engine,
preprocessing="auto",
timeout=OCR_TIMEOUT_SECONDS
)
elapsed_ms = int((time.time() - start_time) * 1000)
if result.get("success"):
# Job completed successfully
extraction = result.get("extraction", {})
await job_queue.update_status(
job_id=job.id,
status=OCRJobStatus.completed,
result=extraction,
processing_time_ms=elapsed_ms
)
logger.info(f"[JobWorker] Job {job.id} completed in {elapsed_ms}ms")
else:
# Job failed
error_msg = result.get("error", "Unknown error")
await job_queue.update_status(
job_id=job.id,
status=OCRJobStatus.failed,
error=error_msg,
processing_time_ms=elapsed_ms
)
logger.warning(f"[JobWorker] Job {job.id} failed after {elapsed_ms}ms: {error_msg}")
except Exception as e:
elapsed_ms = int((time.time() - start_time) * 1000)
logger.error(f"[JobWorker] Job {job.id} error after {elapsed_ms}ms: {e}")
await job_queue.update_status(
job_id=job.id,
status=OCRJobStatus.failed,
error=str(e),
processing_time_ms=elapsed_ms
)
finally:
# Cleanup file after processing
try:
await job_queue.cleanup_job_file(job.id)
except Exception as e:
logger.warning(f"[JobWorker] Failed to cleanup file for job {job.id}: {e}")
async def _cleanup_loop() -> None:
"""
Periodic cleanup of expired jobs.
Runs every hour to delete jobs older than 24 hours.
"""
global _shutdown_event
logger.info("[JobWorker] Starting cleanup loop...")
while not _shutdown_event.is_set():
try:
# Wait for interval or shutdown
try:
await asyncio.wait_for(
_shutdown_event.wait(),
timeout=CLEANUP_INTERVAL_SECONDS
)
if _shutdown_event.is_set():
break
except asyncio.TimeoutError:
pass # Normal timeout, do cleanup
# Run cleanup
deleted = await job_queue.cleanup_expired()
if deleted > 0:
logger.info(f"[JobWorker] Cleanup: deleted {deleted} expired jobs")
except asyncio.CancelledError:
logger.info("[JobWorker] Cleanup loop cancelled")
break
except Exception as e:
logger.error(f"[JobWorker] Cleanup error: {e}")
await asyncio.sleep(60) # Retry after 1 minute
logger.info("[JobWorker] Cleanup loop stopped")
async def start_job_worker() -> bool:
"""
Start the job worker background task.
Called at FastAPI startup to begin processing queue.
Returns:
True if started successfully
"""
global _job_worker_task, _cleanup_task, _shutdown_event
if _job_worker_task is not None and not _job_worker_task.done():
logger.warning("[JobWorker] Already running")
return True
try:
# Initialize job queue
await job_queue.initialize()
# Initialize worker pool
if not ocr_worker_pool.initialize():
logger.error("[JobWorker] Failed to initialize worker pool")
return False
# Pre-warm worker pool in BACKGROUND (don't block startup)
# First OCR request may be slower if prewarm isn't done yet
async def _background_prewarm():
logger.info("[JobWorker] Pre-warming OCR worker pool (background)...")
warmup_success = await ocr_worker_pool.prewarm(timeout=90.0)
if warmup_success:
logger.info("[JobWorker] ✅ OCR worker pool pre-warmed successfully")
else:
logger.warning("[JobWorker] ⚠️ Worker pool pre-warm failed, first request will be slower")
asyncio.create_task(_background_prewarm())
# Start worker loop
_shutdown_event = asyncio.Event()
_job_worker_task = asyncio.create_task(_job_worker_loop())
# Start cleanup loop
_cleanup_task = asyncio.create_task(_cleanup_loop())
logger.info("[JobWorker] Started successfully")
return True
except Exception as e:
logger.error(f"[JobWorker] Failed to start: {e}")
return False
async def stop_job_worker() -> None:
"""
Stop the job worker background task.
Called at FastAPI shutdown to gracefully stop processing.
"""
global _job_worker_task, _cleanup_task, _shutdown_event
logger.info("[JobWorker] Stopping...")
# Signal shutdown
if _shutdown_event:
_shutdown_event.set()
# Cancel worker task
if _job_worker_task and not _job_worker_task.done():
_job_worker_task.cancel()
try:
await _job_worker_task
except asyncio.CancelledError:
pass
# Cancel cleanup task
if _cleanup_task and not _cleanup_task.done():
_cleanup_task.cancel()
try:
await _cleanup_task
except asyncio.CancelledError:
pass
# Shutdown worker pool
ocr_worker_pool.shutdown(wait=True)
_job_worker_task = None
_cleanup_task = None
_shutdown_event = None
logger.info("[JobWorker] Stopped")
def is_running() -> bool:
"""Check if job worker is running."""
return _job_worker_task is not None and not _job_worker_task.done()
def estimate_wait_time(queue_position: int) -> int:
"""
Estimate wait time for a job in queue.
Args:
queue_position: Position in queue (1 = next)
Returns:
Estimated wait time in seconds
"""
if queue_position <= 0:
return 0
# Get average processing time (synchronous fallback)
# Default ~7 seconds per job if no data
avg_time = 7.0
try:
# Try to get from queue stats
import asyncio
loop = asyncio.get_event_loop()
if loop.is_running():
# Can't use sync call in async context, use default
pass
else:
avg_time = loop.run_until_complete(job_queue.get_average_processing_time())
except Exception:
pass
# Estimate: position * average_time
return int(queue_position * avg_time)