Files
roa2web-service-auto/backend/modules/data_entry/services/ocr/job_worker.py
Claude Agent 1bb3a382de feat: Add PWA support and consolidate CSS design system
- Add PWA manifest, icons (192x192, 512x512), and service worker
- Register service worker in index.html with Apple mobile web app support
- Consolidate CSS variables and design tokens documentation
- Update PrimeVue overrides for consistent theming
- Refactor data-entry components to use shared CSS patterns
- Add frontend-style-auditor agent for style consistency checks
- Minor OCR validation and job worker improvements
- Update start-prod.sh configuration

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-06 14:00:21 +00:00

546 lines
18 KiB
Python

"""
OCR Job Worker - Background Task for Queue Processing
Runs as an asyncio background task in FastAPI.
Continuously polls the job queue and processes OCR requests IN PARALLEL.
Architecture:
FastAPI startup
start_job_worker()
asyncio.create_task(_job_worker_loop())
while True:
# Process up to OCR_WORKERS jobs concurrently
jobs = get_pending_jobs(limit=available_slots)
for job in jobs:
asyncio.create_task(_process_job(job))
await asyncio.sleep(0.1)
"""
import asyncio
import logging
import os
import time
from pathlib import Path
from typing import Optional, Set
from .job_queue import job_queue, OCRJobStatus, OCRJob
from .ocr_worker_pool import ocr_worker_pool
logger = logging.getLogger(__name__)
# Global task reference
_job_worker_task: Optional[asyncio.Task] = None
_cleanup_task: Optional[asyncio.Task] = None
_shutdown_event: Optional[asyncio.Event] = None
_active_tasks: Set[asyncio.Task] = set() # Track active job tasks
_concurrency_semaphore: Optional[asyncio.Semaphore] = None # Limit concurrent jobs
# Configuration
POLL_INTERVAL_SECONDS = 0.1 # How often to check for new jobs (faster for parallel)
CLEANUP_INTERVAL_SECONDS = 3600 # Clean expired jobs every hour
OCR_TIMEOUT_SECONDS = 120 # Max time for OCR processing
async def _job_worker_loop() -> None:
"""
Main worker loop - processes jobs from queue IN PARALLEL.
Runs continuously until shutdown. Uses semaphore to limit
concurrent jobs to OCR_WORKERS count. Launches jobs as
background tasks without waiting for completion.
"""
global _shutdown_event, _active_tasks, _concurrency_semaphore
# Get max concurrent jobs from env (matches worker pool size)
max_concurrent = int(os.getenv('OCR_WORKERS', '2'))
_concurrency_semaphore = asyncio.Semaphore(max_concurrent)
_active_tasks = set()
logger.info(f"[JobWorker] Starting PARALLEL worker loop (max_concurrent={max_concurrent})...")
_shutdown_event = asyncio.Event()
consecutive_errors = 0
max_consecutive_errors = 10
while not _shutdown_event.is_set():
try:
# Clean up completed tasks
done_tasks = {t for t in _active_tasks if t.done()}
for task in done_tasks:
_active_tasks.discard(task)
# Check for exceptions
try:
task.result()
except Exception as e:
logger.error(f"[JobWorker] Task failed: {e}")
# Check if we have capacity for more jobs
active_count = len(_active_tasks)
available_slots = max_concurrent - active_count
if available_slots > 0:
# Get next pending job
job = await job_queue.get_next_pending()
if job:
consecutive_errors = 0
# Launch job processing as background task
task = asyncio.create_task(_process_job_with_semaphore(job))
_active_tasks.add(task)
logger.debug(f"[JobWorker] Launched job {job.id} (active={len(_active_tasks)}/{max_concurrent})")
else:
# No pending jobs - wait briefly
try:
await asyncio.wait_for(
_shutdown_event.wait(),
timeout=POLL_INTERVAL_SECONDS
)
if _shutdown_event.is_set():
break
except asyncio.TimeoutError:
pass
else:
# At capacity - wait for a slot to free up
await asyncio.sleep(POLL_INTERVAL_SECONDS)
except asyncio.CancelledError:
logger.info("[JobWorker] Worker loop cancelled")
break
except Exception as e:
consecutive_errors += 1
logger.error(f"[JobWorker] Error in worker loop ({consecutive_errors}/{max_consecutive_errors}): {e}")
if consecutive_errors >= max_consecutive_errors:
logger.error("[JobWorker] Too many consecutive errors, stopping worker")
break
await asyncio.sleep(min(consecutive_errors * 2, 30))
# Wait for active tasks to complete on shutdown
if _active_tasks:
logger.info(f"[JobWorker] Waiting for {len(_active_tasks)} active tasks to complete...")
await asyncio.gather(*_active_tasks, return_exceptions=True)
logger.info("[JobWorker] Worker loop stopped")
async def _process_job_with_semaphore(job: OCRJob) -> None:
"""
Process job with semaphore to limit concurrency.
Acquires semaphore before processing, releases after.
This ensures we don't exceed OCR_WORKERS concurrent jobs.
"""
global _concurrency_semaphore
async with _concurrency_semaphore:
await _process_job(job)
async def _process_job(job: OCRJob) -> None:
"""
Process a single OCR job.
Reads file, submits to worker pool, updates job status,
and saves metrics for analytics.
Args:
job: OCRJob to process
"""
logger.info(f"[JobWorker] Processing job {job.id}: engine={job.engine}, file={Path(job.file_path).name}")
start_time = time.time()
file_size = 0
file_type = "image/jpeg"
try:
# Note: Job already marked as 'processing' atomically in get_next_pending()
# Read file bytes
file_path = Path(job.file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
with open(file_path, 'rb') as f:
file_bytes = f.read()
file_size = len(file_bytes)
# Determine file type from job or extension
file_type = getattr(job, 'mime_type', 'image/jpeg') or 'image/jpeg'
# Submit to worker pool
result = await ocr_worker_pool.submit_task(
image_bytes=file_bytes,
engine=job.engine,
preprocessing="auto",
timeout=OCR_TIMEOUT_SECONDS
)
elapsed_ms = int((time.time() - start_time) * 1000)
if result.get("success"):
# Job completed successfully
extraction = result.get("extraction", {})
# Include raw_texts for analysis (from all OCR engine passes)
extraction['raw_texts'] = result.get("raw_texts", [])
# Extract actual OCR processing time from extraction result
ocr_time_ms = extraction.get('processing_time_ms', 0)
# Debug: log suggested_payment_mode
spm = extraction.get('suggested_payment_mode')
logger.info(f"[JobWorker] Job {job.id} extraction has suggested_payment_mode={spm}")
await job_queue.update_status(
job_id=job.id,
status=OCRJobStatus.completed,
result=extraction,
processing_time_ms=elapsed_ms,
ocr_time_ms=ocr_time_ms
)
logger.info(f"[JobWorker] Job {job.id} completed in {elapsed_ms}ms (ocr: {ocr_time_ms}ms)")
# Save metrics for successful job
await _save_job_metrics(
job_id=job.id,
username=job.created_by or 'unknown',
engine_requested=job.engine,
engine_used=extraction.get('ocr_engine', job.engine),
processing_time_ms=elapsed_ms,
file_size_bytes=file_size,
file_type=file_type,
original_filename=job.original_filename,
success=True,
overall_confidence=extraction.get('overall_confidence', 0.0),
fields_extracted=_count_extracted_fields(extraction),
needs_manual_review=extraction.get('needs_manual_review'),
validation_warnings_count=len(extraction.get('validation_warnings', [])),
validation_errors_count=len(extraction.get('validation_errors', [])),
)
else:
# Job failed
error_msg = result.get("error", "Unknown error")
await job_queue.update_status(
job_id=job.id,
status=OCRJobStatus.failed,
error=error_msg,
processing_time_ms=elapsed_ms
)
logger.warning(f"[JobWorker] Job {job.id} failed after {elapsed_ms}ms: {error_msg}")
# Save metrics for failed job
await _save_job_metrics(
job_id=job.id,
username=job.created_by or 'unknown',
engine_requested=job.engine,
engine_used=job.engine,
processing_time_ms=elapsed_ms,
file_size_bytes=file_size,
file_type=file_type,
original_filename=job.original_filename,
success=False,
error_message=error_msg,
)
except Exception as e:
elapsed_ms = int((time.time() - start_time) * 1000)
logger.error(f"[JobWorker] Job {job.id} error after {elapsed_ms}ms: {e}")
await job_queue.update_status(
job_id=job.id,
status=OCRJobStatus.failed,
error=str(e),
processing_time_ms=elapsed_ms
)
# Save metrics for error job
await _save_job_metrics(
job_id=job.id,
username=job.created_by or 'unknown',
engine_requested=job.engine,
engine_used=job.engine,
processing_time_ms=elapsed_ms,
file_size_bytes=file_size,
file_type=file_type,
original_filename=job.original_filename,
success=False,
error_message=str(e),
)
finally:
# Cleanup file after processing
try:
await job_queue.cleanup_job_file(job.id)
except Exception as e:
logger.warning(f"[JobWorker] Failed to cleanup file for job {job.id}: {e}")
async def _cleanup_loop() -> None:
"""
Periodic cleanup of expired jobs.
Runs every hour to delete jobs older than 24 hours.
"""
global _shutdown_event
logger.info("[JobWorker] Starting cleanup loop...")
while not _shutdown_event.is_set():
try:
# Wait for interval or shutdown
try:
await asyncio.wait_for(
_shutdown_event.wait(),
timeout=CLEANUP_INTERVAL_SECONDS
)
if _shutdown_event.is_set():
break
except asyncio.TimeoutError:
pass # Normal timeout, do cleanup
# Run cleanup
deleted = await job_queue.cleanup_expired()
if deleted > 0:
logger.info(f"[JobWorker] Cleanup: deleted {deleted} expired jobs")
except asyncio.CancelledError:
logger.info("[JobWorker] Cleanup loop cancelled")
break
except Exception as e:
logger.error(f"[JobWorker] Cleanup error: {e}")
await asyncio.sleep(60) # Retry after 1 minute
logger.info("[JobWorker] Cleanup loop stopped")
async def start_job_worker() -> bool:
"""
Start the job worker background task.
Called at FastAPI startup to begin processing queue.
Returns:
True if started successfully
"""
global _job_worker_task, _cleanup_task, _shutdown_event
if _job_worker_task is not None and not _job_worker_task.done():
logger.warning("[JobWorker] Already running")
return True
try:
# Initialize job queue
await job_queue.initialize()
# Initialize worker pool
if not ocr_worker_pool.initialize():
logger.error("[JobWorker] Failed to initialize worker pool")
return False
# Pre-warm worker pool in BACKGROUND (don't block startup)
# First OCR request may be slower if prewarm isn't done yet
async def _background_prewarm():
logger.info("[JobWorker] Pre-warming OCR worker pool (background)...")
warmup_success = await ocr_worker_pool.prewarm(timeout=90.0)
if warmup_success:
logger.info("[JobWorker] OCR worker pool pre-warmed successfully")
else:
logger.warning("[JobWorker] Worker pool pre-warm failed, first request will be slower")
asyncio.create_task(_background_prewarm())
# Start worker loop
_shutdown_event = asyncio.Event()
_job_worker_task = asyncio.create_task(_job_worker_loop())
# Start cleanup loop
_cleanup_task = asyncio.create_task(_cleanup_loop())
logger.info("[JobWorker] Started successfully")
return True
except Exception as e:
logger.error(f"[JobWorker] Failed to start: {e}")
return False
async def stop_job_worker() -> None:
"""
Stop the job worker background task.
Called at FastAPI shutdown to gracefully stop processing.
"""
global _job_worker_task, _cleanup_task, _shutdown_event
logger.info("[JobWorker] Stopping...")
# Signal shutdown
if _shutdown_event:
_shutdown_event.set()
# Cancel worker task
if _job_worker_task and not _job_worker_task.done():
_job_worker_task.cancel()
try:
await _job_worker_task
except asyncio.CancelledError:
pass
# Cancel cleanup task
if _cleanup_task and not _cleanup_task.done():
_cleanup_task.cancel()
try:
await _cleanup_task
except asyncio.CancelledError:
pass
# Shutdown worker pool
ocr_worker_pool.shutdown(wait=True)
_job_worker_task = None
_cleanup_task = None
_shutdown_event = None
logger.info("[JobWorker] Stopped")
def is_running() -> bool:
"""Check if job worker is running."""
return _job_worker_task is not None and not _job_worker_task.done()
def estimate_wait_time(queue_position: int) -> int:
"""
Estimate wait time for a job in queue.
Args:
queue_position: Position in queue (1 = next)
Returns:
Estimated wait time in seconds
"""
if queue_position <= 0:
return 0
# Get average processing time (synchronous fallback)
# Default ~7 seconds per job if no data
avg_time = 7.0
try:
# Try to get from queue stats
import asyncio
loop = asyncio.get_event_loop()
if loop.is_running():
# Can't use sync call in async context, use default
pass
else:
avg_time = loop.run_until_complete(job_queue.get_average_processing_time())
except Exception:
pass
# Estimate: position * average_time
return int(queue_position * avg_time)
# ============================================================================
# Metrics Helper Functions
# ============================================================================
async def _save_job_metrics(
job_id: str,
username: str,
engine_requested: str,
engine_used: str,
processing_time_ms: int = 0,
file_size_bytes: int = 0,
file_type: str = "image/jpeg",
original_filename: Optional[str] = None,
success: bool = True,
error_message: Optional[str] = None,
overall_confidence: float = 0.0,
fields_extracted: int = 0,
needs_manual_review: Optional[bool] = None,
validation_warnings_count: int = 0,
validation_errors_count: int = 0,
) -> None:
"""
Save OCR job metrics to database for analytics.
Called after each job completes (success or failure).
Errors are logged but don't affect job processing.
"""
try:
from backend.modules.data_entry.db.database import get_db_session
from backend.modules.data_entry.db.crud.ocr_settings import OCRMetricsCRUD
async with await get_db_session() as session:
await OCRMetricsCRUD.create(
session=session,
job_id=job_id,
username=username,
engine_requested=engine_requested,
engine_used=engine_used,
processing_time_ms=processing_time_ms,
file_size_bytes=file_size_bytes,
file_type=file_type,
original_filename=original_filename,
success=success,
error_message=error_message,
overall_confidence=overall_confidence,
fields_extracted=fields_extracted,
needs_manual_review=needs_manual_review,
validation_warnings_count=validation_warnings_count,
validation_errors_count=validation_errors_count,
)
logger.debug(f"[JobWorker] Saved metrics for job {job_id}")
except Exception as e:
# Log but don't fail - metrics are nice-to-have
logger.warning(f"[JobWorker] Failed to save metrics for job {job_id}: {e}")
def _count_extracted_fields(extraction: dict) -> int:
"""
Count number of successfully extracted fields from OCR result.
Counts non-None values in key fields.
"""
key_fields = [
'receipt_number',
'receipt_date',
'amount',
'partner_name',
'cui',
'tva_total',
'address',
'items_count',
]
count = 0
for field in key_fields:
value = extraction.get(field)
if value is not None and value != '' and value != []:
count += 1
# Also count TVA entries if present
tva_entries = extraction.get('tva_entries', [])
if tva_entries and len(tva_entries) > 0:
count += 1
# Count payment methods if present
payment_methods = extraction.get('payment_methods', [])
if payment_methods and len(payment_methods) > 0:
count += 1
return count