feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements: - Add persistent SQLite-based job queue for OCR tasks - Implement worker pool with process isolation and auto-restart - Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone - Optimize Tesseract preprocessing based on benchmark results (8x faster) - Add recognize_cif_optimized() with multi-strategy CIF extraction - Add Romanian CIF checksum validation - Increase Telegram long polling timeout from 10s to 30s Squashed commits: - feat(ocr): Implement persistent worker pool with SQLite job queue - feat(ocr): Add OCR engine selector dropdown to upload zone - perf(telegram): Increase long polling timeout from 10s to 30s - perf(ocr): Optimize Tesseract preprocessing based on benchmark results 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -48,6 +48,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Global variables for background tasks
|
||||
telegram_bot_task = None
|
||||
ocr_job_worker_running = False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@@ -122,15 +123,32 @@ async def init_telegram_db():
|
||||
raise
|
||||
|
||||
|
||||
def init_paddle_ocr_background():
|
||||
"""Initialize PaddleOCR in background thread (takes 15-20s)."""
|
||||
async def init_ocr_job_worker():
|
||||
"""Initialize OCR job worker with persistent PaddleOCR.
|
||||
|
||||
This replaces the old background thread approach:
|
||||
- Starts ProcessPoolExecutor with persistent worker
|
||||
- Pre-warms PaddleOCR (loads once, reuses for all requests)
|
||||
- Starts job queue background task
|
||||
"""
|
||||
global ocr_job_worker_running
|
||||
|
||||
logger.info("[OCR] Initializing OCR job worker...")
|
||||
try:
|
||||
logger.info("[DATA-ENTRY] Pre-loading OCR engine (background)...")
|
||||
from backend.modules.data_entry.services.ocr_service import ocr_service
|
||||
ocr_service.ocr_engine._init_paddle_lazy()
|
||||
logger.info("[DATA-ENTRY] ✅ OCR engine ready")
|
||||
from backend.modules.data_entry.services.ocr.job_worker import start_job_worker, is_running
|
||||
|
||||
success = await start_job_worker()
|
||||
ocr_job_worker_running = is_running()
|
||||
|
||||
if success:
|
||||
logger.info("[OCR] ✅ Job worker started (PaddleOCR persistent)")
|
||||
else:
|
||||
logger.warning("[OCR] ⚠️ Job worker failed to start, falling back to sync mode")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"[DATA-ENTRY] ⚠️ OCR engine pre-load failed: {e}")
|
||||
logger.warning(f"[OCR] ⚠️ OCR job worker init failed: {e}")
|
||||
logger.warning("[OCR] Continuing with sync OCR mode")
|
||||
ocr_job_worker_running = False
|
||||
|
||||
|
||||
async def run_telegram_bot():
|
||||
@@ -178,7 +196,11 @@ async def run_telegram_bot():
|
||||
# Initialize and start
|
||||
await application.initialize()
|
||||
await application.start()
|
||||
await application.updater.start_polling(drop_pending_updates=True)
|
||||
await application.updater.start_polling(
|
||||
drop_pending_updates=True,
|
||||
poll_interval=0, # No delay between polls
|
||||
timeout=30 # Long poll timeout 30 seconds (reduces requests from ~6/min to ~2/min)
|
||||
)
|
||||
|
||||
bot_info = await application.bot.get_me()
|
||||
logger.info(f"[TELEGRAM] ✅ Bot running: @{bot_info.username}")
|
||||
@@ -236,9 +258,8 @@ async def startup_event():
|
||||
init_telegram_db(),
|
||||
)
|
||||
|
||||
# Step 3: Start PaddleOCR initialization in background thread
|
||||
import threading
|
||||
threading.Thread(target=init_paddle_ocr_background, daemon=True).start()
|
||||
# Step 3: Initialize OCR job worker (with persistent PaddleOCR)
|
||||
await init_ocr_job_worker()
|
||||
|
||||
# Step 4: Start Telegram bot as background task
|
||||
if settings.telegram_bot_token:
|
||||
@@ -260,13 +281,24 @@ async def startup_event():
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""Application shutdown - Cleanup resources."""
|
||||
global telegram_bot_task
|
||||
global telegram_bot_task, ocr_job_worker_running
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("[SHUTDOWN] Stopping ROA2WEB Unified Backend...")
|
||||
logger.info("=" * 80)
|
||||
|
||||
try:
|
||||
# Stop OCR job worker
|
||||
if ocr_job_worker_running:
|
||||
logger.info("[SHUTDOWN] Stopping OCR job worker...")
|
||||
try:
|
||||
from backend.modules.data_entry.services.ocr.job_worker import stop_job_worker
|
||||
await stop_job_worker()
|
||||
ocr_job_worker_running = False
|
||||
logger.info("[SHUTDOWN] OCR job worker stopped")
|
||||
except Exception as e:
|
||||
logger.error(f"[SHUTDOWN] OCR worker error: {e}")
|
||||
|
||||
# Stop Telegram bot
|
||||
if telegram_bot_task and not telegram_bot_task.done():
|
||||
logger.info("[SHUTDOWN] Stopping Telegram bot...")
|
||||
@@ -409,6 +441,26 @@ async def health_check():
|
||||
else:
|
||||
health_status["modules"]["telegram_bot"] = "disabled"
|
||||
|
||||
# Check OCR job worker
|
||||
global ocr_job_worker_running
|
||||
try:
|
||||
from backend.modules.data_entry.services.ocr.job_worker import is_running
|
||||
from backend.modules.data_entry.services.ocr.job_queue import job_queue
|
||||
|
||||
if is_running():
|
||||
# Get queue stats
|
||||
stats = await job_queue.get_queue_stats()
|
||||
health_status["modules"]["ocr_worker"] = {
|
||||
"status": "running",
|
||||
"pending_jobs": stats.get("pending", 0),
|
||||
"processing_jobs": stats.get("processing", 0),
|
||||
"avg_time_seconds": stats.get("average_time_seconds", 0)
|
||||
}
|
||||
else:
|
||||
health_status["modules"]["ocr_worker"] = "stopped"
|
||||
except Exception as e:
|
||||
health_status["modules"]["ocr_worker"] = f"error: {str(e)}"
|
||||
|
||||
return health_status
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user