feat(ocr): Implement persistent worker pool with SQLite job queue

Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-31 12:32:12 +02:00
parent 00f826f7ed
commit 74f7aefc26
23 changed files with 3616 additions and 209 deletions

View File

@@ -48,6 +48,7 @@ logger = logging.getLogger(__name__)
# Global variables for background tasks
telegram_bot_task = None
ocr_job_worker_running = False
# ============================================================================
@@ -122,15 +123,32 @@ async def init_telegram_db():
raise
def init_paddle_ocr_background():
"""Initialize PaddleOCR in background thread (takes 15-20s)."""
async def init_ocr_job_worker():
"""Initialize OCR job worker with persistent PaddleOCR.
This replaces the old background thread approach:
- Starts ProcessPoolExecutor with persistent worker
- Pre-warms PaddleOCR (loads once, reuses for all requests)
- Starts job queue background task
"""
global ocr_job_worker_running
logger.info("[OCR] Initializing OCR job worker...")
try:
logger.info("[DATA-ENTRY] Pre-loading OCR engine (background)...")
from backend.modules.data_entry.services.ocr_service import ocr_service
ocr_service.ocr_engine._init_paddle_lazy()
logger.info("[DATA-ENTRY] ✅ OCR engine ready")
from backend.modules.data_entry.services.ocr.job_worker import start_job_worker, is_running
success = await start_job_worker()
ocr_job_worker_running = is_running()
if success:
logger.info("[OCR] ✅ Job worker started (PaddleOCR persistent)")
else:
logger.warning("[OCR] ⚠️ Job worker failed to start, falling back to sync mode")
except Exception as e:
logger.warning(f"[DATA-ENTRY] ⚠️ OCR engine pre-load failed: {e}")
logger.warning(f"[OCR] ⚠️ OCR job worker init failed: {e}")
logger.warning("[OCR] Continuing with sync OCR mode")
ocr_job_worker_running = False
async def run_telegram_bot():
@@ -178,7 +196,11 @@ async def run_telegram_bot():
# Initialize and start
await application.initialize()
await application.start()
await application.updater.start_polling(drop_pending_updates=True)
await application.updater.start_polling(
drop_pending_updates=True,
poll_interval=0, # No delay between polls
timeout=30 # Long poll timeout 30 seconds (reduces requests from ~6/min to ~2/min)
)
bot_info = await application.bot.get_me()
logger.info(f"[TELEGRAM] ✅ Bot running: @{bot_info.username}")
@@ -236,9 +258,8 @@ async def startup_event():
init_telegram_db(),
)
# Step 3: Start PaddleOCR initialization in background thread
import threading
threading.Thread(target=init_paddle_ocr_background, daemon=True).start()
# Step 3: Initialize OCR job worker (with persistent PaddleOCR)
await init_ocr_job_worker()
# Step 4: Start Telegram bot as background task
if settings.telegram_bot_token:
@@ -260,13 +281,24 @@ async def startup_event():
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown - Cleanup resources."""
global telegram_bot_task
global telegram_bot_task, ocr_job_worker_running
logger.info("=" * 80)
logger.info("[SHUTDOWN] Stopping ROA2WEB Unified Backend...")
logger.info("=" * 80)
try:
# Stop OCR job worker
if ocr_job_worker_running:
logger.info("[SHUTDOWN] Stopping OCR job worker...")
try:
from backend.modules.data_entry.services.ocr.job_worker import stop_job_worker
await stop_job_worker()
ocr_job_worker_running = False
logger.info("[SHUTDOWN] OCR job worker stopped")
except Exception as e:
logger.error(f"[SHUTDOWN] OCR worker error: {e}")
# Stop Telegram bot
if telegram_bot_task and not telegram_bot_task.done():
logger.info("[SHUTDOWN] Stopping Telegram bot...")
@@ -409,6 +441,26 @@ async def health_check():
else:
health_status["modules"]["telegram_bot"] = "disabled"
# Check OCR job worker
global ocr_job_worker_running
try:
from backend.modules.data_entry.services.ocr.job_worker import is_running
from backend.modules.data_entry.services.ocr.job_queue import job_queue
if is_running():
# Get queue stats
stats = await job_queue.get_queue_stats()
health_status["modules"]["ocr_worker"] = {
"status": "running",
"pending_jobs": stats.get("pending", 0),
"processing_jobs": stats.get("processing", 0),
"avg_time_seconds": stats.get("average_time_seconds", 0)
}
else:
health_status["modules"]["ocr_worker"] = "stopped"
except Exception as e:
health_status["modules"]["ocr_worker"] = f"error: {str(e)}"
return health_status