Files
roa2web-service-auto/backend/main.py
Marius Mutu 74f7aefc26 feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 12:32:12 +02:00

481 lines
18 KiB
Python

"""
ROA2WEB Unified Backend - Single FastAPI Application
Consolidates Reports, Data Entry, and Telegram modules into one process
"""
import asyncio
import logging
import os
import sys
from datetime import datetime
from pathlib import Path
from contextlib import asynccontextmanager
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
# Load environment variables
load_dotenv()
# Add project root and shared modules to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root)) # Enable 'from backend.xxx import yyy'
sys.path.insert(0, str(project_root / "shared")) # Enable 'from shared.xxx import yyy'
# Import configuration
from backend.config import settings
# Import shared infrastructure
from shared.database.oracle_pool import oracle_pool
from shared.auth.middleware import AuthenticationMiddleware
from shared.auth.routes import create_auth_router
from shared.routes.companies import create_companies_router
from shared.routes.calendar import create_calendar_router
# Import module router factories
from backend.modules.reports.routers import create_reports_router
from backend.modules.data_entry.routers import create_data_entry_router
from backend.modules.telegram.routers import create_telegram_router
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)
# Global variables for background tasks
telegram_bot_task = None
ocr_job_worker_running = False
# ============================================================================
# INITIALIZATION FUNCTIONS
# ============================================================================
async def init_oracle_pool():
"""Initialize Oracle connection pool (shared by all modules)."""
logger.info("[ORACLE] Initializing connection pool...")
await oracle_pool.initialize()
logger.info("[ORACLE] ✅ Pool initialized successfully")
async def init_reports_cache():
"""Initialize Reports cache system."""
logger.info("[REPORTS] Initializing cache system...")
try:
from backend.modules.reports.cache import init_cache, init_event_monitor, get_cache
from backend.modules.reports.cache.config import CacheConfig
cache_config = CacheConfig.from_env()
await init_cache(cache_config)
logger.info(f"[REPORTS] ✅ Cache initialized: type={cache_config.cache_type}, enabled={cache_config.enabled}")
# Initialize event monitor
cache = get_cache()
await init_event_monitor(cache, cache_config)
if cache_config.auto_invalidate_enabled:
logger.info("[REPORTS] Event-based auto-invalidation ENABLED")
else:
logger.info("[REPORTS] Event-based auto-invalidation DISABLED")
except Exception as e:
logger.error(f"[REPORTS] ⚠️ Cache initialization error: {e}", exc_info=True)
logger.warning("[REPORTS] Continuing without cache")
async def init_data_entry_db():
"""Initialize Data Entry SQLite database."""
logger.info("[DATA-ENTRY] Initializing SQLite database...")
try:
from backend.modules.data_entry.db.database import init_db
await init_db()
logger.info(f"[DATA-ENTRY] ✅ Database initialized: {settings.data_entry_sqlite_database_path}")
# Ensure upload directory exists
settings.data_entry_upload_path_resolved
logger.info(f"[DATA-ENTRY] Upload path: {settings.data_entry_upload_path_resolved}")
except Exception as e:
logger.error(f"[DATA-ENTRY] ❌ Database initialization error: {e}", exc_info=True)
raise
async def init_telegram_db():
"""Initialize Telegram SQLite database."""
logger.info("[TELEGRAM] Initializing SQLite database...")
try:
from backend.modules.telegram.db import init_database, cleanup_expired_codes, cleanup_expired_sessions, cleanup_expired_email_codes
await init_database()
logger.info(f"[TELEGRAM] ✅ Database initialized: {settings.telegram_sqlite_database_path}")
# Cleanup expired data
expired_codes = await cleanup_expired_codes()
expired_sessions = await cleanup_expired_sessions()
expired_email_codes = await cleanup_expired_email_codes()
logger.info(f"[TELEGRAM] Cleanup: {expired_codes} codes, {expired_sessions} sessions, {expired_email_codes} email codes removed")
except Exception as e:
logger.error(f"[TELEGRAM] ❌ Database initialization error: {e}", exc_info=True)
raise
async def init_ocr_job_worker():
"""Initialize OCR job worker with persistent PaddleOCR.
This replaces the old background thread approach:
- Starts ProcessPoolExecutor with persistent worker
- Pre-warms PaddleOCR (loads once, reuses for all requests)
- Starts job queue background task
"""
global ocr_job_worker_running
logger.info("[OCR] Initializing OCR job worker...")
try:
from backend.modules.data_entry.services.ocr.job_worker import start_job_worker, is_running
success = await start_job_worker()
ocr_job_worker_running = is_running()
if success:
logger.info("[OCR] ✅ Job worker started (PaddleOCR persistent)")
else:
logger.warning("[OCR] ⚠️ Job worker failed to start, falling back to sync mode")
except Exception as e:
logger.warning(f"[OCR] ⚠️ OCR job worker init failed: {e}")
logger.warning("[OCR] Continuing with sync OCR mode")
ocr_job_worker_running = False
async def run_telegram_bot():
"""Run Telegram bot as background task."""
logger.info("[TELEGRAM] Starting bot...")
try:
from telegram.ext import Application, CommandHandler, CallbackQueryHandler, MessageHandler, filters
from backend.modules.telegram.bot.handlers import (
start_command, help_command, clear_command, companies_command,
unlink_command, selectcompany_command, dashboard_command, sold_command,
facturi_command, trezorerie_command, menu_command, trezorerie_casa_command,
trezorerie_banca_command, clienti_command, furnizori_command, evolutie_command,
clearcache_command, togglecache_command, handle_text_message, button_callback,
error_handler
)
from backend.modules.telegram.bot.email_handlers import email_login_handler
# Create Telegram application
application = Application.builder().token(settings.telegram_bot_token).build()
# Register handlers
application.add_handler(email_login_handler)
application.add_handler(CommandHandler("start", start_command))
application.add_handler(CommandHandler("menu", menu_command))
application.add_handler(CommandHandler("help", help_command))
application.add_handler(CommandHandler("unlink", unlink_command))
application.add_handler(CommandHandler("clear", clear_command))
application.add_handler(CommandHandler("companies", companies_command))
application.add_handler(CommandHandler("selectcompany", selectcompany_command))
application.add_handler(CommandHandler("dashboard", dashboard_command))
application.add_handler(CommandHandler("sold", sold_command))
application.add_handler(CommandHandler("facturi", facturi_command))
application.add_handler(CommandHandler("trezorerie", trezorerie_command))
application.add_handler(CommandHandler("trezorerie_casa", trezorerie_casa_command))
application.add_handler(CommandHandler("trezorerie_banca", trezorerie_banca_command))
application.add_handler(CommandHandler("clienti", clienti_command))
application.add_handler(CommandHandler("furnizori", furnizori_command))
application.add_handler(CommandHandler("evolutie", evolutie_command))
application.add_handler(CommandHandler("clearcache", clearcache_command))
application.add_handler(CommandHandler("togglecache", togglecache_command))
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_text_message))
application.add_handler(CallbackQueryHandler(button_callback))
application.add_error_handler(error_handler)
# Initialize and start
await application.initialize()
await application.start()
await application.updater.start_polling(
drop_pending_updates=True,
poll_interval=0, # No delay between polls
timeout=30 # Long poll timeout 30 seconds (reduces requests from ~6/min to ~2/min)
)
bot_info = await application.bot.get_me()
logger.info(f"[TELEGRAM] ✅ Bot running: @{bot_info.username}")
# Keep bot running
while True:
await asyncio.sleep(1)
except asyncio.CancelledError:
logger.info("[TELEGRAM] Bot task cancelled, stopping...")
if 'application' in locals():
await application.updater.stop()
await application.stop()
await application.shutdown()
logger.info("[TELEGRAM] ✅ Bot stopped")
raise
except Exception as e:
logger.error(f"[TELEGRAM] ❌ Bot error: {e}", exc_info=True)
raise
# ============================================================================
# FASTAPI APPLICATION
# ============================================================================
app = FastAPI(
title="ROA2WEB Unified Backend",
description="Unified FastAPI backend for Reports, Data Entry, and Telegram modules",
version="1.0.0"
)
# ============================================================================
# STARTUP/SHUTDOWN EVENT HANDLERS
# ============================================================================
@app.on_event("startup")
async def startup_event():
"""Application startup - Initialize all resources."""
global telegram_bot_task
logger.info("=" * 80)
logger.info("[STARTUP] ROA2WEB Unified Backend")
logger.info("=" * 80)
try:
# Step 1: Initialize Oracle pool (shared by all modules)
await init_oracle_pool()
# Step 2: Parallel initialization of module-specific resources
logger.info("[STARTUP] Initializing module resources in parallel...")
await asyncio.gather(
init_reports_cache(),
init_data_entry_db(),
init_telegram_db(),
)
# Step 3: Initialize OCR job worker (with persistent PaddleOCR)
await init_ocr_job_worker()
# Step 4: Start Telegram bot as background task
if settings.telegram_bot_token:
telegram_bot_task = asyncio.create_task(run_telegram_bot())
logger.info("[STARTUP] ✅ Telegram bot task created")
else:
logger.warning("[STARTUP] ⚠️ TELEGRAM_BOT_TOKEN not set, bot disabled")
logger.info("=" * 80)
logger.info("[STARTUP] ✅ All modules initialized successfully")
logger.info(f"[STARTUP] ✅ Server running on http://{settings.api_host}:{settings.api_port}")
logger.info("=" * 80)
except Exception as e:
logger.error(f"[STARTUP] ❌ Initialization failed: {e}", exc_info=True)
raise
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown - Cleanup resources."""
global telegram_bot_task, ocr_job_worker_running
logger.info("=" * 80)
logger.info("[SHUTDOWN] Stopping ROA2WEB Unified Backend...")
logger.info("=" * 80)
try:
# Stop OCR job worker
if ocr_job_worker_running:
logger.info("[SHUTDOWN] Stopping OCR job worker...")
try:
from backend.modules.data_entry.services.ocr.job_worker import stop_job_worker
await stop_job_worker()
ocr_job_worker_running = False
logger.info("[SHUTDOWN] OCR job worker stopped")
except Exception as e:
logger.error(f"[SHUTDOWN] OCR worker error: {e}")
# Stop Telegram bot
if telegram_bot_task and not telegram_bot_task.done():
logger.info("[SHUTDOWN] Stopping Telegram bot...")
telegram_bot_task.cancel()
try:
await telegram_bot_task
except asyncio.CancelledError:
pass
# Stop Reports cache event monitor
try:
from backend.modules.reports.cache import close_cache, get_event_monitor
monitor = get_event_monitor()
if monitor:
await monitor.stop()
logger.info("[SHUTDOWN] Reports cache monitor stopped")
await close_cache()
logger.info("[SHUTDOWN] Reports cache closed")
except Exception as e:
logger.error(f"[SHUTDOWN] Cache error: {e}")
# Close Oracle pool
await oracle_pool.close_pool()
logger.info("[SHUTDOWN] Oracle pool closed")
logger.info("=" * 80)
logger.info("[SHUTDOWN] ✅ Shutdown complete")
logger.info("=" * 80)
except Exception as e:
logger.error(f"[SHUTDOWN] Error during shutdown: {e}", exc_info=True)
# ============================================================================
# MIDDLEWARE
# ============================================================================
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins for production deployment
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Authentication middleware
app.add_middleware(
AuthenticationMiddleware,
excluded_paths=[
"/", "/docs", "/health", "/redoc", "/openapi.json",
"/api/auth/login", "/api/auth/refresh",
"/api/telegram/auth/verify-user",
"/api/telegram/auth/verify-email",
"/api/telegram/auth/login-with-email",
"/api/telegram/auth/refresh-token",
"/api/telegram/health"
]
)
# ============================================================================
# ROUTER REGISTRATION
# ============================================================================
# Module routers with prefixes
app.include_router(create_reports_router(), prefix="/api/reports", tags=["reports"])
app.include_router(create_data_entry_router(), prefix="/api/data-entry", tags=["data-entry"])
app.include_router(create_telegram_router(), prefix="/api/telegram", tags=["telegram"])
# Shared routers
auth_router = create_auth_router(prefix="", tags=["authentication"])
app.include_router(auth_router, prefix="/api/auth")
companies_router = create_companies_router(oracle_pool, tags=["companies"])
app.include_router(companies_router, prefix="/api/companies")
calendar_router = create_calendar_router(oracle_pool, tags=["calendar"])
app.include_router(calendar_router, prefix="/api/calendar")
# ============================================================================
# ROOT & HEALTH ENDPOINTS
# ============================================================================
@app.get("/")
async def root():
"""Root endpoint - API information."""
return {
"name": settings.app_name,
"version": settings.app_version,
"status": "running",
"modules": ["reports", "data-entry", "telegram"],
"docs": "/docs",
"health": "/health"
}
@app.get("/health")
async def health_check():
"""Health check endpoint with module status."""
health_status = {
"api": "healthy",
"timestamp": datetime.utcnow().isoformat(),
"modules": {}
}
# Check Oracle connection
try:
async with oracle_pool.get_connection() as conn:
with conn.cursor() as cursor:
cursor.execute("SELECT 1 FROM DUAL")
health_status["modules"]["oracle"] = "connected"
except Exception as e:
health_status["modules"]["oracle"] = f"error: {str(e)}"
# Check Reports cache
try:
from backend.modules.reports.cache import get_cache
cache = get_cache()
health_status["modules"]["reports_cache"] = "initialized" if cache else "disabled"
except Exception as e:
health_status["modules"]["reports_cache"] = f"error: {str(e)}"
# Check Data Entry DB
try:
db_path = Path(settings.data_entry_sqlite_database_path)
health_status["modules"]["data_entry_db"] = "exists" if db_path.exists() else "missing"
except Exception as e:
health_status["modules"]["data_entry_db"] = f"error: {str(e)}"
# Check Telegram bot
global telegram_bot_task
if telegram_bot_task:
if telegram_bot_task.done():
health_status["modules"]["telegram_bot"] = "stopped"
else:
health_status["modules"]["telegram_bot"] = "running"
else:
health_status["modules"]["telegram_bot"] = "disabled"
# Check OCR job worker
global ocr_job_worker_running
try:
from backend.modules.data_entry.services.ocr.job_worker import is_running
from backend.modules.data_entry.services.ocr.job_queue import job_queue
if is_running():
# Get queue stats
stats = await job_queue.get_queue_stats()
health_status["modules"]["ocr_worker"] = {
"status": "running",
"pending_jobs": stats.get("pending", 0),
"processing_jobs": stats.get("processing", 0),
"avg_time_seconds": stats.get("average_time_seconds", 0)
}
else:
health_status["modules"]["ocr_worker"] = "stopped"
except Exception as e:
health_status["modules"]["ocr_worker"] = f"error: {str(e)}"
return health_status
# ============================================================================
# MAIN ENTRY POINT
# ============================================================================
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"backend.main:app",
host=settings.api_host,
port=settings.api_port,
reload=False,
log_level="info"
)