From cd7eb628dd794f944a292462e83d05e31b17fa7c Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Fri, 26 Jun 2026 07:33:01 +0000 Subject: [PATCH] feat(oracle): auto-recover Oracle pool + surface status, stop silent import failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a power loss the app started before Oracle was ready; init_oracle() failed once, the pool stayed None forever (no retry), and every sync silently failed ("Oracle pool not initialized") while still hammering the GoMag API each minute, and order-detail 500'd. - database.ensure_oracle_pool(force): thread-safe (re)create of the pool, called at the start of every sync cycle → self-heals within one cycle once Oracle is back (incl. after an Oracle service restart). init_oracle_client made idempotent so re-init can't fall back to thin mode. - database.oracle_status() exposed; main.py startup is non-fatal via ensure pool. - run_sync ensures the pool before the GoMag download; on failure it records a clear run status instead of crashing and skips the wasted API calls. - /api/sync/health reports oracle_ready/last_error; dashboard health pill shows "Oracle indisponibil" (top priority). Recovery via the existing Start Sync button. - order_detail degrades gracefully (200 without CODMAT + notice) instead of 500. Co-Authored-By: Claude Opus 4.8 (1M context) --- api/app/database.py | 78 +++++++++++++++++++++++--- api/app/main.py | 12 ++-- api/app/routers/sync.py | 18 +++++- api/app/services/sync_service.py | 20 +++++++ api/app/static/js/dashboard.js | 9 ++- api/app/static/js/shared.js | 10 +++- api/app/templates/base.html | 2 +- api/app/templates/dashboard.html | 2 +- api/tests/test_sync_health_endpoint.py | 13 +++++ 9 files changed, 140 insertions(+), 24 deletions(-) diff --git a/api/app/database.py b/api/app/database.py index abee56f..655fb3b 100644 --- a/api/app/database.py +++ b/api/app/database.py @@ -3,39 +3,59 @@ import aiosqlite import sqlite3 import logging import os +import threading +from datetime import datetime from .config import settings logger = logging.getLogger(__name__) # ---- Oracle Pool ---- pool = None +_pool_lock = threading.Lock() +_pool_last_error = None # str — reason the last (re)init failed, or None +_pool_last_attempt = None # ISO str — when we last tried to (re)init +_client_initialized = False # init_oracle_client may only be called once/process -def init_oracle(): - """Initialize Oracle client mode and create connection pool.""" - global pool + +def _init_oracle_client_once(): + """Load the Oracle client library exactly once. + + init_oracle_client() loads the thick-mode driver (it does NOT connect to the + DB), so it succeeds even when Oracle is down. Calling it a second time raises, + which on a pool re-init would wrongly fall back to thin mode — so we guard it. + """ + global _client_initialized + if _client_initialized: + return force_thin = settings.FORCE_THIN_MODE instantclient_path = settings.INSTANTCLIENTPATH - dsn = settings.ORACLE_DSN # Ensure TNS_ADMIN is set as OS env var so oracledb can find tnsnames.ora if settings.TNS_ADMIN: os.environ['TNS_ADMIN'] = settings.TNS_ADMIN - logger.info(f"Oracle config: DSN={dsn}, TNS_ADMIN={settings.TNS_ADMIN or os.environ.get('TNS_ADMIN', '(not set)')}, INSTANTCLIENTPATH={instantclient_path or '(not set)'}") + logger.info(f"Oracle config: DSN={settings.ORACLE_DSN}, TNS_ADMIN={settings.TNS_ADMIN or os.environ.get('TNS_ADMIN', '(not set)')}, INSTANTCLIENTPATH={instantclient_path or '(not set)'}") if force_thin: - logger.info(f"FORCE_THIN_MODE=true: thin mode for {dsn}") + logger.info(f"FORCE_THIN_MODE=true: thin mode for {settings.ORACLE_DSN}") elif instantclient_path: try: oracledb.init_oracle_client(lib_dir=instantclient_path) - logger.info(f"Thick mode activated for {dsn}") + logger.info(f"Thick mode activated for {settings.ORACLE_DSN}") except Exception as e: logger.error(f"Thick mode error: {e}") logger.info("Fallback to thin mode") else: - logger.info(f"Thin mode (default) for {dsn}") + logger.info(f"Thin mode (default) for {settings.ORACLE_DSN}") + _client_initialized = True + + +def init_oracle(): + """Initialize Oracle client mode and create the connection pool. Raises on failure.""" + global pool + _init_oracle_client_once() pool = oracledb.create_pool( user=settings.ORACLE_USER, password=settings.ORACLE_PASSWORD, @@ -44,9 +64,49 @@ def init_oracle(): max=4, increment=1 ) - logger.info(f"Oracle pool created for {dsn}") + logger.info(f"Oracle pool created for {settings.ORACLE_DSN}") return pool + +def ensure_oracle_pool(force: bool = False) -> bool: + """Ensure the Oracle pool exists, (re)creating it if needed. Returns True if ready. + + Thread-safe and idempotent — safe to call at the start of every sync cycle so + the app self-heals after Oracle becomes reachable again (e.g. the DB service + was restarted after a power loss). On failure it records the reason and leaves + pool=None so callers can surface a clear status instead of crashing. + """ + global pool, _pool_last_error, _pool_last_attempt + with _pool_lock: + if pool is not None and not force: + return True + if force and pool is not None: + try: + pool.close() + except Exception: + pass + pool = None + _pool_last_attempt = datetime.now().isoformat() + try: + init_oracle() + _pool_last_error = None + return True + except Exception as e: + pool = None + _pool_last_error = str(e) + logger.error(f"Oracle pool init failed: {e}") + return False + + +def oracle_status() -> dict: + """Snapshot of Oracle pool readiness for health endpoints.""" + return { + "ready": pool is not None, + "last_error": _pool_last_error, + "last_attempt_at": _pool_last_attempt, + } + + def get_oracle_connection(): """Get a connection from the Oracle pool.""" if pool is None: diff --git a/api/app/main.py b/api/app/main.py index 683bcc1..d4ae30e 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -7,7 +7,7 @@ import logging import os from .config import settings -from .database import init_oracle, close_oracle, init_sqlite +from .database import ensure_oracle_pool, close_oracle, init_sqlite # Configure logging with both stream and file handlers _log_level = getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO) @@ -35,12 +35,10 @@ async def lifespan(app: FastAPI): """Startup and shutdown events.""" logger.info("Starting GoMag Import Manager...") - # Initialize Oracle pool - try: - init_oracle() - except Exception as e: - logger.error(f"Oracle init failed: {e}") - # Allow app to start even without Oracle for development + # Initialize Oracle pool (non-fatal: app still starts if Oracle is down; + # each sync cycle calls ensure_oracle_pool() and self-heals when it returns) + if not ensure_oracle_pool(): + logger.error("Oracle pool not ready at startup — will retry on each sync cycle") # Initialize SQLite init_sqlite() diff --git a/api/app/routers/sync.py b/api/app/routers/sync.py index 1b8ccab..9d2b3d0 100644 --- a/api/app/routers/sync.py +++ b/api/app/routers/sync.py @@ -190,8 +190,11 @@ async def sync_health(): counts = await sqlite_service.get_recent_phase_failures(limit=3) escalation_phase = next((p for p, c in counts.items() if c >= 3), None) + ora = database.oracle_status() + is_healthy = ( - last_status in (None, "completed") + ora["ready"] + and last_status in (None, "completed") and escalation_phase is None and sum(counts.values()) <= 1 ) @@ -203,6 +206,9 @@ async def sync_health(): "recent_phase_failures": counts, "escalation_phase": escalation_phase, "is_healthy": is_healthy, + "oracle_ready": ora["ready"], + "oracle_last_error": ora["last_error"], + "oracle_last_attempt_at": ora["last_attempt_at"], } @@ -422,10 +428,18 @@ async def order_detail(order_number: str): return {"error": "Order not found"} items = detail.get("items", []) - await _enrich_items_with_codmat(items) + oracle_available = True + try: + await _enrich_items_with_codmat(items) + except Exception as e: + # Oracle down (pool not initialized): still return the order with its + # items so the detail panel renders, just without CODMAT enrichment. + oracle_available = False + logger.warning(f"order_detail CODMAT enrich skipped (Oracle unavailable?): {e}") # Enrich with invoice data order = detail.get("order", {}) + order["oracle_available"] = oracle_available if order.get("factura_numar") and order.get("factura_data"): order["invoice"] = { "facturat": True, diff --git a/api/app/services/sync_service.py b/api/app/services/sync_service.py index 59e2348..2dedf8a 100644 --- a/api/app/services/sync_service.py +++ b/api/app/services/sync_service.py @@ -338,6 +338,26 @@ async def run_sync(id_pol: int = None, id_sectie: int = None, run_id: str = None return {"run_id": run_id, "status": "halted_escalation", "error": halt_msg} try: + # Phase -1: Ensure Oracle pool (auto-recovery after a DB restart). + # Done before the GoMag download so we don't waste API calls every + # cycle while Oracle is down, and so users get a clear status. + if not await asyncio.to_thread(database.ensure_oracle_pool): + last_err = database.oracle_status().get("last_error") or "fara detalii" + msg = ("Oracle indisponibil — pool neinitializat. Import oprit; " + "se reincearca automat la urmatorul ciclu de sync. " + f"Detalii: {last_err}") + _log_line(run_id, f"EROARE: {msg}") + await sqlite_service.create_sync_run(run_id, 0) + await sqlite_service.update_sync_run( + run_id, "failed", 0, 0, 0, 0, error_message=msg + ) + if _current_sync: + _current_sync["status"] = "failed" + _current_sync["finished_at"] = _now().isoformat() + _current_sync["error"] = msg + _update_progress("failed", "Oracle indisponibil — import oprit") + return {"run_id": run_id, "status": "failed", "error": msg} + # Phase 0: Download orders from GoMag API _update_progress("downloading", "Descărcare comenzi din GoMag API...") _log_line(run_id, "Descărcare comenzi din GoMag API...") diff --git a/api/app/static/js/dashboard.js b/api/app/static/js/dashboard.js index e3c2cec..c4239a6 100644 --- a/api/app/static/js/dashboard.js +++ b/api/app/static/js/dashboard.js @@ -184,7 +184,14 @@ function renderHealthPill(h) { const recent = h.recent_phase_failures || {}; const recentCount = Object.values(recent).reduce((a, b) => a + (b || 0), 0); - if (h.escalation_phase || h.last_sync_status === 'halted_escalation') { + if (h.oracle_ready === false) { + state = 'escalated'; + iconCls = 'bi-database-x'; + text = 'Oracle indisponibil'; + tooltip = `Oracle indisponibil — importurile sunt oprite.\n` + + `${h.oracle_last_error || ''}\n` + + `Se reincearca automat la urmatorul sync. Apasa Start Sync pentru a reincerca acum.`; + } else if (h.escalation_phase || h.last_sync_status === 'halted_escalation') { state = 'escalated'; iconCls = 'bi-x-octagon-fill'; text = 'Blocat'; diff --git a/api/app/static/js/shared.js b/api/app/static/js/shared.js index fb7f866..4ec2f36 100644 --- a/api/app/static/js/shared.js +++ b/api/app/static/js/shared.js @@ -865,9 +865,13 @@ async function renderOrderDetailModal(orderNumber, opts) { // Render compact header info (partner + addresses) _renderHeaderInfo(order); - if (order.error_message) { - document.getElementById('detailError').textContent = order.error_message; - document.getElementById('detailError').style.display = ''; + const detailErrEl = document.getElementById('detailError'); + if (order.oracle_available === false) { + detailErrEl.textContent = '⚠ Oracle indisponibil — CODMAT-urile nu pot fi incarcate momentan. Reincearca dupa restabilirea conexiunii.'; + detailErrEl.style.display = ''; + } else if (order.error_message) { + detailErrEl.textContent = order.error_message; + detailErrEl.style.display = ''; } // Configure footer action buttons BEFORE any early-return on items — diff --git a/api/app/templates/base.html b/api/app/templates/base.html index 76e272e..0695d2e 100644 --- a/api/app/templates/base.html +++ b/api/app/templates/base.html @@ -169,7 +169,7 @@ - + + {% endblock %} diff --git a/api/tests/test_sync_health_endpoint.py b/api/tests/test_sync_health_endpoint.py index f9feebe..1376093 100644 --- a/api/tests/test_sync_health_endpoint.py +++ b/api/tests/test_sync_health_endpoint.py @@ -32,6 +32,9 @@ client = TestClient(app) @pytest.fixture(autouse=True) async def _reset(): database.init_sqlite() + # Simulate Oracle up for health tests (no real pool in unit env). + _orig_pool = database.pool + database.pool = object() db = await sqlite_service.get_sqlite() try: await db.execute("DELETE FROM sync_phase_failures") @@ -40,6 +43,7 @@ async def _reset(): finally: await db.close() yield + database.pool = _orig_pool async def _make_run(run_id: str, status: str = "completed", offset: int = 0, @@ -108,3 +112,12 @@ async def test_health_one_phase_failure_still_warning_not_healthy(): # 1 recent phase failure → is_healthy stays True (<=1 tolerance); healthy assert data["is_healthy"] is True assert data["recent_phase_failures"]["invoice_check"] == 1 + + +async def test_health_oracle_down_not_healthy(): + await _make_run("ok-oracle", status="completed") + database.pool = None # simulate Oracle pool not initialized + r = client.get("/api/sync/health") + data = r.json() + assert data["oracle_ready"] is False + assert data["is_healthy"] is False