feat(oracle): auto-recover Oracle pool + surface status, stop silent import failures
After a power loss the app started before Oracle was ready; init_oracle() failed
once, the pool stayed None forever (no retry), and every sync silently failed
("Oracle pool not initialized") while still hammering the GoMag API each minute,
and order-detail 500'd.
- database.ensure_oracle_pool(force): thread-safe (re)create of the pool, called
at the start of every sync cycle → self-heals within one cycle once Oracle is
back (incl. after an Oracle service restart). init_oracle_client made idempotent
so re-init can't fall back to thin mode.
- database.oracle_status() exposed; main.py startup is non-fatal via ensure pool.
- run_sync ensures the pool before the GoMag download; on failure it records a
clear run status instead of crashing and skips the wasted API calls.
- /api/sync/health reports oracle_ready/last_error; dashboard health pill shows
"Oracle indisponibil" (top priority). Recovery via the existing Start Sync button.
- order_detail degrades gracefully (200 without CODMAT + notice) instead of 500.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,39 +3,59 @@ import aiosqlite
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
|
from datetime import datetime
|
||||||
from .config import settings
|
from .config import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ---- Oracle Pool ----
|
# ---- Oracle Pool ----
|
||||||
pool = None
|
pool = None
|
||||||
|
_pool_lock = threading.Lock()
|
||||||
|
_pool_last_error = None # str — reason the last (re)init failed, or None
|
||||||
|
_pool_last_attempt = None # ISO str — when we last tried to (re)init
|
||||||
|
_client_initialized = False # init_oracle_client may only be called once/process
|
||||||
|
|
||||||
def init_oracle():
|
|
||||||
"""Initialize Oracle client mode and create connection pool."""
|
def _init_oracle_client_once():
|
||||||
global pool
|
"""Load the Oracle client library exactly once.
|
||||||
|
|
||||||
|
init_oracle_client() loads the thick-mode driver (it does NOT connect to the
|
||||||
|
DB), so it succeeds even when Oracle is down. Calling it a second time raises,
|
||||||
|
which on a pool re-init would wrongly fall back to thin mode — so we guard it.
|
||||||
|
"""
|
||||||
|
global _client_initialized
|
||||||
|
if _client_initialized:
|
||||||
|
return
|
||||||
|
|
||||||
force_thin = settings.FORCE_THIN_MODE
|
force_thin = settings.FORCE_THIN_MODE
|
||||||
instantclient_path = settings.INSTANTCLIENTPATH
|
instantclient_path = settings.INSTANTCLIENTPATH
|
||||||
dsn = settings.ORACLE_DSN
|
|
||||||
|
|
||||||
# Ensure TNS_ADMIN is set as OS env var so oracledb can find tnsnames.ora
|
# Ensure TNS_ADMIN is set as OS env var so oracledb can find tnsnames.ora
|
||||||
if settings.TNS_ADMIN:
|
if settings.TNS_ADMIN:
|
||||||
os.environ['TNS_ADMIN'] = settings.TNS_ADMIN
|
os.environ['TNS_ADMIN'] = settings.TNS_ADMIN
|
||||||
|
|
||||||
logger.info(f"Oracle config: DSN={dsn}, TNS_ADMIN={settings.TNS_ADMIN or os.environ.get('TNS_ADMIN', '(not set)')}, INSTANTCLIENTPATH={instantclient_path or '(not set)'}")
|
logger.info(f"Oracle config: DSN={settings.ORACLE_DSN}, TNS_ADMIN={settings.TNS_ADMIN or os.environ.get('TNS_ADMIN', '(not set)')}, INSTANTCLIENTPATH={instantclient_path or '(not set)'}")
|
||||||
|
|
||||||
if force_thin:
|
if force_thin:
|
||||||
logger.info(f"FORCE_THIN_MODE=true: thin mode for {dsn}")
|
logger.info(f"FORCE_THIN_MODE=true: thin mode for {settings.ORACLE_DSN}")
|
||||||
elif instantclient_path:
|
elif instantclient_path:
|
||||||
try:
|
try:
|
||||||
oracledb.init_oracle_client(lib_dir=instantclient_path)
|
oracledb.init_oracle_client(lib_dir=instantclient_path)
|
||||||
logger.info(f"Thick mode activated for {dsn}")
|
logger.info(f"Thick mode activated for {settings.ORACLE_DSN}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Thick mode error: {e}")
|
logger.error(f"Thick mode error: {e}")
|
||||||
logger.info("Fallback to thin mode")
|
logger.info("Fallback to thin mode")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Thin mode (default) for {dsn}")
|
logger.info(f"Thin mode (default) for {settings.ORACLE_DSN}")
|
||||||
|
|
||||||
|
_client_initialized = True
|
||||||
|
|
||||||
|
|
||||||
|
def init_oracle():
|
||||||
|
"""Initialize Oracle client mode and create the connection pool. Raises on failure."""
|
||||||
|
global pool
|
||||||
|
_init_oracle_client_once()
|
||||||
pool = oracledb.create_pool(
|
pool = oracledb.create_pool(
|
||||||
user=settings.ORACLE_USER,
|
user=settings.ORACLE_USER,
|
||||||
password=settings.ORACLE_PASSWORD,
|
password=settings.ORACLE_PASSWORD,
|
||||||
@@ -44,9 +64,49 @@ def init_oracle():
|
|||||||
max=4,
|
max=4,
|
||||||
increment=1
|
increment=1
|
||||||
)
|
)
|
||||||
logger.info(f"Oracle pool created for {dsn}")
|
logger.info(f"Oracle pool created for {settings.ORACLE_DSN}")
|
||||||
return pool
|
return pool
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_oracle_pool(force: bool = False) -> bool:
|
||||||
|
"""Ensure the Oracle pool exists, (re)creating it if needed. Returns True if ready.
|
||||||
|
|
||||||
|
Thread-safe and idempotent — safe to call at the start of every sync cycle so
|
||||||
|
the app self-heals after Oracle becomes reachable again (e.g. the DB service
|
||||||
|
was restarted after a power loss). On failure it records the reason and leaves
|
||||||
|
pool=None so callers can surface a clear status instead of crashing.
|
||||||
|
"""
|
||||||
|
global pool, _pool_last_error, _pool_last_attempt
|
||||||
|
with _pool_lock:
|
||||||
|
if pool is not None and not force:
|
||||||
|
return True
|
||||||
|
if force and pool is not None:
|
||||||
|
try:
|
||||||
|
pool.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
pool = None
|
||||||
|
_pool_last_attempt = datetime.now().isoformat()
|
||||||
|
try:
|
||||||
|
init_oracle()
|
||||||
|
_pool_last_error = None
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
pool = None
|
||||||
|
_pool_last_error = str(e)
|
||||||
|
logger.error(f"Oracle pool init failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def oracle_status() -> dict:
|
||||||
|
"""Snapshot of Oracle pool readiness for health endpoints."""
|
||||||
|
return {
|
||||||
|
"ready": pool is not None,
|
||||||
|
"last_error": _pool_last_error,
|
||||||
|
"last_attempt_at": _pool_last_attempt,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_oracle_connection():
|
def get_oracle_connection():
|
||||||
"""Get a connection from the Oracle pool."""
|
"""Get a connection from the Oracle pool."""
|
||||||
if pool is None:
|
if pool is None:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from .config import settings
|
from .config import settings
|
||||||
from .database import init_oracle, close_oracle, init_sqlite
|
from .database import ensure_oracle_pool, close_oracle, init_sqlite
|
||||||
|
|
||||||
# Configure logging with both stream and file handlers
|
# Configure logging with both stream and file handlers
|
||||||
_log_level = getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO)
|
_log_level = getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO)
|
||||||
@@ -35,12 +35,10 @@ async def lifespan(app: FastAPI):
|
|||||||
"""Startup and shutdown events."""
|
"""Startup and shutdown events."""
|
||||||
logger.info("Starting GoMag Import Manager...")
|
logger.info("Starting GoMag Import Manager...")
|
||||||
|
|
||||||
# Initialize Oracle pool
|
# Initialize Oracle pool (non-fatal: app still starts if Oracle is down;
|
||||||
try:
|
# each sync cycle calls ensure_oracle_pool() and self-heals when it returns)
|
||||||
init_oracle()
|
if not ensure_oracle_pool():
|
||||||
except Exception as e:
|
logger.error("Oracle pool not ready at startup — will retry on each sync cycle")
|
||||||
logger.error(f"Oracle init failed: {e}")
|
|
||||||
# Allow app to start even without Oracle for development
|
|
||||||
|
|
||||||
# Initialize SQLite
|
# Initialize SQLite
|
||||||
init_sqlite()
|
init_sqlite()
|
||||||
|
|||||||
@@ -190,8 +190,11 @@ async def sync_health():
|
|||||||
counts = await sqlite_service.get_recent_phase_failures(limit=3)
|
counts = await sqlite_service.get_recent_phase_failures(limit=3)
|
||||||
escalation_phase = next((p for p, c in counts.items() if c >= 3), None)
|
escalation_phase = next((p for p, c in counts.items() if c >= 3), None)
|
||||||
|
|
||||||
|
ora = database.oracle_status()
|
||||||
|
|
||||||
is_healthy = (
|
is_healthy = (
|
||||||
last_status in (None, "completed")
|
ora["ready"]
|
||||||
|
and last_status in (None, "completed")
|
||||||
and escalation_phase is None
|
and escalation_phase is None
|
||||||
and sum(counts.values()) <= 1
|
and sum(counts.values()) <= 1
|
||||||
)
|
)
|
||||||
@@ -203,6 +206,9 @@ async def sync_health():
|
|||||||
"recent_phase_failures": counts,
|
"recent_phase_failures": counts,
|
||||||
"escalation_phase": escalation_phase,
|
"escalation_phase": escalation_phase,
|
||||||
"is_healthy": is_healthy,
|
"is_healthy": is_healthy,
|
||||||
|
"oracle_ready": ora["ready"],
|
||||||
|
"oracle_last_error": ora["last_error"],
|
||||||
|
"oracle_last_attempt_at": ora["last_attempt_at"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -422,10 +428,18 @@ async def order_detail(order_number: str):
|
|||||||
return {"error": "Order not found"}
|
return {"error": "Order not found"}
|
||||||
|
|
||||||
items = detail.get("items", [])
|
items = detail.get("items", [])
|
||||||
await _enrich_items_with_codmat(items)
|
oracle_available = True
|
||||||
|
try:
|
||||||
|
await _enrich_items_with_codmat(items)
|
||||||
|
except Exception as e:
|
||||||
|
# Oracle down (pool not initialized): still return the order with its
|
||||||
|
# items so the detail panel renders, just without CODMAT enrichment.
|
||||||
|
oracle_available = False
|
||||||
|
logger.warning(f"order_detail CODMAT enrich skipped (Oracle unavailable?): {e}")
|
||||||
|
|
||||||
# Enrich with invoice data
|
# Enrich with invoice data
|
||||||
order = detail.get("order", {})
|
order = detail.get("order", {})
|
||||||
|
order["oracle_available"] = oracle_available
|
||||||
if order.get("factura_numar") and order.get("factura_data"):
|
if order.get("factura_numar") and order.get("factura_data"):
|
||||||
order["invoice"] = {
|
order["invoice"] = {
|
||||||
"facturat": True,
|
"facturat": True,
|
||||||
|
|||||||
@@ -338,6 +338,26 @@ async def run_sync(id_pol: int = None, id_sectie: int = None, run_id: str = None
|
|||||||
return {"run_id": run_id, "status": "halted_escalation", "error": halt_msg}
|
return {"run_id": run_id, "status": "halted_escalation", "error": halt_msg}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Phase -1: Ensure Oracle pool (auto-recovery after a DB restart).
|
||||||
|
# Done before the GoMag download so we don't waste API calls every
|
||||||
|
# cycle while Oracle is down, and so users get a clear status.
|
||||||
|
if not await asyncio.to_thread(database.ensure_oracle_pool):
|
||||||
|
last_err = database.oracle_status().get("last_error") or "fara detalii"
|
||||||
|
msg = ("Oracle indisponibil — pool neinitializat. Import oprit; "
|
||||||
|
"se reincearca automat la urmatorul ciclu de sync. "
|
||||||
|
f"Detalii: {last_err}")
|
||||||
|
_log_line(run_id, f"EROARE: {msg}")
|
||||||
|
await sqlite_service.create_sync_run(run_id, 0)
|
||||||
|
await sqlite_service.update_sync_run(
|
||||||
|
run_id, "failed", 0, 0, 0, 0, error_message=msg
|
||||||
|
)
|
||||||
|
if _current_sync:
|
||||||
|
_current_sync["status"] = "failed"
|
||||||
|
_current_sync["finished_at"] = _now().isoformat()
|
||||||
|
_current_sync["error"] = msg
|
||||||
|
_update_progress("failed", "Oracle indisponibil — import oprit")
|
||||||
|
return {"run_id": run_id, "status": "failed", "error": msg}
|
||||||
|
|
||||||
# Phase 0: Download orders from GoMag API
|
# Phase 0: Download orders from GoMag API
|
||||||
_update_progress("downloading", "Descărcare comenzi din GoMag API...")
|
_update_progress("downloading", "Descărcare comenzi din GoMag API...")
|
||||||
_log_line(run_id, "Descărcare comenzi din GoMag API...")
|
_log_line(run_id, "Descărcare comenzi din GoMag API...")
|
||||||
|
|||||||
@@ -184,7 +184,14 @@ function renderHealthPill(h) {
|
|||||||
const recent = h.recent_phase_failures || {};
|
const recent = h.recent_phase_failures || {};
|
||||||
const recentCount = Object.values(recent).reduce((a, b) => a + (b || 0), 0);
|
const recentCount = Object.values(recent).reduce((a, b) => a + (b || 0), 0);
|
||||||
|
|
||||||
if (h.escalation_phase || h.last_sync_status === 'halted_escalation') {
|
if (h.oracle_ready === false) {
|
||||||
|
state = 'escalated';
|
||||||
|
iconCls = 'bi-database-x';
|
||||||
|
text = 'Oracle indisponibil';
|
||||||
|
tooltip = `Oracle indisponibil — importurile sunt oprite.\n`
|
||||||
|
+ `${h.oracle_last_error || ''}\n`
|
||||||
|
+ `Se reincearca automat la urmatorul sync. Apasa Start Sync pentru a reincerca acum.`;
|
||||||
|
} else if (h.escalation_phase || h.last_sync_status === 'halted_escalation') {
|
||||||
state = 'escalated';
|
state = 'escalated';
|
||||||
iconCls = 'bi-x-octagon-fill';
|
iconCls = 'bi-x-octagon-fill';
|
||||||
text = 'Blocat';
|
text = 'Blocat';
|
||||||
|
|||||||
@@ -865,9 +865,13 @@ async function renderOrderDetailModal(orderNumber, opts) {
|
|||||||
// Render compact header info (partner + addresses)
|
// Render compact header info (partner + addresses)
|
||||||
_renderHeaderInfo(order);
|
_renderHeaderInfo(order);
|
||||||
|
|
||||||
if (order.error_message) {
|
const detailErrEl = document.getElementById('detailError');
|
||||||
document.getElementById('detailError').textContent = order.error_message;
|
if (order.oracle_available === false) {
|
||||||
document.getElementById('detailError').style.display = '';
|
detailErrEl.textContent = '⚠ Oracle indisponibil — CODMAT-urile nu pot fi incarcate momentan. Reincearca dupa restabilirea conexiunii.';
|
||||||
|
detailErrEl.style.display = '';
|
||||||
|
} else if (order.error_message) {
|
||||||
|
detailErrEl.textContent = order.error_message;
|
||||||
|
detailErrEl.style.display = '';
|
||||||
}
|
}
|
||||||
|
|
||||||
// Configure footer action buttons BEFORE any early-return on items —
|
// Configure footer action buttons BEFORE any early-return on items —
|
||||||
|
|||||||
@@ -169,7 +169,7 @@
|
|||||||
|
|
||||||
<script>window.ROOT_PATH = "{{ rp }}";</script>
|
<script>window.ROOT_PATH = "{{ rp }}";</script>
|
||||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
|
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
|
||||||
<script src="{{ rp }}/static/js/shared.js?v=50"></script>
|
<script src="{{ rp }}/static/js/shared.js?v=51"></script>
|
||||||
<script>
|
<script>
|
||||||
// Dark mode toggle
|
// Dark mode toggle
|
||||||
function toggleDarkMode() {
|
function toggleDarkMode() {
|
||||||
|
|||||||
@@ -121,5 +121,5 @@
|
|||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block scripts %}
|
{% block scripts %}
|
||||||
<script src="{{ request.scope.get('root_path', '') }}/static/js/dashboard.js?v=52"></script>
|
<script src="{{ request.scope.get('root_path', '') }}/static/js/dashboard.js?v=53"></script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ client = TestClient(app)
|
|||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
async def _reset():
|
async def _reset():
|
||||||
database.init_sqlite()
|
database.init_sqlite()
|
||||||
|
# Simulate Oracle up for health tests (no real pool in unit env).
|
||||||
|
_orig_pool = database.pool
|
||||||
|
database.pool = object()
|
||||||
db = await sqlite_service.get_sqlite()
|
db = await sqlite_service.get_sqlite()
|
||||||
try:
|
try:
|
||||||
await db.execute("DELETE FROM sync_phase_failures")
|
await db.execute("DELETE FROM sync_phase_failures")
|
||||||
@@ -40,6 +43,7 @@ async def _reset():
|
|||||||
finally:
|
finally:
|
||||||
await db.close()
|
await db.close()
|
||||||
yield
|
yield
|
||||||
|
database.pool = _orig_pool
|
||||||
|
|
||||||
|
|
||||||
async def _make_run(run_id: str, status: str = "completed", offset: int = 0,
|
async def _make_run(run_id: str, status: str = "completed", offset: int = 0,
|
||||||
@@ -108,3 +112,12 @@ async def test_health_one_phase_failure_still_warning_not_healthy():
|
|||||||
# 1 recent phase failure → is_healthy stays True (<=1 tolerance); healthy
|
# 1 recent phase failure → is_healthy stays True (<=1 tolerance); healthy
|
||||||
assert data["is_healthy"] is True
|
assert data["is_healthy"] is True
|
||||||
assert data["recent_phase_failures"]["invoice_check"] == 1
|
assert data["recent_phase_failures"]["invoice_check"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
async def test_health_oracle_down_not_healthy():
|
||||||
|
await _make_run("ok-oracle", status="completed")
|
||||||
|
database.pool = None # simulate Oracle pool not initialized
|
||||||
|
r = client.get("/api/sync/health")
|
||||||
|
data = r.json()
|
||||||
|
assert data["oracle_ready"] is False
|
||||||
|
assert data["is_healthy"] is False
|
||||||
|
|||||||
Reference in New Issue
Block a user