Merge fix/roa-mass-deletion-guard: ROA mass-deletion guard + Oracle pool auto-recovery

- Guard against falsely mass-marking DELETED_IN_ROA when ROA is recovering
- Oracle pool self-heals per sync cycle; status surfaced; order_detail no longer 500s

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-06-26 07:41:21 +00:00
10 changed files with 194 additions and 32 deletions

View File

@@ -3,39 +3,59 @@ import aiosqlite
import sqlite3
import logging
import os
import threading
from datetime import datetime
from .config import settings
logger = logging.getLogger(__name__)
# ---- Oracle Pool ----
pool = None
_pool_lock = threading.Lock()
_pool_last_error = None # str — reason the last (re)init failed, or None
_pool_last_attempt = None # ISO str — when we last tried to (re)init
_client_initialized = False # init_oracle_client may only be called once/process
def init_oracle():
"""Initialize Oracle client mode and create connection pool."""
global pool
def _init_oracle_client_once():
"""Load the Oracle client library exactly once.
init_oracle_client() loads the thick-mode driver (it does NOT connect to the
DB), so it succeeds even when Oracle is down. Calling it a second time raises,
which on a pool re-init would wrongly fall back to thin mode — so we guard it.
"""
global _client_initialized
if _client_initialized:
return
force_thin = settings.FORCE_THIN_MODE
instantclient_path = settings.INSTANTCLIENTPATH
dsn = settings.ORACLE_DSN
# Ensure TNS_ADMIN is set as OS env var so oracledb can find tnsnames.ora
if settings.TNS_ADMIN:
os.environ['TNS_ADMIN'] = settings.TNS_ADMIN
logger.info(f"Oracle config: DSN={dsn}, TNS_ADMIN={settings.TNS_ADMIN or os.environ.get('TNS_ADMIN', '(not set)')}, INSTANTCLIENTPATH={instantclient_path or '(not set)'}")
logger.info(f"Oracle config: DSN={settings.ORACLE_DSN}, TNS_ADMIN={settings.TNS_ADMIN or os.environ.get('TNS_ADMIN', '(not set)')}, INSTANTCLIENTPATH={instantclient_path or '(not set)'}")
if force_thin:
logger.info(f"FORCE_THIN_MODE=true: thin mode for {dsn}")
logger.info(f"FORCE_THIN_MODE=true: thin mode for {settings.ORACLE_DSN}")
elif instantclient_path:
try:
oracledb.init_oracle_client(lib_dir=instantclient_path)
logger.info(f"Thick mode activated for {dsn}")
logger.info(f"Thick mode activated for {settings.ORACLE_DSN}")
except Exception as e:
logger.error(f"Thick mode error: {e}")
logger.info("Fallback to thin mode")
else:
logger.info(f"Thin mode (default) for {dsn}")
logger.info(f"Thin mode (default) for {settings.ORACLE_DSN}")
_client_initialized = True
def init_oracle():
"""Initialize Oracle client mode and create the connection pool. Raises on failure."""
global pool
_init_oracle_client_once()
pool = oracledb.create_pool(
user=settings.ORACLE_USER,
password=settings.ORACLE_PASSWORD,
@@ -44,9 +64,49 @@ def init_oracle():
max=4,
increment=1
)
logger.info(f"Oracle pool created for {dsn}")
logger.info(f"Oracle pool created for {settings.ORACLE_DSN}")
return pool
def ensure_oracle_pool(force: bool = False) -> bool:
"""Ensure the Oracle pool exists, (re)creating it if needed. Returns True if ready.
Thread-safe and idempotent — safe to call at the start of every sync cycle so
the app self-heals after Oracle becomes reachable again (e.g. the DB service
was restarted after a power loss). On failure it records the reason and leaves
pool=None so callers can surface a clear status instead of crashing.
"""
global pool, _pool_last_error, _pool_last_attempt
with _pool_lock:
if pool is not None and not force:
return True
if force and pool is not None:
try:
pool.close()
except Exception:
pass
pool = None
_pool_last_attempt = datetime.now().isoformat()
try:
init_oracle()
_pool_last_error = None
return True
except Exception as e:
pool = None
_pool_last_error = str(e)
logger.error(f"Oracle pool init failed: {e}")
return False
def oracle_status() -> dict:
"""Snapshot of Oracle pool readiness for health endpoints."""
return {
"ready": pool is not None,
"last_error": _pool_last_error,
"last_attempt_at": _pool_last_attempt,
}
def get_oracle_connection():
"""Get a connection from the Oracle pool."""
if pool is None:

View File

@@ -7,7 +7,7 @@ import logging
import os
from .config import settings
from .database import init_oracle, close_oracle, init_sqlite
from .database import ensure_oracle_pool, close_oracle, init_sqlite
# Configure logging with both stream and file handlers
_log_level = getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO)
@@ -35,12 +35,10 @@ async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
logger.info("Starting GoMag Import Manager...")
# Initialize Oracle pool
try:
init_oracle()
except Exception as e:
logger.error(f"Oracle init failed: {e}")
# Allow app to start even without Oracle for development
# Initialize Oracle pool (non-fatal: app still starts if Oracle is down;
# each sync cycle calls ensure_oracle_pool() and self-heals when it returns)
if not ensure_oracle_pool():
logger.error("Oracle pool not ready at startup — will retry on each sync cycle")
# Initialize SQLite
init_sqlite()

View File

@@ -190,8 +190,11 @@ async def sync_health():
counts = await sqlite_service.get_recent_phase_failures(limit=3)
escalation_phase = next((p for p, c in counts.items() if c >= 3), None)
ora = database.oracle_status()
is_healthy = (
last_status in (None, "completed")
ora["ready"]
and last_status in (None, "completed")
and escalation_phase is None
and sum(counts.values()) <= 1
)
@@ -203,6 +206,9 @@ async def sync_health():
"recent_phase_failures": counts,
"escalation_phase": escalation_phase,
"is_healthy": is_healthy,
"oracle_ready": ora["ready"],
"oracle_last_error": ora["last_error"],
"oracle_last_attempt_at": ora["last_attempt_at"],
}
@@ -422,10 +428,18 @@ async def order_detail(order_number: str):
return {"error": "Order not found"}
items = detail.get("items", [])
await _enrich_items_with_codmat(items)
oracle_available = True
try:
await _enrich_items_with_codmat(items)
except Exception as e:
# Oracle down (pool not initialized): still return the order with its
# items so the detail panel renders, just without CODMAT enrichment.
oracle_available = False
logger.warning(f"order_detail CODMAT enrich skipped (Oracle unavailable?): {e}")
# Enrich with invoice data
order = detail.get("order", {})
order["oracle_available"] = oracle_available
if order.get("factura_numar") and order.get("factura_data"):
order["invoice"] = {
"facturat": True,
@@ -846,10 +860,14 @@ async def refresh_invoices():
existing_ids = await asyncio.to_thread(
invoice_service.check_orders_exist, id_comanda_list
)
for o in all_imported:
if o["id_comanda"] not in existing_ids:
await sqlite_service.mark_order_deleted_in_roa(o["order_number"])
orders_deleted += 1
try:
to_delete = invoice_service.deletions_or_guard(all_imported, existing_ids)
except invoice_service.MassDeletionGuard as g:
logger.warning(f"Mass-deletion guard tripped during refresh: {g}")
to_delete = []
for o in to_delete:
await sqlite_service.mark_order_deleted_in_roa(o["order_number"])
orders_deleted += 1
# Cherry-pick A: Batch refresh Oracle addresses for all orders with stored address IDs
addr_rows = await sqlite_service.get_orders_with_address_ids()

View File

@@ -3,6 +3,39 @@ from .. import database
logger = logging.getLogger(__name__)
# ── Mass-deletion safety guard ──────────────────────────────────────────────
# If ROA appears to have lost a large fraction of its orders, it is almost
# certainly a transient/recovery state (e.g. the DB just restarted after a power
# loss and COMENZI hasn't finished recovering), NOT real deletions. In that case
# we refuse to mass-mark orders as DELETED_IN_ROA — a sticky, hard-to-reverse
# operation that nulls id_comanda. See incident 2026-06-26 (3794 false deletes).
MASS_DELETION_ABORT_FRACTION = 0.30
MASS_DELETION_ABORT_MIN = 25
class MassDeletionGuard(Exception):
"""Raised when the number of orders that would be marked deleted is
suspiciously high, indicating ROA is unavailable rather than truly purged."""
def deletions_or_guard(all_imported: list, existing_ids: set) -> list:
"""Return the subset of all_imported whose id_comanda is missing from ROA,
or raise MassDeletionGuard if that subset is implausibly large.
`existing_ids` MUST come from a successful check_orders_exist call — that
function now raises on Oracle error rather than returning a partial set, so
an empty result here means ROA genuinely has none of these orders.
"""
missing = [o for o in all_imported if o["id_comanda"] not in existing_ids]
total = len(all_imported)
if total >= MASS_DELETION_ABORT_MIN and len(missing) > total * MASS_DELETION_ABORT_FRACTION:
raise MassDeletionGuard(
f"{len(missing)}/{total} comenzi par sterse din ROA "
f"(>{int(MASS_DELETION_ABORT_FRACTION * 100)}%) — posibil ROA "
f"indisponibil/in recuperare; marcarea DELETED_IN_ROA a fost ANULATA"
)
return missing
def check_invoices_for_orders(id_comanda_list: list) -> dict:
"""Check which orders have been invoiced in Oracle (vanzari table).
@@ -68,7 +101,11 @@ def check_orders_exist(id_comanda_list: list) -> set:
for row in cur:
existing.add(row[0])
except Exception as e:
# Do NOT swallow: a partial/empty result on error would be misread by
# callers as "these orders were deleted in ROA" and trigger sticky
# DELETED_IN_ROA marking. Propagate so the caller skips deletion.
logger.warning(f"Order existence check failed: {e}")
raise
finally:
database.pool.release(conn)

View File

@@ -338,6 +338,26 @@ async def run_sync(id_pol: int = None, id_sectie: int = None, run_id: str = None
return {"run_id": run_id, "status": "halted_escalation", "error": halt_msg}
try:
# Phase -1: Ensure Oracle pool (auto-recovery after a DB restart).
# Done before the GoMag download so we don't waste API calls every
# cycle while Oracle is down, and so users get a clear status.
if not await asyncio.to_thread(database.ensure_oracle_pool):
last_err = database.oracle_status().get("last_error") or "fara detalii"
msg = ("Oracle indisponibil — pool neinitializat. Import oprit; "
"se reincearca automat la urmatorul ciclu de sync. "
f"Detalii: {last_err}")
_log_line(run_id, f"EROARE: {msg}")
await sqlite_service.create_sync_run(run_id, 0)
await sqlite_service.update_sync_run(
run_id, "failed", 0, 0, 0, 0, error_message=msg
)
if _current_sync:
_current_sync["status"] = "failed"
_current_sync["finished_at"] = _now().isoformat()
_current_sync["error"] = msg
_update_progress("failed", "Oracle indisponibil — import oprit")
return {"run_id": run_id, "status": "failed", "error": msg}
# Phase 0: Download orders from GoMag API
_update_progress("downloading", "Descărcare comenzi din GoMag API...")
_log_line(run_id, "Descărcare comenzi din GoMag API...")
@@ -1081,10 +1101,15 @@ async def run_sync(id_pol: int = None, id_sectie: int = None, run_id: str = None
existing_ids = await asyncio.to_thread(
invoice_service.check_orders_exist, id_comanda_list
)
for o in all_imported:
if o["id_comanda"] not in existing_ids:
await sqlite_service.mark_order_deleted_in_roa(o["order_number"])
orders_deleted += 1
try:
to_delete = invoice_service.deletions_or_guard(all_imported, existing_ids)
except invoice_service.MassDeletionGuard as g:
_log_line(run_id, f"⚠ Protectie stergeri: {g}")
await _record_phase_err(run_id, "mass_deletion_guard", g)
to_delete = []
for o in to_delete:
await sqlite_service.mark_order_deleted_in_roa(o["order_number"])
orders_deleted += 1
if invoices_updated:
_log_line(run_id, f"Facturi noi: {invoices_updated} comenzi facturate")

View File

@@ -184,7 +184,14 @@ function renderHealthPill(h) {
const recent = h.recent_phase_failures || {};
const recentCount = Object.values(recent).reduce((a, b) => a + (b || 0), 0);
if (h.escalation_phase || h.last_sync_status === 'halted_escalation') {
if (h.oracle_ready === false) {
state = 'escalated';
iconCls = 'bi-database-x';
text = 'Oracle indisponibil';
tooltip = `Oracle indisponibil — importurile sunt oprite.\n`
+ `${h.oracle_last_error || ''}\n`
+ `Se reincearca automat la urmatorul sync. Apasa Start Sync pentru a reincerca acum.`;
} else if (h.escalation_phase || h.last_sync_status === 'halted_escalation') {
state = 'escalated';
iconCls = 'bi-x-octagon-fill';
text = 'Blocat';

View File

@@ -865,9 +865,13 @@ async function renderOrderDetailModal(orderNumber, opts) {
// Render compact header info (partner + addresses)
_renderHeaderInfo(order);
if (order.error_message) {
document.getElementById('detailError').textContent = order.error_message;
document.getElementById('detailError').style.display = '';
const detailErrEl = document.getElementById('detailError');
if (order.oracle_available === false) {
detailErrEl.textContent = '⚠ Oracle indisponibil — CODMAT-urile nu pot fi incarcate momentan. Reincearca dupa restabilirea conexiunii.';
detailErrEl.style.display = '';
} else if (order.error_message) {
detailErrEl.textContent = order.error_message;
detailErrEl.style.display = '';
}
// Configure footer action buttons BEFORE any early-return on items —

View File

@@ -169,7 +169,7 @@
<script>window.ROOT_PATH = "{{ rp }}";</script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
<script src="{{ rp }}/static/js/shared.js?v=50"></script>
<script src="{{ rp }}/static/js/shared.js?v=51"></script>
<script>
// Dark mode toggle
function toggleDarkMode() {

View File

@@ -121,5 +121,5 @@
{% endblock %}
{% block scripts %}
<script src="{{ request.scope.get('root_path', '') }}/static/js/dashboard.js?v=52"></script>
<script src="{{ request.scope.get('root_path', '') }}/static/js/dashboard.js?v=53"></script>
{% endblock %}

View File

@@ -32,6 +32,9 @@ client = TestClient(app)
@pytest.fixture(autouse=True)
async def _reset():
database.init_sqlite()
# Simulate Oracle up for health tests (no real pool in unit env).
_orig_pool = database.pool
database.pool = object()
db = await sqlite_service.get_sqlite()
try:
await db.execute("DELETE FROM sync_phase_failures")
@@ -40,6 +43,7 @@ async def _reset():
finally:
await db.close()
yield
database.pool = _orig_pool
async def _make_run(run_id: str, status: str = "completed", offset: int = 0,
@@ -108,3 +112,12 @@ async def test_health_one_phase_failure_still_warning_not_healthy():
# 1 recent phase failure → is_healthy stays True (<=1 tolerance); healthy
assert data["is_healthy"] is True
assert data["recent_phase_failures"]["invoice_check"] == 1
async def test_health_oracle_down_not_healthy():
await _make_run("ok-oracle", status="completed")
database.pool = None # simulate Oracle pool not initialized
r = client.get("/api/sync/health")
data = r.json()
assert data["oracle_ready"] is False
assert data["is_healthy"] is False