feat(maintenance): guard DB + log growth (Option B + daily prune + rotation)

Root cause of the 2GB prod import.db: the sync_run_orders audit junction recorded every order on every run; under the 1-minute scheduler ~98% of 21.7M rows were no-op ALREADY_IMPORTED re-observations. NSSM stdout/stderr also grew unbounded (rotation never applied to the live service). Changes: - sqlite_service: skip ALREADY_IMPORTED rows in sync_run_orders (write-side guard, _SKIP_JUNCTION_STATUSES); add prune_sync_history(retention_days) with incremental_vacuum. - maintenance_service (new): cleanup_old_logs + run_daily_maintenance. - scheduler_service: start_maintenance_job (daily CronTrigger). - main.py: RotatingFileHandler (sync_comenzi_current.log, 10MB x5) instead of a new timestamped file per start; schedule daily maintenance + one-shot catch-up at startup. - scripts/db_maintenance.py (new): one-shot prune + VACUUM + log cleanup, plain sqlite3, invoked by deploy.ps1 while the service is stopped. - deploy.ps1: stop -> run db_maintenance.py -> (re)apply NSSM AppRotate* idempotently -> start, so rotation reaches pre-existing services. Retention defaults: 7 days history, 7 days logs. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 09:59:41 +00:00
parent ccc6a933fa
commit dcc5042586
6 changed files with 331 additions and 29 deletions
--- a/api/app/main.py
+++ b/api/app/main.py
@@ -1,9 +1,10 @@
 import asyncio
 from contextlib import asynccontextmanager
 from datetime import datetime
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from pathlib import Path
 import logging
 import logging.handlers
 import os
 from .config import settings
@@ -19,8 +20,12 @@ _stream_handler.setFormatter(_formatter)
 _log_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs')
 os.makedirs(_log_dir, exist_ok=True)
-_log_filename = f"sync_comenzi_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+# Rotating handler (10MB x 5 backups) instead of a new timestamped file per
-_file_handler = logging.FileHandler(os.path.join(_log_dir, _log_filename), encoding='utf-8')
+# start — caps log growth and stops file proliferation across restarts. Fixed
 # name still matches the QA glob `sync_comenzi_*.log`.
 _file_handler = logging.handlers.RotatingFileHandler(
    os.path.join(_log_dir, "sync_comenzi_current.log"),
    maxBytes=10 * 1024 * 1024, backupCount=5, encoding='utf-8')
 _file_handler.setFormatter(_formatter)
 _root_logger = logging.getLogger()
@@ -54,6 +59,15 @@ async def lifespan(app: FastAPI):
    except Exception:
        pass
    # Daily DB/log maintenance (prune audit history + cleanup old logs) + a
    # one-shot catch-up so a long-down service reclaims immediately on start.
    try:
        from .services import maintenance_service
        scheduler_service.start_maintenance_job()
        asyncio.create_task(maintenance_service.run_daily_maintenance())
    except Exception as e:
        logger.warning(f"Maintenance scheduling failed: {e}")
    logger.info("GoMag Import Manager started")
    yield
--- a/api/app/services/maintenance_service.py
+++ b/api/app/services/maintenance_service.py
@@ -0,0 +1,75 @@
 """Periodic maintenance: prune audit history + clean up old log files.
 Keeps the SQLite DB and the logs/ directory from growing unbounded. The audit
 tables (sync_runs, sync_run_orders) were the only DB growth source under the
 1-minute scheduler; business tables (orders, order_items) are never touched.
 The one-shot heavy reclaim (full VACUUM, run while the service is stopped) lives
 in scripts/db_maintenance.py and is invoked by deploy.ps1.
 """
 import logging
 import os
 import time
 logger = logging.getLogger(__name__)
 DEFAULT_HISTORY_RETENTION_DAYS = 7
 DEFAULT_LOG_RETENTION_DAYS = 7
 def _logs_dir() -> str:
    """Absolute path to the repo-root logs/ directory (matches main.py)."""
    here = os.path.dirname(os.path.abspath(__file__))
    return os.path.join(os.path.abspath(os.path.join(here, "..", "..", "..")), "logs")
 def cleanup_old_logs(retention_days: int = DEFAULT_LOG_RETENTION_DAYS,
                     log_dir: str | None = None) -> int:
    """Delete log files older than `retention_days`. Returns count removed.
    Targets any file with `.log` in its name (covers `sync_comenzi_current.log`,
    NSSM `service_stdout.log`, and rotated backups like `*.log.3`). The live
    rotating files stay fresh (recent mtime) so they fall inside the window.
    """
    log_dir = log_dir or _logs_dir()
    if not os.path.isdir(log_dir):
        return 0
    cutoff = time.time() - retention_days * 86400
    removed = 0
    for name in os.listdir(log_dir):
        if ".log" not in name:
            continue
        path = os.path.join(log_dir, name)
        try:
            if os.path.isfile(path) and os.path.getmtime(path) < cutoff:
                os.remove(path)
                removed += 1
        except OSError as e:
            logger.warning(f"cleanup_old_logs: could not remove {name}: {e}")
    if removed:
        logger.info(f"cleanup_old_logs: removed {removed} file(s) older than "
                    f"{retention_days}d from {log_dir}")
    return removed
 async def run_daily_maintenance(
        history_days: int = DEFAULT_HISTORY_RETENTION_DAYS,
        log_days: int = DEFAULT_LOG_RETENTION_DAYS) -> dict:
    """Daily job: prune audit history (+reclaim pages) and clean old log files.
    Each step is isolated — a failure in one does not skip the other.
    """
    from . import sqlite_service
    result: dict = {}
    try:
        result["db"] = await sqlite_service.prune_sync_history(history_days)
    except Exception as e:
        logger.warning(f"run_daily_maintenance: prune_sync_history failed: {e}")
        result["db_error"] = str(e)
    try:
        result["logs_removed"] = cleanup_old_logs(log_days)
    except Exception as e:
        logger.warning(f"run_daily_maintenance: cleanup_old_logs failed: {e}")
        result["logs_error"] = str(e)
    return result
--- a/api/app/services/scheduler_service.py
+++ b/api/app/services/scheduler_service.py
@@ -1,6 +1,7 @@
 import logging
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from apscheduler.triggers.interval import IntervalTrigger
 from apscheduler.triggers.cron import CronTrigger
 logger = logging.getLogger(__name__)
@@ -42,6 +43,31 @@ def start_scheduler(interval_minutes: int = 10):
    logger.info(f"Scheduler started with interval {interval_minutes}min")
 def start_maintenance_job(hour: int = 3):
    """Schedule the daily DB/log maintenance job (prune history + cleanup logs).
    Runs independently of the sync job — starts the scheduler if it isn't already
    running so maintenance happens even when auto-sync is disabled.
    """
    if _scheduler is None:
        init_scheduler()
    from . import maintenance_service
    _scheduler.add_job(
        maintenance_service.run_daily_maintenance,
        trigger=CronTrigger(hour=hour, minute=0),
        id="maintenance_job",
        name="Daily DB/Log Maintenance",
        replace_existing=True
    )
    if not _scheduler.running:
        _scheduler.start()
    logger.info(f"Maintenance job scheduled daily at {hour:02d}:00")
 def stop_scheduler():
    """Stop the scheduler."""
    global _is_running
--- a/api/app/services/sqlite_service.py
+++ b/api/app/services/sqlite_service.py
@@ -2,7 +2,7 @@ import json
 import logging
 import logging.handlers
 import os
-from datetime import datetime
+from datetime import datetime, timedelta
 from zoneinfo import ZoneInfo
 from ..database import get_sqlite, get_sqlite_sync
 from ..constants import OrderStatus
@@ -114,6 +114,45 @@ async def update_sync_run(run_id: str, status: str, total_orders: int = 0,
        await db.close()
 async def prune_sync_history(retention_days: int = 7) -> dict:
    """Delete sync_runs + sync_run_orders older than `retention_days`.
    Audit-only tables — `orders`/`order_items` (business data) are never touched.
    Frees pages via incremental_vacuum (prod DB is auto_vacuum=INCREMENTAL after
    the initial reclaim). Returns counts for logging. See _SKIP_JUNCTION_STATUSES
    for the complementary write-side guard.
    """
    cutoff = (datetime.now(_tz_bucharest).replace(tzinfo=None)
              - timedelta(days=retention_days)).strftime("%Y-%m-%d")
    db = await get_sqlite()
    try:
        cur = await db.execute(
            "DELETE FROM sync_run_orders WHERE sync_run_id IN "
            "(SELECT run_id FROM sync_runs WHERE substr(started_at,1,10) < ?)",
            (cutoff,))
        junction_deleted = cur.rowcount
        cur = await db.execute(
            "DELETE FROM sync_runs WHERE substr(started_at,1,10) < ?", (cutoff,))
        runs_deleted = cur.rowcount
        # Drop phase-failure rows orphaned by the run deletion.
        await db.execute(
            "DELETE FROM sync_phase_failures "
            "WHERE run_id NOT IN (SELECT run_id FROM sync_runs)")
        await db.commit()
        try:
            await db.execute("PRAGMA incremental_vacuum")
            await db.commit()
        except Exception as e:  # auto_vacuum may be OFF on a fresh dev DB
            logger.debug(f"prune_sync_history: incremental_vacuum skipped: {e}")
        logger.info(
            f"prune_sync_history: cutoff<{cutoff} runs_deleted={runs_deleted} "
            f"junction_deleted={junction_deleted}")
        return {"cutoff": cutoff, "runs_deleted": runs_deleted,
                "junction_deleted": junction_deleted}
    finally:
        await db.close()
 async def upsert_order(sync_run_id: str, order_number: str, order_date: str,
                       customer_name: str, status: str, id_comanda: int = None,
                       id_partener: int = None, error_message: str = None,
@@ -171,8 +210,28 @@ async def upsert_order(sync_run_id: str, order_number: str, order_date: str,
        await db.close()
 # Audit junction policy (DB-size guard):
 # The sync_run_orders junction recorded EVERY order seen on EVERY run. Under the
 # 1-minute scheduler, ~98% of rows were no-op ALREADY_IMPORTED re-observations,
 # which grew the table to 21M+ rows / 2GB. We no longer record those: the order's
 # current state still lives in `orders`; the junction now only lists orders a run
 # actually touched (new / changed / skipped / errored / cancelled). Run-detail
 # views therefore show only meaningful orders per run.
 _SKIP_JUNCTION_STATUSES = {OrderStatus.ALREADY_IMPORTED.value}
 def _record_in_junction(status_at_run: str) -> bool:
    """Whether this per-run status is worth persisting in sync_run_orders."""
    return status_at_run not in _SKIP_JUNCTION_STATUSES
 async def add_sync_run_order(sync_run_id: str, order_number: str, status_at_run: str):
-    """Record that this run processed this order (junction table)."""
+    """Record that this run processed this order (junction table).
    No-op ALREADY_IMPORTED observations are skipped — see _SKIP_JUNCTION_STATUSES.
    """
    if not _record_in_junction(status_at_run):
        return
    db = await get_sqlite()
    try:
        await db.execute("""
@@ -258,9 +317,15 @@ async def _insert_orders_only(db, orders: list[dict]):
    if not orders:
        return
    await db.executemany(_ORDERS_UPSERT_SQL, [_orders_row(d) for d in orders])
    junction_rows = [
        (d["sync_run_id"], d["order_number"], d.get("status_at_run", d["status"]))
        for d in orders
        if _record_in_junction(d.get("status_at_run", d["status"]))
    ]
    if junction_rows:
        await db.executemany(
            "INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
-        [(d["sync_run_id"], d["order_number"], d.get("status_at_run", d["status"])) for d in orders],
+            junction_rows,
        )
@@ -273,9 +338,15 @@ async def _insert_valid_batch(db, orders: list[dict]):
    if not orders:
        return
    await db.executemany(_ORDERS_UPSERT_SQL, [_orders_row(d) for d in orders])
    junction_rows = [
        (d["sync_run_id"], d["order_number"], d["status_at_run"])
        for d in orders
        if _record_in_junction(d["status_at_run"])
    ]
    if junction_rows:
        await db.executemany(
            "INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
-        [(d["sync_run_id"], d["order_number"], d["status_at_run"]) for d in orders],
+            junction_rows,
        )
    all_items: list[tuple] = []
@@ -314,6 +385,7 @@ async def _insert_single_order(db, d: dict):
    Caller wraps in SAVEPOINT so a per-row failure doesn't poison the batch.
    """
    await db.execute(_ORDERS_UPSERT_SQL, _orders_row(d))
    if _record_in_junction(d["status_at_run"]):
        await db.execute(
            "INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
            (d["sync_run_id"], d["order_number"], d["status_at_run"]),
--- a/deploy.ps1
+++ b/deploy.ps1
@@ -431,25 +431,38 @@ if ($NssmExe) {
    $existingService = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue
-    if ($existingService) {
+    if (-not $existingService) {
        Write-Info "Serviciu existent, restarteaza..."
        & $NssmExe restart $ServiceName
        Write-OK "Serviciu $ServiceName restartat"
    } else {
        Write-Info "Instalez serviciu $ServiceName cu NSSM..."
        & $NssmExe install $ServiceName (Join-Path $RepoPath "start.bat")
        & $NssmExe set $ServiceName AppDirectory $RepoPath
        & $NssmExe set $ServiceName DisplayName "GoMag Vending Import Manager"
        & $NssmExe set $ServiceName Description "Import comenzi web GoMag -> ROA Oracle"
        & $NssmExe set $ServiceName Start SERVICE_AUTO_START
    } else {
        Write-Info "Serviciu existent, il opresc pentru mentenanta..."
        & $NssmExe stop $ServiceName 2>$null
        Start-Sleep -Seconds 3
    }
    # Mentenanta DB + log-uri cu serviciul oprit: prune istoric (7z) + VACUUM
    # (reclaim disc) + cleanup log-uri (7z). Non-fatal daca esueaza.
    Write-Info "Mentenanta DB/log-uri (prune + VACUUM + cleanup)..."
    try {
        & $VenvPy (Join-Path $RepoPath "scripts\db_maintenance.py") --history-days 7 --log-days 7
    } catch {
        Write-Warn "Mentenanta DB a esuat (continui): $_"
    }
    # (Re)aplica config log/rotatie de fiecare data — idempotent, astfel rotatia
    # ajunge si pe serviciile instalate inainte ca aceste setari sa existe.
    & $NssmExe set $ServiceName AppStdout (Join-Path $RepoPath "logs\service_stdout.log")
    & $NssmExe set $ServiceName AppStderr (Join-Path $RepoPath "logs\service_stderr.log")
    & $NssmExe set $ServiceName AppRotateFiles 1
    & $NssmExe set $ServiceName AppRotateOnline 1
    & $NssmExe set $ServiceName AppRotateBytes 10485760
    & $NssmExe start $ServiceName
-        Write-OK "Serviciu $ServiceName instalat si pornit"
+    Write-OK "Serviciu $ServiceName pornit (mentenanta + rotatie aplicate)"
    }
 } else {
    # Fallback: Task Scheduler
--- a/scripts/db_maintenance.py
+++ b/scripts/db_maintenance.py
@@ -0,0 +1,102 @@
 #!/usr/bin/env python3
 """
 One-shot SQLite + log maintenance, invoked by deploy.ps1 while the GoMagVending
 service is stopped.
 What it does:
    1. Prune audit history older than --history-days (sync_runs, sync_run_orders,
       orphaned sync_phase_failures). Business tables (orders, order_items) are
       NEVER touched.
    2. Enable PRAGMA auto_vacuum=INCREMENTAL and run a full VACUUM to reclaim disk.
    3. Delete log files older than --log-days from logs/.
 Plain sqlite3 only — no app imports, no Oracle, no event loop — so it runs even
 if the app/Oracle env isn't set up.
 Usage:
    python scripts/db_maintenance.py                       # defaults: 7/7 days
    python scripts/db_maintenance.py --history-days 7 --log-days 7
    python scripts/db_maintenance.py --db C:\\path\\import.db
 """
 import argparse
 import os
 import sqlite3
 import sys
 import time
 from datetime import datetime, timedelta
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 DEFAULT_DB = os.path.join(REPO_ROOT, "api", "data", "import.db")
 DEFAULT_LOGS = os.path.join(REPO_ROOT, "logs")
 def prune_and_vacuum(db_path: str, history_days: int) -> None:
    cutoff = (datetime.now() - timedelta(days=history_days)).strftime("%Y-%m-%d")
    before = os.path.getsize(db_path) / 1048576.0
    conn = sqlite3.connect(db_path, timeout=120)
    try:
        cur = conn.cursor()
        cur.execute(
            "DELETE FROM sync_run_orders WHERE sync_run_id IN "
            "(SELECT run_id FROM sync_runs WHERE substr(started_at,1,10) < ?)",
            (cutoff,))
        junction = cur.rowcount
        cur.execute(
            "DELETE FROM sync_runs WHERE substr(started_at,1,10) < ?", (cutoff,))
        runs = cur.rowcount
        cur.execute(
            "DELETE FROM sync_phase_failures "
            "WHERE run_id NOT IN (SELECT run_id FROM sync_runs)")
        conn.commit()
        # auto_vacuum mode change only takes effect on the next VACUUM.
        conn.isolation_level = None
        conn.execute("PRAGMA auto_vacuum=INCREMENTAL")
        t0 = time.time()
        conn.execute("VACUUM")
        vac = time.time() - t0
    finally:
        conn.close()
    after = os.path.getsize(db_path) / 1048576.0
    print(f"[db_maintenance] cutoff<{cutoff} runs_deleted={runs} "
          f"junction_deleted={junction} size {before:.1f}MB -> {after:.1f}MB "
          f"(VACUUM {vac:.1f}s)")
 def cleanup_logs(log_dir: str, log_days: int) -> None:
    if not os.path.isdir(log_dir):
        print(f"[db_maintenance] logs dir not found: {log_dir}")
        return
    cutoff = time.time() - log_days * 86400
    removed = 0
    for name in os.listdir(log_dir):
        if ".log" not in name:
            continue
        path = os.path.join(log_dir, name)
        try:
            if os.path.isfile(path) and os.path.getmtime(path) < cutoff:
                os.remove(path)
                removed += 1
        except OSError as e:
            print(f"[db_maintenance] could not remove {name}: {e}")
    print(f"[db_maintenance] removed {removed} log file(s) older than {log_days}d")
 def main() -> int:
    ap = argparse.ArgumentParser(description="SQLite + log maintenance")
    ap.add_argument("--db", default=DEFAULT_DB)
    ap.add_argument("--logs-dir", default=DEFAULT_LOGS)
    ap.add_argument("--history-days", type=int, default=7)
    ap.add_argument("--log-days", type=int, default=7)
    args = ap.parse_args()
    if not os.path.exists(args.db):
        # Non-fatal: a fresh install may not have a DB yet.
        print(f"[db_maintenance] DB not found, skipping: {args.db}", file=sys.stderr)
    else:
        prune_and_vacuum(args.db, args.history_days)
    cleanup_logs(args.logs_dir, args.log_days)
    return 0
 if __name__ == "__main__":
    sys.exit(main())