feat(maintenance): guard DB + log growth (Option B + daily prune + rotation)
Root cause of the 2GB prod import.db: the sync_run_orders audit junction recorded every order on every run; under the 1-minute scheduler ~98% of 21.7M rows were no-op ALREADY_IMPORTED re-observations. NSSM stdout/stderr also grew unbounded (rotation never applied to the live service). Changes: - sqlite_service: skip ALREADY_IMPORTED rows in sync_run_orders (write-side guard, _SKIP_JUNCTION_STATUSES); add prune_sync_history(retention_days) with incremental_vacuum. - maintenance_service (new): cleanup_old_logs + run_daily_maintenance. - scheduler_service: start_maintenance_job (daily CronTrigger). - main.py: RotatingFileHandler (sync_comenzi_current.log, 10MB x5) instead of a new timestamped file per start; schedule daily maintenance + one-shot catch-up at startup. - scripts/db_maintenance.py (new): one-shot prune + VACUUM + log cleanup, plain sqlite3, invoked by deploy.ps1 while the service is stopped. - deploy.ps1: stop -> run db_maintenance.py -> (re)apply NSSM AppRotate* idempotently -> start, so rotation reaches pre-existing services. Retention defaults: 7 days history, 7 days logs. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,7 @@ import json
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
from ..database import get_sqlite, get_sqlite_sync
|
||||
from ..constants import OrderStatus
|
||||
@@ -114,6 +114,45 @@ async def update_sync_run(run_id: str, status: str, total_orders: int = 0,
|
||||
await db.close()
|
||||
|
||||
|
||||
async def prune_sync_history(retention_days: int = 7) -> dict:
|
||||
"""Delete sync_runs + sync_run_orders older than `retention_days`.
|
||||
|
||||
Audit-only tables — `orders`/`order_items` (business data) are never touched.
|
||||
Frees pages via incremental_vacuum (prod DB is auto_vacuum=INCREMENTAL after
|
||||
the initial reclaim). Returns counts for logging. See _SKIP_JUNCTION_STATUSES
|
||||
for the complementary write-side guard.
|
||||
"""
|
||||
cutoff = (datetime.now(_tz_bucharest).replace(tzinfo=None)
|
||||
- timedelta(days=retention_days)).strftime("%Y-%m-%d")
|
||||
db = await get_sqlite()
|
||||
try:
|
||||
cur = await db.execute(
|
||||
"DELETE FROM sync_run_orders WHERE sync_run_id IN "
|
||||
"(SELECT run_id FROM sync_runs WHERE substr(started_at,1,10) < ?)",
|
||||
(cutoff,))
|
||||
junction_deleted = cur.rowcount
|
||||
cur = await db.execute(
|
||||
"DELETE FROM sync_runs WHERE substr(started_at,1,10) < ?", (cutoff,))
|
||||
runs_deleted = cur.rowcount
|
||||
# Drop phase-failure rows orphaned by the run deletion.
|
||||
await db.execute(
|
||||
"DELETE FROM sync_phase_failures "
|
||||
"WHERE run_id NOT IN (SELECT run_id FROM sync_runs)")
|
||||
await db.commit()
|
||||
try:
|
||||
await db.execute("PRAGMA incremental_vacuum")
|
||||
await db.commit()
|
||||
except Exception as e: # auto_vacuum may be OFF on a fresh dev DB
|
||||
logger.debug(f"prune_sync_history: incremental_vacuum skipped: {e}")
|
||||
logger.info(
|
||||
f"prune_sync_history: cutoff<{cutoff} runs_deleted={runs_deleted} "
|
||||
f"junction_deleted={junction_deleted}")
|
||||
return {"cutoff": cutoff, "runs_deleted": runs_deleted,
|
||||
"junction_deleted": junction_deleted}
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
async def upsert_order(sync_run_id: str, order_number: str, order_date: str,
|
||||
customer_name: str, status: str, id_comanda: int = None,
|
||||
id_partener: int = None, error_message: str = None,
|
||||
@@ -171,8 +210,28 @@ async def upsert_order(sync_run_id: str, order_number: str, order_date: str,
|
||||
await db.close()
|
||||
|
||||
|
||||
# Audit junction policy (DB-size guard):
|
||||
# The sync_run_orders junction recorded EVERY order seen on EVERY run. Under the
|
||||
# 1-minute scheduler, ~98% of rows were no-op ALREADY_IMPORTED re-observations,
|
||||
# which grew the table to 21M+ rows / 2GB. We no longer record those: the order's
|
||||
# current state still lives in `orders`; the junction now only lists orders a run
|
||||
# actually touched (new / changed / skipped / errored / cancelled). Run-detail
|
||||
# views therefore show only meaningful orders per run.
|
||||
_SKIP_JUNCTION_STATUSES = {OrderStatus.ALREADY_IMPORTED.value}
|
||||
|
||||
|
||||
def _record_in_junction(status_at_run: str) -> bool:
|
||||
"""Whether this per-run status is worth persisting in sync_run_orders."""
|
||||
return status_at_run not in _SKIP_JUNCTION_STATUSES
|
||||
|
||||
|
||||
async def add_sync_run_order(sync_run_id: str, order_number: str, status_at_run: str):
|
||||
"""Record that this run processed this order (junction table)."""
|
||||
"""Record that this run processed this order (junction table).
|
||||
|
||||
No-op ALREADY_IMPORTED observations are skipped — see _SKIP_JUNCTION_STATUSES.
|
||||
"""
|
||||
if not _record_in_junction(status_at_run):
|
||||
return
|
||||
db = await get_sqlite()
|
||||
try:
|
||||
await db.execute("""
|
||||
@@ -258,10 +317,16 @@ async def _insert_orders_only(db, orders: list[dict]):
|
||||
if not orders:
|
||||
return
|
||||
await db.executemany(_ORDERS_UPSERT_SQL, [_orders_row(d) for d in orders])
|
||||
await db.executemany(
|
||||
"INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
|
||||
[(d["sync_run_id"], d["order_number"], d.get("status_at_run", d["status"])) for d in orders],
|
||||
)
|
||||
junction_rows = [
|
||||
(d["sync_run_id"], d["order_number"], d.get("status_at_run", d["status"]))
|
||||
for d in orders
|
||||
if _record_in_junction(d.get("status_at_run", d["status"]))
|
||||
]
|
||||
if junction_rows:
|
||||
await db.executemany(
|
||||
"INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
|
||||
junction_rows,
|
||||
)
|
||||
|
||||
|
||||
async def _insert_valid_batch(db, orders: list[dict]):
|
||||
@@ -273,10 +338,16 @@ async def _insert_valid_batch(db, orders: list[dict]):
|
||||
if not orders:
|
||||
return
|
||||
await db.executemany(_ORDERS_UPSERT_SQL, [_orders_row(d) for d in orders])
|
||||
await db.executemany(
|
||||
"INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
|
||||
[(d["sync_run_id"], d["order_number"], d["status_at_run"]) for d in orders],
|
||||
)
|
||||
junction_rows = [
|
||||
(d["sync_run_id"], d["order_number"], d["status_at_run"])
|
||||
for d in orders
|
||||
if _record_in_junction(d["status_at_run"])
|
||||
]
|
||||
if junction_rows:
|
||||
await db.executemany(
|
||||
"INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
|
||||
junction_rows,
|
||||
)
|
||||
|
||||
all_items: list[tuple] = []
|
||||
order_numbers_with_items: set = set()
|
||||
@@ -314,10 +385,11 @@ async def _insert_single_order(db, d: dict):
|
||||
Caller wraps in SAVEPOINT so a per-row failure doesn't poison the batch.
|
||||
"""
|
||||
await db.execute(_ORDERS_UPSERT_SQL, _orders_row(d))
|
||||
await db.execute(
|
||||
"INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
|
||||
(d["sync_run_id"], d["order_number"], d["status_at_run"]),
|
||||
)
|
||||
if _record_in_junction(d["status_at_run"]):
|
||||
await db.execute(
|
||||
"INSERT OR IGNORE INTO sync_run_orders (sync_run_id, order_number, status_at_run) VALUES (?, ?, ?)",
|
||||
(d["sync_run_id"], d["order_number"], d["status_at_run"]),
|
||||
)
|
||||
raw_items = d.get("items", [])
|
||||
if raw_items:
|
||||
await db.execute("DELETE FROM order_items WHERE order_number = ?", (d["order_number"],))
|
||||
|
||||
Reference in New Issue
Block a user