From 62e9926bd472b4b1e110553a1ef0d43570ba348e Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Sat, 25 Apr 2026 18:48:12 +0000 Subject: [PATCH] feat(dr): add cluster + memory pre-flight, deploy VM 109 watchdog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DR test script now refuses to start VM 109 if: * cluster is not quorate (e.g. mid-failover into a degraded state), * available memory on the host is below VM 109 config + 1 GB margin. Both checks scale automatically — memory threshold is computed from qm config so resizing VM 109 does not require touching the script. Adds vm109-watchdog.sh, scheduled cluster-wide every minute. The watchdog is the second line of defence behind the cleanup trap from 8a0c557: it force-stops VM 109 if the trap was bypassed (script killed, host crash mid-test, manual run forgotten). It honours /var/run/vm109-debug.flag for legitimate manual sessions and is node-aware via /etc/pve/qemu-server/109.conf so it can be deployed on every node without coordinating with VM 109's current location. Both safeguards target the 04-18 → 04-20 chain: VM 109 left running 2.5 days then sandwiched against an HA failover that pushed CT 108 Oracle (8 GB) onto pveelite (16 GB) → OOM cascade. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/vm109-watchdog.sh | 99 +++++++++++++++++++ .../scripts/weekly-dr-test-proxmox.sh | 39 +++++++- 2 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 proxmox/vm109-windows-dr/scripts/vm109-watchdog.sh diff --git a/proxmox/vm109-windows-dr/scripts/vm109-watchdog.sh b/proxmox/vm109-windows-dr/scripts/vm109-watchdog.sh new file mode 100644 index 0000000..ae66c30 --- /dev/null +++ b/proxmox/vm109-windows-dr/scripts/vm109-watchdog.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# +# VM 109 watchdog: stops VM 109 if running outside the DR test window. +# +# Why: incident 2026-04-18 — DR script crashed after starting VM 109 but +# before stopping it. Trap was added (commit 8a0c557) but only fires on +# script exit, not on system crash, kernel panic, or oomkill of the test +# script itself. This watchdog is the second line of defense. +# +# Behavior: +# * If VM 109 is not running: exit silently. +# * If VM 109 is running and uptime <= 60 min: exit silently (test running). +# * If VM 109 is running, uptime > 60 min, debug flag absent, and we are +# OUTSIDE Saturday 05:55-07:30 EEST: alert + stop VM 109. +# +# Debug exemption: +# touch /var/run/vm109-debug.flag # before manual debug +# rm /var/run/vm109-debug.flag # after debug +# +# Schedule (cron on the node hosting VM 109): +# * * * * * /opt/scripts/vm109-watchdog.sh + +set -euo pipefail +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +DR_VM_ID="109" +DEBUG_FLAG="/var/run/vm109-debug.flag" +LOG="/var/log/oracle-dr/watchdog.log" +MAX_RUNTIME_S=3600 # 60 minutes outside test window +TEST_WINDOW_START_MIN=$((5 * 60 + 55)) # Saturday 05:55 +TEST_WINDOW_END_MIN=$((7 * 60 + 30)) # Saturday 07:30 + +mkdir -p "$(dirname "$LOG")" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >>"$LOG"; } + +# Skip silently if VM 109 config not on this node (cluster-aware). +[ -f /etc/pve/qemu-server/${DR_VM_ID}.conf ] || exit 0 + +# Skip if not running +qm status "$DR_VM_ID" 2>/dev/null | grep -q running || exit 0 + +# Skip if debug flag set +[ -f "$DEBUG_FLAG" ] && exit 0 + +# Get VM 109 uptime in seconds (process etime) +PID_FILE="/var/run/qemu-server/${DR_VM_ID}.pid" +[ -f "$PID_FILE" ] || exit 0 +VM_PID=$(cat "$PID_FILE") +UPTIME_S=$(ps -p "$VM_PID" -o etimes= 2>/dev/null | tr -d ' ' || echo 0) + +# Within first hour: assume normal test run, no action +[ "$UPTIME_S" -le "$MAX_RUNTIME_S" ] && exit 0 + +# Inside Saturday test window: assume manual extended test, alert but do not stop +DOW=$(date +%u) # 1=Mon ... 7=Sun, Saturday=6 +NOW_MIN=$(( $(date +%H) * 60 + $(date +%M) )) +if [ "$DOW" -eq 6 ] \ + && [ "$NOW_MIN" -ge "$TEST_WINDOW_START_MIN" ] \ + && [ "$NOW_MIN" -le "$TEST_WINDOW_END_MIN" ]; then + log "VM ${DR_VM_ID} running ${UPTIME_S}s in test window — no action (alert sent)" + echo "VM ${DR_VM_ID} running ${UPTIME_S}s during DR test window. Investigate." \ + | mail -s "[WARN] VM 109 long-running in test window" root 2>/dev/null || true + exit 0 +fi + +# Outside test window + uptime exceeded: alert and stop +log "VM ${DR_VM_ID} running ${UPTIME_S}s outside test window — stopping forcefully" + +ZFS_REPLICA=$(zfs list -t snapshot 2>/dev/null \ + | awk '/vm-109-disk-1@/ {print $1}' | tail -1 || echo "unknown") + +cat </dev/null || true +VM ${DR_VM_ID} (oracle-dr-windows) was running for ${UPTIME_S}s outside the +weekly DR test window (Saturday 05:55-07:30) on $(hostname). + +This indicates the DR test script either crashed without invoking its +cleanup trap, or someone started VM ${DR_VM_ID} manually without setting +${DEBUG_FLAG}. + +The watchdog is force-stopping VM ${DR_VM_ID} now to prevent another +04-20-style memory exhaustion if HA failover were to fire while VM 109 +is consuming ${DR_VM_ID} memory. + +Latest VM 109 ZFS replica: ${ZFS_REPLICA} +Watchdog log: ${LOG} + +To run a manual test without watchdog interference: + touch ${DEBUG_FLAG} + qm start ${DR_VM_ID} + # ... your work ... + qm stop ${DR_VM_ID} + rm ${DEBUG_FLAG} +EOF + +qm stop "$DR_VM_ID" --skiplock --timeout 60 2>>"$LOG" || \ + log "qm stop failed for VM ${DR_VM_ID}" + +log "Force stop completed" diff --git a/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh b/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh index 9d4af80..35fed7d 100644 --- a/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh +++ b/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh @@ -358,14 +358,45 @@ run_dr_test() { local step_start=$(date +%s) log "STEP 1: Pre-flight checks" - # Check backups exist - backup_count=$(find "$BACKUP_PATH" -maxdepth 1 -type f -name '*.BKP' 2>/dev/null | wc -l) + # Check 1a: Cluster quorate and not degraded. + # Refusing to test during a node outage prevents stacking VM 109 (6 GB) + # on top of a host already absorbing failover load — the 04-20 trigger. + local cluster_quorate + cluster_quorate=$(pvecm status 2>/dev/null | awk '/Quorate:/ {print $2}') + if [ "$cluster_quorate" != "Yes" ]; then + track_step "Pre-flight checks" false "Cluster not quorate (degraded?)" "$step_start" + test_result="FAILED - Cluster degraded" + backup_count=0 + fi - if [ "$backup_count" -lt 2 ]; then + # Check 1b: Memory headroom on this host. Calculated from VM 109 config + # so it scales automatically if VM 109 memory is later resized. + local dr_vm_mem_mb avail_mb min_free_mb + dr_vm_mem_mb=$(qm config "$DR_VM_ID" 2>/dev/null | awk '/^memory:/ {print $2}') + avail_mb=$(awk '/^MemAvailable:/ {print int($2/1024)}' /proc/meminfo) + min_free_mb=$((dr_vm_mem_mb + 1024)) + + if [ "$test_result" != "FAILED - Cluster degraded" ] \ + && [ "$avail_mb" -lt "$min_free_mb" ]; then + track_step "Pre-flight checks" false \ + "Insufficient memory: ${avail_mb}MB available, need ${min_free_mb}MB" "$step_start" + test_result="FAILED - Insufficient memory" + backup_count=0 + fi + + # Check 1c: Backups exist (only if previous checks passed) + if [ "$test_result" = "FAILED" ]; then + backup_count=$(find "$BACKUP_PATH" -maxdepth 1 -type f -name '*.BKP' 2>/dev/null | wc -l) + fi + + if [ "$test_result" != "FAILED" ]; then + : # already failed in cluster/memory check, skip + elif [ "$backup_count" -lt 2 ]; then track_step "Pre-flight checks" false "Insufficient backups (found: $backup_count)" "$step_start" test_result="FAILED - No backups" else - track_step "Pre-flight checks" true "Found $backup_count backups" "$step_start" + track_step "Pre-flight checks" true \ + "Found $backup_count backups, ${avail_mb}MB available" "$step_start" # Step 2: Start VM step_start=$(date +%s)