#!/bin/bash # # VM 109 watchdog: stops VM 109 if running outside the DR test window. # # Why: incident 2026-04-18 — DR script crashed after starting VM 109 but # before stopping it. Trap was added (commit 8a0c557) but only fires on # script exit, not on system crash, kernel panic, or oomkill of the test # script itself. This watchdog is the second line of defense. # # Behavior: # * If VM 109 is not running: exit silently. # * If VM 109 is running and uptime <= 60 min: exit silently (test running). # * If VM 109 is running, uptime > 60 min, debug flag absent, and we are # OUTSIDE Saturday 05:55-07:30 EEST: alert + stop VM 109. # # Debug exemption: # touch /var/run/vm109-debug.flag # before manual debug # rm /var/run/vm109-debug.flag # after debug # # Schedule (cron on the node hosting VM 109): # * * * * * /opt/scripts/vm109-watchdog.sh set -euo pipefail export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" DR_VM_ID="109" DEBUG_FLAG="/var/run/vm109-debug.flag" LOG="/var/log/oracle-dr/watchdog.log" MAX_RUNTIME_S=3600 # 60 minutes outside test window TEST_WINDOW_START_MIN=$((5 * 60 + 55)) # Saturday 05:55 TEST_WINDOW_END_MIN=$((7 * 60 + 30)) # Saturday 07:30 mkdir -p "$(dirname "$LOG")" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >>"$LOG"; } # Skip silently if VM 109 config not on this node (cluster-aware). [ -f /etc/pve/qemu-server/${DR_VM_ID}.conf ] || exit 0 # Skip if not running qm status "$DR_VM_ID" 2>/dev/null | grep -q running || exit 0 # Skip if debug flag set [ -f "$DEBUG_FLAG" ] && exit 0 # Get VM 109 uptime in seconds (process etime) PID_FILE="/var/run/qemu-server/${DR_VM_ID}.pid" [ -f "$PID_FILE" ] || exit 0 VM_PID=$(cat "$PID_FILE") UPTIME_S=$(ps -p "$VM_PID" -o etimes= 2>/dev/null | tr -d ' ' || echo 0) # Within first hour: assume normal test run, no action [ "$UPTIME_S" -le "$MAX_RUNTIME_S" ] && exit 0 # Inside Saturday test window: assume manual extended test, alert but do not stop DOW=$(date +%u) # 1=Mon ... 7=Sun, Saturday=6 NOW_MIN=$(( $(date +%H) * 60 + $(date +%M) )) if [ "$DOW" -eq 6 ] \ && [ "$NOW_MIN" -ge "$TEST_WINDOW_START_MIN" ] \ && [ "$NOW_MIN" -le "$TEST_WINDOW_END_MIN" ]; then log "VM ${DR_VM_ID} running ${UPTIME_S}s in test window — no action (alert sent)" echo "VM ${DR_VM_ID} running ${UPTIME_S}s during DR test window. Investigate." \ | mail -s "[WARN] VM 109 long-running in test window" root 2>/dev/null || true exit 0 fi # Outside test window + uptime exceeded: alert and stop log "VM ${DR_VM_ID} running ${UPTIME_S}s outside test window — stopping forcefully" ZFS_REPLICA=$(zfs list -t snapshot 2>/dev/null \ | awk '/vm-109-disk-1@/ {print $1}' | tail -1 || echo "unknown") cat </dev/null || true VM ${DR_VM_ID} (oracle-dr-windows) was running for ${UPTIME_S}s outside the weekly DR test window (Saturday 05:55-07:30) on $(hostname). This indicates the DR test script either crashed without invoking its cleanup trap, or someone started VM ${DR_VM_ID} manually without setting ${DEBUG_FLAG}. The watchdog is force-stopping VM ${DR_VM_ID} now to prevent another 04-20-style memory exhaustion if HA failover were to fire while VM 109 is consuming ${DR_VM_ID} memory. Latest VM 109 ZFS replica: ${ZFS_REPLICA} Watchdog log: ${LOG} To run a manual test without watchdog interference: touch ${DEBUG_FLAG} qm start ${DR_VM_ID} # ... your work ... qm stop ${DR_VM_ID} rm ${DEBUG_FLAG} EOF qm stop "$DR_VM_ID" --skiplock --timeout 60 2>>"$LOG" || \ log "qm stop failed for VM ${DR_VM_ID}" log "Force stop completed"