From 2e8cd9ca59d35bcef723075ddae0a1f65d7e4a47 Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Sat, 25 Apr 2026 08:47:54 +0000 Subject: [PATCH] fix(dr-test): guard cleanup trap + surface qm start errors The cleanup trap added in 8a0c557 stopped VM 109 unconditionally on EXIT, which kills the VM during --install/--help or when an operator launched it manually for debugging. Gate the trap with DR_VM_STARTED_BY_US so it only fires when the script itself started the VM. Also remove the 2>/dev/null swallow on qm start so cross-node failures (e.g. running on a node where the VM is not configured) appear in the log instead of producing a silent "Failed to start VM 109" in 0 seconds. Root cause for the 2026-04-25 silent failure: cron lived on pveelite while VM 109 had been migrated to pvemini; qm start returned an error that was hidden by the redirect. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/weekly-dr-test-proxmox.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh b/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh index f52b187..9d4af80 100644 --- a/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh +++ b/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh @@ -22,12 +22,16 @@ set -euo pipefail -# Cleanup trap: ensure VM 109 is always stopped on script exit +# Cleanup trap: stop VM 109 on script exit ONLY if this script started it. # Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running # for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover. +# Guard prevents the trap from killing an externally-running VM during +# --install / --help or when an operator launched it manually for debugging. +DR_VM_STARTED_BY_US=false cleanup_vm() { local rc=$? - if qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then + if [ "$DR_VM_STARTED_BY_US" = "true" ] \ + && qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop" qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true fi @@ -367,7 +371,9 @@ run_dr_test() { step_start=$(date +%s) log "STEP 2: Starting DR VM" - if qm start "$DR_VM_ID" 2>/dev/null; then + local qm_start_output + if qm_start_output=$(qm start "$DR_VM_ID" 2>&1); then + DR_VM_STARTED_BY_US=true vm_status_label="Running" # Intelligent VM boot wait with polling (max 180s) @@ -526,7 +532,8 @@ run_dr_test() { vm_status_label="Stopped" else - track_step "VM Startup" false "Failed to start VM $DR_VM_ID" "$step_start" + log_error "qm start $DR_VM_ID failed: $qm_start_output" + track_step "VM Startup" false "Failed to start VM $DR_VM_ID: $qm_start_output" "$step_start" vm_status_label="Failed to start" fi fi