diff --git a/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh b/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh index f52b187..9d4af80 100644 --- a/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh +++ b/proxmox/vm109-windows-dr/scripts/weekly-dr-test-proxmox.sh @@ -22,12 +22,16 @@ set -euo pipefail -# Cleanup trap: ensure VM 109 is always stopped on script exit +# Cleanup trap: stop VM 109 on script exit ONLY if this script started it. # Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running # for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover. +# Guard prevents the trap from killing an externally-running VM during +# --install / --help or when an operator launched it manually for debugging. +DR_VM_STARTED_BY_US=false cleanup_vm() { local rc=$? - if qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then + if [ "$DR_VM_STARTED_BY_US" = "true" ] \ + && qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop" qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true fi @@ -367,7 +371,9 @@ run_dr_test() { step_start=$(date +%s) log "STEP 2: Starting DR VM" - if qm start "$DR_VM_ID" 2>/dev/null; then + local qm_start_output + if qm_start_output=$(qm start "$DR_VM_ID" 2>&1); then + DR_VM_STARTED_BY_US=true vm_status_label="Running" # Intelligent VM boot wait with polling (max 180s) @@ -526,7 +532,8 @@ run_dr_test() { vm_status_label="Stopped" else - track_step "VM Startup" false "Failed to start VM $DR_VM_ID" "$step_start" + log_error "qm start $DR_VM_ID failed: $qm_start_output" + track_step "VM Startup" false "Failed to start VM $DR_VM_ID: $qm_start_output" "$step_start" vm_status_label="Failed to start" fi fi