fix(dr-test): guard cleanup trap + surface qm start errors

The cleanup trap added in 8a0c557 stopped VM 109 unconditionally on EXIT,
which kills the VM during --install/--help or when an operator launched
it manually for debugging. Gate the trap with DR_VM_STARTED_BY_US so it
only fires when the script itself started the VM.

Also remove the 2>/dev/null swallow on qm start so cross-node failures
(e.g. running on a node where the VM is not configured) appear in the
log instead of producing a silent "Failed to start VM 109" in 0 seconds.

Root cause for the 2026-04-25 silent failure: cron lived on pveelite
while VM 109 had been migrated to pvemini; qm start returned an error
that was hidden by the redirect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-04-25 08:47:54 +00:00
parent 8a0c557981
commit 2e8cd9ca59

View File

@@ -22,12 +22,16 @@
set -euo pipefail
# Cleanup trap: ensure VM 109 is always stopped on script exit
# Cleanup trap: stop VM 109 on script exit ONLY if this script started it.
# Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running
# for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover.
# Guard prevents the trap from killing an externally-running VM during
# --install / --help or when an operator launched it manually for debugging.
DR_VM_STARTED_BY_US=false
cleanup_vm() {
local rc=$?
if qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
if [ "$DR_VM_STARTED_BY_US" = "true" ] \
&& qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop"
qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true
fi
@@ -367,7 +371,9 @@ run_dr_test() {
step_start=$(date +%s)
log "STEP 2: Starting DR VM"
if qm start "$DR_VM_ID" 2>/dev/null; then
local qm_start_output
if qm_start_output=$(qm start "$DR_VM_ID" 2>&1); then
DR_VM_STARTED_BY_US=true
vm_status_label="Running"
# Intelligent VM boot wait with polling (max 180s)
@@ -526,7 +532,8 @@ run_dr_test() {
vm_status_label="Stopped"
else
track_step "VM Startup" false "Failed to start VM $DR_VM_ID" "$step_start"
log_error "qm start $DR_VM_ID failed: $qm_start_output"
track_step "VM Startup" false "Failed to start VM $DR_VM_ID: $qm_start_output" "$step_start"
vm_status_label="Failed to start"
fi
fi