fix(dr-test): guard cleanup trap + surface qm start errors
The cleanup trap added in 8a0c557 stopped VM 109 unconditionally on EXIT,
which kills the VM during --install/--help or when an operator launched
it manually for debugging. Gate the trap with DR_VM_STARTED_BY_US so it
only fires when the script itself started the VM.
Also remove the 2>/dev/null swallow on qm start so cross-node failures
(e.g. running on a node where the VM is not configured) appear in the
log instead of producing a silent "Failed to start VM 109" in 0 seconds.
Root cause for the 2026-04-25 silent failure: cron lived on pveelite
while VM 109 had been migrated to pvemini; qm start returned an error
that was hidden by the redirect.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,12 +22,16 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Cleanup trap: ensure VM 109 is always stopped on script exit
|
||||
# Cleanup trap: stop VM 109 on script exit ONLY if this script started it.
|
||||
# Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running
|
||||
# for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover.
|
||||
# Guard prevents the trap from killing an externally-running VM during
|
||||
# --install / --help or when an operator launched it manually for debugging.
|
||||
DR_VM_STARTED_BY_US=false
|
||||
cleanup_vm() {
|
||||
local rc=$?
|
||||
if qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
|
||||
if [ "$DR_VM_STARTED_BY_US" = "true" ] \
|
||||
&& qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
|
||||
echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop"
|
||||
qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true
|
||||
fi
|
||||
@@ -367,7 +371,9 @@ run_dr_test() {
|
||||
step_start=$(date +%s)
|
||||
log "STEP 2: Starting DR VM"
|
||||
|
||||
if qm start "$DR_VM_ID" 2>/dev/null; then
|
||||
local qm_start_output
|
||||
if qm_start_output=$(qm start "$DR_VM_ID" 2>&1); then
|
||||
DR_VM_STARTED_BY_US=true
|
||||
vm_status_label="Running"
|
||||
|
||||
# Intelligent VM boot wait with polling (max 180s)
|
||||
@@ -526,7 +532,8 @@ run_dr_test() {
|
||||
vm_status_label="Stopped"
|
||||
|
||||
else
|
||||
track_step "VM Startup" false "Failed to start VM $DR_VM_ID" "$step_start"
|
||||
log_error "qm start $DR_VM_ID failed: $qm_start_output"
|
||||
track_step "VM Startup" false "Failed to start VM $DR_VM_ID: $qm_start_output" "$step_start"
|
||||
vm_status_label="Failed to start"
|
||||
fi
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user