fix(dr-test): guard cleanup trap + surface qm start errors
The cleanup trap added in 8a0c557 stopped VM 109 unconditionally on EXIT,
which kills the VM during --install/--help or when an operator launched
it manually for debugging. Gate the trap with DR_VM_STARTED_BY_US so it
only fires when the script itself started the VM.
Also remove the 2>/dev/null swallow on qm start so cross-node failures
(e.g. running on a node where the VM is not configured) appear in the
log instead of producing a silent "Failed to start VM 109" in 0 seconds.
Root cause for the 2026-04-25 silent failure: cron lived on pveelite
while VM 109 had been migrated to pvemini; qm start returned an error
that was hidden by the redirect.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,12 +22,16 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Cleanup trap: ensure VM 109 is always stopped on script exit
|
# Cleanup trap: stop VM 109 on script exit ONLY if this script started it.
|
||||||
# Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running
|
# Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running
|
||||||
# for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover.
|
# for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover.
|
||||||
|
# Guard prevents the trap from killing an externally-running VM during
|
||||||
|
# --install / --help or when an operator launched it manually for debugging.
|
||||||
|
DR_VM_STARTED_BY_US=false
|
||||||
cleanup_vm() {
|
cleanup_vm() {
|
||||||
local rc=$?
|
local rc=$?
|
||||||
if qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
|
if [ "$DR_VM_STARTED_BY_US" = "true" ] \
|
||||||
|
&& qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
|
||||||
echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop"
|
echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop"
|
||||||
qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true
|
qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
@@ -367,7 +371,9 @@ run_dr_test() {
|
|||||||
step_start=$(date +%s)
|
step_start=$(date +%s)
|
||||||
log "STEP 2: Starting DR VM"
|
log "STEP 2: Starting DR VM"
|
||||||
|
|
||||||
if qm start "$DR_VM_ID" 2>/dev/null; then
|
local qm_start_output
|
||||||
|
if qm_start_output=$(qm start "$DR_VM_ID" 2>&1); then
|
||||||
|
DR_VM_STARTED_BY_US=true
|
||||||
vm_status_label="Running"
|
vm_status_label="Running"
|
||||||
|
|
||||||
# Intelligent VM boot wait with polling (max 180s)
|
# Intelligent VM boot wait with polling (max 180s)
|
||||||
@@ -526,7 +532,8 @@ run_dr_test() {
|
|||||||
vm_status_label="Stopped"
|
vm_status_label="Stopped"
|
||||||
|
|
||||||
else
|
else
|
||||||
track_step "VM Startup" false "Failed to start VM $DR_VM_ID" "$step_start"
|
log_error "qm start $DR_VM_ID failed: $qm_start_output"
|
||||||
|
track_step "VM Startup" false "Failed to start VM $DR_VM_ID: $qm_start_output" "$step_start"
|
||||||
vm_status_label="Failed to start"
|
vm_status_label="Failed to start"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user