fix(vm109-dr): trap cleanup to stop VM 109 on script exit
The DR test script used set -euo pipefail, so a failing SSH shutdown command caused the script to exit before qm stop. On 2026-04-20 this left VM 109 running for 2.5 days and triggered an OOM cascade when pvemini HA-failed over to pveelite. Adds EXIT trap that force-stops VM 109 regardless of exit path, and makes the Step 7 SSH shutdown tolerant of failure. Incident details: proxmox/cluster/incidents/2026-04-20-cluster-outage.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,19 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Cleanup trap: ensure VM 109 is always stopped on script exit
|
||||
# Fixes incident 2026-04-20: script crashed at SSH step and left VM 109 running
|
||||
# for 2.5 days, causing OOM cascade on pveelite after pvemini HA failover.
|
||||
cleanup_vm() {
|
||||
local rc=$?
|
||||
if qm status "${DR_VM_ID:-109}" 2>/dev/null | grep -q running; then
|
||||
echo "[trap] VM ${DR_VM_ID:-109} still running at exit (rc=$rc), forcing stop"
|
||||
qm stop "${DR_VM_ID:-109}" --skiplock 2>/dev/null || true
|
||||
fi
|
||||
exit $rc
|
||||
}
|
||||
trap cleanup_vm EXIT
|
||||
|
||||
# Set proper PATH for cron execution
|
||||
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
|
||||
@@ -504,7 +517,8 @@ run_dr_test() {
|
||||
step_start=$(date +%s)
|
||||
log "STEP 7: Shutting down VM"
|
||||
|
||||
ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" "shutdown /s /t 30" 2>/dev/null
|
||||
ssh -p "$DR_VM_PORT" -o ConnectTimeout=10 "$DR_VM_USER@$DR_VM_IP" "shutdown /s /t 30" 2>/dev/null \
|
||||
|| log_warning "SSH shutdown failed, will force qm stop"
|
||||
sleep 60
|
||||
qm stop "$DR_VM_ID" 2>/dev/null
|
||||
|
||||
|
||||
Reference in New Issue
Block a user