feat(dr): add cluster + memory pre-flight, deploy VM 109 watchdog
DR test script now refuses to start VM 109 if:
* cluster is not quorate (e.g. mid-failover into a degraded state),
* available memory on the host is below VM 109 config + 1 GB margin.
Both checks scale automatically — memory threshold is computed from
qm config so resizing VM 109 does not require touching the script.
Adds vm109-watchdog.sh, scheduled cluster-wide every minute. The
watchdog is the second line of defence behind the cleanup trap from
8a0c557: it force-stops VM 109 if the trap was bypassed (script
killed, host crash mid-test, manual run forgotten). It honours
/var/run/vm109-debug.flag for legitimate manual sessions and is
node-aware via /etc/pve/qemu-server/109.conf so it can be deployed
on every node without coordinating with VM 109's current location.
Both safeguards target the 04-18 → 04-20 chain: VM 109 left running
2.5 days then sandwiched against an HA failover that pushed CT 108
Oracle (8 GB) onto pveelite (16 GB) → OOM cascade.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -358,14 +358,45 @@ run_dr_test() {
|
||||
local step_start=$(date +%s)
|
||||
log "STEP 1: Pre-flight checks"
|
||||
|
||||
# Check backups exist
|
||||
backup_count=$(find "$BACKUP_PATH" -maxdepth 1 -type f -name '*.BKP' 2>/dev/null | wc -l)
|
||||
# Check 1a: Cluster quorate and not degraded.
|
||||
# Refusing to test during a node outage prevents stacking VM 109 (6 GB)
|
||||
# on top of a host already absorbing failover load — the 04-20 trigger.
|
||||
local cluster_quorate
|
||||
cluster_quorate=$(pvecm status 2>/dev/null | awk '/Quorate:/ {print $2}')
|
||||
if [ "$cluster_quorate" != "Yes" ]; then
|
||||
track_step "Pre-flight checks" false "Cluster not quorate (degraded?)" "$step_start"
|
||||
test_result="FAILED - Cluster degraded"
|
||||
backup_count=0
|
||||
fi
|
||||
|
||||
if [ "$backup_count" -lt 2 ]; then
|
||||
# Check 1b: Memory headroom on this host. Calculated from VM 109 config
|
||||
# so it scales automatically if VM 109 memory is later resized.
|
||||
local dr_vm_mem_mb avail_mb min_free_mb
|
||||
dr_vm_mem_mb=$(qm config "$DR_VM_ID" 2>/dev/null | awk '/^memory:/ {print $2}')
|
||||
avail_mb=$(awk '/^MemAvailable:/ {print int($2/1024)}' /proc/meminfo)
|
||||
min_free_mb=$((dr_vm_mem_mb + 1024))
|
||||
|
||||
if [ "$test_result" != "FAILED - Cluster degraded" ] \
|
||||
&& [ "$avail_mb" -lt "$min_free_mb" ]; then
|
||||
track_step "Pre-flight checks" false \
|
||||
"Insufficient memory: ${avail_mb}MB available, need ${min_free_mb}MB" "$step_start"
|
||||
test_result="FAILED - Insufficient memory"
|
||||
backup_count=0
|
||||
fi
|
||||
|
||||
# Check 1c: Backups exist (only if previous checks passed)
|
||||
if [ "$test_result" = "FAILED" ]; then
|
||||
backup_count=$(find "$BACKUP_PATH" -maxdepth 1 -type f -name '*.BKP' 2>/dev/null | wc -l)
|
||||
fi
|
||||
|
||||
if [ "$test_result" != "FAILED" ]; then
|
||||
: # already failed in cluster/memory check, skip
|
||||
elif [ "$backup_count" -lt 2 ]; then
|
||||
track_step "Pre-flight checks" false "Insufficient backups (found: $backup_count)" "$step_start"
|
||||
test_result="FAILED - No backups"
|
||||
else
|
||||
track_step "Pre-flight checks" true "Found $backup_count backups" "$step_start"
|
||||
track_step "Pre-flight checks" true \
|
||||
"Found $backup_count backups, ${avail_mb}MB available" "$step_start"
|
||||
|
||||
# Step 2: Start VM
|
||||
step_start=$(date +%s)
|
||||
|
||||
Reference in New Issue
Block a user