feat(failover): add VM 201 manual failover + recovery scripts, watchdog alert
VM 201 (Windows critical) stays out of HA by design. Added: - failover-vm201.sh: interactive failover pvemini -> pveelite with ZFS replication state - recover-vm201-to-pvemini.sh: interactive reverse migration with uptime + split-brain checks - pvemini-down-alert.sh: cron watchdog on pveelite, emails full runbook after 2min DOWN Replication RPO tightened: CT 108 + VM 201 to 5min, CT 171 to 15min. CT 171 added to HA (ha-group-main) for continuous Claude Code access. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
79
proxmox/cluster/failover/failover-vm201.sh
Executable file
79
proxmox/cluster/failover/failover-vm201.sh
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
# Manual failover VM 201 (roacentral Windows) către pveelite
|
||||
# Deployed la /opt/scripts/failover-vm201.sh pe pveelite
|
||||
# Folosește ultima replicare ZFS locală
|
||||
set -euo pipefail
|
||||
|
||||
VMID=201
|
||||
PRIMARY=pvemini
|
||||
PRIMARY_IP=10.0.20.201
|
||||
SECONDARY=$(hostname)
|
||||
LOG=/var/log/failover-vm201.log
|
||||
MAIL_TO=mmarius28@gmail.com
|
||||
|
||||
log() { echo "[$(date '+%F %T')] $*" | tee -a "$LOG"; }
|
||||
die() { log "ABORT: $*"; exit 1; }
|
||||
|
||||
[[ "$SECONDARY" == "pveelite" ]] || die "Rulează pe pveelite, nu pe $SECONDARY"
|
||||
[[ $EUID -eq 0 ]] || die "Trebuie root"
|
||||
|
||||
FORCE=${1:-}
|
||||
|
||||
log "=== Failover VM $VMID pornit ==="
|
||||
|
||||
# Check 1: pvemini efectiv down
|
||||
log "Verific $PRIMARY..."
|
||||
if ping -c 2 -W 2 "$PRIMARY_IP" &>/dev/null; then
|
||||
if ssh -o ConnectTimeout=5 -o BatchMode=yes "root@$PRIMARY_IP" 'pvecm status' &>/dev/null; then
|
||||
die "$PRIMARY răspunde la ping + ssh. NU fac failover. Oprește manual VM 201 pe $PRIMARY înainte."
|
||||
fi
|
||||
log "$PRIMARY ping OK dar ssh fail — posibil degradat"
|
||||
fi
|
||||
log "$PRIMARY confirmat DOWN"
|
||||
|
||||
# Check 2: VM nu rulează deja local
|
||||
if qm status "$VMID" 2>/dev/null | grep -q running; then
|
||||
die "VM $VMID deja running pe $SECONDARY"
|
||||
fi
|
||||
|
||||
# Check 3: discul există local (ultima replicare)
|
||||
if ! zfs list "rpool/data/vm-$VMID-disk-3" &>/dev/null; then
|
||||
die "Disc vm-$VMID-disk-3 lipsește local. Replicare nefuncțională?"
|
||||
fi
|
||||
LAST_SNAP=$(zfs list -t snapshot -o name,creation -s creation "rpool/data/vm-$VMID-disk-3" 2>/dev/null | tail -1)
|
||||
log "Ultima replicare: $LAST_SNAP"
|
||||
|
||||
# Confirmare interactivă
|
||||
if [[ "$FORCE" != "--yes" ]]; then
|
||||
echo
|
||||
echo "========================================"
|
||||
echo "ATENȚIE: VM $VMID va porni pe $SECONDARY"
|
||||
echo "folosind ultima replicare ZFS."
|
||||
echo "Date pierdute = ce s-a scris după $LAST_SNAP"
|
||||
echo "========================================"
|
||||
read -p "Continui? (tastează 'DA' pentru confirmare): " CONFIRM
|
||||
[[ "$CONFIRM" == "DA" ]] || die "Anulat de utilizator"
|
||||
fi
|
||||
|
||||
# Start
|
||||
log "Pornesc VM $VMID pe $SECONDARY..."
|
||||
qm start "$VMID"
|
||||
sleep 5
|
||||
STATUS=$(qm status "$VMID" | awk '{print $2}')
|
||||
log "Status: $STATUS"
|
||||
|
||||
# Mail
|
||||
{
|
||||
echo "Failover VM $VMID executat pe $SECONDARY la $(date)"
|
||||
echo
|
||||
echo "Primary $PRIMARY era DOWN."
|
||||
echo "Ultima replicare folosită: $LAST_SNAP"
|
||||
echo "Status VM: $STATUS"
|
||||
echo
|
||||
echo "Pași următori:"
|
||||
echo "1. Verifică aplicația Windows pe 10.0.20.201 (IIS, etc)"
|
||||
echo "2. Când $PRIMARY revine, NU porni VM $VMID pe $PRIMARY"
|
||||
echo "3. După stabilizare, migrare înapoi: qm migrate $VMID $PRIMARY --online 0"
|
||||
} | mail -r 'ups@romfast.ro' -s "[CRITIC] Failover VM $VMID pornit pe $SECONDARY" "$MAIL_TO"
|
||||
|
||||
log "=== Failover complet ==="
|
||||
Reference in New Issue
Block a user