VM 201 (Windows critical) stays out of HA by design. Added: - failover-vm201.sh: interactive failover pvemini -> pveelite with ZFS replication state - recover-vm201-to-pvemini.sh: interactive reverse migration with uptime + split-brain checks - pvemini-down-alert.sh: cron watchdog on pveelite, emails full runbook after 2min DOWN Replication RPO tightened: CT 108 + VM 201 to 5min, CT 171 to 15min. CT 171 added to HA (ha-group-main) for continuous Claude Code access. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
80 lines
2.6 KiB
Bash
Executable File
80 lines
2.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Manual failover VM 201 (roacentral Windows) către pveelite
|
|
# Deployed la /opt/scripts/failover-vm201.sh pe pveelite
|
|
# Folosește ultima replicare ZFS locală
|
|
set -euo pipefail
|
|
|
|
VMID=201
|
|
PRIMARY=pvemini
|
|
PRIMARY_IP=10.0.20.201
|
|
SECONDARY=$(hostname)
|
|
LOG=/var/log/failover-vm201.log
|
|
MAIL_TO=mmarius28@gmail.com
|
|
|
|
log() { echo "[$(date '+%F %T')] $*" | tee -a "$LOG"; }
|
|
die() { log "ABORT: $*"; exit 1; }
|
|
|
|
[[ "$SECONDARY" == "pveelite" ]] || die "Rulează pe pveelite, nu pe $SECONDARY"
|
|
[[ $EUID -eq 0 ]] || die "Trebuie root"
|
|
|
|
FORCE=${1:-}
|
|
|
|
log "=== Failover VM $VMID pornit ==="
|
|
|
|
# Check 1: pvemini efectiv down
|
|
log "Verific $PRIMARY..."
|
|
if ping -c 2 -W 2 "$PRIMARY_IP" &>/dev/null; then
|
|
if ssh -o ConnectTimeout=5 -o BatchMode=yes "root@$PRIMARY_IP" 'pvecm status' &>/dev/null; then
|
|
die "$PRIMARY răspunde la ping + ssh. NU fac failover. Oprește manual VM 201 pe $PRIMARY înainte."
|
|
fi
|
|
log "$PRIMARY ping OK dar ssh fail — posibil degradat"
|
|
fi
|
|
log "$PRIMARY confirmat DOWN"
|
|
|
|
# Check 2: VM nu rulează deja local
|
|
if qm status "$VMID" 2>/dev/null | grep -q running; then
|
|
die "VM $VMID deja running pe $SECONDARY"
|
|
fi
|
|
|
|
# Check 3: discul există local (ultima replicare)
|
|
if ! zfs list "rpool/data/vm-$VMID-disk-3" &>/dev/null; then
|
|
die "Disc vm-$VMID-disk-3 lipsește local. Replicare nefuncțională?"
|
|
fi
|
|
LAST_SNAP=$(zfs list -t snapshot -o name,creation -s creation "rpool/data/vm-$VMID-disk-3" 2>/dev/null | tail -1)
|
|
log "Ultima replicare: $LAST_SNAP"
|
|
|
|
# Confirmare interactivă
|
|
if [[ "$FORCE" != "--yes" ]]; then
|
|
echo
|
|
echo "========================================"
|
|
echo "ATENȚIE: VM $VMID va porni pe $SECONDARY"
|
|
echo "folosind ultima replicare ZFS."
|
|
echo "Date pierdute = ce s-a scris după $LAST_SNAP"
|
|
echo "========================================"
|
|
read -p "Continui? (tastează 'DA' pentru confirmare): " CONFIRM
|
|
[[ "$CONFIRM" == "DA" ]] || die "Anulat de utilizator"
|
|
fi
|
|
|
|
# Start
|
|
log "Pornesc VM $VMID pe $SECONDARY..."
|
|
qm start "$VMID"
|
|
sleep 5
|
|
STATUS=$(qm status "$VMID" | awk '{print $2}')
|
|
log "Status: $STATUS"
|
|
|
|
# Mail
|
|
{
|
|
echo "Failover VM $VMID executat pe $SECONDARY la $(date)"
|
|
echo
|
|
echo "Primary $PRIMARY era DOWN."
|
|
echo "Ultima replicare folosită: $LAST_SNAP"
|
|
echo "Status VM: $STATUS"
|
|
echo
|
|
echo "Pași următori:"
|
|
echo "1. Verifică aplicația Windows pe 10.0.20.201 (IIS, etc)"
|
|
echo "2. Când $PRIMARY revine, NU porni VM $VMID pe $PRIMARY"
|
|
echo "3. După stabilizare, migrare înapoi: qm migrate $VMID $PRIMARY --online 0"
|
|
} | mail -r 'ups@romfast.ro' -s "[CRITIC] Failover VM $VMID pornit pe $SECONDARY" "$MAIL_TO"
|
|
|
|
log "=== Failover complet ==="
|