Adds complete procedure for replacing UPS battery when entire cluster is powered by the same UPS, requiring full cluster shutdown. New files: - scripts/ups-maintenance-shutdown.sh: Automated orchestrated shutdown for maintenance operations with confirmation prompts and progress display - docs/UPS-BATTERY-REPLACEMENT.md: Complete step-by-step guide for battery replacement including pre-shutdown, physical replacement, and post-startup verification procedures Features: - Orchestrated shutdown: VMs → LXC containers → secondary nodes → primary - Interactive confirmation before shutdown - Color-coded progress indicators - Countdown timers for each phase - Post-replacement verification checklist - Troubleshooting guide for common issues - Recovery procedures for cluster/quorum problems The procedure accounts for all 3 cluster nodes (pve1, pvemini, pveelite) being on the same UPS, requiring complete infrastructure shutdown. Documentation includes: - When to replace battery (based on monthly test results) - Pre-planning and user notification templates - Physical battery replacement safety procedures - Cluster recovery and VM restart procedures - Post-replacement testing and verification - 24-hour and 1-week monitoring checklists Estimated maintenance window: 30-60 minutes 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
157 lines
5.2 KiB
Bash
157 lines
5.2 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Script de shutdown orchestrat pentru mentenanță UPS
|
|
# Folosit când trebuie să oprești complet clusterul pentru înlocuire baterie
|
|
#
|
|
# Autor: Claude Code
|
|
# Data: 2025-10-06
|
|
|
|
LOGFILE="/var/log/ups-maintenance.log"
|
|
NODES=("10.0.20.200" "10.0.20.202") # pve1, pveelite (adaptează dacă IP-urile sunt altele)
|
|
|
|
# Culori pentru output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOGFILE
|
|
}
|
|
|
|
echo -e "${BLUE}========================================${NC}"
|
|
echo -e "${BLUE}UPS MAINTENANCE - CLUSTER SHUTDOWN${NC}"
|
|
echo -e "${BLUE}========================================${NC}"
|
|
echo ""
|
|
echo -e "${YELLOW}⚠️ ATENȚIE: Acest script va opri TOATE nodurile cluster!${NC}"
|
|
echo -e "${YELLOW}⚠️ Folosit pentru mentenanță UPS (înlocuire baterie)${NC}"
|
|
echo ""
|
|
echo -e "${RED}Cluster nodes care vor fi oprite:${NC}"
|
|
echo " - pve1 (10.0.20.200)"
|
|
echo " - pveelite (10.0.20.202)"
|
|
echo " - pvemini (10.0.20.201) - ULTIMUL"
|
|
echo ""
|
|
read -p "Continui cu shutdown? (scrie 'DA' pentru confirmare): " confirm
|
|
|
|
if [ "$confirm" != "DA" ]; then
|
|
echo -e "${RED}Anulat de utilizator.${NC}"
|
|
exit 0
|
|
fi
|
|
|
|
log "========================================"
|
|
log "MAINTENANCE SHUTDOWN - START"
|
|
log "Initiated by: $(whoami)"
|
|
log "========================================"
|
|
|
|
# 1. Verificare status cluster înainte
|
|
log "Step 1: Verificare status cluster..."
|
|
pvecm status | tee -a $LOGFILE
|
|
echo ""
|
|
|
|
# 2. Oprire VM-uri pe toate nodurile
|
|
log "Step 2: Oprire VM-uri pe TOATE nodurile..."
|
|
|
|
echo -e "${BLUE}Oprire VM-uri pe pvemini (local)...${NC}"
|
|
for vmid in $(qm list | awk 'NR>1 {print $1}'); do
|
|
vm_name=$(qm config $vmid | grep '^name:' | cut -d' ' -f2 || echo "VM-$vmid")
|
|
vm_status=$(qm status $vmid | awk '{print $2}')
|
|
|
|
if [ "$vm_status" == "running" ]; then
|
|
log " Oprire VM $vmid ($vm_name) pe pvemini..."
|
|
qm shutdown $vmid --timeout 180 &
|
|
else
|
|
log " VM $vmid ($vm_name) deja oprit"
|
|
fi
|
|
done
|
|
|
|
# Oprire VM-uri pe nodurile secundare
|
|
for node_ip in ${NODES[@]}; do
|
|
echo -e "${BLUE}Oprire VM-uri pe nod $node_ip...${NC}"
|
|
ssh -o ConnectTimeout=5 root@$node_ip "
|
|
for vmid in \$(qm list | awk 'NR>1 {print \$1}'); do
|
|
vm_name=\$(qm config \$vmid | grep '^name:' | cut -d' ' -f2 || echo 'VM-'\$vmid)
|
|
vm_status=\$(qm status \$vmid | awk '{print \$2}')
|
|
|
|
if [ \"\$vm_status\" == \"running\" ]; then
|
|
echo ' Oprire VM '\$vmid' ('\$vm_name') pe $node_ip...'
|
|
qm shutdown \$vmid --timeout 180 &
|
|
fi
|
|
done
|
|
" 2>&1 | tee -a $LOGFILE
|
|
done
|
|
|
|
log "Așteptare 3 minute pentru shutdown VM-uri..."
|
|
echo -e "${YELLOW}Aștept 180 secunde pentru oprirea graceful a VM-urilor...${NC}"
|
|
for i in {180..1}; do
|
|
echo -ne "\r${YELLOW}Rămas: $i secunde...${NC} "
|
|
sleep 1
|
|
done
|
|
echo ""
|
|
|
|
# 3. Verificare VM-uri oprite
|
|
log "Step 3: Verificare VM-uri oprite..."
|
|
running_vms=$(qm list | awk 'NR>1 && $3=="running" {print $1}')
|
|
if [ ! -z "$running_vms" ]; then
|
|
log "WARNING: VM-uri încă pornite pe pvemini: $running_vms"
|
|
echo -e "${YELLOW}WARNING: Unele VM-uri încă rulează. Oprire forțată în 30 secunde...${NC}"
|
|
sleep 30
|
|
for vmid in $running_vms; do
|
|
log " Force stop VM $vmid"
|
|
qm stop $vmid
|
|
done
|
|
fi
|
|
|
|
# 4. Oprire containere (dacă există)
|
|
log "Step 4: Oprire containere LXC (dacă există)..."
|
|
pct list 2>/dev/null | awk 'NR>1 && $2=="running" {print $1}' | while read ctid; do
|
|
log " Oprire container $ctid"
|
|
pct shutdown $ctid --timeout 60 &
|
|
done
|
|
sleep 70
|
|
|
|
# 5. Oprire noduri secundare
|
|
log "Step 5: Oprire noduri secundare (pve1, pveelite)..."
|
|
for node_ip in ${NODES[@]}; do
|
|
log " Shutdown nod $node_ip în 2 minute..."
|
|
echo -e "${RED}Shutdown nod $node_ip...${NC}"
|
|
ssh -o ConnectTimeout=5 root@$node_ip "shutdown -h +2 'UPS Maintenance - Battery Replacement'" 2>&1 | tee -a $LOGFILE &
|
|
done
|
|
|
|
log "Așteptare 150 secunde pentru shutdown noduri secundare..."
|
|
echo -e "${YELLOW}Aștept 150 secunde pentru oprirea nodurilor secundare...${NC}"
|
|
for i in {150..1}; do
|
|
echo -ne "\r${YELLOW}Rămas: $i secunde până la shutdown pvemini...${NC} "
|
|
sleep 1
|
|
done
|
|
echo ""
|
|
|
|
# 6. Oprire nod local (pvemini) - ULTIMUL
|
|
log "Step 6: Oprire pvemini (nod PRIMARY - ULTIMUL)..."
|
|
log "========================================"
|
|
log "MAINTENANCE SHUTDOWN - COMPLETE"
|
|
log "Nodurile secundare ar trebui să fie oprite."
|
|
log "pvemini se va opri în 2 minute."
|
|
log "========================================"
|
|
|
|
echo ""
|
|
echo -e "${GREEN}========================================${NC}"
|
|
echo -e "${GREEN}SHUTDOWN ORCHESTRAT FINALIZAT${NC}"
|
|
echo -e "${GREEN}========================================${NC}"
|
|
echo ""
|
|
echo -e "${YELLOW}URMĂTORII PAȘI:${NC}"
|
|
echo "1. Așteaptă 2 minute pentru oprirea completă a pvemini"
|
|
echo "2. Verifică că toate LED-urile nodurilor sunt stinse"
|
|
echo "3. Deconectează UPS de la priză"
|
|
echo "4. Înlocuiește bateria UPS"
|
|
echo "5. Reconectează UPS la priză"
|
|
echo "6. Pornește nodurile (apasă buton power sau WOL)"
|
|
echo "7. Verifică cluster cu: pvecm status"
|
|
echo ""
|
|
echo -e "${RED}pvemini se va opri în 2 minute!${NC}"
|
|
echo ""
|
|
|
|
shutdown -h +2 "UPS Maintenance - Battery Replacement - Primary Node"
|
|
|
|
exit 0
|