Add UPS battery replacement procedure and maintenance shutdown script

Adds complete procedure for replacing UPS battery when entire cluster
is powered by the same UPS, requiring full cluster shutdown.

New files:
- scripts/ups-maintenance-shutdown.sh: Automated orchestrated shutdown
  for maintenance operations with confirmation prompts and progress display
- docs/UPS-BATTERY-REPLACEMENT.md: Complete step-by-step guide for battery
  replacement including pre-shutdown, physical replacement, and post-startup
  verification procedures

Features:
- Orchestrated shutdown: VMs → LXC containers → secondary nodes → primary
- Interactive confirmation before shutdown
- Color-coded progress indicators
- Countdown timers for each phase
- Post-replacement verification checklist
- Troubleshooting guide for common issues
- Recovery procedures for cluster/quorum problems

The procedure accounts for all 3 cluster nodes (pve1, pvemini, pveelite)
being on the same UPS, requiring complete infrastructure shutdown.

Documentation includes:
- When to replace battery (based on monthly test results)
- Pre-planning and user notification templates
- Physical battery replacement safety procedures
- Cluster recovery and VM restart procedures
- Post-replacement testing and verification
- 24-hour and 1-week monitoring checklists

Estimated maintenance window: 30-60 minutes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Marius
2025-10-06 21:46:28 +03:00
parent 87b9709a0d
commit cc72a5f96e
3 changed files with 681 additions and 2 deletions

View File

@@ -0,0 +1,156 @@
#!/bin/bash
#
# Script de shutdown orchestrat pentru mentenanță UPS
# Folosit când trebuie să oprești complet clusterul pentru înlocuire baterie
#
# Autor: Claude Code
# Data: 2025-10-06
LOGFILE="/var/log/ups-maintenance.log"
NODES=("10.0.20.200" "10.0.20.202") # pve1, pveelite (adaptează dacă IP-urile sunt altele)
# Culori pentru output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOGFILE
}
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE}UPS MAINTENANCE - CLUSTER SHUTDOWN${NC}"
echo -e "${BLUE}========================================${NC}"
echo ""
echo -e "${YELLOW}⚠️ ATENȚIE: Acest script va opri TOATE nodurile cluster!${NC}"
echo -e "${YELLOW}⚠️ Folosit pentru mentenanță UPS (înlocuire baterie)${NC}"
echo ""
echo -e "${RED}Cluster nodes care vor fi oprite:${NC}"
echo " - pve1 (10.0.20.200)"
echo " - pveelite (10.0.20.202)"
echo " - pvemini (10.0.20.201) - ULTIMUL"
echo ""
read -p "Continui cu shutdown? (scrie 'DA' pentru confirmare): " confirm
if [ "$confirm" != "DA" ]; then
echo -e "${RED}Anulat de utilizator.${NC}"
exit 0
fi
log "========================================"
log "MAINTENANCE SHUTDOWN - START"
log "Initiated by: $(whoami)"
log "========================================"
# 1. Verificare status cluster înainte
log "Step 1: Verificare status cluster..."
pvecm status | tee -a $LOGFILE
echo ""
# 2. Oprire VM-uri pe toate nodurile
log "Step 2: Oprire VM-uri pe TOATE nodurile..."
echo -e "${BLUE}Oprire VM-uri pe pvemini (local)...${NC}"
for vmid in $(qm list | awk 'NR>1 {print $1}'); do
vm_name=$(qm config $vmid | grep '^name:' | cut -d' ' -f2 || echo "VM-$vmid")
vm_status=$(qm status $vmid | awk '{print $2}')
if [ "$vm_status" == "running" ]; then
log " Oprire VM $vmid ($vm_name) pe pvemini..."
qm shutdown $vmid --timeout 180 &
else
log " VM $vmid ($vm_name) deja oprit"
fi
done
# Oprire VM-uri pe nodurile secundare
for node_ip in ${NODES[@]}; do
echo -e "${BLUE}Oprire VM-uri pe nod $node_ip...${NC}"
ssh -o ConnectTimeout=5 root@$node_ip "
for vmid in \$(qm list | awk 'NR>1 {print \$1}'); do
vm_name=\$(qm config \$vmid | grep '^name:' | cut -d' ' -f2 || echo 'VM-'\$vmid)
vm_status=\$(qm status \$vmid | awk '{print \$2}')
if [ \"\$vm_status\" == \"running\" ]; then
echo ' Oprire VM '\$vmid' ('\$vm_name') pe $node_ip...'
qm shutdown \$vmid --timeout 180 &
fi
done
" 2>&1 | tee -a $LOGFILE
done
log "Așteptare 3 minute pentru shutdown VM-uri..."
echo -e "${YELLOW}Aștept 180 secunde pentru oprirea graceful a VM-urilor...${NC}"
for i in {180..1}; do
echo -ne "\r${YELLOW}Rămas: $i secunde...${NC} "
sleep 1
done
echo ""
# 3. Verificare VM-uri oprite
log "Step 3: Verificare VM-uri oprite..."
running_vms=$(qm list | awk 'NR>1 && $3=="running" {print $1}')
if [ ! -z "$running_vms" ]; then
log "WARNING: VM-uri încă pornite pe pvemini: $running_vms"
echo -e "${YELLOW}WARNING: Unele VM-uri încă rulează. Oprire forțată în 30 secunde...${NC}"
sleep 30
for vmid in $running_vms; do
log " Force stop VM $vmid"
qm stop $vmid
done
fi
# 4. Oprire containere (dacă există)
log "Step 4: Oprire containere LXC (dacă există)..."
pct list 2>/dev/null | awk 'NR>1 && $2=="running" {print $1}' | while read ctid; do
log " Oprire container $ctid"
pct shutdown $ctid --timeout 60 &
done
sleep 70
# 5. Oprire noduri secundare
log "Step 5: Oprire noduri secundare (pve1, pveelite)..."
for node_ip in ${NODES[@]}; do
log " Shutdown nod $node_ip în 2 minute..."
echo -e "${RED}Shutdown nod $node_ip...${NC}"
ssh -o ConnectTimeout=5 root@$node_ip "shutdown -h +2 'UPS Maintenance - Battery Replacement'" 2>&1 | tee -a $LOGFILE &
done
log "Așteptare 150 secunde pentru shutdown noduri secundare..."
echo -e "${YELLOW}Aștept 150 secunde pentru oprirea nodurilor secundare...${NC}"
for i in {150..1}; do
echo -ne "\r${YELLOW}Rămas: $i secunde până la shutdown pvemini...${NC} "
sleep 1
done
echo ""
# 6. Oprire nod local (pvemini) - ULTIMUL
log "Step 6: Oprire pvemini (nod PRIMARY - ULTIMUL)..."
log "========================================"
log "MAINTENANCE SHUTDOWN - COMPLETE"
log "Nodurile secundare ar trebui să fie oprite."
log "pvemini se va opri în 2 minute."
log "========================================"
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN}SHUTDOWN ORCHESTRAT FINALIZAT${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo -e "${YELLOW}URMĂTORII PAȘI:${NC}"
echo "1. Așteaptă 2 minute pentru oprirea completă a pvemini"
echo "2. Verifică că toate LED-urile nodurilor sunt stinse"
echo "3. Deconectează UPS de la priză"
echo "4. Înlocuiește bateria UPS"
echo "5. Reconectează UPS la priză"
echo "6. Pornește nodurile (apasă buton power sau WOL)"
echo "7. Verifică cluster cu: pvecm status"
echo ""
echo -e "${RED}pvemini se va opri în 2 minute!${NC}"
echo ""
shutdown -h +2 "UPS Maintenance - Battery Replacement - Primary Node"
exit 0