Files
ROMFASTSQL/proxmox/ups/scripts/ups-shutdown-cluster.sh
Marius 87b9709a0d Add complete UPS monitoring system with monthly battery testing
This commit adds a comprehensive UPS monitoring and management system for
the Proxmox cluster with automated shutdown orchestration and monthly
battery health testing.

Features:
- NUT (Network UPS Tools) configuration for INNO TECH USB UPS
- Automated cluster shutdown on power failure (3-minute grace period)
- Monthly automated battery testing with health evaluation
- Email notifications via PVE::Notify system
- WinNUT monitoring client for Windows VM 201

Components added:
- config/: NUT configuration files (ups.conf, upsd.conf, upsmon.conf, etc.)
- scripts/ups-shutdown-cluster.sh: Orchestrated cluster shutdown
- scripts/ups-monthly-test.sh: Monthly battery test with email reports
- scripts/upssched-cmd: Event handler for UPS state changes
- docs/: Complete installation and usage documentation

Key findings:
- UPS battery.charge reporting has 10-40 second delay after test start
- Test must monitor voltage drop (1.5-2V) and charge drop (9-27%)
- Battery health evaluation: EXCELLENT/GOOD/FAIR/POOR based on discharge rate
- Email notifications use Handlebars templates without Unicode emojis for compatibility

Configuration:
- UPS: INNO TECH (Voltronic protocol, vendor 0665:5161)
- Primary node: pvemini (10.0.20.201) with USB connection
- Monthly test: cron 0 0 1 * * /opt/scripts/ups-monthly-test.sh
- Shutdown timer: 180 seconds on battery before cluster shutdown

Documentation includes complete installation guides for NUT server,
WinNUT client, and troubleshooting procedures.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-06 21:39:46 +03:00

84 lines
2.9 KiB
Bash

#!/bin/bash
#
# Script de shutdown orchestrat pentru cluster Proxmox când UPS este pe baterie critică
# Autor: Generat automat
# Data: 2025-10-06
LOGFILE=/var/log/ups-shutdown.log
NODES=(10.0.20.200 10.0.20.202) # pve1, pve2 (pvemini va fi ultimul)
log_message() {
echo "[2025-10-06 20:02:34] $1" | tee -a $LOGFILE
}
log_message "========================================"
log_message "UPS SHUTDOWN ORCHESTRATION STARTED"
log_message "UPS Status: $(upsc nutdev1 ups.status 2>/dev/null || echo 'UNKNOWN')"
log_message "Battery Charge: $(upsc nutdev1 battery.charge 2>/dev/null || echo 'UNKNOWN')%"
log_message "========================================"
# Verifică dacă UPS este într-adevăr pe baterie critică
UPS_STATUS=$(upsc nutdev1 ups.status 2>/dev/null)
if [[ ! $UPS_STATUS =~ (OB|LB) ]]; then
log_message "WARNING: UPS status is $UPS_STATUS - not critical. Aborting shutdown."
exit 0
fi
log_message "Step 1: Oprire VM-uri și containere pe toate nodurile..."
# Oprește VM-uri pe toate nodurile (inclusiv local)
for node in ${NODES[@]} localhost; do
if [ "$node" == "localhost" ]; then
NODE_NAME="pvemini (local)"
else
NODE_NAME=$node
fi
log_message " - Oprire VM-uri pe $NODE_NAME..."
if [ "$node" == "localhost" ]; then
# Local - oprește VM-urile direct
for vmid in $(qm list | awk 'NR>1 {print $1}'); do
vm_status=$(qm status $vmid | awk '{print $2}')
if [ "$vm_status" == "running" ]; then
log_message " * Oprire VM $vmid pe pvemini..."
qm shutdown $vmid --timeout 60 &
fi
done
else
# Remote - SSH către alt nod
ssh -o ConnectTimeout=5 root@$node "
for vmid in \$(qm list | awk 'NR>1 {print \$1}'); do
vm_status=\$(qm status \$vmid | awk '{print \$2}')
if [ \"\$vm_status\" == \"running\" ]; then
echo ' * Oprire VM '\$vmid' pe $node...'
qm shutdown \$vmid --timeout 60 &
fi
done
" 2>&1 | tee -a $LOGFILE
fi
done
log_message "Step 2: Așteptare 90 secunde pentru oprirea VM-urilor..."
sleep 90
log_message "Step 3: Oprire noduri secundare (pve1, pve2)..."
for node in ${NODES[@]}; do
log_message " - Shutdown nod $node..."
ssh -o ConnectTimeout=5 root@$node "shutdown -h +1 'UPS on battery critical - shutting down'" 2>&1 | tee -a $LOGFILE &
done
log_message "Step 4: Așteptare 30 secunde pentru shutdown noduri secundare..."
sleep 30
log_message "Step 5: Oprire nod local (pvemini - primary)..."
log_message "========================================"
log_message "UPS SHUTDOWN ORCHESTRATION COMPLETED"
log_message "Local node will shutdown in 1 minute"
log_message "========================================"
# Oprește nodul local (ultimul)
shutdown -h +1 "UPS on battery critical - primary node shutting down"
exit 0