#!/bin/bash # # Detects pveelite outage and emails the operator with copy-paste # failover instructions. Runs on pvemini every minute. # # Threshold: 5 consecutive minute failures before alerting (avoids # false positives from short network blips). State is held in # /var/run/pveelite-down-counter so a flap drops back to 0. # # Schedule (cron on pvemini): * * * * * /opt/scripts/pveelite-down-alert.sh set -euo pipefail export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" PVEELITE_IP="10.0.20.202" PVEMINI_IP="10.0.20.201" DATASET="rpool/oracle-backups" COUNTER_FILE="/var/run/pveelite-down-counter" ALERT_SENT_FILE="/var/run/pveelite-down-alerted" ALERT_THRESHOLD=5 ALERT_RECIPIENT="${ALERT_RECIPIENT:-root}" if ping -c 1 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then # Reset counter on success and clear "alerted" flag so a future outage re-fires. rm -f "$COUNTER_FILE" "$ALERT_SENT_FILE" exit 0 fi # Failure tick COUNT=$(( $(cat "$COUNTER_FILE" 2>/dev/null || echo 0) + 1 )) echo "$COUNT" >"$COUNTER_FILE" [ "$COUNT" -lt "$ALERT_THRESHOLD" ] && exit 0 [ -f "$ALERT_SENT_FILE" ] && exit 0 # already alerted this outage # Gather diagnostics for the email body LAST_REPL=$(zfs list -t snapshot -o name,creation -s creation 2>/dev/null \ | awk -v p="$DATASET@repl_" '$1 ~ p {snap=$1; ts=$2 " " $3 " " $4 " " $5 " " $6} END {print snap " (" ts ")"}') LAST_VM109_REPL=$(zfs list -t snapshot -o name,creation -s creation 2>/dev/null \ | awk '/vm-109-disk-1@__replicate_109/ {snap=$1; ts=$2 " " $3 " " $4 " " $5 " " $6} END {print snap " (" ts ")"}') cat <