docs(cluster): incident pvemini backup SSD hang + thermal monitoring

Documenteaza incidentul Kingston SNV3S2000G hang la 2026-04-30 (Sensor 2 74°C → emergency mode + restart loop) si masurile aplicate: distantare temporala backup-uri par/impar, mutare CT 101+110 pe pve1 backup-ssd, nofail in fstab, hardware watchdog iTCO_wdt, monitoring CSV la 30 min. Adauga scripturile /opt/scripts/kingston-thermal-{monitor,report}.sh pentru tracking trend si alertare la depasirea pragurilor termale. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 01:04:02 +03:00
parent 2109bc7f5e
commit 3ded5d3f2f
3 changed files with 756 additions and 0 deletions
--- a/proxmox/cluster/scripts/kingston-thermal-monitor.sh
+++ b/proxmox/cluster/scripts/kingston-thermal-monitor.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+#
+# Kingston SNV3S2000G (nvme2) thermal + health monitor
+# Runs every 30 min from cron, logs CSV trend, alerts on threshold transitions
+# Schedule (cron on pvemini): */30 * * * * /opt/scripts/kingston-thermal-monitor.sh
+#
+# Background: incident 2026-04-30 — Kingston backup SSD hung due to thermal stress
+# (Sensor 2 = 74°C idle). This script tracks recovery after distantare temporala +
+# alternare par/impar deployed 2026-05-01.
+#
+set -euo pipefail
+export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+
+DEVICE="/dev/nvme2"
+DEVICE_LABEL="Kingston SNV3S2000G (backup SSD)"
+LOG_FILE="/var/log/kingston-thermal.csv"
+STATE_FILE="/var/run/kingston-thermal-state"
+ALERT_RECIPIENT="${ALERT_RECIPIENT:-mmarius28@gmail.com}"
+ALERT_FROM="ups@romfast.ro"
+
+# Thresholds
+WARN_TEMP=70   # Sensor 2 warning
+CRIT_TEMP=75   # Sensor 2 critical (kernel warning at 75 per SMART spec)
+WARN_SPARE=50  # below = warning
+WARN_USED=80   # above = warning
+
+# Initialize log with header if missing
+if [ ! -f "$LOG_FILE" ]; then
+    echo "timestamp,critical_warning,temp_composite,sensor1,sensor2,sensor3,available_spare,percentage_used,media_errors,unsafe_shutdowns,warning_temp_time,critical_comp_time,data_units_written,power_on_hours" > "$LOG_FILE"
+fi
+
+# Read SMART JSON
+SMART=$(smartctl -j -A "$DEVICE" 2>/dev/null) || {
+    logger -t kingston-thermal "ERROR: smartctl failed for $DEVICE"
+    exit 1
+}
+
+# Extract values (jq -r so unquoted)
+CRIT_WARN=$(echo "$SMART"  | jq -r '.nvme_smart_health_information_log.critical_warning')
+TEMP_COMP=$(echo "$SMART"  | jq -r '.nvme_smart_health_information_log.temperature')
+TEMP_S1=$(echo "$SMART"    | jq -r '.nvme_smart_health_information_log.temperature_sensors[0] // "0"')
+TEMP_S2=$(echo "$SMART"    | jq -r '.nvme_smart_health_information_log.temperature_sensors[1] // "0"')
+TEMP_S3=$(echo "$SMART"    | jq -r '.nvme_smart_health_information_log.temperature_sensors[2] // "0"')
+SPARE=$(echo "$SMART"      | jq -r '.nvme_smart_health_information_log.available_spare')
+USED=$(echo "$SMART"       | jq -r '.nvme_smart_health_information_log.percentage_used')
+MEDIA_ERR=$(echo "$SMART"  | jq -r '.nvme_smart_health_information_log.media_errors')
+UNSAFE=$(echo "$SMART"     | jq -r '.nvme_smart_health_information_log.unsafe_shutdowns')
+WARN_TIME=$(echo "$SMART"  | jq -r '.nvme_smart_health_information_log.warning_temp_time')
+CRIT_TIME=$(echo "$SMART"  | jq -r '.nvme_smart_health_information_log.critical_comp_time')
+DUW=$(echo "$SMART"        | jq -r '.nvme_smart_health_information_log.data_units_written')
+POH=$(echo "$SMART"        | jq -r '.nvme_smart_health_information_log.power_on_hours')
+
+TS=$(date -Iseconds)
+
+# Append to CSV log
+echo "$TS,$CRIT_WARN,$TEMP_COMP,$TEMP_S1,$TEMP_S2,$TEMP_S3,$SPARE,$USED,$MEDIA_ERR,$UNSAFE,$WARN_TIME,$CRIT_TIME,$DUW,$POH" >> "$LOG_FILE"
+
+# Determine current state
+STATE="ok"
+REASON=""
+if [ "$CRIT_WARN" != "0" ]; then
+    STATE="critical"; REASON="Critical Warning flag = $CRIT_WARN (NVMe firmware reports issue)"
+elif [ "$MEDIA_ERR" -gt 0 ]; then
+    STATE="critical"; REASON="Media errors = $MEDIA_ERR (data integrity loss)"
+elif [ "$TEMP_S2" -ge "$CRIT_TEMP" ]; then
+    STATE="critical"; REASON="Sensor 2 = ${TEMP_S2}°C >= ${CRIT_TEMP}°C critical threshold"
+elif [ "$TEMP_S2" -ge "$WARN_TEMP" ]; then
+    STATE="warning"; REASON="Sensor 2 = ${TEMP_S2}°C >= ${WARN_TEMP}°C warning threshold"
+elif [ "$SPARE" -lt "$WARN_SPARE" ]; then
+    STATE="warning"; REASON="Available Spare = ${SPARE}% (below ${WARN_SPARE}%)"
+elif [ "$USED" -ge "$WARN_USED" ]; then
+    STATE="warning"; REASON="Percentage Used = ${USED}% (above ${WARN_USED}%)"
+fi
+
+# State management — alert only on transitions to a worse state
+PREV_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "ok")
+
+# Severity ranking for transition detection
+state_rank() {
+    case "$1" in
+        ok)       echo 0 ;;
+        warning)  echo 1 ;;
+        critical) echo 2 ;;
+    esac
+}
+PREV_RANK=$(state_rank "$PREV_STATE")
+CURR_RANK=$(state_rank "$STATE")
+
+if [ "$CURR_RANK" -gt "$PREV_RANK" ]; then
+    # Worsening transition — send alert
+    SUBJECT="[ALERT-$STATE] $DEVICE_LABEL on $(hostname): $REASON"
+    {
+        echo "Kingston SSD ($DEVICE_LABEL) on $(hostname)"
+        echo "State: $PREV_STATE -> $STATE"
+        echo "Reason: $REASON"
+        echo "Time: $TS"
+        echo
+        echo "=== SMART snapshot ==="
+        echo "Critical Warning:   $CRIT_WARN"
+        echo "Temp composite:     ${TEMP_COMP}°C"
+        echo "Temp Sensor 1:      ${TEMP_S1}°C"
+        echo "Temp Sensor 2:      ${TEMP_S2}°C  <-- monitored"
+        echo "Temp Sensor 3:      ${TEMP_S3}°C"
+        echo "Available Spare:    ${SPARE}%"
+        echo "Percentage Used:    ${USED}%"
+        echo "Media Errors:       $MEDIA_ERR"
+        echo "Unsafe Shutdowns:   $UNSAFE"
+        echo "Warning Temp Time:  ${WARN_TIME} min"
+        echo "Critical Comp Time: ${CRIT_TIME} min"
+        echo "Power On Hours:     $POH"
+        echo "Data Units Written: $DUW (= $((DUW / 2)) MB written total)"
+        echo
+        echo "Last 5 CSV log entries:"
+        tail -5 "$LOG_FILE"
+        echo
+        echo "Investigate: ssh root@$(hostname -I | awk '{print $1}') 'smartctl -a $DEVICE'"
+    } | mail -r "$ALERT_FROM" -s "$SUBJECT" "$ALERT_RECIPIENT" 2>/dev/null || true
+    logger -t kingston-thermal "ALERT $PREV_STATE -> $STATE: $REASON"
+elif [ "$CURR_RANK" -lt "$PREV_RANK" ]; then
+    # Recovery transition — log only, no alert spam
+    logger -t kingston-thermal "RECOVERY $PREV_STATE -> $STATE (Sensor2=${TEMP_S2}°C, used=${USED}%)"
+fi
+
+# Persist current state
+echo "$STATE" > "$STATE_FILE"
+
+# Routine status to journal (visible in journalctl -t kingston-thermal)
+logger -t kingston-thermal "OK device=$DEVICE state=$STATE temp_comp=${TEMP_COMP}C s1=${TEMP_S1}C s2=${TEMP_S2}C s3=${TEMP_S3}C used=${USED}% spare=${SPARE}% media_err=$MEDIA_ERR"