#!/bin/bash # # Kingston SNV3S2000G (nvme2) thermal + health monitor # Runs every 30 min from cron, logs CSV trend, alerts on threshold transitions # Schedule (cron on pvemini): */30 * * * * /opt/scripts/kingston-thermal-monitor.sh # # Background: incident 2026-04-30 — Kingston backup SSD hung due to thermal stress # (Sensor 2 = 74°C idle). This script tracks recovery after distantare temporala + # alternare par/impar deployed 2026-05-01. # set -euo pipefail export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" DEVICE="/dev/nvme2" DEVICE_LABEL="Kingston SNV3S2000G (backup SSD)" LOG_FILE="/var/log/kingston-thermal.csv" STATE_FILE="/var/run/kingston-thermal-state" ALERT_RECIPIENT="${ALERT_RECIPIENT:-mmarius28@gmail.com}" ALERT_FROM="ups@romfast.ro" # Thresholds WARN_TEMP=70 # Sensor 2 warning CRIT_TEMP=75 # Sensor 2 critical (kernel warning at 75 per SMART spec) WARN_SPARE=50 # below = warning WARN_USED=80 # above = warning # Initialize log with header if missing if [ ! -f "$LOG_FILE" ]; then echo "timestamp,critical_warning,temp_composite,sensor1,sensor2,sensor3,available_spare,percentage_used,media_errors,unsafe_shutdowns,warning_temp_time,critical_comp_time,data_units_written,power_on_hours" > "$LOG_FILE" fi # Read SMART JSON SMART=$(smartctl -j -A "$DEVICE" 2>/dev/null) || { logger -t kingston-thermal "ERROR: smartctl failed for $DEVICE" exit 1 } # Extract values (jq -r so unquoted) CRIT_WARN=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.critical_warning') TEMP_COMP=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature') TEMP_S1=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature_sensors[0] // "0"') TEMP_S2=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature_sensors[1] // "0"') TEMP_S3=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature_sensors[2] // "0"') SPARE=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.available_spare') USED=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.percentage_used') MEDIA_ERR=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.media_errors') UNSAFE=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.unsafe_shutdowns') WARN_TIME=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.warning_temp_time') CRIT_TIME=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.critical_comp_time') DUW=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.data_units_written') POH=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.power_on_hours') TS=$(date -Iseconds) # Append to CSV log echo "$TS,$CRIT_WARN,$TEMP_COMP,$TEMP_S1,$TEMP_S2,$TEMP_S3,$SPARE,$USED,$MEDIA_ERR,$UNSAFE,$WARN_TIME,$CRIT_TIME,$DUW,$POH" >> "$LOG_FILE" # Determine current state STATE="ok" REASON="" if [ "$CRIT_WARN" != "0" ]; then STATE="critical"; REASON="Critical Warning flag = $CRIT_WARN (NVMe firmware reports issue)" elif [ "$MEDIA_ERR" -gt 0 ]; then STATE="critical"; REASON="Media errors = $MEDIA_ERR (data integrity loss)" elif [ "$TEMP_S2" -ge "$CRIT_TEMP" ]; then STATE="critical"; REASON="Sensor 2 = ${TEMP_S2}°C >= ${CRIT_TEMP}°C critical threshold" elif [ "$TEMP_S2" -ge "$WARN_TEMP" ]; then STATE="warning"; REASON="Sensor 2 = ${TEMP_S2}°C >= ${WARN_TEMP}°C warning threshold" elif [ "$SPARE" -lt "$WARN_SPARE" ]; then STATE="warning"; REASON="Available Spare = ${SPARE}% (below ${WARN_SPARE}%)" elif [ "$USED" -ge "$WARN_USED" ]; then STATE="warning"; REASON="Percentage Used = ${USED}% (above ${WARN_USED}%)" fi # State management — alert only on transitions to a worse state PREV_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "ok") # Severity ranking for transition detection state_rank() { case "$1" in ok) echo 0 ;; warning) echo 1 ;; critical) echo 2 ;; esac } PREV_RANK=$(state_rank "$PREV_STATE") CURR_RANK=$(state_rank "$STATE") if [ "$CURR_RANK" -gt "$PREV_RANK" ]; then # Worsening transition — send alert SUBJECT="[ALERT-$STATE] $DEVICE_LABEL on $(hostname): $REASON" { echo "Kingston SSD ($DEVICE_LABEL) on $(hostname)" echo "State: $PREV_STATE -> $STATE" echo "Reason: $REASON" echo "Time: $TS" echo echo "=== SMART snapshot ===" echo "Critical Warning: $CRIT_WARN" echo "Temp composite: ${TEMP_COMP}°C" echo "Temp Sensor 1: ${TEMP_S1}°C" echo "Temp Sensor 2: ${TEMP_S2}°C <-- monitored" echo "Temp Sensor 3: ${TEMP_S3}°C" echo "Available Spare: ${SPARE}%" echo "Percentage Used: ${USED}%" echo "Media Errors: $MEDIA_ERR" echo "Unsafe Shutdowns: $UNSAFE" echo "Warning Temp Time: ${WARN_TIME} min" echo "Critical Comp Time: ${CRIT_TIME} min" echo "Power On Hours: $POH" echo "Data Units Written: $DUW (= $((DUW / 2)) MB written total)" echo echo "Last 5 CSV log entries:" tail -5 "$LOG_FILE" echo echo "Investigate: ssh root@$(hostname -I | awk '{print $1}') 'smartctl -a $DEVICE'" } | mail -r "$ALERT_FROM" -s "$SUBJECT" "$ALERT_RECIPIENT" 2>/dev/null || true logger -t kingston-thermal "ALERT $PREV_STATE -> $STATE: $REASON" elif [ "$CURR_RANK" -lt "$PREV_RANK" ]; then # Recovery transition — log only, no alert spam logger -t kingston-thermal "RECOVERY $PREV_STATE -> $STATE (Sensor2=${TEMP_S2}°C, used=${USED}%)" fi # Persist current state echo "$STATE" > "$STATE_FILE" # Routine status to journal (visible in journalctl -t kingston-thermal) logger -t kingston-thermal "OK device=$DEVICE state=$STATE temp_comp=${TEMP_COMP}C s1=${TEMP_S1}C s2=${TEMP_S2}C s3=${TEMP_S3}C used=${USED}% spare=${SPARE}% media_err=$MEDIA_ERR"