docs(cluster): incident pvemini backup SSD hang + thermal monitoring
Documenteaza incidentul Kingston SNV3S2000G hang la 2026-04-30 (Sensor 2
74°C → emergency mode + restart loop) si masurile aplicate: distantare
temporala backup-uri par/impar, mutare CT 101+110 pe pve1 backup-ssd,
nofail in fstab, hardware watchdog iTCO_wdt, monitoring CSV la 30 min.
Adauga scripturile /opt/scripts/kingston-thermal-{monitor,report}.sh
pentru tracking trend si alertare la depasirea pragurilor termale.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
128
proxmox/cluster/scripts/kingston-thermal-monitor.sh
Normal file
128
proxmox/cluster/scripts/kingston-thermal-monitor.sh
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Kingston SNV3S2000G (nvme2) thermal + health monitor
|
||||
# Runs every 30 min from cron, logs CSV trend, alerts on threshold transitions
|
||||
# Schedule (cron on pvemini): */30 * * * * /opt/scripts/kingston-thermal-monitor.sh
|
||||
#
|
||||
# Background: incident 2026-04-30 — Kingston backup SSD hung due to thermal stress
|
||||
# (Sensor 2 = 74°C idle). This script tracks recovery after distantare temporala +
|
||||
# alternare par/impar deployed 2026-05-01.
|
||||
#
|
||||
set -euo pipefail
|
||||
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
|
||||
DEVICE="/dev/nvme2"
|
||||
DEVICE_LABEL="Kingston SNV3S2000G (backup SSD)"
|
||||
LOG_FILE="/var/log/kingston-thermal.csv"
|
||||
STATE_FILE="/var/run/kingston-thermal-state"
|
||||
ALERT_RECIPIENT="${ALERT_RECIPIENT:-mmarius28@gmail.com}"
|
||||
ALERT_FROM="ups@romfast.ro"
|
||||
|
||||
# Thresholds
|
||||
WARN_TEMP=70 # Sensor 2 warning
|
||||
CRIT_TEMP=75 # Sensor 2 critical (kernel warning at 75 per SMART spec)
|
||||
WARN_SPARE=50 # below = warning
|
||||
WARN_USED=80 # above = warning
|
||||
|
||||
# Initialize log with header if missing
|
||||
if [ ! -f "$LOG_FILE" ]; then
|
||||
echo "timestamp,critical_warning,temp_composite,sensor1,sensor2,sensor3,available_spare,percentage_used,media_errors,unsafe_shutdowns,warning_temp_time,critical_comp_time,data_units_written,power_on_hours" > "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# Read SMART JSON
|
||||
SMART=$(smartctl -j -A "$DEVICE" 2>/dev/null) || {
|
||||
logger -t kingston-thermal "ERROR: smartctl failed for $DEVICE"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Extract values (jq -r so unquoted)
|
||||
CRIT_WARN=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.critical_warning')
|
||||
TEMP_COMP=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature')
|
||||
TEMP_S1=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature_sensors[0] // "0"')
|
||||
TEMP_S2=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature_sensors[1] // "0"')
|
||||
TEMP_S3=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.temperature_sensors[2] // "0"')
|
||||
SPARE=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.available_spare')
|
||||
USED=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.percentage_used')
|
||||
MEDIA_ERR=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.media_errors')
|
||||
UNSAFE=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.unsafe_shutdowns')
|
||||
WARN_TIME=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.warning_temp_time')
|
||||
CRIT_TIME=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.critical_comp_time')
|
||||
DUW=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.data_units_written')
|
||||
POH=$(echo "$SMART" | jq -r '.nvme_smart_health_information_log.power_on_hours')
|
||||
|
||||
TS=$(date -Iseconds)
|
||||
|
||||
# Append to CSV log
|
||||
echo "$TS,$CRIT_WARN,$TEMP_COMP,$TEMP_S1,$TEMP_S2,$TEMP_S3,$SPARE,$USED,$MEDIA_ERR,$UNSAFE,$WARN_TIME,$CRIT_TIME,$DUW,$POH" >> "$LOG_FILE"
|
||||
|
||||
# Determine current state
|
||||
STATE="ok"
|
||||
REASON=""
|
||||
if [ "$CRIT_WARN" != "0" ]; then
|
||||
STATE="critical"; REASON="Critical Warning flag = $CRIT_WARN (NVMe firmware reports issue)"
|
||||
elif [ "$MEDIA_ERR" -gt 0 ]; then
|
||||
STATE="critical"; REASON="Media errors = $MEDIA_ERR (data integrity loss)"
|
||||
elif [ "$TEMP_S2" -ge "$CRIT_TEMP" ]; then
|
||||
STATE="critical"; REASON="Sensor 2 = ${TEMP_S2}°C >= ${CRIT_TEMP}°C critical threshold"
|
||||
elif [ "$TEMP_S2" -ge "$WARN_TEMP" ]; then
|
||||
STATE="warning"; REASON="Sensor 2 = ${TEMP_S2}°C >= ${WARN_TEMP}°C warning threshold"
|
||||
elif [ "$SPARE" -lt "$WARN_SPARE" ]; then
|
||||
STATE="warning"; REASON="Available Spare = ${SPARE}% (below ${WARN_SPARE}%)"
|
||||
elif [ "$USED" -ge "$WARN_USED" ]; then
|
||||
STATE="warning"; REASON="Percentage Used = ${USED}% (above ${WARN_USED}%)"
|
||||
fi
|
||||
|
||||
# State management — alert only on transitions to a worse state
|
||||
PREV_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "ok")
|
||||
|
||||
# Severity ranking for transition detection
|
||||
state_rank() {
|
||||
case "$1" in
|
||||
ok) echo 0 ;;
|
||||
warning) echo 1 ;;
|
||||
critical) echo 2 ;;
|
||||
esac
|
||||
}
|
||||
PREV_RANK=$(state_rank "$PREV_STATE")
|
||||
CURR_RANK=$(state_rank "$STATE")
|
||||
|
||||
if [ "$CURR_RANK" -gt "$PREV_RANK" ]; then
|
||||
# Worsening transition — send alert
|
||||
SUBJECT="[ALERT-$STATE] $DEVICE_LABEL on $(hostname): $REASON"
|
||||
{
|
||||
echo "Kingston SSD ($DEVICE_LABEL) on $(hostname)"
|
||||
echo "State: $PREV_STATE -> $STATE"
|
||||
echo "Reason: $REASON"
|
||||
echo "Time: $TS"
|
||||
echo
|
||||
echo "=== SMART snapshot ==="
|
||||
echo "Critical Warning: $CRIT_WARN"
|
||||
echo "Temp composite: ${TEMP_COMP}°C"
|
||||
echo "Temp Sensor 1: ${TEMP_S1}°C"
|
||||
echo "Temp Sensor 2: ${TEMP_S2}°C <-- monitored"
|
||||
echo "Temp Sensor 3: ${TEMP_S3}°C"
|
||||
echo "Available Spare: ${SPARE}%"
|
||||
echo "Percentage Used: ${USED}%"
|
||||
echo "Media Errors: $MEDIA_ERR"
|
||||
echo "Unsafe Shutdowns: $UNSAFE"
|
||||
echo "Warning Temp Time: ${WARN_TIME} min"
|
||||
echo "Critical Comp Time: ${CRIT_TIME} min"
|
||||
echo "Power On Hours: $POH"
|
||||
echo "Data Units Written: $DUW (= $((DUW / 2)) MB written total)"
|
||||
echo
|
||||
echo "Last 5 CSV log entries:"
|
||||
tail -5 "$LOG_FILE"
|
||||
echo
|
||||
echo "Investigate: ssh root@$(hostname -I | awk '{print $1}') 'smartctl -a $DEVICE'"
|
||||
} | mail -r "$ALERT_FROM" -s "$SUBJECT" "$ALERT_RECIPIENT" 2>/dev/null || true
|
||||
logger -t kingston-thermal "ALERT $PREV_STATE -> $STATE: $REASON"
|
||||
elif [ "$CURR_RANK" -lt "$PREV_RANK" ]; then
|
||||
# Recovery transition — log only, no alert spam
|
||||
logger -t kingston-thermal "RECOVERY $PREV_STATE -> $STATE (Sensor2=${TEMP_S2}°C, used=${USED}%)"
|
||||
fi
|
||||
|
||||
# Persist current state
|
||||
echo "$STATE" > "$STATE_FILE"
|
||||
|
||||
# Routine status to journal (visible in journalctl -t kingston-thermal)
|
||||
logger -t kingston-thermal "OK device=$DEVICE state=$STATE temp_comp=${TEMP_COMP}C s1=${TEMP_S1}C s2=${TEMP_S2}C s3=${TEMP_S3}C used=${USED}% spare=${SPARE}% media_err=$MEDIA_ERR"
|
||||
222
proxmox/cluster/scripts/kingston-thermal-report.sh
Normal file
222
proxmox/cluster/scripts/kingston-thermal-report.sh
Normal file
@@ -0,0 +1,222 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Kingston SNV3S2000G — daily thermal trend report
|
||||
# Reads /var/log/kingston-thermal.csv (populated every 30 min by kingston-thermal-monitor.sh)
|
||||
# Produces human-readable summary: current state, 24h min/max/avg, day-over-day delta, alert events.
|
||||
#
|
||||
# Usage:
|
||||
# /opt/scripts/kingston-thermal-report.sh # raport pe stdout
|
||||
# /opt/scripts/kingston-thermal-report.sh --mail # raport via mail la mmarius28@gmail.com
|
||||
#
|
||||
# Schedule (suggested cron): 0 8 * * * /opt/scripts/kingston-thermal-report.sh --mail
|
||||
#
|
||||
set -euo pipefail
|
||||
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
|
||||
LOG_FILE="/var/log/kingston-thermal.csv"
|
||||
STATE_FILE="/var/run/kingston-thermal-state"
|
||||
ALERT_RECIPIENT="${ALERT_RECIPIENT:-mmarius28@gmail.com}"
|
||||
ALERT_FROM="ups@romfast.ro"
|
||||
|
||||
if [ ! -f "$LOG_FILE" ]; then
|
||||
echo "Eroare: $LOG_FILE nu exista. Scriptul de monitoring nu a rulat inca?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROW_COUNT=$(awk 'NR>1' "$LOG_FILE" | wc -l)
|
||||
if [ "$ROW_COUNT" -eq 0 ]; then
|
||||
echo "Eroare: Fara date in $LOG_FILE (doar header)."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Helper: filter rows in last N hours
|
||||
filter_last_hours() {
|
||||
local hours="$1"
|
||||
local cutoff
|
||||
cutoff=$(date -d "$hours hours ago" -Iseconds)
|
||||
awk -F, -v cutoff="$cutoff" 'NR>1 && $1 >= cutoff' "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Stats: min, max, avg pentru o coloana data
|
||||
stats_col() {
|
||||
local col="$1"
|
||||
awk -F, -v col="$col" '
|
||||
BEGIN { min=999; max=-999; sum=0; n=0 }
|
||||
{ v=$col; if (v<min) min=v; if (v>max) max=v; sum+=v; n++ }
|
||||
END {
|
||||
if (n==0) print "N/A,N/A,N/A,0"
|
||||
else printf "%d,%d,%.1f,%d\n", min, max, sum/n, n
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
# Time spent above threshold (count rows * 30 min, in ore)
|
||||
time_above_threshold() {
|
||||
local col="$1"
|
||||
local threshold="$2"
|
||||
awk -F, -v col="$col" -v t="$threshold" '
|
||||
NR>1 && $col >= t { n++ }
|
||||
END { printf "%.1f\n", (n*30/60) }
|
||||
' "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Latest reading
|
||||
LATEST=$(tail -1 "$LOG_FILE")
|
||||
LATEST_TS=$(echo "$LATEST" | awk -F, '{print $1}')
|
||||
LATEST_S2=$(echo "$LATEST" | awk -F, '{print $5}')
|
||||
LATEST_COMP=$(echo "$LATEST" | awk -F, '{print $3}')
|
||||
LATEST_SPARE=$(echo "$LATEST" | awk -F, '{print $7}')
|
||||
LATEST_USED=$(echo "$LATEST" | awk -F, '{print $8}')
|
||||
LATEST_MEDIA=$(echo "$LATEST" | awk -F, '{print $9}')
|
||||
LATEST_CRITWARN=$(echo "$LATEST" | awk -F, '{print $2}')
|
||||
LATEST_WARNTIME=$(echo "$LATEST" | awk -F, '{print $11}')
|
||||
LATEST_CRITTIME=$(echo "$LATEST" | awk -F, '{print $12}')
|
||||
CURRENT_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "unknown")
|
||||
|
||||
# Stats last 24h
|
||||
LAST24=$(filter_last_hours 24)
|
||||
S24_COUNT=$(echo "$LAST24" | grep -c '^' || true)
|
||||
[ -z "$LAST24" ] && S24_COUNT=0
|
||||
|
||||
if [ "$S24_COUNT" -gt 0 ]; then
|
||||
S24_S2_STATS=$(echo "$LAST24" | stats_col 5)
|
||||
S24_S2_MIN=$(echo "$S24_S2_STATS" | cut -d, -f1)
|
||||
S24_S2_MAX=$(echo "$S24_S2_STATS" | cut -d, -f2)
|
||||
S24_S2_AVG=$(echo "$S24_S2_STATS" | cut -d, -f3)
|
||||
else
|
||||
S24_S2_MIN="N/A"; S24_S2_MAX="N/A"; S24_S2_AVG="N/A"
|
||||
fi
|
||||
|
||||
# Stats 24-48h ago
|
||||
LAST48_RAW=$(filter_last_hours 48)
|
||||
CUTOFF24=$(date -d '24 hours ago' -Iseconds)
|
||||
LAST48=$(echo "$LAST48_RAW" | awk -F, -v cutoff="$CUTOFF24" '$1 < cutoff')
|
||||
S48_COUNT=$(echo "$LAST48" | grep -c '^' || true)
|
||||
[ -z "$LAST48" ] && S48_COUNT=0
|
||||
|
||||
if [ "$S48_COUNT" -gt 0 ]; then
|
||||
S48_S2_STATS=$(echo "$LAST48" | stats_col 5)
|
||||
S48_S2_AVG=$(echo "$S48_S2_STATS" | cut -d, -f3)
|
||||
else
|
||||
S48_S2_AVG="N/A"
|
||||
fi
|
||||
|
||||
# Alert events (24h)
|
||||
ALERT_EVENTS=$(journalctl -t kingston-thermal --since '24 hours ago' --no-pager 2>/dev/null | grep -E 'ALERT|RECOVERY' || echo "(niciun eveniment de tranzitie in 24h)")
|
||||
|
||||
# Backup jobs (24h)
|
||||
BACKUP_EVENTS=$(journalctl -u pvescheduler --since '24 hours ago' --no-pager 2>/dev/null | grep -iE 'backup|vzdump' | tail -10 || echo "(niciun job inregistrat)")
|
||||
[ -z "$BACKUP_EVENTS" ] && BACKUP_EVENTS="(niciun job inregistrat)"
|
||||
|
||||
# Status indicator
|
||||
case "$CURRENT_STATE" in
|
||||
ok) STATUS_LINE="OK" ;;
|
||||
warning) STATUS_LINE="WARNING" ;;
|
||||
critical) STATUS_LINE="CRITICAL" ;;
|
||||
*) STATUS_LINE="UNKNOWN" ;;
|
||||
esac
|
||||
|
||||
# Trend 24h vs 48h
|
||||
TREND_LINE="(prea putine date pentru trend)"
|
||||
if [ "$S48_S2_AVG" != "N/A" ] && [ "$S24_S2_AVG" != "N/A" ]; then
|
||||
DELTA=$(awk -v a="$S24_S2_AVG" -v b="$S48_S2_AVG" 'BEGIN { printf "%.1f", a-b }')
|
||||
if awk "BEGIN { exit !($DELTA < -1) }"; then
|
||||
TREND_LINE="Sensor 2 a SCAZUT cu ${DELTA}C fata de ieri (${S48_S2_AVG} -> ${S24_S2_AVG}) - imbunatatire"
|
||||
elif awk "BEGIN { exit !($DELTA > 1) }"; then
|
||||
TREND_LINE="Sensor 2 a CRESCUT cu +${DELTA}C fata de ieri (${S48_S2_AVG} -> ${S24_S2_AVG}) - atentie!"
|
||||
else
|
||||
TREND_LINE="Sensor 2 stabil fata de ieri (${S48_S2_AVG} vs ${S24_S2_AVG})"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Time above thresholds (cumulative)
|
||||
TIME_ABOVE_70=$(time_above_threshold 5 70)
|
||||
TIME_ABOVE_75=$(time_above_threshold 5 75)
|
||||
TIME_ABOVE_80=$(time_above_threshold 5 80)
|
||||
|
||||
# Build report
|
||||
REPORT="========================================================
|
||||
KINGSTON SNV3S2000G - RAPORT ZILNIC THERMAL
|
||||
Generat: $(date '+%Y-%m-%d %H:%M:%S')
|
||||
Host: $(hostname) ($(hostname -I | awk '{print $1}'))
|
||||
Device: /dev/nvme2 (mounted /mnt/backup)
|
||||
========================================================
|
||||
|
||||
STATUS CURENT: $STATUS_LINE
|
||||
Ultima masuratoare: $LATEST_TS
|
||||
- Sensor 2: ${LATEST_S2}C (cel suspect, pragul critic la 75C)
|
||||
- Composite: ${LATEST_COMP}C
|
||||
- Available Spare: ${LATEST_SPARE}%
|
||||
- Percentage Used: ${LATEST_USED}%
|
||||
- Media Errors: $LATEST_MEDIA
|
||||
- Critical Warning Flag: $LATEST_CRITWARN
|
||||
|
||||
ULTIMELE 24h (Sensor 2)
|
||||
Min: ${S24_S2_MIN}C
|
||||
Max: ${S24_S2_MAX}C
|
||||
Medie: ${S24_S2_AVG}C
|
||||
Probe: ${S24_COUNT} masuratori
|
||||
|
||||
TREND vs IERI
|
||||
$TREND_LINE
|
||||
|
||||
TIMP TOTAL PETRECUT PESTE PRAG (cumulativ din log)
|
||||
>=70C (warning): ${TIME_ABOVE_70} ore
|
||||
>=75C (critical): ${TIME_ABOVE_75} ore
|
||||
>=80C (limit): ${TIME_ABOVE_80} ore
|
||||
|
||||
DIN FIRMWARE-UL SSD-ULUI (cumulativ since power-on)
|
||||
Warning Comp Temperature Time: ${LATEST_WARNTIME} min total
|
||||
Critical Comp Temperature Time: ${LATEST_CRITTIME} min total
|
||||
|
||||
EVENIMENTE ALERTA ULTIMELE 24h
|
||||
$ALERT_EVENTS
|
||||
|
||||
JOBURI BACKUP ULTIMELE 24h
|
||||
$BACKUP_EVENTS
|
||||
|
||||
========================================================
|
||||
INTERPRETARE & ACTIUNE RECOMANDATA
|
||||
========================================================
|
||||
"
|
||||
|
||||
# Recomandare contextuala
|
||||
RECOM=""
|
||||
if [ "$CURRENT_STATE" = "critical" ]; then
|
||||
RECOM="CRITIC: Actioneaza AZI. Probabilitate hang iminent. Fa backup la backup-uri pe alt disc si schimba SSD."
|
||||
elif [ "$CURRENT_STATE" = "warning" ] && [ "$S24_COUNT" -lt 5 ]; then
|
||||
RECOM="WARNING (date insuficiente): Mai putin de 24h de date. Asteapta 24-48h pentru evaluare."
|
||||
elif [ "$CURRENT_STATE" = "warning" ]; then
|
||||
if [ "$S24_S2_MAX" != "N/A" ] && awk "BEGIN { exit !($S24_S2_MAX < 75) }"; then
|
||||
RECOM="WARNING marginal: Sensor 2 osciland intre ${S24_S2_MIN}-${S24_S2_MAX}C, sub pragul critic 75C. Distantarea load-ului ajuta, dar SSD-ul e la limita termica. Continua monitorizarea."
|
||||
else
|
||||
RECOM="WARNING serios: Sensor 2 a atins ${S24_S2_MAX}C in 24h, peste 75C. Daca se repeta in 7 zile -> inlocuire SSD necesara."
|
||||
fi
|
||||
elif [ "$CURRENT_STATE" = "ok" ]; then
|
||||
if [ "$S24_COUNT" -lt 5 ]; then
|
||||
RECOM="OK: Date insuficiente pentru raport complet, dar status curent este normal."
|
||||
else
|
||||
RECOM="OK: Sensor 2 stabilizat sub 70C. Masurile de distantare temporala au functionat."
|
||||
fi
|
||||
else
|
||||
RECOM="STATE necunoscut. Verifica manual: smartctl -a /dev/nvme2"
|
||||
fi
|
||||
|
||||
REPORT="${REPORT} ${RECOM}
|
||||
|
||||
Pentru detalii: tail -50 /var/log/kingston-thermal.csv
|
||||
journalctl -t kingston-thermal --since today
|
||||
========================================================
|
||||
"
|
||||
|
||||
# Output
|
||||
if [ "${1:-}" = "--mail" ]; then
|
||||
SUBJECT="[KINGSTON $STATUS_LINE] Sensor 2 avg ${S24_S2_AVG}C ($(hostname))"
|
||||
if echo "$REPORT" | mail -r "$ALERT_FROM" -s "$SUBJECT" "$ALERT_RECIPIENT" 2>/dev/null; then
|
||||
echo "Raport trimis prin mail la $ALERT_RECIPIENT"
|
||||
else
|
||||
echo "Eroare la trimitere mail (Gmail poate respinge fara DKIM); raportul ramane logat local."
|
||||
fi
|
||||
else
|
||||
echo "$REPORT"
|
||||
fi
|
||||
Reference in New Issue
Block a user