#!/bin/bash # # Kingston SNV3S2000G — daily thermal trend report # Reads /var/log/kingston-thermal.csv (populated every 30 min by kingston-thermal-monitor.sh) # Produces human-readable summary: current state, 24h min/max/avg, day-over-day delta, alert events. # # Usage: # /opt/scripts/kingston-thermal-report.sh # raport pe stdout # /opt/scripts/kingston-thermal-report.sh --mail # raport via mail la mmarius28@gmail.com # # Schedule (suggested cron): 0 8 * * * /opt/scripts/kingston-thermal-report.sh --mail # set -euo pipefail export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" LOG_FILE="/var/log/kingston-thermal.csv" STATE_FILE="/var/run/kingston-thermal-state" ALERT_RECIPIENT="${ALERT_RECIPIENT:-mmarius28@gmail.com}" ALERT_FROM="ups@romfast.ro" if [ ! -f "$LOG_FILE" ]; then echo "Eroare: $LOG_FILE nu exista. Scriptul de monitoring nu a rulat inca?" exit 1 fi ROW_COUNT=$(awk 'NR>1' "$LOG_FILE" | wc -l) if [ "$ROW_COUNT" -eq 0 ]; then echo "Eroare: Fara date in $LOG_FILE (doar header)." exit 1 fi # Helper: filter rows in last N hours filter_last_hours() { local hours="$1" local cutoff cutoff=$(date -d "$hours hours ago" -Iseconds) awk -F, -v cutoff="$cutoff" 'NR>1 && $1 >= cutoff' "$LOG_FILE" } # Stats: min, max, avg pentru o coloana data stats_col() { local col="$1" awk -F, -v col="$col" ' BEGIN { min=999; max=-999; sum=0; n=0 } { v=$col; if (vmax) max=v; sum+=v; n++ } END { if (n==0) print "N/A,N/A,N/A,0" else printf "%d,%d,%.1f,%d\n", min, max, sum/n, n } ' } # Time spent above threshold (count rows * 30 min, in ore) time_above_threshold() { local col="$1" local threshold="$2" awk -F, -v col="$col" -v t="$threshold" ' NR>1 && $col >= t { n++ } END { printf "%.1f\n", (n*30/60) } ' "$LOG_FILE" } # Latest reading LATEST=$(tail -1 "$LOG_FILE") LATEST_TS=$(echo "$LATEST" | awk -F, '{print $1}') LATEST_S2=$(echo "$LATEST" | awk -F, '{print $5}') LATEST_COMP=$(echo "$LATEST" | awk -F, '{print $3}') LATEST_SPARE=$(echo "$LATEST" | awk -F, '{print $7}') LATEST_USED=$(echo "$LATEST" | awk -F, '{print $8}') LATEST_MEDIA=$(echo "$LATEST" | awk -F, '{print $9}') LATEST_CRITWARN=$(echo "$LATEST" | awk -F, '{print $2}') LATEST_WARNTIME=$(echo "$LATEST" | awk -F, '{print $11}') LATEST_CRITTIME=$(echo "$LATEST" | awk -F, '{print $12}') CURRENT_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "unknown") # Stats last 24h LAST24=$(filter_last_hours 24) S24_COUNT=$(echo "$LAST24" | grep -c '^' || true) [ -z "$LAST24" ] && S24_COUNT=0 if [ "$S24_COUNT" -gt 0 ]; then S24_S2_STATS=$(echo "$LAST24" | stats_col 5) S24_S2_MIN=$(echo "$S24_S2_STATS" | cut -d, -f1) S24_S2_MAX=$(echo "$S24_S2_STATS" | cut -d, -f2) S24_S2_AVG=$(echo "$S24_S2_STATS" | cut -d, -f3) else S24_S2_MIN="N/A"; S24_S2_MAX="N/A"; S24_S2_AVG="N/A" fi # Stats 24-48h ago LAST48_RAW=$(filter_last_hours 48) CUTOFF24=$(date -d '24 hours ago' -Iseconds) LAST48=$(echo "$LAST48_RAW" | awk -F, -v cutoff="$CUTOFF24" '$1 < cutoff') S48_COUNT=$(echo "$LAST48" | grep -c '^' || true) [ -z "$LAST48" ] && S48_COUNT=0 if [ "$S48_COUNT" -gt 0 ]; then S48_S2_STATS=$(echo "$LAST48" | stats_col 5) S48_S2_AVG=$(echo "$S48_S2_STATS" | cut -d, -f3) else S48_S2_AVG="N/A" fi # Alert events (24h) ALERT_EVENTS=$(journalctl -t kingston-thermal --since '24 hours ago' --no-pager 2>/dev/null | grep -E 'ALERT|RECOVERY' || echo "(niciun eveniment de tranzitie in 24h)") # Backup jobs (24h) BACKUP_EVENTS=$(journalctl -u pvescheduler --since '24 hours ago' --no-pager 2>/dev/null | grep -iE 'backup|vzdump' | tail -10 || echo "(niciun job inregistrat)") [ -z "$BACKUP_EVENTS" ] && BACKUP_EVENTS="(niciun job inregistrat)" # Status indicator case "$CURRENT_STATE" in ok) STATUS_LINE="OK" ;; warning) STATUS_LINE="WARNING" ;; critical) STATUS_LINE="CRITICAL" ;; *) STATUS_LINE="UNKNOWN" ;; esac # Trend 24h vs 48h TREND_LINE="(prea putine date pentru trend)" if [ "$S48_S2_AVG" != "N/A" ] && [ "$S24_S2_AVG" != "N/A" ]; then DELTA=$(awk -v a="$S24_S2_AVG" -v b="$S48_S2_AVG" 'BEGIN { printf "%.1f", a-b }') if awk "BEGIN { exit !($DELTA < -1) }"; then TREND_LINE="Sensor 2 a SCAZUT cu ${DELTA}C fata de ieri (${S48_S2_AVG} -> ${S24_S2_AVG}) - imbunatatire" elif awk "BEGIN { exit !($DELTA > 1) }"; then TREND_LINE="Sensor 2 a CRESCUT cu +${DELTA}C fata de ieri (${S48_S2_AVG} -> ${S24_S2_AVG}) - atentie!" else TREND_LINE="Sensor 2 stabil fata de ieri (${S48_S2_AVG} vs ${S24_S2_AVG})" fi fi # Time above thresholds (cumulative) TIME_ABOVE_70=$(time_above_threshold 5 70) TIME_ABOVE_75=$(time_above_threshold 5 75) TIME_ABOVE_80=$(time_above_threshold 5 80) # Build report REPORT="======================================================== KINGSTON SNV3S2000G - RAPORT ZILNIC THERMAL Generat: $(date '+%Y-%m-%d %H:%M:%S') Host: $(hostname) ($(hostname -I | awk '{print $1}')) Device: /dev/nvme2 (mounted /mnt/backup) ======================================================== STATUS CURENT: $STATUS_LINE Ultima masuratoare: $LATEST_TS - Sensor 2: ${LATEST_S2}C (cel suspect, pragul critic la 75C) - Composite: ${LATEST_COMP}C - Available Spare: ${LATEST_SPARE}% - Percentage Used: ${LATEST_USED}% - Media Errors: $LATEST_MEDIA - Critical Warning Flag: $LATEST_CRITWARN ULTIMELE 24h (Sensor 2) Min: ${S24_S2_MIN}C Max: ${S24_S2_MAX}C Medie: ${S24_S2_AVG}C Probe: ${S24_COUNT} masuratori TREND vs IERI $TREND_LINE TIMP TOTAL PETRECUT PESTE PRAG (cumulativ din log) >=70C (warning): ${TIME_ABOVE_70} ore >=75C (critical): ${TIME_ABOVE_75} ore >=80C (limit): ${TIME_ABOVE_80} ore DIN FIRMWARE-UL SSD-ULUI (cumulativ since power-on) Warning Comp Temperature Time: ${LATEST_WARNTIME} min total Critical Comp Temperature Time: ${LATEST_CRITTIME} min total EVENIMENTE ALERTA ULTIMELE 24h $ALERT_EVENTS JOBURI BACKUP ULTIMELE 24h $BACKUP_EVENTS ======================================================== INTERPRETARE & ACTIUNE RECOMANDATA ======================================================== " # Recomandare contextuala RECOM="" if [ "$CURRENT_STATE" = "critical" ]; then RECOM="CRITIC: Actioneaza AZI. Probabilitate hang iminent. Fa backup la backup-uri pe alt disc si schimba SSD." elif [ "$CURRENT_STATE" = "warning" ] && [ "$S24_COUNT" -lt 5 ]; then RECOM="WARNING (date insuficiente): Mai putin de 24h de date. Asteapta 24-48h pentru evaluare." elif [ "$CURRENT_STATE" = "warning" ]; then if [ "$S24_S2_MAX" != "N/A" ] && awk "BEGIN { exit !($S24_S2_MAX < 75) }"; then RECOM="WARNING marginal: Sensor 2 osciland intre ${S24_S2_MIN}-${S24_S2_MAX}C, sub pragul critic 75C. Distantarea load-ului ajuta, dar SSD-ul e la limita termica. Continua monitorizarea." else RECOM="WARNING serios: Sensor 2 a atins ${S24_S2_MAX}C in 24h, peste 75C. Daca se repeta in 7 zile -> inlocuire SSD necesara." fi elif [ "$CURRENT_STATE" = "ok" ]; then if [ "$S24_COUNT" -lt 5 ]; then RECOM="OK: Date insuficiente pentru raport complet, dar status curent este normal." else RECOM="OK: Sensor 2 stabilizat sub 70C. Masurile de distantare temporala au functionat." fi else RECOM="STATE necunoscut. Verifica manual: smartctl -a /dev/nvme2" fi REPORT="${REPORT} ${RECOM} Pentru detalii: tail -50 /var/log/kingston-thermal.csv journalctl -t kingston-thermal --since today ======================================================== " # Output if [ "${1:-}" = "--mail" ]; then SUBJECT="[KINGSTON $STATUS_LINE] Sensor 2 avg ${S24_S2_AVG}C ($(hostname))" if echo "$REPORT" | mail -r "$ALERT_FROM" -s "$SUBJECT" "$ALERT_RECIPIENT" 2>/dev/null; then echo "Raport trimis prin mail la $ALERT_RECIPIENT" else echo "Eroare la trimitere mail (Gmail poate respinge fara DKIM); raportul ramane logat local." fi else echo "$REPORT" fi