#!/bin/bash # HA Monitor cu PVE::Notify - versiune fără qdevice # Folosește sistemul nativ Proxmox cu template-uri personalizate # # TEMPLATE SYSTEM: # ================ # Acest script folosește template-uri Handlebars pentru formatarea emailurilor, # exact ca sistemul de backup Proxmox. Template-urile trebuie create în: # # /etc/pve/notification-templates/default/ # ├── ha-status-subject.txt.hbs (subject-ul emailului) # ├── ha-status-body.txt.hbs (conținutul text al emailului) # └── ha-status-body.html.hbs (opțional, versiunea HTML) # # Template-urile folosesc sintaxa Handlebars cu variabilele: # - {{ hostname }} : FQDN-ul serverului # - {{ status }} : "SUCCESSFUL" sau "FAILED" # - {{ runtime }} : timpul de execuție în secunde # - {{ details }} : detaliile verificării HA # # Pentru alte scripturi care vor să folosească sistemul de notificări Proxmox: # 1. Creați template-urile în /etc/pve/notification-templates/default/ # 2. Folosiți PVE::Notify::notify($severity, $template_name, $template_data, $fields) # 3. $template_name trebuie să corespundă cu numele fișierelor template # # PREREQUISITE: # ============= # Template-urile sunt create automat de script la prima rulare # Set proper PATH for cron execution (includes /usr/sbin for corosync-quorumtool) PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin HOSTNAME=$(hostname) FQDN=$(hostname -f) DATE=$(date '+%Y-%m-%d %H:%M:%S') START_TIME=$(date +%s) # Verifică parametri înainte de execuție if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then cat << 'HELP' HA Monitor Script - Proxmox High Availability Monitoring USAGE: /opt/scripts/ha-monitor.sh [OPTION] OPTIONS: (no option) Run HA check and send notification via Proxmox notification system -v, --verbose Run HA check with detailed console output --create-templates Recreate notification templates in /etc/pve/notification-templates/default/ -h, --help Display this help message DESCRIPTION: This script monitors the Proxmox HA cluster status and sends notifications using the native Proxmox notification system (PVE::Notify). It checks: - HA Services (pve-ha-lrm, pve-ha-crm) - Cluster Quorum status - Number of online cluster nodes NOTIFICATION TEMPLATES: Templates are stored in: /etc/pve/notification-templates/default/ - ha-status-subject.txt.hbs (email subject) - ha-status-body.txt.hbs (email body text) - ha-status-body.html.hbs (email body HTML) LOG FILE: /var/log/pve-ha-monitor.log EXAMPLES: # Run normal check (silent, sends notification) /opt/scripts/ha-monitor.sh # Run with verbose output /opt/scripts/ha-monitor.sh -v # Recreate email templates /opt/scripts/ha-monitor.sh --create-templates CRON SETUP: To run every 5 minutes: */5 * * * * /opt/scripts/ha-monitor.sh HELP exit 0 fi if [ "$1" == "--create-templates" ] || [ "$1" == "--templates" ]; then # Funcție pentru crearea template-urilor de notificare create_templates() { local template_dir="/etc/pve/notification-templates/default" # Creează directorul dacă nu există mkdir -p "$template_dir" echo "Creating notification templates in $template_dir..." # Template pentru subject - pentru SUCCESS cat > "$template_dir/ha-status-subject.txt.hbs" << 'EOF' {{#if (eq status "SUCCESSFUL")}}✅ HA CLUSTER OK - {{ hostname }}{{else}}🚨 HA CLUSTER ISSUES - {{ hostname }}{{/if}} EOF # Template pentru body text cat > "$template_dir/ha-status-body.txt.hbs" << 'EOF' {{#if (eq status "SUCCESSFUL")}}✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK{{else}}🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES{{/if}} Host: {{ hostname }} Check duration: {{ runtime }}s CLUSTER STATUS: {{ details }} {{#if (eq status "FAILED")}} === IMMEDIATE ACTIONS REQUIRED === 1. SSH to cluster: ssh root@{{ hostname }} 2. Check overall status: pvecm status 3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20 4. Check network connectivity between nodes 5. Verify all cluster nodes are online {{else}} All HA components are functioning normally. - Cluster has proper quorum - Automatic VM migration is available - System is fully redundant {{/if}} === MANUAL SCRIPT EXECUTION === To run this HA status check manually: Basic check: /opt/scripts/ha-monitor.sh Verbose output (shows details on console): /opt/scripts/ha-monitor.sh -v Recreate email templates: /opt/scripts/ha-monitor.sh --create-templates Script location: /opt/scripts/ha-monitor.sh Log file: /var/log/pve-ha-monitor.log Total check time: {{ runtime }}s EOF # Template pentru body HTML cu font mai mare și consistent cat > "$template_dir/ha-status-body.html.hbs" << 'EOF'
{{#if (eq status "SUCCESSFUL")}}

✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK

{{else}}

🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES

{{/if}}

Host: {{ hostname }}
Check duration: {{ runtime }}s

CLUSTER STATUS

{{ details }}
{{#if (eq status "FAILED")}}

IMMEDIATE ACTIONS REQUIRED

  1. SSH to cluster: ssh root@{{ hostname }}
  2. Check overall status: pvecm status
  3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20
  4. Check network connectivity between nodes
  5. Verify all cluster nodes are online

Warning: Issues detected in the cluster. Immediate attention required to ensure high availability.

{{else}}

All HA components are functioning normally:

{{/if}}

MANUAL SCRIPT EXECUTION

To run this HA status check manually:

Basic check:

/opt/scripts/ha-monitor.sh

Verbose output:

/opt/scripts/ha-monitor.sh -v

Recreate templates:

/opt/scripts/ha-monitor.sh --create-templates

Script location: /opt/scripts/ha-monitor.sh
Log file: /var/log/pve-ha-monitor.log

Total check time: {{ runtime }}s

EOF echo "Templates created successfully." } create_templates echo "Templates recreated successfully." echo "Run './ha-monitor.sh -v' to test with new templates." exit 0 fi # Verificare HA status check_ha_status() { local status_ok=true local details="" # Verifică serviciile HA if systemctl is-active --quiet pve-ha-lrm && systemctl is-active --quiet pve-ha-crm; then details+="HA Services: OK\n" else details+="HA Services: ERROR - Services not running\n" details+=" Recovery: systemctl restart pve-ha-lrm pve-ha-crm\n" status_ok=false fi # Verifică quorum quorum_info=$(corosync-quorumtool -s 2>/dev/null) pvecm_info=$(pvecm status 2>/dev/null) if echo "$quorum_info" | grep -q "Quorate:.*Yes"; then expected_votes=$(echo "$quorum_info" | grep "Expected votes:" | awk '{print $3}') total_votes=$(echo "$quorum_info" | grep "Total votes:" | awk '{print $3}') if [ "$total_votes" = "$expected_votes" ]; then details+="Quorum: OK ($total_votes/$expected_votes votes)\n" else details+="Quorum: WARNING ($total_votes/$expected_votes votes)\n" details+=" Check: pvecm status\n" status_ok=false fi else details+="Quorum: ERROR - Cluster not quorate\n" details+=" Check: pvecm status && corosync-quorumtool -s\n" status_ok=false fi # Verifică nodurile prin pvecm status - numără liniile din Membership information nodes_online=$(echo "$pvecm_info" | grep -E "^[[:space:]]*0x[0-9a-fA-F]+" | wc -l) if [ "$nodes_online" -ge 2 ]; then details+="Cluster Nodes: OK ($nodes_online nodes online)\n" else details+="Cluster Nodes: ERROR - Only $nodes_online nodes online\n" details+=" Check: pvecm nodes && ping [offline-node-ip]\n" status_ok=false fi # Adaugă secțiune de recovery doar pentru cazurile de eroare if ! $status_ok; then details+="\n=== IMMEDIATE ACTIONS REQUIRED ===\n" details+="1. SSH to cluster: ssh root@$(hostname -f)\n" details+="2. Check overall status: pvecm status\n" details+="3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20\n" details+="4. Check network connectivity between nodes\n" fi if $status_ok; then echo "SUCCESSFUL" else echo "FAILED" fi echo -e "$details" } # Execută verificarea RESULT=$(check_ha_status) STATUS=$(echo "$RESULT" | head -n 1) DETAILS=$(echo "$RESULT" | tail -n +2) # Calculează timpul END_TIME=$(date +%s) RUNTIME=$((END_TIME - START_TIME)) # Determină severity if [ "$STATUS" = "SUCCESSFUL" ]; then SEVERITY="info" else SEVERITY="error" fi # Trimite notificarea prin PVE::Notify cu tipul "ha-status" # Redirecționează STDERR către /dev/null pentru modul non-verbose (previne emailuri de la cron) if [ "$1" == "--verbose" ] || [ "$1" == "-v" ]; then # Modul verbose - afișează toate mesajele perl -I/usr/share/perl5 << EOF use strict; use warnings; use PVE::Notify; my \$template_data = { 'hostname' => '$FQDN', 'status' => '$STATUS', 'runtime' => '$RUNTIME', 'details' => '$DETAILS' }; my \$fields = { 'hostname' => '$HOSTNAME', 'type' => 'ha-status', 'status' => '$STATUS' }; eval { PVE::Notify::notify('$SEVERITY', 'ha-status', \$template_data, \$fields); print "Notification sent successfully\\n"; }; if (\$@) { print STDERR "Failed to send notification: \$@\\n"; exit 1; } EOF PERL_EXIT_CODE=$? else # Modul non-verbose - suprimă mesajele INFO pentru a preveni emailuri de la cron perl -I/usr/share/perl5 << EOF 2>/dev/null use strict; use warnings; use PVE::Notify; my \$template_data = { 'hostname' => '$FQDN', 'status' => '$STATUS', 'runtime' => '$RUNTIME', 'details' => '$DETAILS' }; my \$fields = { 'hostname' => '$HOSTNAME', 'type' => 'ha-status', 'status' => '$STATUS' }; eval { PVE::Notify::notify('$SEVERITY', 'ha-status', \$template_data, \$fields); }; if (\$@) { print STDERR "Failed to send notification: \$@\\n"; exit 1; } EOF PERL_EXIT_CODE=$? fi # Log local echo "$(date): HA status check completed - $STATUS, notification exit code: $PERL_EXIT_CODE" >> /var/log/pve-ha-monitor.log # Output pentru testare if [ "$1" == "--verbose" ] || [ "$1" == "-v" ]; then echo "=== HA MONITOR REPORT ===" echo "Status: $STATUS" echo "Runtime: ${RUNTIME}s" echo "Severity: $SEVERITY" echo "Perl exit code: $PERL_EXIT_CODE" echo echo "Details:" echo "$DETAILS" echo echo "Using template: ha-status" echo "Template data: hostname=$FQDN, status=$STATUS, runtime=${RUNTIME}s" fi