#!/bin/bash # HA Monitor cu PVE::Notify - versiune finală # Folosește sistemul nativ Proxmox cu template-uri personalizate # # TEMPLATE SYSTEM: # ================ # Acest script folosește template-uri Handlebars pentru formatarea emailurilor, # exact ca sistemul de backup Proxmox. Template-urile trebuie create în: # # /etc/pve/notification-templates/default/ # ├── ha-status-subject.txt.hbs (subject-ul emailului) # ├── ha-status-body.txt.hbs (conținutul text al emailului) # └── ha-status-body.html.hbs (opțional, versiunea HTML) # # Template-urile folosesc sintaxa Handlebars cu variabilele: # - {{ hostname }} : FQDN-ul serverului # - {{ status }} : "SUCCESSFUL" sau "FAILED" # - {{ runtime }} : timpul de execuție în secunde # - {{ details }} : detaliile verificării HA # # Pentru alte scripturi care vor să folosească sistemul de notificări Proxmox: # 1. Creați template-urile în /etc/pve/notification-templates/default/ # 2. Folosiți PVE::Notify::notify($severity, $template_name, $template_data, $fields) # 3. $template_name trebuie să corespundă cu numele fișierelor template # # PREREQUISITE: # ============= # Template-urile sunt create automat de script la prima rulare HOSTNAME=$(hostname) FQDN=$(hostname -f) DATE=$(date '+%Y-%m-%d %H:%M:%S') START_TIME=$(date +%s) # Funcție pentru crearea template-urilor de notificare create_templates() { local template_dir="/etc/pve/notification-templates/default" # Creează directorul dacă nu există mkdir -p "$template_dir" echo "Creating notification templates in $template_dir..." # Template pentru subject - pentru SUCCESS cat > "$template_dir/ha-status-subject.txt.hbs" << 'EOF' {{#if (eq status "SUCCESSFUL")}}✅ HA CLUSTER OK - {{ hostname }}{{else}}🚨 HA CLUSTER ISSUES - {{ hostname }}{{/if}} EOF # Template pentru body text cat > "$template_dir/ha-status-body.txt.hbs" << 'EOF' {{#if (eq status "SUCCESSFUL")}}✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK{{else}}🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES{{/if}} Host: {{ hostname }} Check duration: {{ runtime }}s CLUSTER STATUS: {{ details }} {{#if (eq status "FAILED")}} === HOW TO READ pvecm status OUTPUT === Your current problematic output shows: - Total votes: 2 (WRONG - should be 3) - Qdevice (votes 0) (WRONG - should be votes 1) After fix should show: - Total votes: 3 (CORRECT) - Qdevice (votes 1) (CORRECT) === STEP-BY-STEP FIX === Step 1 - Fix Qdevice (PRIORITY): systemctl restart corosync-qdevice sleep 5 corosync-qdevice-tool -s Step 2 - Verify cluster status: pvecm status LOOK FOR: Total votes: 3 (not 2!) and Qdevice (votes 1) Step 3 - Test HA functionality: ha-manager status === WHAT THIS MEANS === QDEVICE DISCONNECTED: No tie-breaker vote - If one node fails, cluster may lose quorum - VMs won't automatically migrate The cluster works now but has no tie-breaker vote. One node failure = no quorum = VMs can't migrate. {{else}} All HA components are functioning normally. - Cluster has proper quorum with qdevice participation - Automatic VM migration is available - System is fully redundant {{/if}} === MANUAL SCRIPT EXECUTION === To run this HA status check manually: Basic check: /opt/scripts/ha-monitor.sh Verbose output (shows details on console): /opt/scripts/ha-monitor.sh -v Recreate email templates: /opt/scripts/ha-monitor.sh --create-templates Script location: /opt/scripts/ha-monitor.sh Log file: /var/log/pve-ha-monitor.log Total check time: {{ runtime }}s EOF # Template pentru body HTML cu font mai mare și consistent cat > "$template_dir/ha-status-body.html.hbs" << 'EOF'
{{#if (eq status "SUCCESSFUL")}}

✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK

{{else}}

🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES

{{/if}}

Host: {{ hostname }}
Check duration: {{ runtime }}s

CLUSTER STATUS

{{ details }}
{{#if (eq status "FAILED")}}

HOW TO READ pvecm status OUTPUT

Your current problematic output shows:

After fix should show:

STEP-BY-STEP FIX

Step 1 - Fix Qdevice:

systemctl restart corosync-qdevice
sleep 5
corosync-qdevice-tool -s

Step 2 - Verify status:

pvecm status

LOOK FOR: Total votes: 3 (not 2!) and Qdevice (votes 1)

Bottom line: The cluster works now but has no tie-breaker vote.
One node failure = no quorum = VMs can't migrate.

{{else}}

All HA components are functioning normally:

{{/if}}

MANUAL SCRIPT EXECUTION

To run this HA status check manually:

Basic check:

/opt/scripts/ha-monitor.sh

Verbose output:

/opt/scripts/ha-monitor.sh -v

Recreate templates:

/opt/scripts/ha-monitor.sh --create-templates

Script location: /opt/scripts/ha-monitor.sh
Log file: /var/log/pve-ha-monitor.log

Total check time: {{ runtime }}s

EOF echo "Templates created successfully." } # Creează template-urile la prima rulare sau dacă nu există if [ ! -f "/etc/pve/notification-templates/default/ha-status-subject.txt.hbs" ]; then create_templates fi # Verificare HA status check_ha_status() { local status_ok=true local details="" # Verifică serviciile HA if systemctl is-active --quiet pve-ha-lrm && systemctl is-active --quiet pve-ha-crm; then details+="HA Services: OK\n" else details+="HA Services: ERROR - Services not running\n" details+=" Recovery: systemctl restart pve-ha-lrm pve-ha-crm\n" status_ok=false fi # Verifică quorum și qdevice quorum_info=$(corosync-quorumtool -s 2>/dev/null) pvecm_info=$(pvecm status 2>/dev/null) if echo "$quorum_info" | grep -q "Quorate:.*Yes"; then expected_votes=$(echo "$quorum_info" | grep "Expected votes:" | awk '{print $3}') total_votes=$(echo "$quorum_info" | grep "Total votes:" | awk '{print $3}') # Verifică qdevice prin pvecm status - caută linia cu "Qdevice" qdevice_votes=$(echo "$pvecm_info" | grep -E "^[[:space:]]*0x00000000[[:space:]]+1[[:space:]]+Qdevice" | awk '{print $2}') if [ "$total_votes" = "$expected_votes" ] && [ "$qdevice_votes" = "1" ]; then details+="Quorum: OK ($total_votes/$expected_votes votes, Qdevice participating)\n" elif [ "$total_votes" = "$expected_votes" ]; then details+="Quorum: OK ($total_votes/$expected_votes votes)\n" else details+="Quorum: WARNING ($total_votes/$expected_votes votes)\n" details+=" Check: pvecm status for qdevice participation\n" status_ok=false fi else details+="Quorum: ERROR - Cluster not quorate\n" details+=" Check: pvecm status && corosync-quorumtool -s\n" status_ok=false fi # Verifică conectivitatea qdevice qdevice_status=$(corosync-qdevice-tool -s 2>/dev/null) if echo "$qdevice_status" | grep -q "State:.*Connected"; then qnetd_host=$(echo "$qdevice_status" | grep "QNetd host:" | awk '{print $3}') details+="Qdevice Connection: OK ($qnetd_host)\n" else details+="Qdevice Connection: WARNING - Disconnected\n" details+=" Recovery: systemctl restart corosync-qdevice\n" status_ok=false fi # Verifică nodurile prin pvecm status nodes_online=$(echo "$pvecm_info" | grep -c "A,V,NMW") if [ "$nodes_online" -ge 2 ]; then details+="Cluster Nodes: OK ($nodes_online nodes online)\n" else details+="Cluster Nodes: ERROR - Only $nodes_online nodes online\n" details+=" Check: pvecm nodes && ping [offline-node-ip]\n" status_ok=false fi # Adaugă secțiune de recovery doar pentru cazurile de eroare if ! $status_ok; then details+="\n=== IMMEDIATE ACTIONS REQUIRED ===\n" details+="1. SSH to cluster: ssh root@$(hostname -f)\n" details+="2. Check overall status: pvecm status\n" details+="3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20\n" details+="4. Check network connectivity between nodes\n" fi if $status_ok; then echo "SUCCESSFUL" else echo "FAILED" fi echo -e "$details" } # Execută verificarea RESULT=$(check_ha_status) STATUS=$(echo "$RESULT" | head -n 1) DETAILS=$(echo "$RESULT" | tail -n +2) # Calculează timpul END_TIME=$(date +%s) RUNTIME=$((END_TIME - START_TIME)) # Determină severity if [ "$STATUS" = "SUCCESSFUL" ]; then SEVERITY="info" else SEVERITY="error" fi # Trimite notificarea prin PVE::Notify cu tipul "ha-status" perl -I/usr/share/perl5 << EOF use strict; use warnings; use PVE::Notify; # Date pentru template (în format JSON-like pentru Perl) my \$template_data = { 'hostname' => '$FQDN', 'status' => '$STATUS', 'runtime' => '$RUNTIME', 'details' => '$DETAILS' }; # Metadata pentru matcher my \$fields = { 'hostname' => '$HOSTNAME', 'type' => 'ha-status', 'status' => '$STATUS' }; # Trimite notificarea cu tipul "ha-status" # Va folosi template-urile din /etc/pve/notification-templates/default/ha-status-* eval { PVE::Notify::notify('$SEVERITY', 'ha-status', \$template_data, \$fields); print "Notification sent successfully\\n"; }; if (\$@) { print STDERR "Failed to send notification: \$@\\n"; exit 1; } EOF PERL_EXIT_CODE=$? # Log local echo "$(date): HA status check completed - $STATUS, notification exit code: $PERL_EXIT_CODE" >> /var/log/pve-ha-monitor.log # Output pentru testare if [ "$1" == "--verbose" ] || [ "$1" == "-v" ]; then echo "=== HA MONITOR REPORT ===" echo "Status: $STATUS" echo "Runtime: ${RUNTIME}s" echo "Severity: $SEVERITY" echo "Perl exit code: $PERL_EXIT_CODE" echo echo "Details:" echo "$DETAILS" echo echo "Using template: ha-status" echo "Template data: hostname=$FQDN, status=$STATUS, runtime=${RUNTIME}s" elif [ "$1" == "--create-templates" ] || [ "$1" == "--templates" ]; then create_templates echo "Templates recreated successfully." echo "Run './ha-monitor.sh -v' to test with new templates." exit 0 fi