#!/bin/bash
# HA Monitor cu PVE::Notify - versiune finală
# Folosește sistemul nativ Proxmox cu template-uri personalizate
#
# TEMPLATE SYSTEM:
# ================
# Acest script folosește template-uri Handlebars pentru formatarea emailurilor,
# exact ca sistemul de backup Proxmox. Template-urile trebuie create în:
#
# /etc/pve/notification-templates/default/
# ├── ha-status-subject.txt.hbs (subject-ul emailului)
# ├── ha-status-body.txt.hbs (conținutul text al emailului)
# └── ha-status-body.html.hbs (opțional, versiunea HTML)
#
# Template-urile folosesc sintaxa Handlebars cu variabilele:
# - {{ hostname }} : FQDN-ul serverului
# - {{ status }} : "SUCCESSFUL" sau "FAILED"
# - {{ runtime }} : timpul de execuție în secunde
# - {{ details }} : detaliile verificării HA
#
# Pentru alte scripturi care vor să folosească sistemul de notificări Proxmox:
# 1. Creați template-urile în /etc/pve/notification-templates/default/
# 2. Folosiți PVE::Notify::notify($severity, $template_name, $template_data, $fields)
# 3. $template_name trebuie să corespundă cu numele fișierelor template
#
# PREREQUISITE:
# =============
# Template-urile sunt create automat de script la prima rulare
HOSTNAME=$(hostname)
FQDN=$(hostname -f)
DATE=$(date '+%Y-%m-%d %H:%M:%S')
START_TIME=$(date +%s)
# Funcție pentru crearea template-urilor de notificare
create_templates() {
local template_dir="/etc/pve/notification-templates/default"
# Creează directorul dacă nu există
mkdir -p "$template_dir"
echo "Creating notification templates in $template_dir..."
# Template pentru subject - pentru SUCCESS
cat > "$template_dir/ha-status-subject.txt.hbs" << 'EOF'
{{#if (eq status "SUCCESSFUL")}}✅ HA CLUSTER OK - {{ hostname }}{{else}}🚨 HA CLUSTER ISSUES - {{ hostname }}{{/if}}
EOF
# Template pentru body text
cat > "$template_dir/ha-status-body.txt.hbs" << 'EOF'
{{#if (eq status "SUCCESSFUL")}}✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK{{else}}🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES{{/if}}
Host: {{ hostname }}
Check duration: {{ runtime }}s
CLUSTER STATUS:
{{ details }}
{{#if (eq status "FAILED")}}
=== HOW TO READ pvecm status OUTPUT ===
Your current problematic output shows:
- Total votes: 2 (WRONG - should be 3)
- Qdevice (votes 0) (WRONG - should be votes 1)
After fix should show:
- Total votes: 3 (CORRECT)
- Qdevice (votes 1) (CORRECT)
=== STEP-BY-STEP FIX ===
Step 1 - Fix Qdevice (PRIORITY):
systemctl restart corosync-qdevice
sleep 5
corosync-qdevice-tool -s
Step 2 - Verify cluster status:
pvecm status
LOOK FOR: Total votes: 3 (not 2!) and Qdevice (votes 1)
Step 3 - Test HA functionality:
ha-manager status
=== WHAT THIS MEANS ===
QDEVICE DISCONNECTED: No tie-breaker vote
- If one node fails, cluster may lose quorum
- VMs won't automatically migrate
The cluster works now but has no tie-breaker vote.
One node failure = no quorum = VMs can't migrate.
{{else}}
All HA components are functioning normally.
- Cluster has proper quorum with qdevice participation
- Automatic VM migration is available
- System is fully redundant
{{/if}}
=== MANUAL SCRIPT EXECUTION ===
To run this HA status check manually:
Basic check:
/opt/scripts/ha-monitor.sh
Verbose output (shows details on console):
/opt/scripts/ha-monitor.sh -v
Recreate email templates:
/opt/scripts/ha-monitor.sh --create-templates
Script location: /opt/scripts/ha-monitor.sh
Log file: /var/log/pve-ha-monitor.log
Total check time: {{ runtime }}s
EOF
# Template pentru body HTML cu font mai mare și consistent
cat > "$template_dir/ha-status-body.html.hbs" << 'EOF'
{{#if (eq status "SUCCESSFUL")}}
✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK
{{else}}
🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES
{{/if}}
Host: {{ hostname }}
Check duration: {{ runtime }}s
CLUSTER STATUS
{{ details }}
{{#if (eq status "FAILED")}}
HOW TO READ pvecm status OUTPUT
Your current problematic output shows:
- Total votes: 2 (WRONG - should be 3)
- Qdevice (votes 0) (WRONG - should be votes 1)
After fix should show:
- Total votes: 3 (CORRECT)
- Qdevice (votes 1) (CORRECT)
STEP-BY-STEP FIX
Step 1 - Fix Qdevice:
systemctl restart corosync-qdevice
sleep 5
corosync-qdevice-tool -s
Step 2 - Verify status:
LOOK FOR: Total votes: 3 (not 2!) and Qdevice (votes 1)
Bottom line: The cluster works now but has no tie-breaker vote.
One node failure = no quorum = VMs can't migrate.
{{else}}
All HA components are functioning normally:
- Cluster has proper quorum with qdevice participation
- Automatic VM migration is available
- System is fully redundant
{{/if}}
MANUAL SCRIPT EXECUTION
To run this HA status check manually:
Basic check:
/opt/scripts/ha-monitor.sh
Verbose output:
/opt/scripts/ha-monitor.sh -v
Recreate templates:
/opt/scripts/ha-monitor.sh --create-templates
Script location: /opt/scripts/ha-monitor.sh
Log file: /var/log/pve-ha-monitor.log
Total check time: {{ runtime }}s
EOF
echo "Templates created successfully."
}
# Creează template-urile la prima rulare sau dacă nu există
if [ ! -f "/etc/pve/notification-templates/default/ha-status-subject.txt.hbs" ]; then
create_templates
fi
# Verificare HA status
check_ha_status() {
local status_ok=true
local details=""
# Verifică serviciile HA
if systemctl is-active --quiet pve-ha-lrm && systemctl is-active --quiet pve-ha-crm; then
details+="HA Services: OK\n"
else
details+="HA Services: ERROR - Services not running\n"
details+=" Recovery: systemctl restart pve-ha-lrm pve-ha-crm\n"
status_ok=false
fi
# Verifică quorum și qdevice
quorum_info=$(corosync-quorumtool -s 2>/dev/null)
pvecm_info=$(pvecm status 2>/dev/null)
if echo "$quorum_info" | grep -q "Quorate:.*Yes"; then
expected_votes=$(echo "$quorum_info" | grep "Expected votes:" | awk '{print $3}')
total_votes=$(echo "$quorum_info" | grep "Total votes:" | awk '{print $3}')
# Verifică qdevice prin pvecm status - caută linia cu "Qdevice"
qdevice_votes=$(echo "$pvecm_info" | grep -E "^[[:space:]]*0x00000000[[:space:]]+1[[:space:]]+Qdevice" | awk '{print $2}')
if [ "$total_votes" = "$expected_votes" ] && [ "$qdevice_votes" = "1" ]; then
details+="Quorum: OK ($total_votes/$expected_votes votes, Qdevice participating)\n"
elif [ "$total_votes" = "$expected_votes" ]; then
details+="Quorum: OK ($total_votes/$expected_votes votes)\n"
else
details+="Quorum: WARNING ($total_votes/$expected_votes votes)\n"
details+=" Check: pvecm status for qdevice participation\n"
status_ok=false
fi
else
details+="Quorum: ERROR - Cluster not quorate\n"
details+=" Check: pvecm status && corosync-quorumtool -s\n"
status_ok=false
fi
# Verifică conectivitatea qdevice
qdevice_status=$(corosync-qdevice-tool -s 2>/dev/null)
if echo "$qdevice_status" | grep -q "State:.*Connected"; then
qnetd_host=$(echo "$qdevice_status" | grep "QNetd host:" | awk '{print $3}')
details+="Qdevice Connection: OK ($qnetd_host)\n"
else
details+="Qdevice Connection: WARNING - Disconnected\n"
details+=" Recovery: systemctl restart corosync-qdevice\n"
status_ok=false
fi
# Verifică nodurile prin pvecm status
nodes_online=$(echo "$pvecm_info" | grep -c "A,V,NMW")
if [ "$nodes_online" -ge 2 ]; then
details+="Cluster Nodes: OK ($nodes_online nodes online)\n"
else
details+="Cluster Nodes: ERROR - Only $nodes_online nodes online\n"
details+=" Check: pvecm nodes && ping [offline-node-ip]\n"
status_ok=false
fi
# Adaugă secțiune de recovery doar pentru cazurile de eroare
if ! $status_ok; then
details+="\n=== IMMEDIATE ACTIONS REQUIRED ===\n"
details+="1. SSH to cluster: ssh root@$(hostname -f)\n"
details+="2. Check overall status: pvecm status\n"
details+="3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20\n"
details+="4. Check network connectivity between nodes\n"
fi
if $status_ok; then
echo "SUCCESSFUL"
else
echo "FAILED"
fi
echo -e "$details"
}
# Execută verificarea
RESULT=$(check_ha_status)
STATUS=$(echo "$RESULT" | head -n 1)
DETAILS=$(echo "$RESULT" | tail -n +2)
# Calculează timpul
END_TIME=$(date +%s)
RUNTIME=$((END_TIME - START_TIME))
# Determină severity
if [ "$STATUS" = "SUCCESSFUL" ]; then
SEVERITY="info"
else
SEVERITY="error"
fi
# Trimite notificarea prin PVE::Notify cu tipul "ha-status"
perl -I/usr/share/perl5 << EOF
use strict;
use warnings;
use PVE::Notify;
# Date pentru template (în format JSON-like pentru Perl)
my \$template_data = {
'hostname' => '$FQDN',
'status' => '$STATUS',
'runtime' => '$RUNTIME',
'details' => '$DETAILS'
};
# Metadata pentru matcher
my \$fields = {
'hostname' => '$HOSTNAME',
'type' => 'ha-status',
'status' => '$STATUS'
};
# Trimite notificarea cu tipul "ha-status"
# Va folosi template-urile din /etc/pve/notification-templates/default/ha-status-*
eval {
PVE::Notify::notify('$SEVERITY', 'ha-status', \$template_data, \$fields);
print "Notification sent successfully\\n";
};
if (\$@) {
print STDERR "Failed to send notification: \$@\\n";
exit 1;
}
EOF
PERL_EXIT_CODE=$?
# Log local
echo "$(date): HA status check completed - $STATUS, notification exit code: $PERL_EXIT_CODE" >> /var/log/pve-ha-monitor.log
# Output pentru testare
if [ "$1" == "--verbose" ] || [ "$1" == "-v" ]; then
echo "=== HA MONITOR REPORT ==="
echo "Status: $STATUS"
echo "Runtime: ${RUNTIME}s"
echo "Severity: $SEVERITY"
echo "Perl exit code: $PERL_EXIT_CODE"
echo
echo "Details:"
echo "$DETAILS"
echo
echo "Using template: ha-status"
echo "Template data: hostname=$FQDN, status=$STATUS, runtime=${RUNTIME}s"
elif [ "$1" == "--create-templates" ] || [ "$1" == "--templates" ]; then
create_templates
echo "Templates recreated successfully."
echo "Run './ha-monitor.sh -v' to test with new templates."
exit 0
fi