Fixed two critical issues with HA monitoring: 1. False positive quorum errors - corosync-quorumtool not in cron PATH 2. Unwanted cron emails from PVE::Notify INFO messages to STDERR Changes: - Set proper PATH including /usr/sbin for corosync-quorumtool - Split notification code: verbose shows all, non-verbose redirects STDERR to /dev/null - Prevents cron from sending duplicate notification emails 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
380 lines
12 KiB
Bash
380 lines
12 KiB
Bash
#!/bin/bash
|
|
|
|
# HA Monitor cu PVE::Notify - versiune fără qdevice
|
|
# Folosește sistemul nativ Proxmox cu template-uri personalizate
|
|
#
|
|
# TEMPLATE SYSTEM:
|
|
# ================
|
|
# Acest script folosește template-uri Handlebars pentru formatarea emailurilor,
|
|
# exact ca sistemul de backup Proxmox. Template-urile trebuie create în:
|
|
#
|
|
# /etc/pve/notification-templates/default/
|
|
# ├── ha-status-subject.txt.hbs (subject-ul emailului)
|
|
# ├── ha-status-body.txt.hbs (conținutul text al emailului)
|
|
# └── ha-status-body.html.hbs (opțional, versiunea HTML)
|
|
#
|
|
# Template-urile folosesc sintaxa Handlebars cu variabilele:
|
|
# - {{ hostname }} : FQDN-ul serverului
|
|
# - {{ status }} : "SUCCESSFUL" sau "FAILED"
|
|
# - {{ runtime }} : timpul de execuție în secunde
|
|
# - {{ details }} : detaliile verificării HA
|
|
#
|
|
# Pentru alte scripturi care vor să folosească sistemul de notificări Proxmox:
|
|
# 1. Creați template-urile în /etc/pve/notification-templates/default/
|
|
# 2. Folosiți PVE::Notify::notify($severity, $template_name, $template_data, $fields)
|
|
# 3. $template_name trebuie să corespundă cu numele fișierelor template
|
|
#
|
|
# PREREQUISITE:
|
|
# =============
|
|
# Template-urile sunt create automat de script la prima rulare
|
|
|
|
# Set proper PATH for cron execution (includes /usr/sbin for corosync-quorumtool)
|
|
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
|
|
|
HOSTNAME=$(hostname)
|
|
FQDN=$(hostname -f)
|
|
DATE=$(date '+%Y-%m-%d %H:%M:%S')
|
|
START_TIME=$(date +%s)
|
|
|
|
# Verifică parametri înainte de execuție
|
|
if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
|
|
cat << 'HELP'
|
|
HA Monitor Script - Proxmox High Availability Monitoring
|
|
|
|
USAGE:
|
|
/opt/scripts/ha-monitor.sh [OPTION]
|
|
|
|
OPTIONS:
|
|
(no option) Run HA check and send notification via Proxmox notification system
|
|
-v, --verbose Run HA check with detailed console output
|
|
--create-templates Recreate notification templates in /etc/pve/notification-templates/default/
|
|
-h, --help Display this help message
|
|
|
|
DESCRIPTION:
|
|
This script monitors the Proxmox HA cluster status and sends notifications
|
|
using the native Proxmox notification system (PVE::Notify).
|
|
|
|
It checks:
|
|
- HA Services (pve-ha-lrm, pve-ha-crm)
|
|
- Cluster Quorum status
|
|
- Number of online cluster nodes
|
|
|
|
NOTIFICATION TEMPLATES:
|
|
Templates are stored in: /etc/pve/notification-templates/default/
|
|
- ha-status-subject.txt.hbs (email subject)
|
|
- ha-status-body.txt.hbs (email body text)
|
|
- ha-status-body.html.hbs (email body HTML)
|
|
|
|
LOG FILE:
|
|
/var/log/pve-ha-monitor.log
|
|
|
|
EXAMPLES:
|
|
# Run normal check (silent, sends notification)
|
|
/opt/scripts/ha-monitor.sh
|
|
|
|
# Run with verbose output
|
|
/opt/scripts/ha-monitor.sh -v
|
|
|
|
# Recreate email templates
|
|
/opt/scripts/ha-monitor.sh --create-templates
|
|
|
|
CRON SETUP:
|
|
To run every 5 minutes:
|
|
*/5 * * * * /opt/scripts/ha-monitor.sh
|
|
|
|
HELP
|
|
exit 0
|
|
fi
|
|
|
|
if [ "$1" == "--create-templates" ] || [ "$1" == "--templates" ]; then
|
|
# Funcție pentru crearea template-urilor de notificare
|
|
create_templates() {
|
|
local template_dir="/etc/pve/notification-templates/default"
|
|
|
|
# Creează directorul dacă nu există
|
|
mkdir -p "$template_dir"
|
|
|
|
echo "Creating notification templates in $template_dir..."
|
|
|
|
# Template pentru subject - pentru SUCCESS
|
|
cat > "$template_dir/ha-status-subject.txt.hbs" << 'EOF'
|
|
{{#if (eq status "SUCCESSFUL")}}✅ HA CLUSTER OK - {{ hostname }}{{else}}🚨 HA CLUSTER ISSUES - {{ hostname }}{{/if}}
|
|
EOF
|
|
|
|
# Template pentru body text
|
|
cat > "$template_dir/ha-status-body.txt.hbs" << 'EOF'
|
|
{{#if (eq status "SUCCESSFUL")}}✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK{{else}}🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES{{/if}}
|
|
|
|
Host: {{ hostname }}
|
|
Check duration: {{ runtime }}s
|
|
|
|
CLUSTER STATUS:
|
|
{{ details }}
|
|
|
|
{{#if (eq status "FAILED")}}
|
|
=== IMMEDIATE ACTIONS REQUIRED ===
|
|
|
|
1. SSH to cluster: ssh root@{{ hostname }}
|
|
2. Check overall status: pvecm status
|
|
3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20
|
|
4. Check network connectivity between nodes
|
|
5. Verify all cluster nodes are online
|
|
|
|
{{else}}
|
|
All HA components are functioning normally.
|
|
- Cluster has proper quorum
|
|
- Automatic VM migration is available
|
|
- System is fully redundant
|
|
{{/if}}
|
|
|
|
=== MANUAL SCRIPT EXECUTION ===
|
|
|
|
To run this HA status check manually:
|
|
|
|
Basic check:
|
|
/opt/scripts/ha-monitor.sh
|
|
|
|
Verbose output (shows details on console):
|
|
/opt/scripts/ha-monitor.sh -v
|
|
|
|
Recreate email templates:
|
|
/opt/scripts/ha-monitor.sh --create-templates
|
|
|
|
Script location: /opt/scripts/ha-monitor.sh
|
|
Log file: /var/log/pve-ha-monitor.log
|
|
|
|
Total check time: {{ runtime }}s
|
|
EOF
|
|
|
|
# Template pentru body HTML cu font mai mare și consistent
|
|
cat > "$template_dir/ha-status-body.html.hbs" << 'EOF'
|
|
<div style="font-family: Arial, sans-serif; font-size: 16px; line-height: 1.5; max-width: 800px;">
|
|
|
|
{{#if (eq status "SUCCESSFUL")}}
|
|
<h2 style="font-size: 22px; color: green; margin-bottom: 15px;">✅ HIGH AVAILABILITY STATUS: ALL SYSTEMS OK</h2>
|
|
{{else}}
|
|
<h2 style="font-size: 22px; color: red; margin-bottom: 15px;">🚨 HIGH AVAILABILITY CLUSTER HAS ISSUES</h2>
|
|
{{/if}}
|
|
|
|
<p style="font-size: 16px; margin-bottom: 15px;"><strong>Host:</strong> {{ hostname }}<br>
|
|
<strong>Check duration:</strong> {{ runtime }}s</p>
|
|
|
|
<h3 style="font-size: 18px; margin-top: 20px; margin-bottom: 10px;">CLUSTER STATUS</h3>
|
|
<pre style="font-size: 15px; background: #f8f9fa; padding: 12px; border: 1px solid #ddd; border-radius: 4px; margin-bottom: 20px;">{{ details }}</pre>
|
|
|
|
{{#if (eq status "FAILED")}}
|
|
<h3 style="font-size: 18px; margin-top: 20px; margin-bottom: 10px;">IMMEDIATE ACTIONS REQUIRED</h3>
|
|
|
|
<ol style="font-size: 16px; margin-bottom: 15px;">
|
|
<li>SSH to cluster: <code>ssh root@{{ hostname }}</code></li>
|
|
<li>Check overall status: <code>pvecm status</code></li>
|
|
<li>Review HA logs: <code>journalctl -u pve-ha-lrm -u pve-ha-crm -n 20</code></li>
|
|
<li>Check network connectivity between nodes</li>
|
|
<li>Verify all cluster nodes are online</li>
|
|
</ol>
|
|
|
|
<p style="font-size: 16px; background: #f8d7da; padding: 12px; border-radius: 4px; margin-top: 15px;"><strong>Warning:</strong> Issues detected in the cluster. Immediate attention required to ensure high availability.</p>
|
|
|
|
{{else}}
|
|
<p style="font-size: 16px; background: #d4edda; padding: 12px; border-radius: 4px; margin-top: 15px;"><strong>All HA components are functioning normally:</strong></p>
|
|
<ul style="font-size: 16px; margin-top: 10px;">
|
|
<li>Cluster has proper quorum</li>
|
|
<li>Automatic VM migration is available</li>
|
|
<li>System is fully redundant</li>
|
|
</ul>
|
|
{{/if}}
|
|
|
|
<h3 style="font-size: 18px; margin-top: 30px; margin-bottom: 10px;">MANUAL SCRIPT EXECUTION</h3>
|
|
<p style="font-size: 16px; margin-bottom: 10px;">To run this HA status check manually:</p>
|
|
|
|
<p style="font-size: 16px; margin-bottom: 8px;"><strong>Basic check:</strong></p>
|
|
<div style="font-size: 15px; background: #f8f9fa; padding: 8px; border: 1px solid #ddd; border-radius: 4px; margin-bottom: 10px;">
|
|
/opt/scripts/ha-monitor.sh
|
|
</div>
|
|
|
|
<p style="font-size: 16px; margin-bottom: 8px;"><strong>Verbose output:</strong></p>
|
|
<div style="font-size: 15px; background: #f8f9fa; padding: 8px; border: 1px solid #ddd; border-radius: 4px; margin-bottom: 10px;">
|
|
/opt/scripts/ha-monitor.sh -v
|
|
</div>
|
|
|
|
<p style="font-size: 16px; margin-bottom: 8px;"><strong>Recreate templates:</strong></p>
|
|
<div style="font-size: 15px; background: #f8f9fa; padding: 8px; border: 1px solid #ddd; border-radius: 4px; margin-bottom: 15px;">
|
|
/opt/scripts/ha-monitor.sh --create-templates
|
|
</div>
|
|
|
|
<p style="font-size: 16px; margin-bottom: 20px;"><strong>Script location:</strong> /opt/scripts/ha-monitor.sh<br>
|
|
<strong>Log file:</strong> /var/log/pve-ha-monitor.log</p>
|
|
|
|
<p style="font-size: 16px; margin-top: 20px;"><strong>Total check time:</strong> {{ runtime }}s</p>
|
|
|
|
</div>
|
|
EOF
|
|
|
|
echo "Templates created successfully."
|
|
}
|
|
|
|
create_templates
|
|
echo "Templates recreated successfully."
|
|
echo "Run './ha-monitor.sh -v' to test with new templates."
|
|
exit 0
|
|
fi
|
|
|
|
# Verificare HA status
|
|
check_ha_status() {
|
|
local status_ok=true
|
|
local details=""
|
|
|
|
# Verifică serviciile HA
|
|
if systemctl is-active --quiet pve-ha-lrm && systemctl is-active --quiet pve-ha-crm; then
|
|
details+="HA Services: OK\n"
|
|
else
|
|
details+="HA Services: ERROR - Services not running\n"
|
|
details+=" Recovery: systemctl restart pve-ha-lrm pve-ha-crm\n"
|
|
status_ok=false
|
|
fi
|
|
|
|
# Verifică quorum
|
|
quorum_info=$(corosync-quorumtool -s 2>/dev/null)
|
|
pvecm_info=$(pvecm status 2>/dev/null)
|
|
|
|
if echo "$quorum_info" | grep -q "Quorate:.*Yes"; then
|
|
expected_votes=$(echo "$quorum_info" | grep "Expected votes:" | awk '{print $3}')
|
|
total_votes=$(echo "$quorum_info" | grep "Total votes:" | awk '{print $3}')
|
|
|
|
if [ "$total_votes" = "$expected_votes" ]; then
|
|
details+="Quorum: OK ($total_votes/$expected_votes votes)\n"
|
|
else
|
|
details+="Quorum: WARNING ($total_votes/$expected_votes votes)\n"
|
|
details+=" Check: pvecm status\n"
|
|
status_ok=false
|
|
fi
|
|
else
|
|
details+="Quorum: ERROR - Cluster not quorate\n"
|
|
details+=" Check: pvecm status && corosync-quorumtool -s\n"
|
|
status_ok=false
|
|
fi
|
|
|
|
# Verifică nodurile prin pvecm status - numără liniile din Membership information
|
|
nodes_online=$(echo "$pvecm_info" | grep -E "^[[:space:]]*0x[0-9a-fA-F]+" | wc -l)
|
|
|
|
if [ "$nodes_online" -ge 2 ]; then
|
|
details+="Cluster Nodes: OK ($nodes_online nodes online)\n"
|
|
else
|
|
details+="Cluster Nodes: ERROR - Only $nodes_online nodes online\n"
|
|
details+=" Check: pvecm nodes && ping [offline-node-ip]\n"
|
|
status_ok=false
|
|
fi
|
|
|
|
# Adaugă secțiune de recovery doar pentru cazurile de eroare
|
|
if ! $status_ok; then
|
|
details+="\n=== IMMEDIATE ACTIONS REQUIRED ===\n"
|
|
details+="1. SSH to cluster: ssh root@$(hostname -f)\n"
|
|
details+="2. Check overall status: pvecm status\n"
|
|
details+="3. Review HA logs: journalctl -u pve-ha-lrm -u pve-ha-crm -n 20\n"
|
|
details+="4. Check network connectivity between nodes\n"
|
|
fi
|
|
|
|
if $status_ok; then
|
|
echo "SUCCESSFUL"
|
|
else
|
|
echo "FAILED"
|
|
fi
|
|
echo -e "$details"
|
|
}
|
|
|
|
# Execută verificarea
|
|
RESULT=$(check_ha_status)
|
|
STATUS=$(echo "$RESULT" | head -n 1)
|
|
DETAILS=$(echo "$RESULT" | tail -n +2)
|
|
|
|
# Calculează timpul
|
|
END_TIME=$(date +%s)
|
|
RUNTIME=$((END_TIME - START_TIME))
|
|
|
|
# Determină severity
|
|
if [ "$STATUS" = "SUCCESSFUL" ]; then
|
|
SEVERITY="info"
|
|
else
|
|
SEVERITY="error"
|
|
fi
|
|
|
|
# Trimite notificarea prin PVE::Notify cu tipul "ha-status"
|
|
# Redirecționează STDERR către /dev/null pentru modul non-verbose (previne emailuri de la cron)
|
|
if [ "$1" == "--verbose" ] || [ "$1" == "-v" ]; then
|
|
# Modul verbose - afișează toate mesajele
|
|
perl -I/usr/share/perl5 << EOF
|
|
use strict;
|
|
use warnings;
|
|
use PVE::Notify;
|
|
|
|
my \$template_data = {
|
|
'hostname' => '$FQDN',
|
|
'status' => '$STATUS',
|
|
'runtime' => '$RUNTIME',
|
|
'details' => '$DETAILS'
|
|
};
|
|
|
|
my \$fields = {
|
|
'hostname' => '$HOSTNAME',
|
|
'type' => 'ha-status',
|
|
'status' => '$STATUS'
|
|
};
|
|
|
|
eval {
|
|
PVE::Notify::notify('$SEVERITY', 'ha-status', \$template_data, \$fields);
|
|
print "Notification sent successfully\\n";
|
|
};
|
|
if (\$@) {
|
|
print STDERR "Failed to send notification: \$@\\n";
|
|
exit 1;
|
|
}
|
|
EOF
|
|
PERL_EXIT_CODE=$?
|
|
else
|
|
# Modul non-verbose - suprimă mesajele INFO pentru a preveni emailuri de la cron
|
|
perl -I/usr/share/perl5 << EOF 2>/dev/null
|
|
use strict;
|
|
use warnings;
|
|
use PVE::Notify;
|
|
|
|
my \$template_data = {
|
|
'hostname' => '$FQDN',
|
|
'status' => '$STATUS',
|
|
'runtime' => '$RUNTIME',
|
|
'details' => '$DETAILS'
|
|
};
|
|
|
|
my \$fields = {
|
|
'hostname' => '$HOSTNAME',
|
|
'type' => 'ha-status',
|
|
'status' => '$STATUS'
|
|
};
|
|
|
|
eval {
|
|
PVE::Notify::notify('$SEVERITY', 'ha-status', \$template_data, \$fields);
|
|
};
|
|
if (\$@) {
|
|
print STDERR "Failed to send notification: \$@\\n";
|
|
exit 1;
|
|
}
|
|
EOF
|
|
PERL_EXIT_CODE=$?
|
|
fi
|
|
|
|
# Log local
|
|
echo "$(date): HA status check completed - $STATUS, notification exit code: $PERL_EXIT_CODE" >> /var/log/pve-ha-monitor.log
|
|
|
|
# Output pentru testare
|
|
if [ "$1" == "--verbose" ] || [ "$1" == "-v" ]; then
|
|
echo "=== HA MONITOR REPORT ==="
|
|
echo "Status: $STATUS"
|
|
echo "Runtime: ${RUNTIME}s"
|
|
echo "Severity: $SEVERITY"
|
|
echo "Perl exit code: $PERL_EXIT_CODE"
|
|
echo
|
|
echo "Details:"
|
|
echo "$DETAILS"
|
|
echo
|
|
echo "Using template: ha-status"
|
|
echo "Template data: hostname=$FQDN, status=$STATUS, runtime=${RUNTIME}s"
|
|
fi |