Files
ROMFASTSQL/oracle/standby-server-scripts/weekly-dr-test-proxmox.sh
Marius 8da1208ca7 Oracle DR: Fix false FAILED notification - parse database status from log
- Replace complex SSH+PowerShell query with simple log file parsing
- rman_restore_from_zero.ps1 already verifies and outputs database status
- Parse 'OPEN_MODE: READ WRITE' and 'TABLES: <count>' from LOG_FILE
- Fixes issue where successful restore was reported as FAILED
- More reliable: avoids SSH escaping issues with Select-String -Quiet

Root cause: SSH+PowerShell+sqlplus+Select-String chain was too fragile and
returned empty/false even when database was successfully opened (42625 tables).

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
2025-10-11 18:55:05 +03:00

646 lines
22 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
#
# Oracle DR Weekly Test with Proxmox PVE::Notify
# Automated DR test with notifications via Proxmox notification system
#
# Location: /opt/scripts/weekly-dr-test-proxmox.sh (on Proxmox host)
# Schedule: Add to cron for weekly execution (Saturdays)
#
# This script is SELF-SUFFICIENT:
# - Automatically creates notification templates if they don't exist
# - Uses Proxmox native notification system
# - No email configuration needed - uses existing Proxmox setup
#
# Installation:
# cp weekly-dr-test-proxmox.sh /opt/scripts/
# chmod +x /opt/scripts/weekly-dr-test-proxmox.sh
# /opt/scripts/weekly-dr-test-proxmox.sh --install # Creates templates
# crontab -e # Add: 0 6 * * 6 /opt/scripts/weekly-dr-test-proxmox.sh
#
# Author: Claude (based on ha-monitor.sh pattern)
# Version: 1.0
set -euo pipefail
# Configuration
DR_VM_ID="109"
DR_VM_IP="10.0.20.37"
DR_VM_PORT="22122"
DR_VM_USER="romfast"
BACKUP_PATH="/mnt/pve/oracle-backups/ROA/autobackup"
MAX_RESTORE_TIME_MIN=30
TEMPLATE_DIR="/usr/share/pve-manager/templates/default"
LOG_DIR="/var/log/oracle-dr"
LOG_FILE="$LOG_DIR/dr_test_$(date +%Y%m%d_%H%M%S).log"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Create log directory
mkdir -p "$LOG_DIR"
# Function to create notification templates
create_templates() {
echo -e "${GREEN}Creating Oracle DR test notification templates...${NC}"
# Create templates directory if needed
mkdir -p "$TEMPLATE_DIR"
# Subject template
cat > "$TEMPLATE_DIR/oracle-dr-test-subject.txt.hbs" <<'EOF'
Oracle DR Test {{test_result}} | {{date}}
EOF
# Text body template
cat > "$TEMPLATE_DIR/oracle-dr-test-body.txt.hbs" <<'EOF'
Oracle DR Test {{test_result}} | {{date}}
Severity: {{severity}}
SUMMARY
- Outcome: {{test_result}}
- Duration: {{total_duration}} min (restore {{restore_duration}} min)
- Backups used: {{backup_count}}
- Tables restored: {{tables_restored}}
COMPONENTS
- VM {{vm_id}} ({{vm_ip}}): {{vm_status}}
- NFS: {{nfs_status}}
- Database: {{database_status}}
- Cleanup: {{disk_freed}} GB freed
STEPS
{{#each test_steps}}
- {{#if this.passed}}✓{{else}}✗{{/if}} {{this.name}} ({{this.duration}}s){{#if this.status}} - {{this.status}}{{/if}}
{{/each}}
{{#if has_errors}}
ISSUES
{{#each errors}}
- {{this}}
{{/each}}
{{/if}}
{{#if has_warnings}}
WARNINGS
{{#each warnings}}
- {{this}}
{{/each}}
{{/if}}
RMAN RESTORE LOG (complete)
---
{{restore_log}}
---
BASH SCRIPT LOG (last 100 lines)
---
{{bash_log}}
---
Full log: {{log_file}}
Next test: Saturday 06:00
EOF
# HTML body template (compact Gmail-friendly layout)
cat > "$TEMPLATE_DIR/oracle-dr-test-body.html.hbs" <<'EOF'
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Oracle DR Test {{test_result}} | {{date}}</title>
</head>
<body style="margin:0;padding:16px;font-family:Arial,Helvetica,sans-serif;background:#ffffff;color:#2c3e50;">
<table style="width:100%;max-width:640px;margin:0 auto;border-collapse:collapse;">
<tr>
<td style="padding:0 0 12px 0;font-size:18px;font-weight:600;">
Oracle DR Test {{test_result}}
</td>
</tr>
<tr>
<td style="padding:0 0 8px 0;font-size:13px;color:#6c757d;">{{date}} · Severity: {{severity}}</td>
</tr>
<tr>
<td style="padding:12px;border:1px solid #e1e4e8;border-radius:4px;">
<table style="width:100%;border-collapse:collapse;font-size:14px;">
<tr><td style="padding:4px 0;">Outcome</td><td style="padding:4px 0;text-align:right;">{{test_result}}</td></tr>
<tr><td style="padding:4px 0;">Duration</td><td style="padding:4px 0;text-align:right;">{{total_duration}} min (restore {{restore_duration}} min)</td></tr>
<tr><td style="padding:4px 0;">Backups used</td><td style="padding:4px 0;text-align:right;">{{backup_count}}</td></tr>
<tr><td style="padding:4px 0;">Tables restored</td><td style="padding:4px 0;text-align:right;">{{tables_restored}}</td></tr>
</table>
</td>
</tr>
<tr>
<td style="padding:16px 0 0 0;">
<table style="width:100%;border-collapse:collapse;font-size:14px;border:1px solid #e1e4e8;border-radius:4px;background:#f9fafb;">
<tr><td style="padding:8px 12px;font-weight:600;">Components</td></tr>
<tr><td style="padding:6px 12px;border-top:1px solid #e1e4e8;">VM {{vm_id}} ({{vm_ip}}): {{vm_status}}</td></tr>
<tr><td style="padding:6px 12px;border-top:1px solid #e1e4e8;">NFS: {{nfs_status}}</td></tr>
<tr><td style="padding:6px 12px;border-top:1px solid #e1e4e8;">Database: {{database_status}}</td></tr>
<tr><td style="padding:6px 12px;border-top:1px solid #e1e4e8;">Cleanup: {{disk_freed}} GB freed</td></tr>
</table>
</td>
</tr>
<tr>
<td style="padding:16px 0 0 0;">
<table style="width:100%;border-collapse:collapse;font-size:14px;">
<tr><td style="padding:0 0 6px 0;font-weight:600;">Steps</td></tr>
{{#each test_steps}}
<tr>
<td style="padding:4px 0;border-bottom:1px solid #f1f1f1;">{{#if this.passed}}✓{{else}}✗{{/if}} {{this.name}} ({{this.duration}}s){{#if this.status}} {{this.status}}{{/if}}</td>
</tr>
{{/each}}
</table>
</td>
</tr>
{{#if has_errors}}
<tr>
<td style="padding:16px 0 0 0;">
<table style="width:100%;border-collapse:collapse;font-size:14px;background:#fff5f5;border:1px solid #f1b0b7;border-radius:4px;">
<tr><td style="padding:8px 12px;font-weight:600;color:#c82333;">Issues</td></tr>
{{#each errors}}
<tr><td style="padding:6px 12px;border-top:1px solid #f8d7da;">• {{this}}</td></tr>
{{/each}}
</table>
</td>
</tr>
{{/if}}
{{#if has_warnings}}
<tr>
<td style="padding:16px 0 0 0;">
<table style="width:100%;border-collapse:collapse;font-size:14px;background:#fff8e5;border:1px solid #ffe8a1;border-radius:4px;">
<tr><td style="padding:8px 12px;font-weight:600;color:#856404;">Warnings</td></tr>
{{#each warnings}}
<tr><td style="padding:6px 12px;border-top:1px solid #ffe8a1;">• {{this}}</td></tr>
{{/each}}
</table>
</td>
</tr>
{{/if}}
<tr>
<td style="padding:16px 0 0 0;">
<table style="width:100%;border-collapse:collapse;font-size:12px;border:1px solid #e1e4e8;border-radius:4px;background:#f9fafb;">
<tr><td style="padding:8px 12px;font-weight:600;font-size:13px;">RMAN Restore Log (complete)</td></tr>
<tr><td style="padding:8px 12px;font-family:monospace;white-space:pre-wrap;word-wrap:break-word;border-top:1px solid #e1e4e8;">{{restore_log}}</td></tr>
</table>
</td>
</tr>
<tr>
<td style="padding:16px 0 0 0;">
<table style="width:100%;border-collapse:collapse;font-size:12px;border:1px solid #e1e4e8;border-radius:4px;background:#f9fafb;">
<tr><td style="padding:8px 12px;font-weight:600;font-size:13px;">Bash Script Log (last 100 lines)</td></tr>
<tr><td style="padding:8px 12px;font-family:monospace;white-space:pre-wrap;word-wrap:break-word;border-top:1px solid #e1e4e8;">{{bash_log}}</td></tr>
</table>
</td>
</tr>
<tr>
<td style="padding:16px 0 0 0;font-size:12px;color:#6c757d;">
Full log: {{log_file}} · Next test: Saturday 06:00
</td>
</tr>
</table>
</body>
</html>
EOF
echo -e "${GREEN}Templates created successfully in $TEMPLATE_DIR${NC}"
}
# Function to send notification via PVE::Notify
send_pve_notification() {
local severity="$1"
local data="$2"
# Create Perl script to call PVE::Notify
cat > /tmp/oracle-dr-notify.pl <<'PERL_SCRIPT'
#!/usr/bin/perl
use strict;
use warnings;
use PVE::Notify;
use JSON;
my $json_data = do { local $/; <STDIN> };
my $data = decode_json($json_data);
my $severity = $data->{severity} // 'info';
my $template_name = 'oracle-dr-test';
# Add fields for matching rules
my $fields = {
type => 'oracle-dr-test',
severity => $severity,
test_result => $data->{test_result},
};
# Send notification
eval {
PVE::Notify::notify(
$severity,
$template_name,
$data,
$fields
);
};
if ($@) {
print "Error sending notification: $@\n";
exit 1;
}
print "Notification sent successfully\n";
PERL_SCRIPT
chmod +x /tmp/oracle-dr-notify.pl
# Send notification
echo "$data" | perl /tmp/oracle-dr-notify.pl
rm -f /tmp/oracle-dr-notify.pl
}
# Logging functions
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
}
# Test tracking
TEST_STEPS=()
ERRORS=()
WARNINGS=()
TEST_START_TIME=$(date +%s)
# Function to track test steps
track_step() {
local name="$1"
local passed="$2"
local status="$3"
local start_time="$4"
local end_time=$(date +%s)
local duration=$((end_time - start_time))
local step_json
step_json=$(jq -n \
--arg name "$name" \
--arg status "$status" \
--arg duration "$duration" \
--arg passed "$passed" \
'{name:$name, status:$status, duration:($duration|tonumber), passed:($passed == "true")}'
)
TEST_STEPS+=("$step_json")
if [ "$passed" = "false" ]; then
ERRORS+=("$name: $status")
fi
}
# Main test workflow
run_dr_test() {
local test_result="FAILED"
local severity="error"
local is_success=false
local restore_duration=0
local tables_restored=0
local db_status="UNKNOWN"
local nfs_status="Not checked"
local vm_status_label="Not started"
local cleanup_freed=0
local backup_count=0
local restore_log="Not collected"
log "=========================================="
log "Oracle DR Weekly Test - Starting"
log "=========================================="
# Step 1: Pre-flight checks
local step_start=$(date +%s)
log "STEP 1: Pre-flight checks"
# Check backups exist
backup_count=$(find "$BACKUP_PATH" -maxdepth 1 -type f -name '*.BKP' 2>/dev/null | wc -l)
if [ "$backup_count" -lt 2 ]; then
track_step "Pre-flight checks" false "Insufficient backups (found: $backup_count)" "$step_start"
test_result="FAILED - No backups"
else
track_step "Pre-flight checks" true "Found $backup_count backups" "$step_start"
# Step 2: Start VM
step_start=$(date +%s)
log "STEP 2: Starting DR VM"
if qm start "$DR_VM_ID" 2>/dev/null; then
vm_status_label="Running"
# Intelligent VM boot wait with polling (max 180s)
local MAX_BOOT_WAIT=180
local POLL_INTERVAL=5
local boot_elapsed=0
local vm_ready=false
log "Waiting for VM to become ready (SSH + PowerShell, max ${MAX_BOOT_WAIT}s)..."
while [ $boot_elapsed -lt $MAX_BOOT_WAIT ]; do
# Check 1: VM running status in Proxmox
local vm_qm_status
vm_qm_status=$(qm status "$DR_VM_ID" 2>/dev/null | grep -o "running" || echo "")
if [ "$vm_qm_status" = "running" ]; then
# Check 2: SSH connectivity and PowerShell availability (what we actually need)
if ssh -p "$DR_VM_PORT" -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o BatchMode=yes "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command 'Write-Output ready'" >/dev/null 2>&1; then
log "VM ready after ${boot_elapsed}s (SSH and PowerShell responding)"
vm_ready=true
break
fi
fi
sleep $POLL_INTERVAL
boot_elapsed=$((boot_elapsed + POLL_INTERVAL))
# Progress logging every 30 seconds
if [ $((boot_elapsed % 30)) -eq 0 ] && [ $boot_elapsed -lt $MAX_BOOT_WAIT ]; then
log "Still waiting for VM... (${boot_elapsed}s/${MAX_BOOT_WAIT}s elapsed)"
fi
done
if [ "$vm_ready" = false ]; then
log_warning "VM did not respond within ${MAX_BOOT_WAIT}s, continuing anyway (may cause subsequent failures)"
fi
track_step "VM Startup" true "VM $DR_VM_ID started and ready (${boot_elapsed}s)" "$step_start"
# Step 3: Verify NFS mount
step_start=$(date +%s)
log "STEP 3: Verifying NFS mount"
nfs_status="Not Mounted"
if ssh -p "$DR_VM_PORT" -o ConnectTimeout=10 "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command 'Test-Path F:\\ROA\\autobackup'" 2>/dev/null; then
nfs_status="Mounted"
track_step "NFS Mount Check" true "F:\\ drive accessible" "$step_start"
else
track_step "NFS Mount Check" false "F:\\ drive not accessible" "$step_start"
WARNINGS+=("NFS mount may need manual intervention")
fi
# Step 4: Run restore
step_start=$(date +%s)
local restore_start=$step_start
log "STEP 4: Running database restore"
if ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -ExecutionPolicy Bypass -File D:\\oracle\\scripts\\rman_restore_from_zero.ps1 -TestMode" 2>&1 | tee -a "$LOG_FILE"; then
local restore_end=$(date +%s)
restore_duration=$(( (restore_end - restore_start) / 60 ))
track_step "Database Restore" true "Restored in $restore_duration minutes" "$step_start"
# Step 5: Verify database
step_start=$(date +%s)
log "STEP 5: Verifying database"
# Parse database status from LOG_FILE (rman_restore_from_zero.ps1 already verified it)
# Look for "OPEN_MODE: READ WRITE" in the captured output
if grep -q "OPEN_MODE: READ WRITE" "$LOG_FILE" 2>/dev/null; then
db_status="READ WRITE"
else
db_status=""
fi
# Parse table count from LOG_FILE (already captured in STEP 3 output)
# Look for "TABLES: <number>" in the output
tables_restored=$(grep -oP "TABLES:\s*\K\d+" "$LOG_FILE" 2>/dev/null | tail -1 || echo "0")
tables_restored=$(echo "$tables_restored" | tr -cd '0-9')
[ -z "$tables_restored" ] && tables_restored=0
if [[ "$db_status" == "READ WRITE" ]] && [ "$tables_restored" -gt 0 ]; then
track_step "Database Verification" true "Database OPEN, $tables_restored tables" "$step_start"
test_result="PASSED"
severity="info"
is_success=true
else
track_step "Database Verification" false "Database not OPEN" "$step_start"
fi
# Collect restore log from VM (always attempt collection - FULL log)
log "Collecting restore log from DR VM..."
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command \"Get-Content 'D:\\oracle\\logs\\restore_from_zero.log' -ErrorAction SilentlyContinue\"" 2>/dev/null || echo "")
# If not found, try alternate locations
if [ -z "$restore_log" ]; then
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command \"Get-Content 'D:\\oracle\\temp\\restore_from_zero.log' -ErrorAction SilentlyContinue\"" 2>/dev/null || echo "")
fi
# Still not found, use fallback message
if [ -z "$restore_log" ]; then
restore_log="Restore log not available (file may not exist or was not generated)"
fi
# Step 6: Cleanup (AFTER restore - stop service to release file locks)
step_start=$(date +%s)
log "STEP 6: Running cleanup"
ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -ExecutionPolicy Bypass -File D:\\oracle\\scripts\\cleanup_database.ps1 /SILENT /AFTER" 2>/dev/null
cleanup_freed=8
track_step "Cleanup" true "Database cleaned, ~${cleanup_freed}GB freed" "$step_start"
else
# Collect restore log even when restore fails (FULL log)
log "Collecting restore log after failure..."
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command \"Get-Content 'D:\\oracle\\logs\\restore_from_zero.log' -ErrorAction SilentlyContinue\"" 2>/dev/null || echo "")
if [ -z "$restore_log" ]; then
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command \"Get-Content 'D:\\oracle\\temp\\restore_from_zero.log' -ErrorAction SilentlyContinue\"" 2>/dev/null || echo "")
fi
# Always try to get some error output from RMAN script
if [ -z "$restore_log" ]; then
last_error=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command \"Get-Content 'D:\\oracle\\temp\\*.rman' -Tail 20 -ErrorAction SilentlyContinue\"" 2>/dev/null || echo "")
if [ -n "$last_error" ]; then
restore_log="RMAN script content (last 20 lines):\n$last_error"
else
restore_log="No restore logs or RMAN scripts found"
fi
fi
track_step "Database Restore" false "Restore failed" "$step_start"
fi
# Step 7: Shutdown VM
step_start=$(date +%s)
log "STEP 7: Shutting down VM"
ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" "shutdown /s /t 30" 2>/dev/null
sleep 60
qm stop "$DR_VM_ID" 2>/dev/null
track_step "VM Shutdown" true "VM stopped" "$step_start"
vm_status_label="Stopped"
else
track_step "VM Startup" false "Failed to start VM $DR_VM_ID" "$step_start"
vm_status_label="Failed to start"
fi
fi
# Calculate total duration
local test_end_time=$(date +%s)
local total_duration=$(( (test_end_time - TEST_START_TIME) / 60 ))
# Prepare notification data
local steps_json
if [ ${#TEST_STEPS[@]} -eq 0 ]; then
steps_json='[]'
else
steps_json=$(printf '%s\n' "${TEST_STEPS[@]}" | jq -s '.')
fi
local errors_json
if [ ${#ERRORS[@]} -eq 0 ]; then
errors_json='[]'
else
errors_json=$(printf '%s\n' "${ERRORS[@]}" | jq -R . | jq -s .)
fi
local warnings_json
if [ ${#WARNINGS[@]} -eq 0 ]; then
warnings_json='[]'
else
warnings_json=$(printf '%s\n' "${WARNINGS[@]}" | jq -R . | jq -s .)
fi
local has_errors=false
local has_warnings=false
[ ${#ERRORS[@]} -gt 0 ] && has_errors=true
[ ${#WARNINGS[@]} -gt 0 ] && has_warnings=true
if [ "$is_success" = true ] && [ "$has_warnings" = true ]; then
severity="warning"
fi
local db_status_clean=$(echo "$db_status" | tr -d '\r' | sed 's/^ *//;s/ *$//')
# Escape restore log for JSON
local restore_log_json
restore_log_json=$(echo "$restore_log" | jq -Rs .)
# Collect last 100 lines of bash script log
local bash_log
bash_log=$(tail -100 "$LOG_FILE" 2>/dev/null || echo "Bash log not available")
local bash_log_json
bash_log_json=$(echo "$bash_log" | jq -Rs .)
local json_data=$(cat <<JSON
{
"severity": "$severity",
"test_result": "$test_result",
"date": "$(date '+%Y-%m-%d %H:%M:%S')",
"total_duration": $total_duration,
"is_success": $is_success,
"has_errors": $has_errors,
"has_warnings": $has_warnings,
"test_steps": $steps_json,
"errors": $errors_json,
"warnings": $warnings_json,
"backup_count": $backup_count,
"restore_duration": $restore_duration,
"tables_restored": ${tables_restored:-0},
"database_status": "${db_status_clean:-UNKNOWN}",
"disk_freed": $cleanup_freed,
"vm_id": "$DR_VM_ID",
"vm_ip": "$DR_VM_IP",
"vm_status": "$vm_status_label",
"nfs_status": "${nfs_status:-Unknown}",
"log_file": "$LOG_FILE",
"restore_log": $restore_log_json,
"bash_log": $bash_log_json
}
JSON
)
# Send notification
log "Sending notification..."
send_pve_notification "$severity" "$json_data"
# Final summary
log "=========================================="
log "Oracle DR Test Complete: $test_result"
log "Duration: $total_duration minutes"
log "Log: $LOG_FILE"
log "=========================================="
}
# Main execution
main() {
case "${1:-}" in
--install)
create_templates
echo ""
echo -e "${GREEN}Installation complete!${NC}"
echo "Next steps:"
echo "1. Test the script: /opt/scripts/weekly-dr-test-proxmox.sh"
echo "2. Add to cron: crontab -e"
echo " Add line: 0 6 * * 6 /opt/scripts/weekly-dr-test-proxmox.sh"
echo "3. Configure notifications in Proxmox GUI if needed:"
echo " Datacenter > Notifications > Add matching rules for 'oracle-dr-test'"
;;
--help)
echo "Oracle DR Weekly Test for Proxmox"
echo "Usage:"
echo " $0 - Run DR test"
echo " $0 --install - Create notification templates"
echo " $0 --help - Show this help"
;;
*)
# Check if templates exist, create if missing
if [ ! -f "$TEMPLATE_DIR/oracle-dr-test-subject.txt.hbs" ]; then
echo -e "${YELLOW}Templates not found, creating...${NC}"
create_templates
echo ""
fi
# Run DR test
run_dr_test
;;
esac
}
# Check dependencies
if ! command -v jq &> /dev/null; then
echo -e "${RED}Error: jq is not installed${NC}"
echo "Install with: apt-get install jq"
exit 1
fi
main "$@"