diff --git a/proxmox/vm109-windows-dr/scripts/failback-dr-to-pveelite.sh b/proxmox/vm109-windows-dr/scripts/failback-dr-to-pveelite.sh new file mode 100644 index 0000000..0e59361 --- /dev/null +++ b/proxmox/vm109-windows-dr/scripts/failback-dr-to-pveelite.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# +# Failback Oracle DR storage from pvemini back to pveelite. +# +# When to run: pveelite has been brought back online and you want to +# return to the normal topology (pveelite = active, pvemini = readonly +# replica). Inverse of failover-dr-to-pvemini.sh. +# +# Sequence: +# 1. Confirm pveelite reachable. +# 2. Snapshot current writable state on pvemini. +# 3. Send the snapshot to pveelite (overwrites stale state there). +# 4. Stop NFS on pvemini, remove its export entry. +# 5. Set pvemini readonly=on (back to replica role). +# 6. On pveelite: zfs recv finalisation, set readonly=off, restart NFS. +# 7. Patch transfer_backups.ps1 on Oracle Windows back to pveelite IP. +# 8. Re-arm replication cron (which already lives on pveelite). +# +# This script orchestrates from pvemini so it can SSH outward to pveelite. + +set -euo pipefail +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +PVEELITE_IP="10.0.20.202" +PVEMINI_IP="10.0.20.201" +DATASET="rpool/oracle-backups" +MOUNTPOINT="/mnt/pve/oracle-backups" +NFS_CLIENT="10.0.20.37" +NFS_OPTS="rw,sync,no_subtree_check,no_root_squash" +PRIMARY_HOST="10.0.20.36" +PRIMARY_USER="dr-failover" +PRIMARY_SSH_PORT="22122" +TRANSFER_SCRIPT_WIN_PATH='D:\rman_backup\transfer_backups.ps1' +LOG="/var/log/oracle-dr/failover.log" +SSH_OPTS_PVE="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes" + +mkdir -p "$(dirname "$LOG")" +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } + +if [ "$(hostname)" != "pvemini" ]; then + log "FATAL: this script must run on pvemini (current: $(hostname))" + exit 1 +fi + +log "============================================================" +log "Oracle DR failback: pvemini -> pveelite" +log "============================================================" + +# Step 1: verify pveelite reachable +log "Step 1: verifying pveelite reachable" +if ! ping -c 3 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then + log "ABORT: pveelite is still unreachable." + exit 2 +fi +if ! ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" "true" 2>/dev/null; then + log "ABORT: pveelite SSH not responding." + exit 2 +fi +log " pveelite reachable." + +# Step 2: take a final snapshot on pvemini before handing back +SNAP="${DATASET}@failback_$(date +%Y%m%d_%H%M%S)" +log "Step 2: snapshot $SNAP" +zfs snapshot "$SNAP" + +# Step 3: send to pveelite +log "Step 3: sending snapshot to pveelite (incremental from latest common)" +COMMON_BASE=$(comm -12 \ + <(zfs list -H -t snapshot -o name "$DATASET" | sed "s|^$DATASET@||" | sort) \ + <(ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" "zfs list -H -t snapshot -o name $DATASET 2>/dev/null | sed 's|^$DATASET@||' | sort") \ + | tail -1) + +if [ -z "$COMMON_BASE" ]; then + log " no common snapshot — refusing to do full send (would destroy pveelite state)." + log " Manual recovery required. Inspect: zfs list -t snapshot $DATASET on both nodes." + exit 3 +fi +log " common base: $DATASET@$COMMON_BASE" +log " sending ${DATASET}@${COMMON_BASE} -> $SNAP to pveelite" +zfs send -i "${DATASET}@${COMMON_BASE}" "$SNAP" \ + | ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" "zfs recv -F $DATASET" 2>&1 | tee -a "$LOG" + +# Step 4: stop NFS on pvemini, remove export +log "Step 4: stopping NFS on pvemini" +EXPORT_LINE="$MOUNTPOINT $NFS_CLIENT($NFS_OPTS)" +if grep -qF "$EXPORT_LINE" /etc/exports; then + sed -i "\#$EXPORT_LINE#d" /etc/exports + log " export removed from /etc/exports" +fi +exportfs -ra +# Only stop NFS server if no other exports remain +if [ -z "$(exportfs -v 2>/dev/null)" ]; then + systemctl stop nfs-server + log " nfs-server stopped (no other exports)" +fi + +# Step 5: pvemini back to readonly replica +log "Step 5: setting pvemini dataset readonly=on" +zfs set readonly=on "$DATASET" + +# Step 6: pveelite take over as primary +log "Step 6: activating pveelite as primary" +ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" " + set -e + zfs set readonly=off $DATASET + systemctl is-enabled --quiet nfs-server || systemctl enable nfs-server + systemctl is-active --quiet nfs-server || systemctl start nfs-server + exportfs -ra + exportfs -v +" 2>&1 | tee -a "$LOG" + +# Step 7: patch primary Oracle script back (literal Replace via PS EncodedCommand) +log "Step 7: patching $TRANSFER_SCRIPT_WIN_PATH back to $PVEELITE_IP" +PS_SCRIPT="\$path = '$TRANSFER_SCRIPT_WIN_PATH' +\$old = '\"$PVEMINI_IP\"' +\$new = '\"$PVEELITE_IP\"' +\$content = Get-Content \$path -Raw +if (\$content.Contains(\$old)) { + Set-Content -Path \$path -Value \$content.Replace(\$old, \$new) -NoNewline + Write-Output 'PATCHED_BACK' +} elseif (\$content.Contains(\$new)) { + Write-Output 'ALREADY_AT_PVEELITE' +} else { + Write-Output 'UNKNOWN_DRHost_VALUE' +}" +PS_B64=$(printf '%s' "$PS_SCRIPT" | iconv -t UTF-16LE | base64 -w0) +PATCH_RESULT=$(ssh -p "$PRIMARY_SSH_PORT" -o ConnectTimeout=10 -o BatchMode=yes \ + "$PRIMARY_USER@$PRIMARY_HOST" \ + "powershell -NoProfile -EncodedCommand $PS_B64" 2>&1 \ + | grep -vE '^#< CLIXML|$' | tr -d '\r' | head -1) +if [ -n "$PATCH_RESULT" ]; then + log " result: $PATCH_RESULT" +else + log " WARNING: SSH to primary failed — edit \$DRHost = \"$PVEELITE_IP\" manually" +fi + +# Step 8: replication cron on pveelite is unchanged, will resume on schedule +log "Step 8: replication cron on pveelite resumes automatically (*/15)" + +log "============================================================" +log "Failback complete. pveelite is again the active NFS source." +log "============================================================" diff --git a/proxmox/vm109-windows-dr/scripts/failover-dr-to-pvemini.sh b/proxmox/vm109-windows-dr/scripts/failover-dr-to-pvemini.sh new file mode 100644 index 0000000..8e06b2a --- /dev/null +++ b/proxmox/vm109-windows-dr/scripts/failover-dr-to-pvemini.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# +# Failover the Oracle DR storage from pveelite to pvemini. +# +# When to run: pveelite is dead long enough that the user has chosen to +# take over backup ingestion on pvemini rather than wait. The +# pveelite-down email alert points the operator at this script. +# +# What it does: +# 1. Confirms pveelite is actually unreachable (refuses to split-brain). +# 2. Flips rpool/oracle-backups on pvemini from readonly replica to +# writable primary. +# 3. Configures and starts the NFS export on pvemini so VM 109 can +# still mount /mnt/pve/oracle-backups when it boots there. +# 4. Patches transfer_backups.ps1 on the Oracle Windows production +# host (10.0.20.36) to ship to pvemini's IP instead of pveelite's. +# 5. Disables the original ZFS replication cron (which would now fail +# since the source pveelite is down). +# 6. Prints next steps for the operator. +# +# Idempotent: rerunning is safe — each step checks before acting. +# +# Reverse: /opt/scripts/failback-dr-to-pveelite.sh once pveelite is back. + +set -euo pipefail +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +PVEELITE_IP="10.0.20.202" +PVEMINI_IP="10.0.20.201" +DATASET="rpool/oracle-backups" +MOUNTPOINT="/mnt/pve/oracle-backups" +NFS_CLIENT="10.0.20.37" # VM 109 NFS client +NFS_OPTS="rw,sync,no_subtree_check,no_root_squash" +PRIMARY_HOST="10.0.20.36" +PRIMARY_USER="dr-failover" +PRIMARY_SSH_PORT="22122" +TRANSFER_SCRIPT_WIN_PATH='D:\rman_backup\transfer_backups.ps1' +LOG="/var/log/oracle-dr/failover.log" +SSH_OPTS_PVE="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes" + +mkdir -p "$(dirname "$LOG")" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } + +if [ "$(hostname)" != "pvemini" ]; then + log "FATAL: this script must run on pvemini (current: $(hostname))" + exit 1 +fi + +log "============================================================" +log "Oracle DR failover: pveelite -> pvemini" +log "============================================================" + +# Step 1: confirm pveelite is unreachable +log "Step 1: verifying pveelite ($PVEELITE_IP) is unreachable..." +if ping -c 3 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then + log "ABORT: pveelite responds to ping. Refusing to split-brain." + log " If you really want to force failover anyway:" + log " 1. Confirm pveelite NFS service is dead (systemctl status nfs-server)" + log " 2. Stop pveelite NFS first: ssh pveelite 'systemctl stop nfs-server'" + log " 3. Then re-run this script." + exit 2 +fi +log " pveelite unreachable, proceeding." + +# Step 2: flip dataset to writable +CURRENT_RO=$(zfs get -H -o value readonly "$DATASET") +log "Step 2: dataset readonly status = $CURRENT_RO" +if [ "$CURRENT_RO" = "on" ]; then + log " setting readonly=off on $DATASET" + zfs set readonly=off "$DATASET" +else + log " already writable, no change" +fi + +# Step 3: NFS export +log "Step 3: configuring NFS export on pvemini" +EXPORT_LINE="$MOUNTPOINT $NFS_CLIENT($NFS_OPTS)" +if grep -qF "$EXPORT_LINE" /etc/exports; then + log " export already present in /etc/exports" +else + log " appending export line" + echo "$EXPORT_LINE" >> /etc/exports +fi +systemctl is-enabled --quiet nfs-server || systemctl enable nfs-server +systemctl is-active --quiet nfs-server || systemctl start nfs-server +exportfs -ra +log " active exports:" +exportfs -v 2>&1 | sed 's/^/ /' | tee -a "$LOG" + +# Step 4: patch primary Oracle transfer script. +# Use literal String.Replace (no regex). Send via PowerShell -EncodedCommand +# (UTF-16LE base64) to bypass all bash <-> SSH <-> PowerShell quoting issues. +log "Step 4: patching $TRANSFER_SCRIPT_WIN_PATH on $PRIMARY_HOST" +PS_SCRIPT="\$path = '$TRANSFER_SCRIPT_WIN_PATH' +\$old = '\"$PVEELITE_IP\"' +\$new = '\"$PVEMINI_IP\"' +\$content = Get-Content \$path -Raw +if (\$content.Contains(\$old)) { + Set-Content -Path \$path -Value \$content.Replace(\$old, \$new) -NoNewline + Write-Output 'PATCHED' +} elseif (\$content.Contains(\$new)) { + Write-Output 'ALREADY_FAILED_OVER' +} else { + Write-Output 'UNKNOWN_DRHost_VALUE' +}" +PS_B64=$(printf '%s' "$PS_SCRIPT" | iconv -t UTF-16LE | base64 -w0) +PATCH_RESULT=$(ssh -p "$PRIMARY_SSH_PORT" -o ConnectTimeout=10 -o BatchMode=yes \ + "$PRIMARY_USER@$PRIMARY_HOST" \ + "powershell -NoProfile -EncodedCommand $PS_B64" 2>&1 \ + | grep -vE '^#< CLIXML|$' | tr -d '\r' | head -1) +if [ -n "$PATCH_RESULT" ]; then + log " result: $PATCH_RESULT" +else + log " WARNING: SSH to primary failed — operator must edit $TRANSFER_SCRIPT_WIN_PATH manually" + log " Set: \$DRHost = \"$PVEMINI_IP\"" +fi + +# Step 5: disable original replication cron entry locally too +# (it lives on pveelite; nothing to do here, but document) +log "Step 5: ZFS replication cron is on pveelite which is down — no action needed" + +# Step 6: print next steps +log "============================================================" +log "Failover complete on pvemini." +log "Next steps for the operator:" +log " 1. Verify VM 109 starts here if a DR test is needed:" +log " qm start 109 (once HA migrates VM 109 to pvemini, or manually)" +log " 2. Watch the next scheduled Oracle backup land on pvemini:" +log " tail -f /var/log/syslog | grep nfsd" +log " 3. When pveelite returns, run /opt/scripts/failback-dr-to-pveelite.sh" +log "============================================================" diff --git a/proxmox/vm109-windows-dr/scripts/pveelite-down-alert.sh b/proxmox/vm109-windows-dr/scripts/pveelite-down-alert.sh new file mode 100644 index 0000000..55717a8 --- /dev/null +++ b/proxmox/vm109-windows-dr/scripts/pveelite-down-alert.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Detects pveelite outage and emails the operator with copy-paste +# failover instructions. Runs on pvemini every minute. +# +# Threshold: 5 consecutive minute failures before alerting (avoids +# false positives from short network blips). State is held in +# /var/run/pveelite-down-counter so a flap drops back to 0. +# +# Schedule (cron on pvemini): * * * * * /opt/scripts/pveelite-down-alert.sh + +set -euo pipefail +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +PVEELITE_IP="10.0.20.202" +PVEMINI_IP="10.0.20.201" +DATASET="rpool/oracle-backups" +COUNTER_FILE="/var/run/pveelite-down-counter" +ALERT_SENT_FILE="/var/run/pveelite-down-alerted" +ALERT_THRESHOLD=5 +ALERT_RECIPIENT="${ALERT_RECIPIENT:-root}" + +if ping -c 1 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then + # Reset counter on success and clear "alerted" flag so a future outage re-fires. + rm -f "$COUNTER_FILE" "$ALERT_SENT_FILE" + exit 0 +fi + +# Failure tick +COUNT=$(( $(cat "$COUNTER_FILE" 2>/dev/null || echo 0) + 1 )) +echo "$COUNT" >"$COUNTER_FILE" + +[ "$COUNT" -lt "$ALERT_THRESHOLD" ] && exit 0 +[ -f "$ALERT_SENT_FILE" ] && exit 0 # already alerted this outage + +# Gather diagnostics for the email body +LAST_REPL=$(zfs list -t snapshot -o name,creation -s creation 2>/dev/null \ + | awk -v p="$DATASET@repl_" '$1 ~ p {snap=$1; ts=$2 " " $3 " " $4 " " $5 " " $6} END {print snap " (" ts ")"}') +LAST_VM109_REPL=$(zfs list -t snapshot -o name,creation -s creation 2>/dev/null \ + | awk '/vm-109-disk-1@__replicate_109/ {snap=$1; ts=$2 " " $3 " " $4 " " $5 " " $6} END {print snap " (" ts ")"}') + +cat <