#!/bin/bash # # Failover the Oracle DR storage from pveelite to pvemini. # # When to run: pveelite is dead long enough that the user has chosen to # take over backup ingestion on pvemini rather than wait. The # pveelite-down email alert points the operator at this script. # # What it does: # 1. Confirms pveelite is actually unreachable (refuses to split-brain). # 2. Flips rpool/oracle-backups on pvemini from readonly replica to # writable primary. # 3. Configures and starts the NFS export on pvemini so VM 109 can # still mount /mnt/pve/oracle-backups when it boots there. # 4. Patches transfer_backups.ps1 on the Oracle Windows production # host (10.0.20.36) to ship to pvemini's IP instead of pveelite's. # 5. Disables the original ZFS replication cron (which would now fail # since the source pveelite is down). # 6. Prints next steps for the operator. # # Idempotent: rerunning is safe — each step checks before acting. # # Reverse: /opt/scripts/failback-dr-to-pveelite.sh once pveelite is back. set -euo pipefail export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" PVEELITE_IP="10.0.20.202" PVEMINI_IP="10.0.20.201" DATASET="rpool/oracle-backups" MOUNTPOINT="/mnt/pve/oracle-backups" NFS_CLIENT="10.0.20.37" # VM 109 NFS client NFS_OPTS="rw,sync,no_subtree_check,no_root_squash" PRIMARY_HOST="10.0.20.36" PRIMARY_USER="dr-failover" PRIMARY_SSH_PORT="22122" TRANSFER_SCRIPT_WIN_PATH='D:\rman_backup\transfer_backups.ps1' LOG="/var/log/oracle-dr/failover.log" SSH_OPTS_PVE="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes" mkdir -p "$(dirname "$LOG")" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } if [ "$(hostname)" != "pvemini" ]; then log "FATAL: this script must run on pvemini (current: $(hostname))" exit 1 fi log "============================================================" log "Oracle DR failover: pveelite -> pvemini" log "============================================================" # Step 1: confirm pveelite is unreachable log "Step 1: verifying pveelite ($PVEELITE_IP) is unreachable..." if ping -c 3 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then log "ABORT: pveelite responds to ping. Refusing to split-brain." log " If you really want to force failover anyway:" log " 1. Confirm pveelite NFS service is dead (systemctl status nfs-server)" log " 2. Stop pveelite NFS first: ssh pveelite 'systemctl stop nfs-server'" log " 3. Then re-run this script." exit 2 fi log " pveelite unreachable, proceeding." # Step 2: flip dataset to writable CURRENT_RO=$(zfs get -H -o value readonly "$DATASET") log "Step 2: dataset readonly status = $CURRENT_RO" if [ "$CURRENT_RO" = "on" ]; then log " setting readonly=off on $DATASET" zfs set readonly=off "$DATASET" else log " already writable, no change" fi # Step 3: NFS export log "Step 3: configuring NFS export on pvemini" EXPORT_LINE="$MOUNTPOINT $NFS_CLIENT($NFS_OPTS)" if grep -qF "$EXPORT_LINE" /etc/exports; then log " export already present in /etc/exports" else log " appending export line" echo "$EXPORT_LINE" >> /etc/exports fi systemctl is-enabled --quiet nfs-server || systemctl enable nfs-server systemctl is-active --quiet nfs-server || systemctl start nfs-server exportfs -ra log " active exports:" exportfs -v 2>&1 | sed 's/^/ /' | tee -a "$LOG" # Step 4: patch primary Oracle transfer script. # Use literal String.Replace (no regex). Send via PowerShell -EncodedCommand # (UTF-16LE base64) to bypass all bash <-> SSH <-> PowerShell quoting issues. log "Step 4: patching $TRANSFER_SCRIPT_WIN_PATH on $PRIMARY_HOST" PS_SCRIPT="\$path = '$TRANSFER_SCRIPT_WIN_PATH' \$old = '\"$PVEELITE_IP\"' \$new = '\"$PVEMINI_IP\"' \$content = Get-Content \$path -Raw if (\$content.Contains(\$old)) { Set-Content -Path \$path -Value \$content.Replace(\$old, \$new) -NoNewline Write-Output 'PATCHED' } elseif (\$content.Contains(\$new)) { Write-Output 'ALREADY_FAILED_OVER' } else { Write-Output 'UNKNOWN_DRHost_VALUE' }" PS_B64=$(printf '%s' "$PS_SCRIPT" | iconv -t UTF-16LE | base64 -w0) PATCH_RESULT=$(ssh -p "$PRIMARY_SSH_PORT" -o ConnectTimeout=10 -o BatchMode=yes \ "$PRIMARY_USER@$PRIMARY_HOST" \ "powershell -NoProfile -EncodedCommand $PS_B64" 2>&1 \ | grep -vE '^#< CLIXML|$' | tr -d '\r' | head -1) if [ -n "$PATCH_RESULT" ]; then log " result: $PATCH_RESULT" else log " WARNING: SSH to primary failed — operator must edit $TRANSFER_SCRIPT_WIN_PATH manually" log " Set: \$DRHost = \"$PVEMINI_IP\"" fi # Step 5: disable original replication cron entry locally too # (it lives on pveelite; nothing to do here, but document) log "Step 5: ZFS replication cron is on pveelite which is down — no action needed" # Step 6: print next steps log "============================================================" log "Failover complete on pvemini." log "Next steps for the operator:" log " 1. Verify VM 109 starts here if a DR test is needed:" log " qm start 109 (once HA migrates VM 109 to pvemini, or manually)" log " 2. Watch the next scheduled Oracle backup land on pvemini:" log " tail -f /var/log/syslog | grep nfsd" log " 3. When pveelite returns, run /opt/scripts/failback-dr-to-pveelite.sh" log "============================================================"