ROMFASTSQL/proxmox/vm109-windows-dr/scripts/failover-dr-to-pvemini.sh

#!/bin/bash
#
# Failover the Oracle DR storage from pveelite to pvemini.
#
# When to run: pveelite is dead long enough that the user has chosen to
# take over backup ingestion on pvemini rather than wait. The
# pveelite-down email alert points the operator at this script.
#
# What it does:
#   1. Confirms pveelite is actually unreachable (refuses to split-brain).
#   2. Flips rpool/oracle-backups on pvemini from readonly replica to
#      writable primary.
#   3. Configures and starts the NFS export on pvemini so VM 109 can
#      still mount /mnt/pve/oracle-backups when it boots there.
#   4. Patches transfer_backups.ps1 on the Oracle Windows production
#      host (10.0.20.36) to ship to pvemini's IP instead of pveelite's.
#   5. Disables the original ZFS replication cron (which would now fail
#      since the source pveelite is down).
#   6. Prints next steps for the operator.
#
# Idempotent: rerunning is safe — each step checks before acting.
#
# Reverse: /opt/scripts/failback-dr-to-pveelite.sh once pveelite is back.

set -euo pipefail
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"

PVEELITE_IP="10.0.20.202"
PVEMINI_IP="10.0.20.201"
DATASET="rpool/oracle-backups"
MOUNTPOINT="/mnt/pve/oracle-backups"
NFS_CLIENT="10.0.20.37"     # VM 109 NFS client
NFS_OPTS="rw,sync,no_subtree_check,no_root_squash"
PRIMARY_HOST="10.0.20.36"
PRIMARY_USER="dr-failover"
PRIMARY_SSH_PORT="22122"
TRANSFER_SCRIPT_WIN_PATH='D:\rman_backup\transfer_backups.ps1'
LOG="/var/log/oracle-dr/failover.log"
SSH_OPTS_PVE="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes"

mkdir -p "$(dirname "$LOG")"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }

if [ "$(hostname)" != "pvemini" ]; then
    log "FATAL: this script must run on pvemini (current: $(hostname))"
    exit 1
fi

log "============================================================"
log "Oracle DR failover: pveelite -> pvemini"
log "============================================================"

# Step 1: confirm pveelite is unreachable
log "Step 1: verifying pveelite ($PVEELITE_IP) is unreachable..."
if ping -c 3 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then
    log "ABORT: pveelite responds to ping. Refusing to split-brain."
    log "  If you really want to force failover anyway:"
    log "    1. Confirm pveelite NFS service is dead (systemctl status nfs-server)"
    log "    2. Stop pveelite NFS first: ssh pveelite 'systemctl stop nfs-server'"
    log "    3. Then re-run this script."
    exit 2
fi
log "  pveelite unreachable, proceeding."

# Step 2: flip dataset to writable
CURRENT_RO=$(zfs get -H -o value readonly "$DATASET")
log "Step 2: dataset readonly status = $CURRENT_RO"
if [ "$CURRENT_RO" = "on" ]; then
    log "  setting readonly=off on $DATASET"
    zfs set readonly=off "$DATASET"
else
    log "  already writable, no change"
fi

# Step 3: NFS export
log "Step 3: configuring NFS export on pvemini"
EXPORT_LINE="$MOUNTPOINT $NFS_CLIENT($NFS_OPTS)"
if grep -qF "$EXPORT_LINE" /etc/exports; then
    log "  export already present in /etc/exports"
else
    log "  appending export line"
    echo "$EXPORT_LINE" >> /etc/exports
fi
systemctl is-enabled --quiet nfs-server || systemctl enable nfs-server
systemctl is-active  --quiet nfs-server || systemctl start  nfs-server
exportfs -ra
log "  active exports:"
exportfs -v 2>&1 | sed 's/^/    /' | tee -a "$LOG"

# Step 4: patch primary Oracle transfer script.
# Use literal String.Replace (no regex). Send via PowerShell -EncodedCommand
# (UTF-16LE base64) to bypass all bash <-> SSH <-> PowerShell quoting issues.
log "Step 4: patching $TRANSFER_SCRIPT_WIN_PATH on $PRIMARY_HOST"
PS_SCRIPT="\$path = '$TRANSFER_SCRIPT_WIN_PATH'
\$old  = '\"$PVEELITE_IP\"'
\$new  = '\"$PVEMINI_IP\"'
\$content = Get-Content \$path -Raw
if (\$content.Contains(\$old)) {
    Set-Content -Path \$path -Value \$content.Replace(\$old, \$new) -NoNewline
    Write-Output 'PATCHED'
} elseif (\$content.Contains(\$new)) {
    Write-Output 'ALREADY_FAILED_OVER'
} else {
    Write-Output 'UNKNOWN_DRHost_VALUE'
}"
PS_B64=$(printf '%s' "$PS_SCRIPT" | iconv -t UTF-16LE | base64 -w0)
PATCH_RESULT=$(ssh -p "$PRIMARY_SSH_PORT" -o ConnectTimeout=10 -o BatchMode=yes \
        "$PRIMARY_USER@$PRIMARY_HOST" \
        "powershell -NoProfile -EncodedCommand $PS_B64" 2>&1 \
        | grep -vE '^#< CLIXML|<Objs |</Objs>$' | tr -d '\r' | head -1)
if [ -n "$PATCH_RESULT" ]; then
    log "  result: $PATCH_RESULT"
else
    log "  WARNING: SSH to primary failed — operator must edit $TRANSFER_SCRIPT_WIN_PATH manually"
    log "  Set: \$DRHost = \"$PVEMINI_IP\""
fi

# Step 5: disable original replication cron entry locally too
# (it lives on pveelite; nothing to do here, but document)
log "Step 5: ZFS replication cron is on pveelite which is down — no action needed"

# Step 6: print next steps
log "============================================================"
log "Failover complete on pvemini."
log "Next steps for the operator:"
log "  1. Verify VM 109 starts here if a DR test is needed:"
log "       qm start 109   (once HA migrates VM 109 to pvemini, or manually)"
log "  2. Watch the next scheduled Oracle backup land on pvemini:"
log "       tail -f /var/log/syslog | grep nfsd"
log "  3. When pveelite returns, run /opt/scripts/failback-dr-to-pveelite.sh"
log "============================================================"