#!/bin/bash # # Failback Oracle DR storage from pvemini back to pveelite. # # When to run: pveelite has been brought back online and you want to # return to the normal topology (pveelite = active, pvemini = readonly # replica). Inverse of failover-dr-to-pvemini.sh. # # Sequence: # 1. Confirm pveelite reachable. # 2. Snapshot current writable state on pvemini. # 3. Send the snapshot to pveelite (overwrites stale state there). # 4. Stop NFS on pvemini, remove its export entry. # 5. Set pvemini readonly=on (back to replica role). # 6. On pveelite: zfs recv finalisation, set readonly=off, restart NFS. # 7. Patch transfer_backups.ps1 on Oracle Windows back to pveelite IP. # 8. Re-arm replication cron (which already lives on pveelite). # # This script orchestrates from pvemini so it can SSH outward to pveelite. set -euo pipefail export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" PVEELITE_IP="10.0.20.202" PVEMINI_IP="10.0.20.201" DATASET="rpool/oracle-backups" MOUNTPOINT="/mnt/pve/oracle-backups" NFS_CLIENT="10.0.20.37" NFS_OPTS="rw,sync,no_subtree_check,no_root_squash" PRIMARY_HOST="10.0.20.36" PRIMARY_USER="dr-failover" PRIMARY_SSH_PORT="22122" TRANSFER_SCRIPT_WIN_PATH='D:\rman_backup\transfer_backups.ps1' LOG="/var/log/oracle-dr/failover.log" SSH_OPTS_PVE="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes" mkdir -p "$(dirname "$LOG")" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; } if [ "$(hostname)" != "pvemini" ]; then log "FATAL: this script must run on pvemini (current: $(hostname))" exit 1 fi log "============================================================" log "Oracle DR failback: pvemini -> pveelite" log "============================================================" # Step 1: verify pveelite reachable log "Step 1: verifying pveelite reachable" if ! ping -c 3 -W 2 "$PVEELITE_IP" >/dev/null 2>&1; then log "ABORT: pveelite is still unreachable." exit 2 fi if ! ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" "true" 2>/dev/null; then log "ABORT: pveelite SSH not responding." exit 2 fi log " pveelite reachable." # Step 2: take a final snapshot on pvemini before handing back SNAP="${DATASET}@failback_$(date +%Y%m%d_%H%M%S)" log "Step 2: snapshot $SNAP" zfs snapshot "$SNAP" # Step 3: send to pveelite log "Step 3: sending snapshot to pveelite (incremental from latest common)" COMMON_BASE=$(comm -12 \ <(zfs list -H -t snapshot -o name "$DATASET" | sed "s|^$DATASET@||" | sort) \ <(ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" "zfs list -H -t snapshot -o name $DATASET 2>/dev/null | sed 's|^$DATASET@||' | sort") \ | tail -1) if [ -z "$COMMON_BASE" ]; then log " no common snapshot — refusing to do full send (would destroy pveelite state)." log " Manual recovery required. Inspect: zfs list -t snapshot $DATASET on both nodes." exit 3 fi log " common base: $DATASET@$COMMON_BASE" log " sending ${DATASET}@${COMMON_BASE} -> $SNAP to pveelite" zfs send -i "${DATASET}@${COMMON_BASE}" "$SNAP" \ | ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" "zfs recv -F $DATASET" 2>&1 | tee -a "$LOG" # Step 4: stop NFS on pvemini, remove export log "Step 4: stopping NFS on pvemini" EXPORT_LINE="$MOUNTPOINT $NFS_CLIENT($NFS_OPTS)" if grep -qF "$EXPORT_LINE" /etc/exports; then sed -i "\#$EXPORT_LINE#d" /etc/exports log " export removed from /etc/exports" fi exportfs -ra # Only stop NFS server if no other exports remain if [ -z "$(exportfs -v 2>/dev/null)" ]; then systemctl stop nfs-server log " nfs-server stopped (no other exports)" fi # Step 5: pvemini back to readonly replica log "Step 5: setting pvemini dataset readonly=on" zfs set readonly=on "$DATASET" # Step 6: pveelite take over as primary log "Step 6: activating pveelite as primary" ssh $SSH_OPTS_PVE "root@$PVEELITE_IP" " set -e zfs set readonly=off $DATASET systemctl is-enabled --quiet nfs-server || systemctl enable nfs-server systemctl is-active --quiet nfs-server || systemctl start nfs-server exportfs -ra exportfs -v " 2>&1 | tee -a "$LOG" # Step 7: patch primary Oracle script back (literal Replace via PS EncodedCommand) log "Step 7: patching $TRANSFER_SCRIPT_WIN_PATH back to $PVEELITE_IP" PS_SCRIPT="\$path = '$TRANSFER_SCRIPT_WIN_PATH' \$old = '\"$PVEMINI_IP\"' \$new = '\"$PVEELITE_IP\"' \$content = Get-Content \$path -Raw if (\$content.Contains(\$old)) { Set-Content -Path \$path -Value \$content.Replace(\$old, \$new) -NoNewline Write-Output 'PATCHED_BACK' } elseif (\$content.Contains(\$new)) { Write-Output 'ALREADY_AT_PVEELITE' } else { Write-Output 'UNKNOWN_DRHost_VALUE' }" PS_B64=$(printf '%s' "$PS_SCRIPT" | iconv -t UTF-16LE | base64 -w0) PATCH_RESULT=$(ssh -p "$PRIMARY_SSH_PORT" -o ConnectTimeout=10 -o BatchMode=yes \ "$PRIMARY_USER@$PRIMARY_HOST" \ "powershell -NoProfile -EncodedCommand $PS_B64" 2>&1 \ | grep -vE '^#< CLIXML|$' | tr -d '\r' | head -1) if [ -n "$PATCH_RESULT" ]; then log " result: $PATCH_RESULT" else log " WARNING: SSH to primary failed — edit \$DRHost = \"$PVEELITE_IP\" manually" fi # Step 8: replication cron on pveelite is unchanged, will resume on schedule log "Step 8: replication cron on pveelite resumes automatically (*/15)" log "============================================================" log "Failback complete. pveelite is again the active NFS source." log "============================================================"