From a62bcb4331fac55ec22968caa7fdc6532b308a2a Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Sat, 25 Apr 2026 19:00:04 +0000 Subject: [PATCH] feat(dr): replicate oracle-backups dataset, mirror to pve1 nightly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert /mnt/pve/oracle-backups from a directory on the pveelite rootfs into a dedicated ZFS dataset rpool/oracle-backups so it can be incrementally replicated to pvemini. zfs-replicate-oracle-backups.sh runs every 15 minutes from cron on pveelite and uses zfs send/recv over the cluster's internal SSH (direct IP, /etc/pve/priv/known_hosts) to avoid Tailscale magicDNS detours that broke the first attempt. The destination dataset is set readonly=on so accidental writes on pvemini cannot diverge it. Snapshot pruning keeps 5 rolling copies. nightly-backup-mirror.sh ships a third copy nightly to pve1's backup-ssd (ext4 SATA) — different physical disk, different filesystem, different node — guarding against the failure mode where both pveelite and pvemini are simultaneously unavailable. The same script tars /etc/pve and rotates 14 days of cluster config archives, since pmxcfs is in-RAM and a multi-node quorum loss would otherwise take cluster config with it. The old directory is kept as oracle-backups.old-DELETE-AFTER-2026-05-02 on pveelite for one week as a safety net. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/nightly-backup-mirror.sh | 63 ++++++++++++++++ .../scripts/zfs-replicate-oracle-backups.sh | 73 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh create mode 100644 proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh diff --git a/proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh b/proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh new file mode 100644 index 0000000..cdfdb63 --- /dev/null +++ b/proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# +# Nightly mirror of Oracle backups + cluster config to pve1's backup-ssd. +# +# Why two redundant copies are not enough: +# * ZFS replica pveelite -> pvemini covers pveelite hardware failure. +# * If both pveelite AND pvemini are down (rare but possible — common +# storage controller, network rack, electrical fault), pve1 is the +# last copy. Keeping it on a different physical disk type (SATA +# ext4) further insulates against ZFS-on-NVMe-specific failures. +# * /etc/pve is in pmxcfs (in-RAM, replicated cluster-wide). If +# quorum is lost on multiple nodes simultaneously the config is +# unrecoverable without a backup. +# +# Schedule (cron on pveelite): 0 4 * * * + +set -euo pipefail +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +PVE1_HOST="10.0.20.200" +PVE1_BACKUP_DIR="/mnt/pve/backup-ssd" +ORACLE_SRC="/mnt/pve/oracle-backups/" +ORACLE_DST="${PVE1_BACKUP_DIR}/oracle-backups-mirror/" +PVE_CFG_DST="${PVE1_BACKUP_DIR}/pve-config-backups" +LOG="/var/log/oracle-dr/nightly-mirror.log" +SSH_OPTS="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes" +KEEP_PVE_CONFIGS=14 # 2 weeks of nightly /etc/pve archives + +mkdir -p "$(dirname "$LOG")" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >>"$LOG"; } + +log "=== Starting nightly mirror ===" + +# 1. Rsync Oracle backups to pve1 +log "Rsync ${ORACLE_SRC} -> ${PVE1_HOST}:${ORACLE_DST}" +if rsync -aHX --delete -e "ssh ${SSH_OPTS}" \ + "${ORACLE_SRC}" "root@${PVE1_HOST}:${ORACLE_DST}" 2>>"$LOG"; then + log "Oracle backups rsync OK" +else + log "ERROR: Oracle backups rsync failed" +fi + +# 2. Tar /etc/pve and ship to pve1 +TS=$(date +%Y%m%d_%H%M%S) +ARCHIVE="pve-config-${TS}.tar.gz" +log "Tar /etc/pve -> ${PVE1_HOST}:${PVE_CFG_DST}/${ARCHIVE}" +if tar czf - -C / etc/pve 2>/dev/null | \ + ssh ${SSH_OPTS} "root@${PVE1_HOST}" \ + "cat > '${PVE_CFG_DST}/${ARCHIVE}'" 2>>"$LOG"; then + log "pve-config tar OK ($(ssh ${SSH_OPTS} root@${PVE1_HOST} \ + "stat -c %s '${PVE_CFG_DST}/${ARCHIVE}'") bytes)" +else + log "ERROR: pve-config tar failed" +fi + +# 3. Prune old pve-config archives on pve1 (keep last KEEP_PVE_CONFIGS) +ssh ${SSH_OPTS} "root@${PVE1_HOST}" " + cd '${PVE_CFG_DST}' && \ + ls -1t pve-config-*.tar.gz 2>/dev/null | tail -n +$((KEEP_PVE_CONFIGS + 1)) | xargs -r rm -v +" >>"$LOG" 2>&1 || true + +log "=== Nightly mirror completed ===" diff --git a/proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh b/proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh new file mode 100644 index 0000000..680a149 --- /dev/null +++ b/proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# +# Replicate rpool/oracle-backups from pveelite (active NFS server) to +# pvemini (standby) every 15 minutes via incremental zfs send/recv. +# +# Why: NFS storage on pveelite is the single point that the DR test and +# the daily SCP transfers from primary Oracle Windows depend on. With +# 15-min ZFS replicas, pvemini can take over within minutes if pveelite +# becomes unreachable (run /opt/scripts/failover-dr-to-pvemini.sh). +# +# Why not pvesr or pve-zsync: +# * pvesr only replicates VM/CT disks, not arbitrary datasets. +# * pve-zsync would add a package dependency for one job. zfs send +# over SSH is the simplest mechanism that fits the rest of the +# cluster's replication patterns. +# +# Schedule: */15 * * * * via cron on pveelite. +# Initial sync: +# zfs send rpool/oracle-backups@init_ | ssh root@ \ +# 'zfs recv -F rpool/oracle-backups && zfs set readonly=on rpool/oracle-backups' +# ssh root@ 'zfs set mountpoint=/mnt/pve/oracle-backups rpool/oracle-backups' + +set -euo pipefail +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +DATASET="rpool/oracle-backups" +TARGET_HOST="10.0.20.201" # pvemini direct IP (avoids tailscale magicDNS detour) +SNAP_PREFIX="repl" +KEEP_SNAPS=5 # rolling history on source side +LOCK="/var/run/zfs-replicate-oracle-backups.lock" +LOG="/var/log/oracle-dr/replication.log" +SSH_OPTS="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes" + +mkdir -p "$(dirname "$LOG")" +exec 9>"$LOCK" +flock -n 9 || { echo "[$(date)] previous run still active, skipping" >>"$LOG"; exit 0; } + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >>"$LOG"; } + +NEW_SNAP="${DATASET}@${SNAP_PREFIX}_$(date +%Y%m%d_%H%M%S)" +zfs snapshot "$NEW_SNAP" + +# Find previous replication snapshot (excluding the one we just made) +PREV_SNAP=$(zfs list -t snapshot -o name -s creation "$DATASET" 2>/dev/null \ + | awk -v p="${DATASET}@${SNAP_PREFIX}_" '$0 ~ p' \ + | grep -v "$NEW_SNAP" \ + | tail -1 || true) + +if [ -n "$PREV_SNAP" ]; then + log "Incremental send: $PREV_SNAP -> $NEW_SNAP" + if ! zfs send -i "$PREV_SNAP" "$NEW_SNAP" | \ + ssh $SSH_OPTS root@${TARGET_HOST} "zfs recv -F $DATASET" 2>>"$LOG"; then + log "ERROR: incremental send failed" + zfs destroy "$NEW_SNAP" 2>/dev/null || true + exit 1 + fi +else + log "Full send (no previous snapshot found): $NEW_SNAP" + if ! zfs send "$NEW_SNAP" | \ + ssh $SSH_OPTS root@${TARGET_HOST} "zfs recv -F $DATASET" 2>>"$LOG"; then + log "ERROR: full send failed" + zfs destroy "$NEW_SNAP" 2>/dev/null || true + exit 1 + fi +fi + +# Prune old snapshots on source (keep last KEEP_SNAPS) +zfs list -t snapshot -o name -s creation "$DATASET" \ + | awk -v p="${DATASET}@${SNAP_PREFIX}_" '$0 ~ p' \ + | head -n -${KEEP_SNAPS} \ + | xargs -r -n1 zfs destroy 2>>"$LOG" || true + +log "Replication completed successfully"