From a62bcb4331fac55ec22968caa7fdc6532b308a2a Mon Sep 17 00:00:00 2001
From: Claude Agent <claude-agent@romfast.ro>
Date: Sat, 25 Apr 2026 19:00:04 +0000
Subject: [PATCH] feat(dr): replicate oracle-backups dataset, mirror to pve1
 nightly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert /mnt/pve/oracle-backups from a directory on the pveelite
rootfs into a dedicated ZFS dataset rpool/oracle-backups so it can be
incrementally replicated to pvemini. zfs-replicate-oracle-backups.sh
runs every 15 minutes from cron on pveelite and uses zfs send/recv
over the cluster's internal SSH (direct IP, /etc/pve/priv/known_hosts)
to avoid Tailscale magicDNS detours that broke the first attempt.
The destination dataset is set readonly=on so accidental writes on
pvemini cannot diverge it. Snapshot pruning keeps 5 rolling copies.

nightly-backup-mirror.sh ships a third copy nightly to pve1's
backup-ssd (ext4 SATA) — different physical disk, different
filesystem, different node — guarding against the failure mode where
both pveelite and pvemini are simultaneously unavailable. The same
script tars /etc/pve and rotates 14 days of cluster config archives,
since pmxcfs is in-RAM and a multi-node quorum loss would otherwise
take cluster config with it.

The old directory is kept as oracle-backups.old-DELETE-AFTER-2026-05-02
on pveelite for one week as a safety net.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../scripts/nightly-backup-mirror.sh          | 63 ++++++++++++++++
 .../scripts/zfs-replicate-oracle-backups.sh   | 73 +++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh
 create mode 100644 proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh

diff --git a/proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh b/proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh
new file mode 100644
index 0000000..cdfdb63
--- /dev/null
+++ b/proxmox/vm109-windows-dr/scripts/nightly-backup-mirror.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# Nightly mirror of Oracle backups + cluster config to pve1's backup-ssd.
+#
+# Why two redundant copies are not enough:
+#   * ZFS replica pveelite -> pvemini covers pveelite hardware failure.
+#   * If both pveelite AND pvemini are down (rare but possible — common
+#     storage controller, network rack, electrical fault), pve1 is the
+#     last copy. Keeping it on a different physical disk type (SATA
+#     ext4) further insulates against ZFS-on-NVMe-specific failures.
+#   * /etc/pve is in pmxcfs (in-RAM, replicated cluster-wide). If
+#     quorum is lost on multiple nodes simultaneously the config is
+#     unrecoverable without a backup.
+#
+# Schedule (cron on pveelite): 0 4 * * *
+
+set -euo pipefail
+export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+
+PVE1_HOST="10.0.20.200"
+PVE1_BACKUP_DIR="/mnt/pve/backup-ssd"
+ORACLE_SRC="/mnt/pve/oracle-backups/"
+ORACLE_DST="${PVE1_BACKUP_DIR}/oracle-backups-mirror/"
+PVE_CFG_DST="${PVE1_BACKUP_DIR}/pve-config-backups"
+LOG="/var/log/oracle-dr/nightly-mirror.log"
+SSH_OPTS="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes"
+KEEP_PVE_CONFIGS=14   # 2 weeks of nightly /etc/pve archives
+
+mkdir -p "$(dirname "$LOG")"
+
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >>"$LOG"; }
+
+log "=== Starting nightly mirror ==="
+
+# 1. Rsync Oracle backups to pve1
+log "Rsync ${ORACLE_SRC} -> ${PVE1_HOST}:${ORACLE_DST}"
+if rsync -aHX --delete -e "ssh ${SSH_OPTS}" \
+        "${ORACLE_SRC}" "root@${PVE1_HOST}:${ORACLE_DST}" 2>>"$LOG"; then
+    log "Oracle backups rsync OK"
+else
+    log "ERROR: Oracle backups rsync failed"
+fi
+
+# 2. Tar /etc/pve and ship to pve1
+TS=$(date +%Y%m%d_%H%M%S)
+ARCHIVE="pve-config-${TS}.tar.gz"
+log "Tar /etc/pve -> ${PVE1_HOST}:${PVE_CFG_DST}/${ARCHIVE}"
+if tar czf - -C / etc/pve 2>/dev/null | \
+        ssh ${SSH_OPTS} "root@${PVE1_HOST}" \
+            "cat > '${PVE_CFG_DST}/${ARCHIVE}'" 2>>"$LOG"; then
+    log "pve-config tar OK ($(ssh ${SSH_OPTS} root@${PVE1_HOST} \
+        "stat -c %s '${PVE_CFG_DST}/${ARCHIVE}'") bytes)"
+else
+    log "ERROR: pve-config tar failed"
+fi
+
+# 3. Prune old pve-config archives on pve1 (keep last KEEP_PVE_CONFIGS)
+ssh ${SSH_OPTS} "root@${PVE1_HOST}" "
+    cd '${PVE_CFG_DST}' && \
+    ls -1t pve-config-*.tar.gz 2>/dev/null | tail -n +$((KEEP_PVE_CONFIGS + 1)) | xargs -r rm -v
+" >>"$LOG" 2>&1 || true
+
+log "=== Nightly mirror completed ==="
diff --git a/proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh b/proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh
new file mode 100644
index 0000000..680a149
--- /dev/null
+++ b/proxmox/vm109-windows-dr/scripts/zfs-replicate-oracle-backups.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+#
+# Replicate rpool/oracle-backups from pveelite (active NFS server) to
+# pvemini (standby) every 15 minutes via incremental zfs send/recv.
+#
+# Why: NFS storage on pveelite is the single point that the DR test and
+# the daily SCP transfers from primary Oracle Windows depend on. With
+# 15-min ZFS replicas, pvemini can take over within minutes if pveelite
+# becomes unreachable (run /opt/scripts/failover-dr-to-pvemini.sh).
+#
+# Why not pvesr or pve-zsync:
+#   * pvesr only replicates VM/CT disks, not arbitrary datasets.
+#   * pve-zsync would add a package dependency for one job. zfs send
+#     over SSH is the simplest mechanism that fits the rest of the
+#     cluster's replication patterns.
+#
+# Schedule: */15 * * * * via cron on pveelite.
+# Initial sync:
+#   zfs send rpool/oracle-backups@init_<ts> | ssh root@<pvemini> \
+#     'zfs recv -F rpool/oracle-backups && zfs set readonly=on rpool/oracle-backups'
+#   ssh root@<pvemini> 'zfs set mountpoint=/mnt/pve/oracle-backups rpool/oracle-backups'
+
+set -euo pipefail
+export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+
+DATASET="rpool/oracle-backups"
+TARGET_HOST="10.0.20.201"     # pvemini direct IP (avoids tailscale magicDNS detour)
+SNAP_PREFIX="repl"
+KEEP_SNAPS=5                  # rolling history on source side
+LOCK="/var/run/zfs-replicate-oracle-backups.lock"
+LOG="/var/log/oracle-dr/replication.log"
+SSH_OPTS="-o UserKnownHostsFile=/etc/pve/priv/known_hosts -o StrictHostKeyChecking=no -o BatchMode=yes"
+
+mkdir -p "$(dirname "$LOG")"
+exec 9>"$LOCK"
+flock -n 9 || { echo "[$(date)] previous run still active, skipping" >>"$LOG"; exit 0; }
+
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >>"$LOG"; }
+
+NEW_SNAP="${DATASET}@${SNAP_PREFIX}_$(date +%Y%m%d_%H%M%S)"
+zfs snapshot "$NEW_SNAP"
+
+# Find previous replication snapshot (excluding the one we just made)
+PREV_SNAP=$(zfs list -t snapshot -o name -s creation "$DATASET" 2>/dev/null \
+    | awk -v p="${DATASET}@${SNAP_PREFIX}_" '$0 ~ p' \
+    | grep -v "$NEW_SNAP" \
+    | tail -1 || true)
+
+if [ -n "$PREV_SNAP" ]; then
+    log "Incremental send: $PREV_SNAP -> $NEW_SNAP"
+    if ! zfs send -i "$PREV_SNAP" "$NEW_SNAP" | \
+            ssh $SSH_OPTS root@${TARGET_HOST} "zfs recv -F $DATASET" 2>>"$LOG"; then
+        log "ERROR: incremental send failed"
+        zfs destroy "$NEW_SNAP" 2>/dev/null || true
+        exit 1
+    fi
+else
+    log "Full send (no previous snapshot found): $NEW_SNAP"
+    if ! zfs send "$NEW_SNAP" | \
+            ssh $SSH_OPTS root@${TARGET_HOST} "zfs recv -F $DATASET" 2>>"$LOG"; then
+        log "ERROR: full send failed"
+        zfs destroy "$NEW_SNAP" 2>/dev/null || true
+        exit 1
+    fi
+fi
+
+# Prune old snapshots on source (keep last KEEP_SNAPS)
+zfs list -t snapshot -o name -s creation "$DATASET" \
+    | awk -v p="${DATASET}@${SNAP_PREFIX}_" '$0 ~ p' \
+    | head -n -${KEEP_SNAPS} \
+    | xargs -r -n1 zfs destroy 2>>"$LOG" || true
+
+log "Replication completed successfully"