From 5750b42836d4f8de27a6acfb38b371e4e8ed6387 Mon Sep 17 00:00:00 2001 From: Marius Date: Sat, 11 Oct 2025 14:30:32 +0300 Subject: [PATCH] Oracle DR: Replace fixed VM boot wait with intelligent polling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Performance optimization for VM startup: Before: Fixed 180s wait regardless of actual boot time After: Intelligent polling with early exit when VM is ready Implementation: - Poll every 5 seconds (max 180s timeout) - Check 1: VM running status in Proxmox (qm status) - Check 2: SSH connectivity test - Check 3: PowerShell availability (what we actually need) - Exit immediately when all checks pass - Progress logging every 30 seconds - Fallback: Continue after 180s with warning Benefits: - Fast VM boot (30s) → saves 150s (2min 30s) - Normal VM boot (60s) → saves 120s (2min) - Slow VM boot → 180s (same as before) - More robust: verifies SSH+PowerShell actually work Average expected improvement: 60-120 seconds per test Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../weekly-dr-test-proxmox.sh | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/oracle/standby-server-scripts/weekly-dr-test-proxmox.sh b/oracle/standby-server-scripts/weekly-dr-test-proxmox.sh index 946dc44..fbdc616 100644 --- a/oracle/standby-server-scripts/weekly-dr-test-proxmox.sh +++ b/oracle/standby-server-scripts/weekly-dr-test-proxmox.sh @@ -353,8 +353,44 @@ run_dr_test() { if qm start "$DR_VM_ID" 2>/dev/null; then vm_status_label="Running" - sleep 180 # Wait for boot - track_step "VM Startup" true "VM $DR_VM_ID started" "$step_start" + + # Intelligent VM boot wait with polling (max 180s) + local MAX_BOOT_WAIT=180 + local POLL_INTERVAL=5 + local boot_elapsed=0 + local vm_ready=false + + log "Waiting for VM to become ready (SSH + PowerShell, max ${MAX_BOOT_WAIT}s)..." + + while [ $boot_elapsed -lt $MAX_BOOT_WAIT ]; do + # Check 1: VM running status in Proxmox + local vm_qm_status + vm_qm_status=$(qm status "$DR_VM_ID" 2>/dev/null | grep -o "running" || echo "") + + if [ "$vm_qm_status" = "running" ]; then + # Check 2: SSH connectivity and PowerShell availability (what we actually need) + if ssh -p "$DR_VM_PORT" -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o BatchMode=yes "$DR_VM_USER@$DR_VM_IP" \ + "powershell -Command 'Write-Output ready'" >/dev/null 2>&1; then + log "VM ready after ${boot_elapsed}s (SSH and PowerShell responding)" + vm_ready=true + break + fi + fi + + sleep $POLL_INTERVAL + boot_elapsed=$((boot_elapsed + POLL_INTERVAL)) + + # Progress logging every 30 seconds + if [ $((boot_elapsed % 30)) -eq 0 ] && [ $boot_elapsed -lt $MAX_BOOT_WAIT ]; then + log "Still waiting for VM... (${boot_elapsed}s/${MAX_BOOT_WAIT}s elapsed)" + fi + done + + if [ "$vm_ready" = false ]; then + log_warning "VM did not respond within ${MAX_BOOT_WAIT}s, continuing anyway (may cause subsequent failures)" + fi + + track_step "VM Startup" true "VM $DR_VM_ID started and ready (${boot_elapsed}s)" "$step_start" # Step 3: Verify NFS mount step_start=$(date +%s)