From b4c2a24281314bebfc233d89d2dd2d07eb675cf5 Mon Sep 17 00:00:00 2001 From: Marius Date: Thu, 6 Nov 2025 12:25:38 +0200 Subject: [PATCH] Fix Oracle DR test ORA-00600 error by forcing service shutdown in cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: DR weekly test failed with ORA-00600 [kcbzib_kcrsds_1] when executed via cron, but succeeded when run manually. Error occurred during "ALTER DATABASE OPEN RESETLOGS" step after successful restore and recovery. Root Cause Analysis: - Manual test (12:09): Undo initialization = 0ms, no errors - Cron test (10:45): Undo initialization = 2735ms, ORA-00600 crash - Alert log showed: "Undo initialization recovery: err:600" - Oracle instance was in inconsistent state from previous run The cleanup_database.ps1 script had an "optimization" that preserved the running Oracle service to "save ~30s startup time". This left the service in an inconsistent state between test runs, causing Oracle to crash when attempting to open the database with RESETLOGS. Solution: Modified cleanup_database.ps1 to ALWAYS stop Oracle service completely: 1. SHUTDOWN ABORT the instance (not just when /AFTER flag) 2. Stop-Service OracleServiceROA (force clean state) 3. Kill remaining oracle processes 4. Service starts fresh during restore (clean Undo initialization) Changes: - Removed if/else branch that skipped shutdown before restore - Always perform full shutdown regardless of /AFTER parameter - Updated messages to reflect clean state approach - Added explanation: "This ensures no state inconsistencies (prevents ORA-00600)" Testing: Manual test confirmed clean 0ms Undo initialization after fix. Related: Works in conjunction with weekly-dr-test-proxmox.sh PATH fix (commit 34f91ba) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../cleanup_database.ps1 | 88 +++++++++---------- 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/oracle/standby-server-scripts/cleanup_database.ps1 b/oracle/standby-server-scripts/cleanup_database.ps1 index 526fa13..3df7542 100644 --- a/oracle/standby-server-scripts/cleanup_database.ps1 +++ b/oracle/standby-server-scripts/cleanup_database.ps1 @@ -40,56 +40,50 @@ New-Item -ItemType Directory -Path "D:\oracle\temp" -Force | Out-Null New-Item -ItemType Directory -Path "D:\oracle\logs" -Force | Out-Null Write-Host "" -if ($afterRestore) { - Write-Host "[1/6] Shutting down database (cleanup AFTER restore)..." +Write-Host "[1/6] Shutting down database and stopping service..." + +# Check if Oracle service exists +$service = Get-Service -Name "OracleServiceROA" -ErrorAction SilentlyContinue +if ($service) { + Write-Host " Oracle service found, ensuring clean shutdown..." - # Check if Oracle service exists - $service = Get-Service -Name "OracleServiceROA" -ErrorAction SilentlyContinue - if ($service) { - Write-Host " Oracle service found, attempting shutdown..." - - # Shutdown instance using SQL*Plus - $shutdownSQL = "WHENEVER SQLERROR CONTINUE`nSHUTDOWN ABORT;`nEXIT;" - try { - $shutdownSQL | & sqlplus -S / as sysdba 2>&1 | Out-Null - Start-Sleep -Seconds 2 - Write-Host " Instance shut down" - } catch { - Write-Host " Shutdown command sent (errors ignored)" - } - - # Stop Oracle service to release file locks - if ($service.Status -eq "Running") { - Write-Host " Stopping Oracle service to release file locks..." - try { - Stop-Service -Name "OracleServiceROA" -Force -ErrorAction Stop - Start-Sleep -Seconds 2 - Write-Host " Service stopped" - } catch { - Write-Host " WARNING: Failed to stop service: $_" -ForegroundColor Yellow - } - } - - # Force kill any remaining Oracle processes - Get-Process -Name "sqlplus" -ErrorAction SilentlyContinue | Stop-Process -Force -ErrorAction SilentlyContinue - Get-Process -Name "oracle" -ErrorAction SilentlyContinue | Stop-Process -Force -ErrorAction SilentlyContinue - } else { - Write-Host " Oracle service not found, skipping shutdown" + # Shutdown instance using SQL*Plus (always, not just /AFTER) + $shutdownSQL = "WHENEVER SQLERROR CONTINUE`nSHUTDOWN ABORT;`nEXIT;" + try { + $shutdownSQL | & sqlplus -S / as sysdba 2>&1 | Out-Null + Start-Sleep -Seconds 2 + Write-Host " Instance shut down (ABORT for fast cleanup)" + } catch { + Write-Host " Shutdown command sent (errors ignored)" } + + # ALWAYS stop Oracle service to ensure clean state + if ($service.Status -eq "Running") { + Write-Host " Stopping Oracle service to ensure clean state..." + try { + Stop-Service -Name "OracleServiceROA" -Force -ErrorAction Stop + Start-Sleep -Seconds 3 + Write-Host " Service stopped successfully" + } catch { + Write-Host " WARNING: Failed to stop service: $_" -ForegroundColor Yellow + } + } else { + Write-Host " Service already stopped" + } + + # Force kill any remaining Oracle processes to ensure clean state + Write-Host " Cleaning up any remaining Oracle processes..." + Get-Process -Name "sqlplus" -ErrorAction SilentlyContinue | Stop-Process -Force -ErrorAction SilentlyContinue + Get-Process -Name "oracle" -ErrorAction SilentlyContinue | Stop-Process -Force -ErrorAction SilentlyContinue Start-Sleep -Seconds 2 + Write-Host " All Oracle processes terminated" } else { - Write-Host "[1/6] Skipping instance shutdown (cleanup BEFORE restore)" - Write-Host " Instance and service left in current state" - Write-Host " Restore script will handle service state properly" + Write-Host " Oracle service not found, will be created during restore" } -Write-Host "[2/6] Oracle service preserved for reuse" -if ($afterRestore) { - Write-Host " Service stopped to release file locks" -} else { - Write-Host " Service remains in current state (running or stopped)" - Write-Host " Optimization: If running, restore saves ~30s startup time" -} +Write-Host "[2/6] Oracle service stopped (clean state for restore)" +Write-Host " Service will be started fresh during restore" +Write-Host " This ensures no state inconsistencies (prevents ORA-00600)" Write-Host "[3/6] Deleting database files + SPFILE..." Write-Host " Deleting datafiles..." @@ -124,16 +118,16 @@ Write-Host "" Write-Host "Current state:" Write-Host " [YES] Oracle software installed" Write-Host " [YES] PFILE exists (C:\Users\oracle\admin\ROA\pfile\initROA.ora)" -Write-Host " [YES] Oracle service (preserved, will be reused)" +Write-Host " [YES] Oracle service (STOPPED for clean restore)" Write-Host " [NO] SPFILE (deleted to ensure PFILE startup)" Write-Host " [NO] Database files (will be restored from backups)" Write-Host " [NO] Control files (will be restored from backups)" Write-Host " [NO] Datafiles (will be restored from backups)" Write-Host "" -Write-Host "VM is now in CLEAN STATE (service preserved)!" +Write-Host "VM is now in CLEAN STATE (service stopped, ready for fresh start)!" Write-Host "" Write-Host "Next step: Run D:\oracle\scripts\rman_restore_from_zero.ps1" -Write-Host " (It will reuse the existing Oracle service and restore the database)" +Write-Host " (It will start the service fresh and restore the database)" Write-Host "" exit 0