Oracle DR: Fix RMAN crosscheck sequence and improve error handling

- Fix CROSSCHECK BACKUP command to execute after database is mounted
- Correct CATALOG command to use recovery_area instead of F:\ path
- Add robust backup file validation with detailed error reporting
- Improve file-by-file backup copying with individual error tracking
- Enhance restore log collection for both success and failure scenarios
- Fix database verification to check OPEN_MODE instead of STATUS
- Add comprehensive directory and permissions error handling

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
This commit is contained in:
Marius
2025-10-11 10:32:49 +03:00
parent 9ed0ee9e0e
commit 3a51880c9e
2 changed files with 115 additions and 11 deletions

View File

@@ -45,9 +45,15 @@ if (-not (Test-Path "F:\ROA\autobackup")) {
Write-Host "[OK] F:\ROA\autobackup is accessible"
Write-Host ""
# Create directories
New-Item -ItemType Directory -Path "D:\oracle\temp" -Force | Out-Null
New-Item -ItemType Directory -Path "D:\oracle\logs" -Force | Out-Null
# Create directories with proper permissions
try {
New-Item -ItemType Directory -Path "D:\oracle\temp" -Force -ErrorAction Stop | Out-Null
New-Item -ItemType Directory -Path "D:\oracle\logs" -Force -ErrorAction Stop | Out-Null
Write-Host "[OK] Created required directories"
} catch {
Write-Host "ERROR: Failed to create directories: $_" -ForegroundColor Red
exit 1
}
Write-Host "============================================"
Write-Host "STEP 1: CLEANUP - Delete existing database"
@@ -115,12 +121,70 @@ $logFile = "D:\oracle\logs\restore_from_zero.log"
New-Item -ItemType Directory -Path "C:\Users\oracle\recovery_area\ROA\autobackup" -Force | Out-Null
Write-Host "[INFO] Copying all backups from F:\ROA\autobackup to recovery area..."
Write-Host " This may take 1-2 minutes for ~10 GB of backups..."
Copy-Item "F:\ROA\autobackup\*.BKP" "C:\Users\oracle\recovery_area\ROA\autobackup\" -Force -ErrorAction Stop
if ($LASTEXITCODE -ne 0) {
Write-Host "ERROR: Failed to copy backups from F:\" -ForegroundColor Red
# Check backup files exist on F: drive before copying
try {
$backupFiles = Get-ChildItem "F:\ROA\autobackup\*.BKP" -ErrorAction Continue
} catch {
Write-Host "WARNING: Cannot enumerate backup files on F: drive - $_" -ForegroundColor Yellow
$backupFiles = @()
}
if ($backupFiles.Count -lt 2) {
Write-Host "ERROR: Insufficient backup files found on F: drive (found: $($backupFiles.Count))" -ForegroundColor Red
Write-Host " At least 2 backup files required for successful restore"
Write-Host " Checking F:\ROA\autobackup directory..."
try {
$dirCheck = Get-ChildItem "F:\ROA\autobackup" -ErrorAction Continue
Write-Host " Directory contents: $($dirCheck.Count) files"
foreach ($file in $dirCheck) {
Write-Host " $($file.Name) - $($file.Length / 1GB) GB" -ForegroundColor Gray
}
} catch {
Write-Host " Cannot access directory: $_" -ForegroundColor Red
}
exit 1
}
Write-Host "[OK] All backups copied to recovery area"
Write-Host "[INFO] Found $($backupFiles.Count) backup files, total size: $([math]::Round(($backupFiles | Measure-Object -Property Length -Sum).Sum / 1GB, 2)) GB"
# Copy backups with better error handling
Write-Host "[INFO] Starting backup copy operation..."
$copyErrors = @()
foreach ($backupFile in $backupFiles) {
try {
Write-Host "[INFO] Copying $($backupFile.Name)..."
Copy-Item $backupFile.FullName "C:\Users\oracle\recovery_area\ROA\autobackup\" -Force -ErrorAction Stop
Write-Host "[OK] Copied $($backupFile.Name)" -ForegroundColor Green
} catch {
Write-Host "ERROR: Failed to copy $($backupFile.Name) - $_" -ForegroundColor Red
$copyErrors += "$($backupFile.Name): $_"
}
}
if ($copyErrors.Count -gt 0) {
Write-Host "ERROR: Backup copy failed for $($copyErrors.Count) files" -ForegroundColor Red
foreach ($error in $copyErrors) {
Write-Host " $error" -ForegroundColor Red
}
exit 1
}
# Verify copied backups
try {
$copiedFiles = Get-ChildItem "C:\Users\oracle\recovery_area\ROA\autobackup\*.BKP" -ErrorAction Continue
} catch {
Write-Host "ERROR: Cannot verify copied backups - $_" -ForegroundColor Red
exit 1
}
if ($copiedFiles.Count -ne $backupFiles.Count) {
Write-Host "ERROR: Backup copy verification failed - file count mismatch" -ForegroundColor Red
Write-Host " Expected: $($backupFiles.Count), Copied: $($copiedFiles.Count)"
exit 1
}
Write-Host "[OK] All $($copiedFiles.Count) backups copied and verified to recovery area"
# Create RMAN script
$rmanContent = @"
@@ -134,7 +198,10 @@ RUN {
ALTER DATABASE MOUNT;
CATALOG START WITH 'F:/ROA/autobackup' NOPROMPT;
CATALOG START WITH 'C:/USERS/ORACLE/RECOVERY_AREA/ROA/AUTOBACKUP' NOPROMPT;
CROSSCHECK BACKUP;
DELETE NOPROMPT EXPIRED BACKUP;
RUN {
ALLOCATE CHANNEL ch1 DEVICE TYPE DISK;
@@ -152,6 +219,8 @@ RUN {
ALTER DATABASE OPEN RESETLOGS;
DELETE NOPROMPT OBSOLETE;
EXIT;
"@

View File

@@ -375,7 +375,7 @@ run_dr_test() {
# Use PowerShell to query database status
db_status=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command \"'SELECT STATUS FROM V\`\$INSTANCE;' | sqlplus -s / as sysdba | Select-String 'OPEN'\"" || echo "")
"powershell -Command \"'SELECT OPEN_MODE FROM V\\\$DATABASE;' | sqlplus -s / as sysdba | findstr 'READ WRITE'\"" || echo "")
# Use PowerShell to count tables
tables_restored=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
@@ -383,7 +383,7 @@ run_dr_test() {
tables_restored=$(echo "$tables_restored" | tr -cd '0-9')
[ -z "$tables_restored" ] && tables_restored=0
if [[ "$db_status" =~ "OPEN" ]]; then
if [[ "$db_status" =~ "READ WRITE" ]]; then
track_step "Database Verification" true "Database OPEN, $tables_restored tables" "$step_start"
test_result="PASSED"
severity="info"
@@ -392,11 +392,27 @@ run_dr_test() {
track_step "Database Verification" false "Database not OPEN" "$step_start"
fi
# Collect restore log from VM
# Collect restore log from VM (always attempt collection)
log "Collecting restore log from DR VM..."
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"type D:\\oracle\\logs\\restore_from_zero.log 2>nul" | head -200 || echo "Log not available")
# If not found, try alternate locations
if [[ "$restore_log" == *"Log not available"* ]]; then
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"type D:\\oracle\\temp\\restore_from_zero.log 2>nul" | head -200 || echo "Log not available")
fi
# Still not found, check if any logs exist
if [[ "$restore_log" == *"Log not available"* ]]; then
log "Checking for any restore logs in DR VM..."
log_check=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"dir D:\\oracle\\logs\\*.log 2>nul || dir D:\\oracle\\temp\\*.log 2>nul || echo 'No logs found'" 2>/dev/null || echo "Connection error")
if [[ "$log_check" != *"No logs found"* ]]; then
restore_log="Log files found but could not be read. Available files: $log_check"
fi
fi
# Step 6: Cleanup
step_start=$(date +%s)
log "STEP 6: Running cleanup"
@@ -408,6 +424,25 @@ run_dr_test() {
track_step "Cleanup" true "Database cleaned, ~${cleanup_freed}GB freed" "$step_start"
else
# Collect restore log even when restore fails
log "Collecting restore log after failure..."
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"type D:\\oracle\\logs\\restore_from_zero.log 2>nul" | head -200 || echo "Log not available")
if [[ "$restore_log" == *"Log not available"* ]]; then
restore_log=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"type D:\\oracle\\temp\\restore_from_zero.log 2>nul" | head -200 || echo "Log not available")
fi
# Always try to get some error output
if [[ "$restore_log" == *"Log not available"* ]]; then
last_error=$(ssh -p "$DR_VM_PORT" "$DR_VM_USER@$DR_VM_IP" \
"powershell -Command 'Get-Content D:\\oracle\\temp\\*.rman -Tail 20 || echo \"No RMAN script found\"'" 2>/dev/null || echo "Cannot access RMAN script")
if [[ "$last_error" != *"No RMAN script found"* ]]; then
restore_log="RMAN script content (last 20 lines):$last_error"
fi
fi
track_step "Database Restore" false "Restore failed" "$step_start"
fi