#!/bin/bash # ROA2WEB Comprehensive Health Check Script # Monitors all services and provides detailed health information set -e # Configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" LOG_FILE="$PROJECT_DIR/health-check.log" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' PURPLE='\033[0;35m' CYAN='\033[0;36m' NC='\033[0m' # No Color # Health check results OVERALL_HEALTH=true ISSUES=() # Logging function log() { local level=$1 shift local message="$*" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo -e "[$timestamp] [$level] $message" | tee -a "$LOG_FILE" } # Status icons status_icon() { local status=$1 case $status in "healthy") echo "✅" ;; "warning") echo "⚠️" ;; "error") echo "❌" ;; "info") echo "ℹ️" ;; *) echo "❓" ;; esac } # Print section header section_header() { local title=$1 echo "" echo -e "${BLUE}=================================${NC}" echo -e "${BLUE}$title${NC}" echo -e "${BLUE}=================================${NC}" } # Add issue to report add_issue() { local severity=$1 local component=$2 local message=$3 ISSUES+=("[$severity] $component: $message") if [[ "$severity" == "ERROR" ]]; then OVERALL_HEALTH=false fi } # Check if service is running check_service_running() { local service_name=$1 local container_name=$2 if docker ps --format "table {{.Names}}" | grep -q "^$container_name$"; then echo -e "$(status_icon "healthy") ${GREEN}$service_name is running${NC}" return 0 else echo -e "$(status_icon "error") ${RED}$service_name is not running${NC}" add_issue "ERROR" "$service_name" "Container not running" return 1 fi } # HTTP health check http_health_check() { local service_name=$1 local url=$2 local expected_status=${3:-200} local timeout=${4:-10} local response local status_code response=$(curl -s -w "%{http_code}" --max-time "$timeout" "$url" 2>/dev/null || echo "000") status_code="${response: -3}" if [[ "$status_code" == "$expected_status" ]]; then echo -e "$(status_icon "healthy") ${GREEN}$service_name HTTP health check passed ($status_code)${NC}" return 0 else echo -e "$(status_icon "error") ${RED}$service_name HTTP health check failed ($status_code)${NC}" add_issue "ERROR" "$service_name" "HTTP health check failed with status $status_code" return 1 fi } # Docker container health check docker_health_check() { local service_name=$1 local container_name=$2 local health_status health_status=$(docker inspect --format='{{.State.Health.Status}}' "$container_name" 2>/dev/null || echo "no-healthcheck") case $health_status in "healthy") echo -e "$(status_icon "healthy") ${GREEN}$service_name Docker health check: healthy${NC}" return 0 ;; "unhealthy") echo -e "$(status_icon "error") ${RED}$service_name Docker health check: unhealthy${NC}" add_issue "ERROR" "$service_name" "Docker health check reports unhealthy" return 1 ;; "starting") echo -e "$(status_icon "warning") ${YELLOW}$service_name Docker health check: starting${NC}" add_issue "WARNING" "$service_name" "Docker health check still starting" return 1 ;; "no-healthcheck") echo -e "$(status_icon "info") ${CYAN}$service_name: No Docker health check configured${NC}" return 0 ;; *) echo -e "$(status_icon "error") ${RED}$service_name Docker health check: unknown status ($health_status)${NC}" add_issue "ERROR" "$service_name" "Unknown Docker health check status: $health_status" return 1 ;; esac } # Check container resources check_container_resources() { local service_name=$1 local container_name=$2 if ! docker ps --format "table {{.Names}}" | grep -q "^$container_name$"; then return 1 fi local stats stats=$(docker stats "$container_name" --no-stream --format "table {{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" 2>/dev/null | tail -n1) if [[ -n "$stats" ]]; then local cpu_percent=$(echo "$stats" | awk '{print $1}' | sed 's/%//') local mem_usage=$(echo "$stats" | awk '{print $2}') local mem_percent=$(echo "$stats" | awk '{print $3}' | sed 's/%//') echo -e "$(status_icon "info") ${CYAN}$service_name Resources: CPU ${cpu_percent}%, Memory ${mem_usage} (${mem_percent}%)${NC}" # Check for resource warnings if (( $(echo "$cpu_percent > 80" | bc -l) )); then add_issue "WARNING" "$service_name" "High CPU usage: ${cpu_percent}%" fi if (( $(echo "$mem_percent > 80" | bc -l) )); then add_issue "WARNING" "$service_name" "High memory usage: ${mem_percent}%" fi fi } # Check logs for errors check_container_logs() { local service_name=$1 local container_name=$2 if ! docker ps --format "table {{.Names}}" | grep -q "^$container_name$"; then return 1 fi local error_count error_count=$(docker logs "$container_name" --since="5m" 2>&1 | grep -i "error\|exception\|failed\|fatal" | wc -l) if [[ "$error_count" -gt 0 ]]; then echo -e "$(status_icon "warning") ${YELLOW}$service_name: $error_count errors in last 5 minutes${NC}" add_issue "WARNING" "$service_name" "$error_count errors found in recent logs" # Show recent errors echo -e "${YELLOW}Recent errors:${NC}" docker logs "$container_name" --since="5m" 2>&1 | grep -i "error\|exception\|failed\|fatal" | tail -3 | sed 's/^/ /' else echo -e "$(status_icon "healthy") ${GREEN}$service_name: No recent errors in logs${NC}" fi } # Check disk space check_disk_space() { section_header "DISK SPACE CHECK" local disk_usage disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//') echo -e "$(status_icon "info") ${CYAN}Root filesystem usage: ${disk_usage}%${NC}" if [[ "$disk_usage" -gt 90 ]]; then echo -e "$(status_icon "error") ${RED}Critical: Disk space usage is ${disk_usage}%${NC}" add_issue "ERROR" "System" "Critical disk space usage: ${disk_usage}%" elif [[ "$disk_usage" -gt 80 ]]; then echo -e "$(status_icon "warning") ${YELLOW}Warning: Disk space usage is ${disk_usage}%${NC}" add_issue "WARNING" "System" "High disk space usage: ${disk_usage}%" else echo -e "$(status_icon "healthy") ${GREEN}Disk space usage is acceptable${NC}" fi # Check Docker space local docker_space docker_space=$(docker system df --format "table {{.Type}}\t{{.Total}}\t{{.Active}}\t{{.Size}}\t{{.Reclaimable}}" 2>/dev/null || echo "Docker space info unavailable") if [[ "$docker_space" != "Docker space info unavailable" ]]; then echo "" echo -e "${CYAN}Docker space usage:${NC}" echo "$docker_space" fi } # Check network connectivity check_network() { section_header "NETWORK CONNECTIVITY CHECK" # Check if Docker network exists if docker network ls | grep -q "roa-network"; then echo -e "$(status_icon "healthy") ${GREEN}Docker network 'roa-network' exists${NC}" else echo -e "$(status_icon "error") ${RED}Docker network 'roa-network' not found${NC}" add_issue "ERROR" "Network" "Docker network 'roa-network' not found" fi # Check external connectivity if ping -c 1 8.8.8.8 &> /dev/null; then echo -e "$(status_icon "healthy") ${GREEN}External network connectivity: OK${NC}" else echo -e "$(status_icon "warning") ${YELLOW}External network connectivity: Limited${NC}" add_issue "WARNING" "Network" "Limited external network connectivity" fi # Check DNS resolution if nslookup google.com &> /dev/null; then echo -e "$(status_icon "healthy") ${GREEN}DNS resolution: OK${NC}" else echo -e "$(status_icon "warning") ${YELLOW}DNS resolution: Issues detected${NC}" add_issue "WARNING" "Network" "DNS resolution issues detected" fi } # Check database connectivity check_database() { section_header "DATABASE CONNECTIVITY CHECK" # Load environment variables if [[ -f "$PROJECT_DIR/.env" ]]; then set -a source "$PROJECT_DIR/.env" set +a elif [[ -f "$PROJECT_DIR/.env.production" ]]; then set -a source "$PROJECT_DIR/.env.production" set +a fi # Check SSH tunnel if needed if [[ "$ORACLE_HOST" == "localhost" && -f "$PROJECT_DIR/ssh-tunnel.sh" ]]; then local tunnel_status tunnel_status=$("$PROJECT_DIR/ssh-tunnel.sh" status 2>/dev/null || echo "not running") if [[ "$tunnel_status" == *"running"* ]]; then echo -e "$(status_icon "healthy") ${GREEN}SSH tunnel is running${NC}" else echo -e "$(status_icon "warning") ${YELLOW}SSH tunnel is not running${NC}" add_issue "WARNING" "Database" "SSH tunnel is not running" fi fi # Test Oracle connection (if we can) if command -v sqlplus &> /dev/null && [[ -n "$ORACLE_USER" && -n "$ORACLE_PASSWORD" ]]; then local connection_test connection_test=$(timeout 10 sqlplus -s "$ORACLE_USER/$ORACLE_PASSWORD@$ORACLE_HOST:$ORACLE_PORT/$ORACLE_SID" <<< "SELECT 'OK' FROM DUAL; EXIT;" 2>/dev/null | grep "OK" || echo "failed") if [[ "$connection_test" == "OK" ]]; then echo -e "$(status_icon "healthy") ${GREEN}Oracle database connection: OK${NC}" else echo -e "$(status_icon "error") ${RED}Oracle database connection: Failed${NC}" add_issue "ERROR" "Database" "Cannot connect to Oracle database" fi else echo -e "$(status_icon "info") ${CYAN}Oracle connection test skipped (sqlplus not available or credentials not set)${NC}" fi } # Check services check_services() { section_header "SERVICES HEALTH CHECK" # Backend service echo -e "${PURPLE}ROA Backend Service:${NC}" check_service_running "Backend" "roa-backend" docker_health_check "Backend" "roa-backend" http_health_check "Backend API" "http://localhost/api/health" check_container_resources "Backend" "roa-backend" check_container_logs "Backend" "roa-backend" echo "" # Frontend service echo -e "${PURPLE}ROA Frontend Service:${NC}" check_service_running "Frontend" "roa-frontend" docker_health_check "Frontend" "roa-frontend" http_health_check "Frontend" "http://localhost:3000/health" check_container_resources "Frontend" "roa-frontend" check_container_logs "Frontend" "roa-frontend" echo "" # Gateway service echo -e "${PURPLE}ROA Gateway Service:${NC}" check_service_running "Gateway" "roa-gateway" docker_health_check "Gateway" "roa-gateway" http_health_check "Gateway" "http://localhost/health" check_container_resources "Gateway" "roa-gateway" check_container_logs "Gateway" "roa-gateway" echo "" # Redis service echo -e "${PURPLE}ROA Redis Service:${NC}" check_service_running "Redis" "roa-redis" docker_health_check "Redis" "roa-redis" check_container_resources "Redis" "roa-redis" check_container_logs "Redis" "roa-redis" } # Generate summary report generate_summary() { section_header "HEALTH CHECK SUMMARY" if [[ "$OVERALL_HEALTH" == "true" ]]; then echo -e "$(status_icon "healthy") ${GREEN}Overall System Health: HEALTHY${NC}" else echo -e "$(status_icon "error") ${RED}Overall System Health: ISSUES DETECTED${NC}" fi echo "" echo -e "${CYAN}Timestamp: $(date)${NC}" if [[ ${#ISSUES[@]} -gt 0 ]]; then echo "" echo -e "${YELLOW}Issues found:${NC}" for issue in "${ISSUES[@]}"; do echo " $issue" done else echo "" echo -e "${GREEN}No issues detected${NC}" fi # Exit with appropriate code if [[ "$OVERALL_HEALTH" == "true" ]]; then exit 0 else exit 1 fi } # Watch mode - continuous monitoring watch_mode() { echo -e "${BLUE}Starting continuous health monitoring...${NC}" echo -e "${CYAN}Press Ctrl+C to stop${NC}" echo "" while true; do clear echo -e "${BLUE}ROA2WEB Health Monitor - $(date)${NC}" # Reset status OVERALL_HEALTH=true ISSUES=() # Quick service check echo "" echo -e "${PURPLE}Service Status:${NC}" check_service_running "Backend" "roa-backend" > /dev/null 2>&1 && echo -e " Backend: $(status_icon "healthy")" || echo -e " Backend: $(status_icon "error")" check_service_running "Frontend" "roa-frontend" > /dev/null 2>&1 && echo -e " Frontend: $(status_icon "healthy")" || echo -e " Frontend: $(status_icon "error")" check_service_running "Gateway" "roa-gateway" > /dev/null 2>&1 && echo -e " Gateway: $(status_icon "healthy")" || echo -e " Gateway: $(status_icon "error")" check_service_running "Redis" "roa-redis" > /dev/null 2>&1 && echo -e " Redis: $(status_icon "healthy")" || echo -e " Redis: $(status_icon "error")" # Quick HTTP checks echo "" echo -e "${PURPLE}API Status:${NC}" http_health_check "Backend API" "http://localhost/api/health" 200 5 > /dev/null 2>&1 && echo -e " API: $(status_icon "healthy")" || echo -e " API: $(status_icon "error")" http_health_check "Frontend" "http://localhost/health" 200 5 > /dev/null 2>&1 && echo -e " Frontend: $(status_icon "healthy")" || echo -e " Frontend: $(status_icon "error")" if [[ ${#ISSUES[@]} -gt 0 ]]; then echo "" echo -e "${YELLOW}Current Issues:${NC}" for issue in "${ISSUES[@]}"; do echo " $issue" done fi sleep 30 done } # Main function main() { local action=${1:-full} case $action in "full") echo -e "${BLUE}ROA2WEB Comprehensive Health Check${NC}" echo -e "${CYAN}$(date)${NC}" check_services check_disk_space check_network check_database generate_summary ;; "quick") echo -e "${BLUE}ROA2WEB Quick Health Check${NC}" # Reset status OVERALL_HEALTH=true ISSUES=() check_services generate_summary ;; "services") check_services ;; "network") check_network ;; "database") check_database ;; "watch") watch_mode ;; *) echo "Usage: $0 {full|quick|services|network|database|watch}" echo "" echo "Commands:" echo " full - Comprehensive health check (default)" echo " quick - Quick services health check" echo " services - Check only ROA2WEB services" echo " network - Check network connectivity" echo " database - Check database connectivity" echo " watch - Continuous monitoring mode" exit 1 ;; esac } # Make sure bc is available for numeric comparisons if ! command -v bc &> /dev/null; then # Fallback function for numeric comparison without bc compare_float() { local val1=$1 local op=$2 local val2=$3 python3 -c "print($val1 $op $val2)" 2>/dev/null || echo "false" } # Replace bc usage with python3 alias bc='python3 -c' fi # Run main function main "$@"