roa2web-service-auto/backend/shared/ssh_tunnel_manager.py

"""
SSH Tunnel Manager - Cross-Platform Monitoring and Auto-Reconnect

This module provides MONITORING and AUTO-RECONNECT for SSH tunnels.
It does NOT start tunnels - that's the responsibility of:
- Linux: start.sh → ssh-tunnel.sh
- Windows: Start-ROA2WEB.ps1 → SSH-Tunnels.ps1
- Windows Service: Start-Backend-Service.ps1 → SSH-Tunnels.ps1

Responsibilities:
✅ Monitor tunnel health via port checks (background asyncio task)
✅ Auto-restart tunnels if they go down (calls platform-specific scripts)
✅ Expose status for /health endpoint

NOT responsible for:
❌ Initial tunnel startup (done by wrapper scripts before backend starts)

Usage in main.py:
    from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager

    @app.on_event("startup")
    async def startup():
        await ssh_tunnel_manager.start_monitoring()

    @app.on_event("shutdown")
    async def shutdown():
        await ssh_tunnel_manager.stop_monitoring()

    @app.get("/health")
    async def health():
        return {
            "ssh_tunnels": ssh_tunnel_manager.get_status()
        }
"""

import asyncio
import json
import logging
import platform
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Any

logger = logging.getLogger(__name__)


class SSHTunnelManager:
    """
    Cross-platform SSH tunnel MONITOR (not starter).

    Timeline:
    T=0     start.sh / Wrapper starts
    T=1s    ssh-tunnel.sh / SSH-Tunnels.ps1 START
    T=3s    Tunnels active ✅
    T=5s    uvicorn backend starts
    T=7s    Backend startup_event()
    T=8s    ssh_tunnel_manager.start_monitoring()
            └─ Detects tunnels already active (just monitors, doesn't start)
    T=38s   Monitor check #1 - OK ✅
    ...
    T=XXs   [Tunnel drops]
    T=XX+30 Monitor detects FAIL (1/2)
    T=XX+60 Monitor detects FAIL (2/2) → RESTART via script
    """

    def __init__(self):
        # Configuration
        self.check_interval: int = 30  # seconds between health checks
        self.max_failures_before_restart: int = 2  # restart after N consecutive failures
        self.restart_cooldown: int = 60  # minimum seconds between restarts

        # State
        self.tunnel_configs: List[Dict] = []
        self.tunnel_status: Dict[str, bool] = {}
        self.consecutive_failures: Dict[str, int] = {}
        self.last_restart_time: float = 0
        self.monitor_task: Optional[asyncio.Task] = None
        self._is_monitoring: bool = False

        # Paths (detected at runtime)
        self._project_root: Optional[Path] = None
        self._config_file: Optional[Path] = None

    def _detect_paths(self) -> bool:
        """Detect project paths based on current file location."""
        # This file is at: backend/shared/ssh_tunnel_manager.py
        # Project root is 2 levels up
        current_file = Path(__file__)
        self._project_root = current_file.parent.parent.parent

        # Config file location
        self._config_file = self._project_root / "backend" / "ssh-tunnels.json"

        return self._config_file.exists()

    def _load_config(self) -> List[Dict]:
        """Load tunnel configuration from ssh-tunnels.json."""
        if not self._config_file or not self._config_file.exists():
            return []

        try:
            with open(self._config_file, 'r') as f:
                tunnels = json.load(f)

            # Filter to only tunnels with ssh_host (excludes direct connections)
            return [t for t in tunnels if t.get("ssh_host")]
        except Exception as e:
            logger.error(f"[SSH-MONITOR] Failed to load config: {e}")
            return []

    async def start_monitoring(self) -> bool:
        """
        Start monitoring EXISTING tunnels.

        Does NOT start tunnels - assumes they're already running
        (started by start.sh / Start-ROA2WEB.ps1 / Start-Backend-Service.ps1).
        """
        if self._is_monitoring:
            logger.warning("[SSH-MONITOR] Already monitoring")
            return True

        # Detect paths and load config
        if not self._detect_paths():
            logger.info("[SSH-MONITOR] No ssh-tunnels.json found, skipping")
            return True

        self.tunnel_configs = self._load_config()

        if not self.tunnel_configs:
            logger.info("[SSH-MONITOR] No SSH tunnels configured (or all are direct connections)")
            return True

        # Check initial status (tunnels should already be running)
        logger.info(f"[SSH-MONITOR] Checking {len(self.tunnel_configs)} tunnel(s)...")

        for config in self.tunnel_configs:
            tunnel_id = config.get("id", "default")
            port = config.get("local_port", 1521)
            name = config.get("name", tunnel_id)

            is_active = await self._check_port("127.0.0.1", port)
            self.tunnel_status[tunnel_id] = is_active
            self.consecutive_failures[tunnel_id] = 0

            status = "✅ active" if is_active else "❌ NOT active"
            logger.info(f"[SSH-MONITOR]   [{tunnel_id}] {name} - localhost:{port} - {status}")

        # Start background monitor loop
        self._is_monitoring = True
        self.monitor_task = asyncio.create_task(self._monitor_loop())
        logger.info(f"[SSH-MONITOR] ✅ Monitoring started (check every {self.check_interval}s)")

        return True

    async def stop_monitoring(self) -> None:
        """Stop the monitoring background task."""
        if not self._is_monitoring:
            return

        self._is_monitoring = False

        if self.monitor_task and not self.monitor_task.done():
            self.monitor_task.cancel()
            try:
                await self.monitor_task
            except asyncio.CancelledError:
                pass

        logger.info("[SSH-MONITOR] ✅ Monitoring stopped")

    async def _monitor_loop(self) -> None:
        """Background loop: check tunnel health every N seconds, restart if needed."""
        while self._is_monitoring:
            try:
                await asyncio.sleep(self.check_interval)

                if not self._is_monitoring:
                    break

                needs_restart = False

                for config in self.tunnel_configs:
                    tunnel_id = config.get("id", "default")
                    port = config.get("local_port", 1521)

                    is_healthy = await self._check_port("127.0.0.1", port)
                    self.tunnel_status[tunnel_id] = is_healthy

                    if is_healthy:
                        # Reset failure count on success
                        if self.consecutive_failures.get(tunnel_id, 0) > 0:
                            logger.info(f"[SSH-MONITOR] [{tunnel_id}] Recovered ✅")
                        self.consecutive_failures[tunnel_id] = 0
                    else:
                        # Increment failure count
                        self.consecutive_failures[tunnel_id] = \
                            self.consecutive_failures.get(tunnel_id, 0) + 1

                        failures = self.consecutive_failures[tunnel_id]
                        logger.warning(
                            f"[SSH-MONITOR] [{tunnel_id}] FAIL "
                            f"({failures}/{self.max_failures_before_restart})"
                        )

                        if failures >= self.max_failures_before_restart:
                            needs_restart = True

                # Restart all tunnels if any failed enough times
                if needs_restart:
                    await self._restart_tunnels()

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"[SSH-MONITOR] Monitor loop error: {e}")
                await asyncio.sleep(5)  # Brief pause before retrying

    async def _check_port(self, host: str, port: int, timeout: float = 3.0) -> bool:
        """Check if a port is accessible (tunnel is working)."""
        try:
            # Use asyncio.open_connection for non-blocking port check
            reader, writer = await asyncio.wait_for(
                asyncio.open_connection(host, port),
                timeout=timeout
            )
            writer.close()
            await writer.wait_closed()
            return True
        except (asyncio.TimeoutError, ConnectionRefusedError, OSError):
            return False
        except Exception as e:
            logger.debug(f"[SSH-MONITOR] Port check error {host}:{port}: {e}")
            return False

    async def _restart_tunnels(self) -> bool:
        """Restart tunnels via platform-specific script."""
        import time

        # Check cooldown
        now = time.time()
        if now - self.last_restart_time < self.restart_cooldown:
            remaining = int(self.restart_cooldown - (now - self.last_restart_time))
            logger.warning(f"[SSH-MONITOR] Restart cooldown active ({remaining}s remaining)")
            return False

        self.last_restart_time = now
        logger.warning("[SSH-MONITOR] 🔄 Restarting tunnels...")

        # Build platform-specific command
        if platform.system() == "Windows":
            # On Windows, scripts are deployed to scripts/ folder
            script_path = self._project_root / "scripts" / "ssh-tunnel.ps1"
            # Fallback to development path if not found
            if not script_path.exists():
                script_path = self._project_root / "deployment" / "windows" / "scripts" / "ssh-tunnel.ps1"
            if not script_path.exists():
                logger.error(f"[SSH-MONITOR] Script not found in scripts/ or deployment/windows/scripts/")
                return False
            cmd = [
                "powershell.exe",
                "-ExecutionPolicy", "Bypass",
                "-File", str(script_path),
                "restart"
            ]
        else:
            script_path = self._project_root / "ssh-tunnel.sh"
            if not script_path.exists():
                logger.error(f"[SSH-MONITOR] Script not found: {script_path}")
                return False
            cmd = [str(script_path), "restart"]

        try:
            # Run restart command in subprocess
            result = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: subprocess.run(
                    cmd,
                    capture_output=True,
                    text=True,
                    timeout=60,
                    cwd=str(self._project_root)
                )
            )

            if result.returncode == 0:
                logger.info("[SSH-MONITOR] ✅ Tunnels restarted successfully")
                # Reset failure counts
                for tunnel_id in self.consecutive_failures:
                    self.consecutive_failures[tunnel_id] = 0
                return True
            else:
                logger.error(f"[SSH-MONITOR] Restart failed (code {result.returncode})")
                if result.stderr:
                    logger.error(f"[SSH-MONITOR] stderr: {result.stderr[:500]}")
                return False

        except subprocess.TimeoutExpired:
            logger.error("[SSH-MONITOR] Restart command timed out")
            return False
        except Exception as e:
            logger.error(f"[SSH-MONITOR] Restart error: {e}")
            return False

    def get_status(self) -> Dict[str, Any]:
        """
        Get current tunnel status for /health endpoint.

        Returns:
            {
                "status": "connected" | "degraded" | "disconnected" | "not_configured",
                "tunnels": {
                    "tunnel_id": true/false,
                    ...
                },
                "monitoring": true/false
            }
        """
        if not self.tunnel_configs:
            return {
                "status": "not_configured",
                "tunnels": {},
                "monitoring": False
            }

        # Determine overall status
        all_connected = all(self.tunnel_status.values()) if self.tunnel_status else False
        any_connected = any(self.tunnel_status.values()) if self.tunnel_status else False

        if all_connected:
            status = "connected"
        elif any_connected:
            status = "degraded"
        else:
            status = "disconnected"

        return {
            "status": status,
            "tunnels": dict(self.tunnel_status),
            "monitoring": self._is_monitoring
        }

    def is_healthy(self) -> bool:
        """Quick check if all tunnels are healthy."""
        if not self.tunnel_configs:
            return True  # No tunnels configured = healthy (direct connection)
        return all(self.tunnel_status.values()) if self.tunnel_status else False


# Global singleton instance
ssh_tunnel_manager = SSHTunnelManager()