feat: [US-004] Add SSH tunnel auto-start for Windows services

- Add ssh-tunnel.ps1: Windows SSH tunnel manager (equivalent to ssh-tunnel.sh) - Supports password auth via plink.exe (PuTTY) - Supports ssh_hostkey for non-interactive batch mode - Commands: start, stop, restart, status - Add start-backend-service.ps1: NSSM service wrapper - Starts SSH tunnels before uvicorn - Waits for tunnel ports to be accessible (30s timeout) - Configured by Install-ROA2WEB.ps1 - Add start.ps1: Windows equivalent of start.sh - Orchestrates SSH tunnel + backend + frontend startup - Add backend/shared/ssh_tunnel_manager.py: Python monitoring - Background asyncio task monitors tunnel health every 30s - Auto-restarts tunnels after 2 consecutive failures - Exposes status to /health endpoint - Update ROA2WEB-Console.ps1: - Add Deploy-Scripts function - Update Update-ServiceToUseVenv to use wrapper script - Fix PowerShell reserved variable ($PID -> $tunnelPid) - Fix script path detection (scripts/ vs deployment/windows/scripts/) - Update README.md with ssh_hostkey documentation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-28 19:04:26 +00:00
parent dc1711acd0
commit 6718c956f7
9 changed files with 1766 additions and 26 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -73,6 +73,7 @@ telegram_bot_task = None
 ocr_job_worker_running = False
 cleanup_task_running = False
 email_cache_running = False
+ssh_tunnel_monitoring = False


 # ============================================================================
@@ -265,6 +266,40 @@ async def init_email_server_cache():
        email_cache_running = False


+async def init_ssh_tunnel_monitoring():
+    """Initialize SSH tunnel monitoring with auto-reconnect.
+
+    This does NOT start tunnels - they should already be running
+    (started by start.sh / start.ps1 / start-backend-service.ps1).
+
+    Responsibilities:
+    - Monitor tunnel health via port checks (every 30s)
+    - Auto-restart tunnels if they go down
+    - Expose status for /health endpoint
+    """
+    global ssh_tunnel_monitoring
+
+    logger.info("[SSH-MONITOR] Initializing tunnel monitoring...")
+    try:
+        from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
+
+        success = await ssh_tunnel_manager.start_monitoring()
+        ssh_tunnel_monitoring = success
+
+        if success:
+            status = ssh_tunnel_manager.get_status()
+            if status["status"] == "not_configured":
+                logger.info("[SSH-MONITOR] No tunnels configured (direct connection mode)")
+            else:
+                logger.info(f"[SSH-MONITOR] ✅ Monitoring active: {status['status']}")
+        else:
+            logger.warning("[SSH-MONITOR] ⚠️ Failed to start monitoring")
+
+    except Exception as e:
+        logger.warning(f"[SSH-MONITOR] ⚠️ Init failed: {e}")
+        ssh_tunnel_monitoring = False
+
+
 async def run_telegram_bot():
    """Run Telegram bot as background task."""
    logger.info("[TELEGRAM] Starting bot...")
@@ -381,7 +416,10 @@ async def startup_event():
        # Step 5: Initialize email-server cache for multi-Oracle (US-003)
        await init_email_server_cache()

-        # Step 6: Start Telegram bot as background task
+        # Step 6: Initialize SSH tunnel monitoring (auto-reconnect)
+        await init_ssh_tunnel_monitoring()
+
+        # Step 7: Start Telegram bot as background task
        if settings.telegram_bot_token:
            telegram_bot_task = asyncio.create_task(run_telegram_bot())
            logger.info("[STARTUP] ✅ Telegram bot task created")
@@ -401,13 +439,24 @@ async def startup_event():
@app.on_event("shutdown")
 async def shutdown_event():
    """Application shutdown - Cleanup resources."""
-    global telegram_bot_task, ocr_job_worker_running, cleanup_task_running, email_cache_running
+    global telegram_bot_task, ocr_job_worker_running, cleanup_task_running, email_cache_running, ssh_tunnel_monitoring

    logger.info("=" * 80)
    logger.info("[SHUTDOWN] Stopping ROA2WEB Unified Backend...")
    logger.info("=" * 80)

    try:
+        # Stop SSH tunnel monitoring
+        if ssh_tunnel_monitoring:
+            logger.info("[SHUTDOWN] Stopping SSH tunnel monitoring...")
+            try:
+                from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
+                await ssh_tunnel_manager.stop_monitoring()
+                ssh_tunnel_monitoring = False
+                logger.info("[SHUTDOWN] SSH tunnel monitoring stopped")
+            except Exception as e:
+                logger.error(f"[SHUTDOWN] SSH tunnel monitoring error: {e}")
+
        # Stop email cache auto-refresh (US-003)
        if email_cache_running:
            logger.info("[SHUTDOWN] Stopping email cache auto-refresh...")
@@ -609,6 +658,14 @@ async def health_check():
    except Exception as e:
        health_status["modules"]["ocr_worker"] = f"error: {str(e)}"

+    # Check SSH tunnels
+    global ssh_tunnel_monitoring
+    try:
+        from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
+        health_status["modules"]["ssh_tunnels"] = ssh_tunnel_manager.get_status()
+    except Exception as e:
+        health_status["modules"]["ssh_tunnels"] = f"error: {str(e)}"
+
    return health_status


--- a/backend/shared/ssh_tunnel_manager.py
+++ b/backend/shared/ssh_tunnel_manager.py
@@ -0,0 +1,350 @@
+"""
+SSH Tunnel Manager - Cross-Platform Monitoring and Auto-Reconnect
+
+This module provides MONITORING and AUTO-RECONNECT for SSH tunnels.
+It does NOT start tunnels - that's the responsibility of:
+- Linux: start.sh → ssh-tunnel.sh
+- Windows: Start-ROA2WEB.ps1 → SSH-Tunnels.ps1
+- Windows Service: Start-Backend-Service.ps1 → SSH-Tunnels.ps1
+
+Responsibilities:
+✅ Monitor tunnel health via port checks (background asyncio task)
+✅ Auto-restart tunnels if they go down (calls platform-specific scripts)
+✅ Expose status for /health endpoint
+
+NOT responsible for:
+❌ Initial tunnel startup (done by wrapper scripts before backend starts)
+
+Usage in main.py:
+    from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
+
+    @app.on_event("startup")
+    async def startup():
+        await ssh_tunnel_manager.start_monitoring()
+
+    @app.on_event("shutdown")
+    async def shutdown():
+        await ssh_tunnel_manager.stop_monitoring()
+
+    @app.get("/health")
+    async def health():
+        return {
+            "ssh_tunnels": ssh_tunnel_manager.get_status()
+        }
+"""
+
+import asyncio
+import json
+import logging
+import platform
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+
+logger = logging.getLogger(__name__)
+
+
+class SSHTunnelManager:
+    """
+    Cross-platform SSH tunnel MONITOR (not starter).
+
+    Timeline:
+    T=0     start.sh / Wrapper starts
+    T=1s    ssh-tunnel.sh / SSH-Tunnels.ps1 START
+    T=3s    Tunnels active ✅
+    T=5s    uvicorn backend starts
+    T=7s    Backend startup_event()
+    T=8s    ssh_tunnel_manager.start_monitoring()
+            └─ Detects tunnels already active (just monitors, doesn't start)
+    T=38s   Monitor check #1 - OK ✅
+    ...
+    T=XXs   [Tunnel drops]
+    T=XX+30 Monitor detects FAIL (1/2)
+    T=XX+60 Monitor detects FAIL (2/2) → RESTART via script
+    """
+
+    def __init__(self):
+        # Configuration
+        self.check_interval: int = 30  # seconds between health checks
+        self.max_failures_before_restart: int = 2  # restart after N consecutive failures
+        self.restart_cooldown: int = 60  # minimum seconds between restarts
+
+        # State
+        self.tunnel_configs: List[Dict] = []
+        self.tunnel_status: Dict[str, bool] = {}
+        self.consecutive_failures: Dict[str, int] = {}
+        self.last_restart_time: float = 0
+        self.monitor_task: Optional[asyncio.Task] = None
+        self._is_monitoring: bool = False
+
+        # Paths (detected at runtime)
+        self._project_root: Optional[Path] = None
+        self._config_file: Optional[Path] = None
+
+    def _detect_paths(self) -> bool:
+        """Detect project paths based on current file location."""
+        # This file is at: backend/shared/ssh_tunnel_manager.py
+        # Project root is 2 levels up
+        current_file = Path(__file__)
+        self._project_root = current_file.parent.parent.parent
+
+        # Config file location
+        self._config_file = self._project_root / "backend" / "ssh-tunnels.json"
+
+        return self._config_file.exists()
+
+    def _load_config(self) -> List[Dict]:
+        """Load tunnel configuration from ssh-tunnels.json."""
+        if not self._config_file or not self._config_file.exists():
+            return []
+
+        try:
+            with open(self._config_file, 'r') as f:
+                tunnels = json.load(f)
+
+            # Filter to only tunnels with ssh_host (excludes direct connections)
+            return [t for t in tunnels if t.get("ssh_host")]
+        except Exception as e:
+            logger.error(f"[SSH-MONITOR] Failed to load config: {e}")
+            return []
+
+    async def start_monitoring(self) -> bool:
+        """
+        Start monitoring EXISTING tunnels.
+
+        Does NOT start tunnels - assumes they're already running
+        (started by start.sh / Start-ROA2WEB.ps1 / Start-Backend-Service.ps1).
+        """
+        if self._is_monitoring:
+            logger.warning("[SSH-MONITOR] Already monitoring")
+            return True
+
+        # Detect paths and load config
+        if not self._detect_paths():
+            logger.info("[SSH-MONITOR] No ssh-tunnels.json found, skipping")
+            return True
+
+        self.tunnel_configs = self._load_config()
+
+        if not self.tunnel_configs:
+            logger.info("[SSH-MONITOR] No SSH tunnels configured (or all are direct connections)")
+            return True
+
+        # Check initial status (tunnels should already be running)
+        logger.info(f"[SSH-MONITOR] Checking {len(self.tunnel_configs)} tunnel(s)...")
+
+        for config in self.tunnel_configs:
+            tunnel_id = config.get("id", "default")
+            port = config.get("local_port", 1521)
+            name = config.get("name", tunnel_id)
+
+            is_active = await self._check_port("127.0.0.1", port)
+            self.tunnel_status[tunnel_id] = is_active
+            self.consecutive_failures[tunnel_id] = 0
+
+            status = "✅ active" if is_active else "❌ NOT active"
+            logger.info(f"[SSH-MONITOR]   [{tunnel_id}] {name} - localhost:{port} - {status}")
+
+        # Start background monitor loop
+        self._is_monitoring = True
+        self.monitor_task = asyncio.create_task(self._monitor_loop())
+        logger.info(f"[SSH-MONITOR] ✅ Monitoring started (check every {self.check_interval}s)")
+
+        return True
+
+    async def stop_monitoring(self) -> None:
+        """Stop the monitoring background task."""
+        if not self._is_monitoring:
+            return
+
+        self._is_monitoring = False
+
+        if self.monitor_task and not self.monitor_task.done():
+            self.monitor_task.cancel()
+            try:
+                await self.monitor_task
+            except asyncio.CancelledError:
+                pass
+
+        logger.info("[SSH-MONITOR] ✅ Monitoring stopped")
+
+    async def _monitor_loop(self) -> None:
+        """Background loop: check tunnel health every N seconds, restart if needed."""
+        while self._is_monitoring:
+            try:
+                await asyncio.sleep(self.check_interval)
+
+                if not self._is_monitoring:
+                    break
+
+                needs_restart = False
+
+                for config in self.tunnel_configs:
+                    tunnel_id = config.get("id", "default")
+                    port = config.get("local_port", 1521)
+
+                    is_healthy = await self._check_port("127.0.0.1", port)
+                    self.tunnel_status[tunnel_id] = is_healthy
+
+                    if is_healthy:
+                        # Reset failure count on success
+                        if self.consecutive_failures.get(tunnel_id, 0) > 0:
+                            logger.info(f"[SSH-MONITOR] [{tunnel_id}] Recovered ✅")
+                        self.consecutive_failures[tunnel_id] = 0
+                    else:
+                        # Increment failure count
+                        self.consecutive_failures[tunnel_id] = \
+                            self.consecutive_failures.get(tunnel_id, 0) + 1
+
+                        failures = self.consecutive_failures[tunnel_id]
+                        logger.warning(
+                            f"[SSH-MONITOR] [{tunnel_id}] FAIL "
+                            f"({failures}/{self.max_failures_before_restart})"
+                        )
+
+                        if failures >= self.max_failures_before_restart:
+                            needs_restart = True
+
+                # Restart all tunnels if any failed enough times
+                if needs_restart:
+                    await self._restart_tunnels()
+
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"[SSH-MONITOR] Monitor loop error: {e}")
+                await asyncio.sleep(5)  # Brief pause before retrying
+
+    async def _check_port(self, host: str, port: int, timeout: float = 3.0) -> bool:
+        """Check if a port is accessible (tunnel is working)."""
+        try:
+            # Use asyncio.open_connection for non-blocking port check
+            reader, writer = await asyncio.wait_for(
+                asyncio.open_connection(host, port),
+                timeout=timeout
+            )
+            writer.close()
+            await writer.wait_closed()
+            return True
+        except (asyncio.TimeoutError, ConnectionRefusedError, OSError):
+            return False
+        except Exception as e:
+            logger.debug(f"[SSH-MONITOR] Port check error {host}:{port}: {e}")
+            return False
+
+    async def _restart_tunnels(self) -> bool:
+        """Restart tunnels via platform-specific script."""
+        import time
+
+        # Check cooldown
+        now = time.time()
+        if now - self.last_restart_time < self.restart_cooldown:
+            remaining = int(self.restart_cooldown - (now - self.last_restart_time))
+            logger.warning(f"[SSH-MONITOR] Restart cooldown active ({remaining}s remaining)")
+            return False
+
+        self.last_restart_time = now
+        logger.warning("[SSH-MONITOR] 🔄 Restarting tunnels...")
+
+        # Build platform-specific command
+        if platform.system() == "Windows":
+            # On Windows, scripts are deployed to scripts/ folder
+            script_path = self._project_root / "scripts" / "ssh-tunnel.ps1"
+            # Fallback to development path if not found
+            if not script_path.exists():
+                script_path = self._project_root / "deployment" / "windows" / "scripts" / "ssh-tunnel.ps1"
+            if not script_path.exists():
+                logger.error(f"[SSH-MONITOR] Script not found in scripts/ or deployment/windows/scripts/")
+                return False
+            cmd = [
+                "powershell.exe",
+                "-ExecutionPolicy", "Bypass",
+                "-File", str(script_path),
+                "restart"
+            ]
+        else:
+            script_path = self._project_root / "ssh-tunnel.sh"
+            if not script_path.exists():
+                logger.error(f"[SSH-MONITOR] Script not found: {script_path}")
+                return False
+            cmd = [str(script_path), "restart"]
+
+        try:
+            # Run restart command in subprocess
+            result = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    timeout=60,
+                    cwd=str(self._project_root)
+                )
+            )
+
+            if result.returncode == 0:
+                logger.info("[SSH-MONITOR] ✅ Tunnels restarted successfully")
+                # Reset failure counts
+                for tunnel_id in self.consecutive_failures:
+                    self.consecutive_failures[tunnel_id] = 0
+                return True
+            else:
+                logger.error(f"[SSH-MONITOR] Restart failed (code {result.returncode})")
+                if result.stderr:
+                    logger.error(f"[SSH-MONITOR] stderr: {result.stderr[:500]}")
+                return False
+
+        except subprocess.TimeoutExpired:
+            logger.error("[SSH-MONITOR] Restart command timed out")
+            return False
+        except Exception as e:
+            logger.error(f"[SSH-MONITOR] Restart error: {e}")
+            return False
+
+    def get_status(self) -> Dict[str, Any]:
+        """
+        Get current tunnel status for /health endpoint.
+
+        Returns:
+            {
+                "status": "connected" | "degraded" | "disconnected" | "not_configured",
+                "tunnels": {
+                    "tunnel_id": true/false,
+                    ...
+                },
+                "monitoring": true/false
+            }
+        """
+        if not self.tunnel_configs:
+            return {
+                "status": "not_configured",
+                "tunnels": {},
+                "monitoring": False
+            }
+
+        # Determine overall status
+        all_connected = all(self.tunnel_status.values()) if self.tunnel_status else False
+        any_connected = any(self.tunnel_status.values()) if self.tunnel_status else False
+
+        if all_connected:
+            status = "connected"
+        elif any_connected:
+            status = "degraded"
+        else:
+            status = "disconnected"
+
+        return {
+            "status": status,
+            "tunnels": dict(self.tunnel_status),
+            "monitoring": self._is_monitoring
+        }
+
+    def is_healthy(self) -> bool:
+        """Quick check if all tunnels are healthy."""
+        if not self.tunnel_configs:
+            return True  # No tunnels configured = healthy (direct connection)
+        return all(self.tunnel_status.values()) if self.tunnel_status else False
+
+
+# Global singleton instance
+ssh_tunnel_manager = SSHTunnelManager()