feat: [US-004] Add SSH tunnel auto-start for Windows services
- Add ssh-tunnel.ps1: Windows SSH tunnel manager (equivalent to ssh-tunnel.sh) - Supports password auth via plink.exe (PuTTY) - Supports ssh_hostkey for non-interactive batch mode - Commands: start, stop, restart, status - Add start-backend-service.ps1: NSSM service wrapper - Starts SSH tunnels before uvicorn - Waits for tunnel ports to be accessible (30s timeout) - Configured by Install-ROA2WEB.ps1 - Add start.ps1: Windows equivalent of start.sh - Orchestrates SSH tunnel + backend + frontend startup - Add backend/shared/ssh_tunnel_manager.py: Python monitoring - Background asyncio task monitors tunnel health every 30s - Auto-restarts tunnels after 2 consecutive failures - Exposes status to /health endpoint - Update ROA2WEB-Console.ps1: - Add Deploy-Scripts function - Update Update-ServiceToUseVenv to use wrapper script - Fix PowerShell reserved variable ($PID -> $tunnelPid) - Fix script path detection (scripts/ vs deployment/windows/scripts/) - Update README.md with ssh_hostkey documentation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -73,6 +73,7 @@ telegram_bot_task = None
|
||||
ocr_job_worker_running = False
|
||||
cleanup_task_running = False
|
||||
email_cache_running = False
|
||||
ssh_tunnel_monitoring = False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@@ -265,6 +266,40 @@ async def init_email_server_cache():
|
||||
email_cache_running = False
|
||||
|
||||
|
||||
async def init_ssh_tunnel_monitoring():
|
||||
"""Initialize SSH tunnel monitoring with auto-reconnect.
|
||||
|
||||
This does NOT start tunnels - they should already be running
|
||||
(started by start.sh / start.ps1 / start-backend-service.ps1).
|
||||
|
||||
Responsibilities:
|
||||
- Monitor tunnel health via port checks (every 30s)
|
||||
- Auto-restart tunnels if they go down
|
||||
- Expose status for /health endpoint
|
||||
"""
|
||||
global ssh_tunnel_monitoring
|
||||
|
||||
logger.info("[SSH-MONITOR] Initializing tunnel monitoring...")
|
||||
try:
|
||||
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
|
||||
|
||||
success = await ssh_tunnel_manager.start_monitoring()
|
||||
ssh_tunnel_monitoring = success
|
||||
|
||||
if success:
|
||||
status = ssh_tunnel_manager.get_status()
|
||||
if status["status"] == "not_configured":
|
||||
logger.info("[SSH-MONITOR] No tunnels configured (direct connection mode)")
|
||||
else:
|
||||
logger.info(f"[SSH-MONITOR] ✅ Monitoring active: {status['status']}")
|
||||
else:
|
||||
logger.warning("[SSH-MONITOR] ⚠️ Failed to start monitoring")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"[SSH-MONITOR] ⚠️ Init failed: {e}")
|
||||
ssh_tunnel_monitoring = False
|
||||
|
||||
|
||||
async def run_telegram_bot():
|
||||
"""Run Telegram bot as background task."""
|
||||
logger.info("[TELEGRAM] Starting bot...")
|
||||
@@ -381,7 +416,10 @@ async def startup_event():
|
||||
# Step 5: Initialize email-server cache for multi-Oracle (US-003)
|
||||
await init_email_server_cache()
|
||||
|
||||
# Step 6: Start Telegram bot as background task
|
||||
# Step 6: Initialize SSH tunnel monitoring (auto-reconnect)
|
||||
await init_ssh_tunnel_monitoring()
|
||||
|
||||
# Step 7: Start Telegram bot as background task
|
||||
if settings.telegram_bot_token:
|
||||
telegram_bot_task = asyncio.create_task(run_telegram_bot())
|
||||
logger.info("[STARTUP] ✅ Telegram bot task created")
|
||||
@@ -401,13 +439,24 @@ async def startup_event():
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""Application shutdown - Cleanup resources."""
|
||||
global telegram_bot_task, ocr_job_worker_running, cleanup_task_running, email_cache_running
|
||||
global telegram_bot_task, ocr_job_worker_running, cleanup_task_running, email_cache_running, ssh_tunnel_monitoring
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("[SHUTDOWN] Stopping ROA2WEB Unified Backend...")
|
||||
logger.info("=" * 80)
|
||||
|
||||
try:
|
||||
# Stop SSH tunnel monitoring
|
||||
if ssh_tunnel_monitoring:
|
||||
logger.info("[SHUTDOWN] Stopping SSH tunnel monitoring...")
|
||||
try:
|
||||
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
|
||||
await ssh_tunnel_manager.stop_monitoring()
|
||||
ssh_tunnel_monitoring = False
|
||||
logger.info("[SHUTDOWN] SSH tunnel monitoring stopped")
|
||||
except Exception as e:
|
||||
logger.error(f"[SHUTDOWN] SSH tunnel monitoring error: {e}")
|
||||
|
||||
# Stop email cache auto-refresh (US-003)
|
||||
if email_cache_running:
|
||||
logger.info("[SHUTDOWN] Stopping email cache auto-refresh...")
|
||||
@@ -609,6 +658,14 @@ async def health_check():
|
||||
except Exception as e:
|
||||
health_status["modules"]["ocr_worker"] = f"error: {str(e)}"
|
||||
|
||||
# Check SSH tunnels
|
||||
global ssh_tunnel_monitoring
|
||||
try:
|
||||
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
|
||||
health_status["modules"]["ssh_tunnels"] = ssh_tunnel_manager.get_status()
|
||||
except Exception as e:
|
||||
health_status["modules"]["ssh_tunnels"] = f"error: {str(e)}"
|
||||
|
||||
return health_status
|
||||
|
||||
|
||||
|
||||
350
backend/shared/ssh_tunnel_manager.py
Normal file
350
backend/shared/ssh_tunnel_manager.py
Normal file
@@ -0,0 +1,350 @@
|
||||
"""
|
||||
SSH Tunnel Manager - Cross-Platform Monitoring and Auto-Reconnect
|
||||
|
||||
This module provides MONITORING and AUTO-RECONNECT for SSH tunnels.
|
||||
It does NOT start tunnels - that's the responsibility of:
|
||||
- Linux: start.sh → ssh-tunnel.sh
|
||||
- Windows: Start-ROA2WEB.ps1 → SSH-Tunnels.ps1
|
||||
- Windows Service: Start-Backend-Service.ps1 → SSH-Tunnels.ps1
|
||||
|
||||
Responsibilities:
|
||||
✅ Monitor tunnel health via port checks (background asyncio task)
|
||||
✅ Auto-restart tunnels if they go down (calls platform-specific scripts)
|
||||
✅ Expose status for /health endpoint
|
||||
|
||||
NOT responsible for:
|
||||
❌ Initial tunnel startup (done by wrapper scripts before backend starts)
|
||||
|
||||
Usage in main.py:
|
||||
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
await ssh_tunnel_manager.start_monitoring()
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown():
|
||||
await ssh_tunnel_manager.stop_monitoring()
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
"ssh_tunnels": ssh_tunnel_manager.get_status()
|
||||
}
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SSHTunnelManager:
|
||||
"""
|
||||
Cross-platform SSH tunnel MONITOR (not starter).
|
||||
|
||||
Timeline:
|
||||
T=0 start.sh / Wrapper starts
|
||||
T=1s ssh-tunnel.sh / SSH-Tunnels.ps1 START
|
||||
T=3s Tunnels active ✅
|
||||
T=5s uvicorn backend starts
|
||||
T=7s Backend startup_event()
|
||||
T=8s ssh_tunnel_manager.start_monitoring()
|
||||
└─ Detects tunnels already active (just monitors, doesn't start)
|
||||
T=38s Monitor check #1 - OK ✅
|
||||
...
|
||||
T=XXs [Tunnel drops]
|
||||
T=XX+30 Monitor detects FAIL (1/2)
|
||||
T=XX+60 Monitor detects FAIL (2/2) → RESTART via script
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Configuration
|
||||
self.check_interval: int = 30 # seconds between health checks
|
||||
self.max_failures_before_restart: int = 2 # restart after N consecutive failures
|
||||
self.restart_cooldown: int = 60 # minimum seconds between restarts
|
||||
|
||||
# State
|
||||
self.tunnel_configs: List[Dict] = []
|
||||
self.tunnel_status: Dict[str, bool] = {}
|
||||
self.consecutive_failures: Dict[str, int] = {}
|
||||
self.last_restart_time: float = 0
|
||||
self.monitor_task: Optional[asyncio.Task] = None
|
||||
self._is_monitoring: bool = False
|
||||
|
||||
# Paths (detected at runtime)
|
||||
self._project_root: Optional[Path] = None
|
||||
self._config_file: Optional[Path] = None
|
||||
|
||||
def _detect_paths(self) -> bool:
|
||||
"""Detect project paths based on current file location."""
|
||||
# This file is at: backend/shared/ssh_tunnel_manager.py
|
||||
# Project root is 2 levels up
|
||||
current_file = Path(__file__)
|
||||
self._project_root = current_file.parent.parent.parent
|
||||
|
||||
# Config file location
|
||||
self._config_file = self._project_root / "backend" / "ssh-tunnels.json"
|
||||
|
||||
return self._config_file.exists()
|
||||
|
||||
def _load_config(self) -> List[Dict]:
|
||||
"""Load tunnel configuration from ssh-tunnels.json."""
|
||||
if not self._config_file or not self._config_file.exists():
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(self._config_file, 'r') as f:
|
||||
tunnels = json.load(f)
|
||||
|
||||
# Filter to only tunnels with ssh_host (excludes direct connections)
|
||||
return [t for t in tunnels if t.get("ssh_host")]
|
||||
except Exception as e:
|
||||
logger.error(f"[SSH-MONITOR] Failed to load config: {e}")
|
||||
return []
|
||||
|
||||
async def start_monitoring(self) -> bool:
|
||||
"""
|
||||
Start monitoring EXISTING tunnels.
|
||||
|
||||
Does NOT start tunnels - assumes they're already running
|
||||
(started by start.sh / Start-ROA2WEB.ps1 / Start-Backend-Service.ps1).
|
||||
"""
|
||||
if self._is_monitoring:
|
||||
logger.warning("[SSH-MONITOR] Already monitoring")
|
||||
return True
|
||||
|
||||
# Detect paths and load config
|
||||
if not self._detect_paths():
|
||||
logger.info("[SSH-MONITOR] No ssh-tunnels.json found, skipping")
|
||||
return True
|
||||
|
||||
self.tunnel_configs = self._load_config()
|
||||
|
||||
if not self.tunnel_configs:
|
||||
logger.info("[SSH-MONITOR] No SSH tunnels configured (or all are direct connections)")
|
||||
return True
|
||||
|
||||
# Check initial status (tunnels should already be running)
|
||||
logger.info(f"[SSH-MONITOR] Checking {len(self.tunnel_configs)} tunnel(s)...")
|
||||
|
||||
for config in self.tunnel_configs:
|
||||
tunnel_id = config.get("id", "default")
|
||||
port = config.get("local_port", 1521)
|
||||
name = config.get("name", tunnel_id)
|
||||
|
||||
is_active = await self._check_port("127.0.0.1", port)
|
||||
self.tunnel_status[tunnel_id] = is_active
|
||||
self.consecutive_failures[tunnel_id] = 0
|
||||
|
||||
status = "✅ active" if is_active else "❌ NOT active"
|
||||
logger.info(f"[SSH-MONITOR] [{tunnel_id}] {name} - localhost:{port} - {status}")
|
||||
|
||||
# Start background monitor loop
|
||||
self._is_monitoring = True
|
||||
self.monitor_task = asyncio.create_task(self._monitor_loop())
|
||||
logger.info(f"[SSH-MONITOR] ✅ Monitoring started (check every {self.check_interval}s)")
|
||||
|
||||
return True
|
||||
|
||||
async def stop_monitoring(self) -> None:
|
||||
"""Stop the monitoring background task."""
|
||||
if not self._is_monitoring:
|
||||
return
|
||||
|
||||
self._is_monitoring = False
|
||||
|
||||
if self.monitor_task and not self.monitor_task.done():
|
||||
self.monitor_task.cancel()
|
||||
try:
|
||||
await self.monitor_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
logger.info("[SSH-MONITOR] ✅ Monitoring stopped")
|
||||
|
||||
async def _monitor_loop(self) -> None:
|
||||
"""Background loop: check tunnel health every N seconds, restart if needed."""
|
||||
while self._is_monitoring:
|
||||
try:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
if not self._is_monitoring:
|
||||
break
|
||||
|
||||
needs_restart = False
|
||||
|
||||
for config in self.tunnel_configs:
|
||||
tunnel_id = config.get("id", "default")
|
||||
port = config.get("local_port", 1521)
|
||||
|
||||
is_healthy = await self._check_port("127.0.0.1", port)
|
||||
self.tunnel_status[tunnel_id] = is_healthy
|
||||
|
||||
if is_healthy:
|
||||
# Reset failure count on success
|
||||
if self.consecutive_failures.get(tunnel_id, 0) > 0:
|
||||
logger.info(f"[SSH-MONITOR] [{tunnel_id}] Recovered ✅")
|
||||
self.consecutive_failures[tunnel_id] = 0
|
||||
else:
|
||||
# Increment failure count
|
||||
self.consecutive_failures[tunnel_id] = \
|
||||
self.consecutive_failures.get(tunnel_id, 0) + 1
|
||||
|
||||
failures = self.consecutive_failures[tunnel_id]
|
||||
logger.warning(
|
||||
f"[SSH-MONITOR] [{tunnel_id}] FAIL "
|
||||
f"({failures}/{self.max_failures_before_restart})"
|
||||
)
|
||||
|
||||
if failures >= self.max_failures_before_restart:
|
||||
needs_restart = True
|
||||
|
||||
# Restart all tunnels if any failed enough times
|
||||
if needs_restart:
|
||||
await self._restart_tunnels()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"[SSH-MONITOR] Monitor loop error: {e}")
|
||||
await asyncio.sleep(5) # Brief pause before retrying
|
||||
|
||||
async def _check_port(self, host: str, port: int, timeout: float = 3.0) -> bool:
|
||||
"""Check if a port is accessible (tunnel is working)."""
|
||||
try:
|
||||
# Use asyncio.open_connection for non-blocking port check
|
||||
reader, writer = await asyncio.wait_for(
|
||||
asyncio.open_connection(host, port),
|
||||
timeout=timeout
|
||||
)
|
||||
writer.close()
|
||||
await writer.wait_closed()
|
||||
return True
|
||||
except (asyncio.TimeoutError, ConnectionRefusedError, OSError):
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.debug(f"[SSH-MONITOR] Port check error {host}:{port}: {e}")
|
||||
return False
|
||||
|
||||
async def _restart_tunnels(self) -> bool:
|
||||
"""Restart tunnels via platform-specific script."""
|
||||
import time
|
||||
|
||||
# Check cooldown
|
||||
now = time.time()
|
||||
if now - self.last_restart_time < self.restart_cooldown:
|
||||
remaining = int(self.restart_cooldown - (now - self.last_restart_time))
|
||||
logger.warning(f"[SSH-MONITOR] Restart cooldown active ({remaining}s remaining)")
|
||||
return False
|
||||
|
||||
self.last_restart_time = now
|
||||
logger.warning("[SSH-MONITOR] 🔄 Restarting tunnels...")
|
||||
|
||||
# Build platform-specific command
|
||||
if platform.system() == "Windows":
|
||||
# On Windows, scripts are deployed to scripts/ folder
|
||||
script_path = self._project_root / "scripts" / "ssh-tunnel.ps1"
|
||||
# Fallback to development path if not found
|
||||
if not script_path.exists():
|
||||
script_path = self._project_root / "deployment" / "windows" / "scripts" / "ssh-tunnel.ps1"
|
||||
if not script_path.exists():
|
||||
logger.error(f"[SSH-MONITOR] Script not found in scripts/ or deployment/windows/scripts/")
|
||||
return False
|
||||
cmd = [
|
||||
"powershell.exe",
|
||||
"-ExecutionPolicy", "Bypass",
|
||||
"-File", str(script_path),
|
||||
"restart"
|
||||
]
|
||||
else:
|
||||
script_path = self._project_root / "ssh-tunnel.sh"
|
||||
if not script_path.exists():
|
||||
logger.error(f"[SSH-MONITOR] Script not found: {script_path}")
|
||||
return False
|
||||
cmd = [str(script_path), "restart"]
|
||||
|
||||
try:
|
||||
# Run restart command in subprocess
|
||||
result = await asyncio.get_event_loop().run_in_executor(
|
||||
None,
|
||||
lambda: subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
cwd=str(self._project_root)
|
||||
)
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info("[SSH-MONITOR] ✅ Tunnels restarted successfully")
|
||||
# Reset failure counts
|
||||
for tunnel_id in self.consecutive_failures:
|
||||
self.consecutive_failures[tunnel_id] = 0
|
||||
return True
|
||||
else:
|
||||
logger.error(f"[SSH-MONITOR] Restart failed (code {result.returncode})")
|
||||
if result.stderr:
|
||||
logger.error(f"[SSH-MONITOR] stderr: {result.stderr[:500]}")
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("[SSH-MONITOR] Restart command timed out")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"[SSH-MONITOR] Restart error: {e}")
|
||||
return False
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current tunnel status for /health endpoint.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"status": "connected" | "degraded" | "disconnected" | "not_configured",
|
||||
"tunnels": {
|
||||
"tunnel_id": true/false,
|
||||
...
|
||||
},
|
||||
"monitoring": true/false
|
||||
}
|
||||
"""
|
||||
if not self.tunnel_configs:
|
||||
return {
|
||||
"status": "not_configured",
|
||||
"tunnels": {},
|
||||
"monitoring": False
|
||||
}
|
||||
|
||||
# Determine overall status
|
||||
all_connected = all(self.tunnel_status.values()) if self.tunnel_status else False
|
||||
any_connected = any(self.tunnel_status.values()) if self.tunnel_status else False
|
||||
|
||||
if all_connected:
|
||||
status = "connected"
|
||||
elif any_connected:
|
||||
status = "degraded"
|
||||
else:
|
||||
status = "disconnected"
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"tunnels": dict(self.tunnel_status),
|
||||
"monitoring": self._is_monitoring
|
||||
}
|
||||
|
||||
def is_healthy(self) -> bool:
|
||||
"""Quick check if all tunnels are healthy."""
|
||||
if not self.tunnel_configs:
|
||||
return True # No tunnels configured = healthy (direct connection)
|
||||
return all(self.tunnel_status.values()) if self.tunnel_status else False
|
||||
|
||||
|
||||
# Global singleton instance
|
||||
ssh_tunnel_manager = SSHTunnelManager()
|
||||
Reference in New Issue
Block a user