feat: [US-004] Add SSH tunnel auto-start for Windows services

- Add ssh-tunnel.ps1: Windows SSH tunnel manager (equivalent to ssh-tunnel.sh)
  - Supports password auth via plink.exe (PuTTY)
  - Supports ssh_hostkey for non-interactive batch mode
  - Commands: start, stop, restart, status

- Add start-backend-service.ps1: NSSM service wrapper
  - Starts SSH tunnels before uvicorn
  - Waits for tunnel ports to be accessible (30s timeout)
  - Configured by Install-ROA2WEB.ps1

- Add start.ps1: Windows equivalent of start.sh
  - Orchestrates SSH tunnel + backend + frontend startup

- Add backend/shared/ssh_tunnel_manager.py: Python monitoring
  - Background asyncio task monitors tunnel health every 30s
  - Auto-restarts tunnels after 2 consecutive failures
  - Exposes status to /health endpoint

- Update ROA2WEB-Console.ps1:
  - Add Deploy-Scripts function
  - Update Update-ServiceToUseVenv to use wrapper script

- Fix PowerShell reserved variable ($PID -> $tunnelPid)
- Fix script path detection (scripts/ vs deployment/windows/scripts/)
- Update README.md with ssh_hostkey documentation

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-01-28 19:04:26 +00:00
parent dc1711acd0
commit 6718c956f7
9 changed files with 1766 additions and 26 deletions

View File

@@ -73,6 +73,7 @@ telegram_bot_task = None
ocr_job_worker_running = False
cleanup_task_running = False
email_cache_running = False
ssh_tunnel_monitoring = False
# ============================================================================
@@ -265,6 +266,40 @@ async def init_email_server_cache():
email_cache_running = False
async def init_ssh_tunnel_monitoring():
"""Initialize SSH tunnel monitoring with auto-reconnect.
This does NOT start tunnels - they should already be running
(started by start.sh / start.ps1 / start-backend-service.ps1).
Responsibilities:
- Monitor tunnel health via port checks (every 30s)
- Auto-restart tunnels if they go down
- Expose status for /health endpoint
"""
global ssh_tunnel_monitoring
logger.info("[SSH-MONITOR] Initializing tunnel monitoring...")
try:
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
success = await ssh_tunnel_manager.start_monitoring()
ssh_tunnel_monitoring = success
if success:
status = ssh_tunnel_manager.get_status()
if status["status"] == "not_configured":
logger.info("[SSH-MONITOR] No tunnels configured (direct connection mode)")
else:
logger.info(f"[SSH-MONITOR] ✅ Monitoring active: {status['status']}")
else:
logger.warning("[SSH-MONITOR] ⚠️ Failed to start monitoring")
except Exception as e:
logger.warning(f"[SSH-MONITOR] ⚠️ Init failed: {e}")
ssh_tunnel_monitoring = False
async def run_telegram_bot():
"""Run Telegram bot as background task."""
logger.info("[TELEGRAM] Starting bot...")
@@ -381,7 +416,10 @@ async def startup_event():
# Step 5: Initialize email-server cache for multi-Oracle (US-003)
await init_email_server_cache()
# Step 6: Start Telegram bot as background task
# Step 6: Initialize SSH tunnel monitoring (auto-reconnect)
await init_ssh_tunnel_monitoring()
# Step 7: Start Telegram bot as background task
if settings.telegram_bot_token:
telegram_bot_task = asyncio.create_task(run_telegram_bot())
logger.info("[STARTUP] ✅ Telegram bot task created")
@@ -401,13 +439,24 @@ async def startup_event():
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown - Cleanup resources."""
global telegram_bot_task, ocr_job_worker_running, cleanup_task_running, email_cache_running
global telegram_bot_task, ocr_job_worker_running, cleanup_task_running, email_cache_running, ssh_tunnel_monitoring
logger.info("=" * 80)
logger.info("[SHUTDOWN] Stopping ROA2WEB Unified Backend...")
logger.info("=" * 80)
try:
# Stop SSH tunnel monitoring
if ssh_tunnel_monitoring:
logger.info("[SHUTDOWN] Stopping SSH tunnel monitoring...")
try:
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
await ssh_tunnel_manager.stop_monitoring()
ssh_tunnel_monitoring = False
logger.info("[SHUTDOWN] SSH tunnel monitoring stopped")
except Exception as e:
logger.error(f"[SHUTDOWN] SSH tunnel monitoring error: {e}")
# Stop email cache auto-refresh (US-003)
if email_cache_running:
logger.info("[SHUTDOWN] Stopping email cache auto-refresh...")
@@ -609,6 +658,14 @@ async def health_check():
except Exception as e:
health_status["modules"]["ocr_worker"] = f"error: {str(e)}"
# Check SSH tunnels
global ssh_tunnel_monitoring
try:
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
health_status["modules"]["ssh_tunnels"] = ssh_tunnel_manager.get_status()
except Exception as e:
health_status["modules"]["ssh_tunnels"] = f"error: {str(e)}"
return health_status

View File

@@ -0,0 +1,350 @@
"""
SSH Tunnel Manager - Cross-Platform Monitoring and Auto-Reconnect
This module provides MONITORING and AUTO-RECONNECT for SSH tunnels.
It does NOT start tunnels - that's the responsibility of:
- Linux: start.sh → ssh-tunnel.sh
- Windows: Start-ROA2WEB.ps1 → SSH-Tunnels.ps1
- Windows Service: Start-Backend-Service.ps1 → SSH-Tunnels.ps1
Responsibilities:
✅ Monitor tunnel health via port checks (background asyncio task)
✅ Auto-restart tunnels if they go down (calls platform-specific scripts)
✅ Expose status for /health endpoint
NOT responsible for:
❌ Initial tunnel startup (done by wrapper scripts before backend starts)
Usage in main.py:
from backend.shared.ssh_tunnel_manager import ssh_tunnel_manager
@app.on_event("startup")
async def startup():
await ssh_tunnel_manager.start_monitoring()
@app.on_event("shutdown")
async def shutdown():
await ssh_tunnel_manager.stop_monitoring()
@app.get("/health")
async def health():
return {
"ssh_tunnels": ssh_tunnel_manager.get_status()
}
"""
import asyncio
import json
import logging
import platform
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Any
logger = logging.getLogger(__name__)
class SSHTunnelManager:
"""
Cross-platform SSH tunnel MONITOR (not starter).
Timeline:
T=0 start.sh / Wrapper starts
T=1s ssh-tunnel.sh / SSH-Tunnels.ps1 START
T=3s Tunnels active ✅
T=5s uvicorn backend starts
T=7s Backend startup_event()
T=8s ssh_tunnel_manager.start_monitoring()
└─ Detects tunnels already active (just monitors, doesn't start)
T=38s Monitor check #1 - OK ✅
...
T=XXs [Tunnel drops]
T=XX+30 Monitor detects FAIL (1/2)
T=XX+60 Monitor detects FAIL (2/2) → RESTART via script
"""
def __init__(self):
# Configuration
self.check_interval: int = 30 # seconds between health checks
self.max_failures_before_restart: int = 2 # restart after N consecutive failures
self.restart_cooldown: int = 60 # minimum seconds between restarts
# State
self.tunnel_configs: List[Dict] = []
self.tunnel_status: Dict[str, bool] = {}
self.consecutive_failures: Dict[str, int] = {}
self.last_restart_time: float = 0
self.monitor_task: Optional[asyncio.Task] = None
self._is_monitoring: bool = False
# Paths (detected at runtime)
self._project_root: Optional[Path] = None
self._config_file: Optional[Path] = None
def _detect_paths(self) -> bool:
"""Detect project paths based on current file location."""
# This file is at: backend/shared/ssh_tunnel_manager.py
# Project root is 2 levels up
current_file = Path(__file__)
self._project_root = current_file.parent.parent.parent
# Config file location
self._config_file = self._project_root / "backend" / "ssh-tunnels.json"
return self._config_file.exists()
def _load_config(self) -> List[Dict]:
"""Load tunnel configuration from ssh-tunnels.json."""
if not self._config_file or not self._config_file.exists():
return []
try:
with open(self._config_file, 'r') as f:
tunnels = json.load(f)
# Filter to only tunnels with ssh_host (excludes direct connections)
return [t for t in tunnels if t.get("ssh_host")]
except Exception as e:
logger.error(f"[SSH-MONITOR] Failed to load config: {e}")
return []
async def start_monitoring(self) -> bool:
"""
Start monitoring EXISTING tunnels.
Does NOT start tunnels - assumes they're already running
(started by start.sh / Start-ROA2WEB.ps1 / Start-Backend-Service.ps1).
"""
if self._is_monitoring:
logger.warning("[SSH-MONITOR] Already monitoring")
return True
# Detect paths and load config
if not self._detect_paths():
logger.info("[SSH-MONITOR] No ssh-tunnels.json found, skipping")
return True
self.tunnel_configs = self._load_config()
if not self.tunnel_configs:
logger.info("[SSH-MONITOR] No SSH tunnels configured (or all are direct connections)")
return True
# Check initial status (tunnels should already be running)
logger.info(f"[SSH-MONITOR] Checking {len(self.tunnel_configs)} tunnel(s)...")
for config in self.tunnel_configs:
tunnel_id = config.get("id", "default")
port = config.get("local_port", 1521)
name = config.get("name", tunnel_id)
is_active = await self._check_port("127.0.0.1", port)
self.tunnel_status[tunnel_id] = is_active
self.consecutive_failures[tunnel_id] = 0
status = "✅ active" if is_active else "❌ NOT active"
logger.info(f"[SSH-MONITOR] [{tunnel_id}] {name} - localhost:{port} - {status}")
# Start background monitor loop
self._is_monitoring = True
self.monitor_task = asyncio.create_task(self._monitor_loop())
logger.info(f"[SSH-MONITOR] ✅ Monitoring started (check every {self.check_interval}s)")
return True
async def stop_monitoring(self) -> None:
"""Stop the monitoring background task."""
if not self._is_monitoring:
return
self._is_monitoring = False
if self.monitor_task and not self.monitor_task.done():
self.monitor_task.cancel()
try:
await self.monitor_task
except asyncio.CancelledError:
pass
logger.info("[SSH-MONITOR] ✅ Monitoring stopped")
async def _monitor_loop(self) -> None:
"""Background loop: check tunnel health every N seconds, restart if needed."""
while self._is_monitoring:
try:
await asyncio.sleep(self.check_interval)
if not self._is_monitoring:
break
needs_restart = False
for config in self.tunnel_configs:
tunnel_id = config.get("id", "default")
port = config.get("local_port", 1521)
is_healthy = await self._check_port("127.0.0.1", port)
self.tunnel_status[tunnel_id] = is_healthy
if is_healthy:
# Reset failure count on success
if self.consecutive_failures.get(tunnel_id, 0) > 0:
logger.info(f"[SSH-MONITOR] [{tunnel_id}] Recovered ✅")
self.consecutive_failures[tunnel_id] = 0
else:
# Increment failure count
self.consecutive_failures[tunnel_id] = \
self.consecutive_failures.get(tunnel_id, 0) + 1
failures = self.consecutive_failures[tunnel_id]
logger.warning(
f"[SSH-MONITOR] [{tunnel_id}] FAIL "
f"({failures}/{self.max_failures_before_restart})"
)
if failures >= self.max_failures_before_restart:
needs_restart = True
# Restart all tunnels if any failed enough times
if needs_restart:
await self._restart_tunnels()
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"[SSH-MONITOR] Monitor loop error: {e}")
await asyncio.sleep(5) # Brief pause before retrying
async def _check_port(self, host: str, port: int, timeout: float = 3.0) -> bool:
"""Check if a port is accessible (tunnel is working)."""
try:
# Use asyncio.open_connection for non-blocking port check
reader, writer = await asyncio.wait_for(
asyncio.open_connection(host, port),
timeout=timeout
)
writer.close()
await writer.wait_closed()
return True
except (asyncio.TimeoutError, ConnectionRefusedError, OSError):
return False
except Exception as e:
logger.debug(f"[SSH-MONITOR] Port check error {host}:{port}: {e}")
return False
async def _restart_tunnels(self) -> bool:
"""Restart tunnels via platform-specific script."""
import time
# Check cooldown
now = time.time()
if now - self.last_restart_time < self.restart_cooldown:
remaining = int(self.restart_cooldown - (now - self.last_restart_time))
logger.warning(f"[SSH-MONITOR] Restart cooldown active ({remaining}s remaining)")
return False
self.last_restart_time = now
logger.warning("[SSH-MONITOR] 🔄 Restarting tunnels...")
# Build platform-specific command
if platform.system() == "Windows":
# On Windows, scripts are deployed to scripts/ folder
script_path = self._project_root / "scripts" / "ssh-tunnel.ps1"
# Fallback to development path if not found
if not script_path.exists():
script_path = self._project_root / "deployment" / "windows" / "scripts" / "ssh-tunnel.ps1"
if not script_path.exists():
logger.error(f"[SSH-MONITOR] Script not found in scripts/ or deployment/windows/scripts/")
return False
cmd = [
"powershell.exe",
"-ExecutionPolicy", "Bypass",
"-File", str(script_path),
"restart"
]
else:
script_path = self._project_root / "ssh-tunnel.sh"
if not script_path.exists():
logger.error(f"[SSH-MONITOR] Script not found: {script_path}")
return False
cmd = [str(script_path), "restart"]
try:
# Run restart command in subprocess
result = await asyncio.get_event_loop().run_in_executor(
None,
lambda: subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60,
cwd=str(self._project_root)
)
)
if result.returncode == 0:
logger.info("[SSH-MONITOR] ✅ Tunnels restarted successfully")
# Reset failure counts
for tunnel_id in self.consecutive_failures:
self.consecutive_failures[tunnel_id] = 0
return True
else:
logger.error(f"[SSH-MONITOR] Restart failed (code {result.returncode})")
if result.stderr:
logger.error(f"[SSH-MONITOR] stderr: {result.stderr[:500]}")
return False
except subprocess.TimeoutExpired:
logger.error("[SSH-MONITOR] Restart command timed out")
return False
except Exception as e:
logger.error(f"[SSH-MONITOR] Restart error: {e}")
return False
def get_status(self) -> Dict[str, Any]:
"""
Get current tunnel status for /health endpoint.
Returns:
{
"status": "connected" | "degraded" | "disconnected" | "not_configured",
"tunnels": {
"tunnel_id": true/false,
...
},
"monitoring": true/false
}
"""
if not self.tunnel_configs:
return {
"status": "not_configured",
"tunnels": {},
"monitoring": False
}
# Determine overall status
all_connected = all(self.tunnel_status.values()) if self.tunnel_status else False
any_connected = any(self.tunnel_status.values()) if self.tunnel_status else False
if all_connected:
status = "connected"
elif any_connected:
status = "degraded"
else:
status = "disconnected"
return {
"status": status,
"tunnels": dict(self.tunnel_status),
"monitoring": self._is_monitoring
}
def is_healthy(self) -> bool:
"""Quick check if all tunnels are healthy."""
if not self.tunnel_configs:
return True # No tunnels configured = healthy (direct connection)
return all(self.tunnel_status.values()) if self.tunnel_status else False
# Global singleton instance
ssh_tunnel_manager = SSHTunnelManager()