Files
echo-core/src/scheduler.py

535 lines
18 KiB
Python

"""
Cron-like job scheduler for Echo-Core.
Wraps APScheduler AsyncIOScheduler to run Claude CLI prompts on a schedule,
sending output to designated Discord channels.
"""
import asyncio
import json
import logging
import os
import re
import subprocess
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Awaitable, Callable
from zoneinfo import ZoneInfo
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from src.claude_session import (
CLAUDE_BIN,
PROJECT_ROOT,
VALID_MODELS,
_safe_env,
build_system_prompt,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
JOBS_DIR = PROJECT_ROOT / "cron"
JOBS_FILE = JOBS_DIR / "jobs.json"
JOB_TIMEOUT = 300 # 5-minute default per job execution
_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
_MAX_PROMPT_LEN = 10_000
_MAX_SHELL_OUTPUT = 1500 # chars of stdout to forward to channel
_MAX_STDERR_REPORT = 500 # chars of stderr on non-zero exit
_VALID_REPORT_ON = {"always", "changes", "never"}
_VALID_KINDS = {"claude", "shell"}
_SCHEDULER_TZ = ZoneInfo("Europe/Bucharest")
_MARKER_RE = re.compile(r"^GSTACK-CRON:\s+changes=(\d+)\s*$", re.MULTILINE)
# ---------------------------------------------------------------------------
# Scheduler class
# ---------------------------------------------------------------------------
class Scheduler:
"""Wraps APScheduler AsyncIOScheduler for Echo Core cron jobs."""
def __init__(
self,
send_callback: Callable[[str, str], Awaitable[None]] | None = None,
config=None,
) -> None:
self._send_callback = send_callback
self._config = config
self._scheduler = AsyncIOScheduler(timezone=_SCHEDULER_TZ)
self._jobs: list[dict] = []
# ------------------------------------------------------------------
# Public methods
# ------------------------------------------------------------------
async def start(self) -> None:
"""Load jobs from jobs.json, schedule enabled ones, start scheduler."""
self._jobs = self._load_jobs()
for job in self._jobs:
if job.get("enabled", False):
self._schedule_job(job)
self._scheduler.start()
logger.info("Scheduler started with %d jobs (%d enabled)",
len(self._jobs),
sum(1 for j in self._jobs if j.get("enabled")))
async def stop(self) -> None:
"""Shut down APScheduler gracefully."""
self._scheduler.shutdown(wait=False)
logger.info("Scheduler stopped")
def add_job(
self,
name: str,
cron: str,
channel: str,
prompt: str,
model: str = "sonnet",
allowed_tools: list[str] | None = None,
) -> dict:
"""Validate, add job to list, save, and schedule. Returns new job dict."""
# Validate name
if not _NAME_RE.match(name):
raise ValueError(
f"Invalid job name '{name}'. Must match: lowercase alphanumeric "
"and hyphens, 1-63 chars, starting with alphanumeric."
)
# Duplicate check
if any(j["name"] == name for j in self._jobs):
raise ValueError(f"Job '{name}' already exists")
# Validate cron expression
try:
CronTrigger.from_crontab(cron)
except (ValueError, KeyError) as exc:
raise ValueError(f"Invalid cron expression '{cron}': {exc}")
# Validate model
if model not in VALID_MODELS:
raise ValueError(
f"Invalid model '{model}'. Must be one of: {', '.join(sorted(VALID_MODELS))}"
)
# Validate prompt
if not prompt or not prompt.strip():
raise ValueError("Prompt must be non-empty")
if len(prompt) > _MAX_PROMPT_LEN:
raise ValueError(f"Prompt too long ({len(prompt)} chars, max {_MAX_PROMPT_LEN})")
job = {
"name": name,
"cron": cron,
"channel": channel,
"model": model,
"prompt": prompt,
"allowed_tools": allowed_tools or [],
"enabled": True,
"last_run": None,
"last_status": None,
"next_run": None,
}
self._jobs.append(job)
self._schedule_job(job)
self._update_next_run(job)
self._save_jobs()
logger.info("Added job '%s' (cron: %s, channel: %s)", name, cron, channel)
return job
def add_shell_job(
self,
name: str,
cron: str,
channel: str,
command: list[str],
report_on: str = "changes",
timeout: int | None = None,
) -> dict:
"""Validate, add a shell job to list, save, and schedule.
Shell jobs execute an arbitrary command via subprocess. Reporting policy:
- report_on="always": always forward stdout (truncated) on exit 0
- report_on="never": never forward stdout on exit 0
- report_on="changes": parse GSTACK-CRON marker ('changes=N') and
forward stdout only if N>0
Non-zero exit status ALWAYS reports stderr regardless of report_on.
"""
# Validate name
if not _NAME_RE.match(name):
raise ValueError(
f"Invalid job name '{name}'. Must match: lowercase alphanumeric "
"and hyphens, 1-63 chars, starting with alphanumeric."
)
# Duplicate check (across claude+shell)
if any(j["name"] == name for j in self._jobs):
raise ValueError(f"Job '{name}' already exists")
# Validate cron expression
try:
CronTrigger.from_crontab(cron, timezone=_SCHEDULER_TZ)
except (ValueError, KeyError) as exc:
raise ValueError(f"Invalid cron expression '{cron}': {exc}")
# Validate channel
if not isinstance(channel, str) or not channel.strip():
raise ValueError("Channel must be a non-empty string")
# Validate command
if (
not isinstance(command, list)
or not command
or not all(isinstance(c, str) and c.strip() for c in command)
):
raise ValueError(
"Command must be a non-empty list of non-empty strings"
)
# Validate report_on
if report_on not in _VALID_REPORT_ON:
raise ValueError(
f"Invalid report_on '{report_on}'. Must be one of: "
f"{', '.join(sorted(_VALID_REPORT_ON))}"
)
# Validate timeout
if timeout is not None:
if not isinstance(timeout, int) or isinstance(timeout, bool):
raise ValueError("Timeout must be an int (seconds)")
if timeout < 1 or timeout > 3600:
raise ValueError("Timeout must be between 1 and 3600 seconds")
job = {
"name": name,
"kind": "shell",
"cron": cron,
"channel": channel,
"command": list(command),
"report_on": report_on,
"timeout": timeout,
"enabled": True,
"last_run": None,
"last_status": None,
"next_run": None,
}
self._jobs.append(job)
self._schedule_job(job)
self._update_next_run(job)
self._save_jobs()
logger.info(
"Added shell job '%s' (cron: %s, channel: %s, report_on: %s)",
name, cron, channel, report_on,
)
return job
def remove_job(self, name: str) -> bool:
"""Remove job from list and APScheduler. Returns True if found."""
for i, job in enumerate(self._jobs):
if job["name"] == name:
self._jobs.pop(i)
try:
self._scheduler.remove_job(name)
except Exception:
pass
self._save_jobs()
logger.info("Removed job '%s'", name)
return True
return False
def enable_job(self, name: str) -> bool:
"""Enable job and schedule in APScheduler. Returns True if found."""
for job in self._jobs:
if job["name"] == name:
job["enabled"] = True
self._schedule_job(job)
self._update_next_run(job)
self._save_jobs()
logger.info("Enabled job '%s'", name)
return True
return False
def disable_job(self, name: str) -> bool:
"""Disable job and remove from APScheduler. Returns True if found."""
for job in self._jobs:
if job["name"] == name:
job["enabled"] = False
job["next_run"] = None
try:
self._scheduler.remove_job(name)
except Exception:
pass
self._save_jobs()
logger.info("Disabled job '%s'", name)
return True
return False
async def run_job(self, name: str) -> str:
"""Force-execute a job immediately. Returns Claude response text."""
job = self._find_job(name)
if job is None:
raise KeyError(f"Job '{name}' not found")
return await self._execute_job(job)
def list_jobs(self) -> list[dict]:
"""Return a copy of all jobs with current state."""
return [dict(j) for j in self._jobs]
# ------------------------------------------------------------------
# Internal methods
# ------------------------------------------------------------------
def _find_job(self, name: str) -> dict | None:
"""Find a job by name."""
for job in self._jobs:
if job["name"] == name:
return job
return None
def _load_jobs(self) -> list[dict]:
"""Read and parse jobs.json. Returns [] if missing or corrupt."""
try:
text = JOBS_FILE.read_text(encoding="utf-8")
if not text.strip():
return []
data = json.loads(text)
if not isinstance(data, list):
logger.error("jobs.json is not a list, treating as empty")
return []
return data
except FileNotFoundError:
return []
except json.JSONDecodeError as exc:
logger.error("jobs.json corrupt (%s), treating as empty", exc)
return []
def _save_jobs(self) -> None:
"""Atomically write current jobs list to jobs.json."""
JOBS_DIR.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
dir=JOBS_DIR, prefix=".jobs_", suffix=".json"
)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
json.dump(self._jobs, f, indent=2, ensure_ascii=False)
f.write("\n")
os.replace(tmp_path, JOBS_FILE)
except BaseException:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
def _schedule_job(self, job: dict) -> None:
"""Add a single job to APScheduler."""
# Remove existing schedule if any
try:
self._scheduler.remove_job(job["name"])
except Exception:
pass
trigger = CronTrigger.from_crontab(job["cron"])
self._scheduler.add_job(
self._job_callback,
trigger=trigger,
id=job["name"],
args=[job["name"]],
max_instances=1,
)
async def _job_callback(self, job_name: str) -> None:
"""APScheduler callback — finds job and executes."""
job = self._find_job(job_name)
if job is None:
logger.error("Scheduled callback for unknown job '%s'", job_name)
return
await self._execute_job(job)
async def _execute_job(self, job: dict) -> str:
"""Execute a job: dispatch by kind, update state, forward output."""
name = job["name"]
job["last_run"] = datetime.now(timezone.utc).isoformat()
kind = job.get("kind", "claude")
if kind == "shell":
result_text = await self._execute_shell_job(job)
else:
result_text = await self._execute_claude_job(job)
# Update next_run from APScheduler
self._update_next_run(job)
# Save state
self._save_jobs()
# Send output via callback if we have something to send
if result_text and "HEARTBEAT_OK" not in result_text and self._send_callback:
try:
await self._send_callback(job["channel"], result_text)
except Exception as exc:
logger.error("Job '%s' send_callback failed: %s", name, exc)
elif not result_text:
logger.debug("Job '%s' produced no output, skipping send", name)
return result_text or ""
async def _execute_claude_job(self, job: dict) -> str:
"""Run a Claude CLI job and return the response text (or error marker)."""
name = job["name"]
# Build CLI command
cmd = [
CLAUDE_BIN, "-p", job["prompt"],
"--model", job["model"],
"--output-format", "json",
"--dangerously-skip-permissions",
]
try:
system_prompt = build_system_prompt()
cmd += ["--system-prompt", system_prompt]
except FileNotFoundError:
pass
if job.get("allowed_tools"):
cmd += ["--allowedTools"] + job["allowed_tools"]
result_text = ""
try:
proc = await asyncio.to_thread(
subprocess.run,
cmd,
capture_output=True,
text=True,
timeout=JOB_TIMEOUT,
env=_safe_env(),
cwd=PROJECT_ROOT,
)
if proc.returncode != 0:
error_msg = proc.stderr[:500] if proc.stderr else "unknown error"
raise RuntimeError(
f"Claude CLI error (exit {proc.returncode}): {error_msg}"
)
data = json.loads(proc.stdout)
result_text = data.get("result", "")
job["last_status"] = "ok"
logger.info("Job '%s' completed successfully", name)
except subprocess.TimeoutExpired:
job["last_status"] = "error"
result_text = f"[cron:{name}] Error: timed out after {JOB_TIMEOUT}s"
logger.error("Job '%s' timed out", name)
except (RuntimeError, json.JSONDecodeError) as exc:
job["last_status"] = "error"
result_text = f"[cron:{name}] Error: {exc}"
logger.error("Job '%s' failed: %s", name, exc)
except Exception as exc:
job["last_status"] = "error"
result_text = f"[cron:{name}] Error: {exc}"
logger.error("Job '%s' unexpected error: %s", name, exc)
return result_text
async def _execute_shell_job(self, job: dict) -> str:
"""Run a shell command job, honour report_on policy, return text to forward.
Exit != 0 always reports stderr (trimmed). Exit == 0 obeys report_on:
'always' forwards stdout, 'never' stays silent, 'changes' parses the
GSTACK-CRON marker ('changes=N') and forwards stdout only if N>0.
Missing/malformed marker is logged and treated as 'no changes'.
"""
name = job["name"]
cmd = list(job["command"])
timeout = job.get("timeout")
if not isinstance(timeout, int) or timeout <= 0:
timeout = JOB_TIMEOUT
report_on = job.get("report_on", "changes")
try:
proc = await asyncio.to_thread(
subprocess.run,
cmd,
capture_output=True,
text=True,
timeout=timeout,
cwd=PROJECT_ROOT,
env=_safe_env(),
)
except subprocess.TimeoutExpired:
job["last_status"] = "error"
logger.error("Shell job '%s' timed out after %ss", name, timeout)
return f"[cron:{name}] Error: timed out after {timeout}s"
except Exception as exc:
job["last_status"] = "error"
logger.error("Shell job '%s' failed to launch: %s", name, exc)
return f"[cron:{name}] Error: {exc}"
if proc.returncode != 0:
job["last_status"] = "error"
stderr_trim = (proc.stderr or "").strip()[:_MAX_STDERR_REPORT]
logger.error(
"Shell job '%s' exit %d: %s", name, proc.returncode, stderr_trim,
)
return f"[cron:{name}] exit {proc.returncode}: {stderr_trim}"
job["last_status"] = "ok"
stdout = proc.stdout or ""
if report_on == "never":
logger.info("Shell job '%s' ok (report_on=never, silent)", name)
return ""
if report_on == "always":
logger.info("Shell job '%s' ok (report_on=always)", name)
return stdout[:_MAX_SHELL_OUTPUT]
# report_on == "changes"
match = _MARKER_RE.search(stdout)
if not match:
logger.warning(
"Shell job '%s' missing GSTACK-CRON marker "
"(report_on=changes, staying silent)",
name,
)
return ""
try:
n = int(match.group(1))
except ValueError:
logger.warning(
"Shell job '%s' GSTACK-CRON marker has non-int payload", name,
)
return ""
if n <= 0:
logger.info("Shell job '%s' ok (0 changes, silent)", name)
return ""
logger.info("Shell job '%s' ok (%d changes, forwarding)", name, n)
return stdout[:_MAX_SHELL_OUTPUT]
def _update_next_run(self, job: dict) -> None:
"""Update job's next_run from APScheduler."""
try:
aps_job = self._scheduler.get_job(job["name"])
if aps_job and aps_job.next_run_time:
job["next_run"] = aps_job.next_run_time.isoformat()
else:
job["next_run"] = None
except Exception:
job["next_run"] = None