feat(run): extract detection loop helpers + unconditional cmd drain

Refactor _detection_loop by moving _run_tick, _handle_fsm_result,
_dispatch_command, and _drain_cmd_queue to module scope, passing
dependencies via a RunContext dataclass. This unblocks direct unit
testing of the drain path.

CRITICAL bug fix: the previous loop issued `continue` when the tick
returned res=None (canary paused or similar), which skipped the
drain block. Commands piled up in cmd_queue while detection was
paused — the hang observed on 2026-04-17 after canary drift-pause.
The refactored loop now runs _drain_cmd_queue UNCONDITIONALLY on
every iteration, after _handle_fsm_result, so pause-state never
starves the command channel.

Tests: test_drain_works_when_canary_paused,
test_drain_works_when_out_of_window,
test_drain_isolates_dispatch_exceptions (exception isolation +
audit/warn wiring).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 11:52:28 +03:00
parent 153196f762
commit c5024ce600
2 changed files with 314 additions and 119 deletions

View File

@@ -401,3 +401,139 @@ async def test_lifecycle_idle_armed_primed_autopoll_fire_stop(monkeypatch, tmp_p
start_idx = scheduler_events.index("start:180")
stop_idx = scheduler_events.index("stop")
assert start_idx < stop_idx, "scheduler started after it stopped"
# ---------------------------------------------------------------------------
# Commit 1 regression tests: _drain_cmd_queue MUST run unconditionally,
# even when canary is paused or when detection is otherwise skipped.
# Prior bug: `continue` past the drain loop caused commands to pile up.
# ---------------------------------------------------------------------------
def _make_ctx_for_drain(cmd_queue, dispatched: list):
"""Build a minimal RunContext where _dispatch_command just records calls."""
import atm.main as _main
class _FakeAudit:
def __init__(self): self.events = []
def log(self, e): self.events.append(e)
class _FakeNotifier:
def __init__(self): self.alerts = []
def send(self, a): self.alerts.append(a)
class _FakeCanary:
def __init__(self, paused=True):
self.is_paused = paused
class _FakeScheduler:
is_running = False
interval_s = None
def start(self, s): pass
def stop(self): pass
state = _main._LoopState(start=0.0)
ctx = _main.RunContext(
cfg=MagicMock(),
capture=lambda: None,
canary=_FakeCanary(paused=True),
detector=MagicMock(),
fsm=MagicMock(),
notifier=_FakeNotifier(),
audit=_FakeAudit(),
detection_log=_FakeAudit(),
scheduler=_FakeScheduler(),
samples_dir=Path("."),
fires_dir=Path("."),
cmd_queue=cmd_queue,
state=state,
levels_extractor_factory=lambda *a, **kw: None,
)
return ctx
@pytest.mark.asyncio
async def test_drain_works_when_canary_paused(monkeypatch):
"""Regression: when canary.is_paused, _drain_cmd_queue still dispatches.
Prior bug: detection loop `continue`'d past the drain block whenever the
tick returned res=None (canary paused). Commands accumulated forever.
"""
import atm.main as _main
from atm.commands import Command
q: asyncio.Queue = asyncio.Queue()
await q.put(Command(action="status"))
await q.put(Command(action="ss"))
dispatched: list = []
async def _fake_dispatch(ctx, cmd):
dispatched.append(cmd.action)
monkeypatch.setattr(_main, "_dispatch_command", _fake_dispatch)
ctx = _make_ctx_for_drain(q, dispatched)
await _main._drain_cmd_queue(ctx)
assert dispatched == ["status", "ss"]
assert q.empty()
@pytest.mark.asyncio
async def test_drain_works_when_out_of_window(monkeypatch):
"""Drain must still fire when the tick skipped (e.g. out of operating hours).
The refactored loop runs _drain_cmd_queue unconditionally after every tick,
regardless of `_TickSyncResult` content.
"""
import atm.main as _main
from atm.commands import Command
q: asyncio.Queue = asyncio.Queue()
await q.put(Command(action="stop"))
dispatched: list = []
async def _fake_dispatch(ctx, cmd):
dispatched.append(cmd.action)
monkeypatch.setattr(_main, "_dispatch_command", _fake_dispatch)
ctx = _make_ctx_for_drain(q, dispatched)
# Simulate out-of-window tick (empty _TickSyncResult, no res)
await _main._handle_fsm_result(ctx, _main._TickSyncResult())
await _main._drain_cmd_queue(ctx)
assert dispatched == ["stop"]
@pytest.mark.asyncio
async def test_drain_isolates_dispatch_exceptions(monkeypatch):
"""If one command raises, remaining commands still drain + warn alert sent."""
import atm.main as _main
from atm.commands import Command
q: asyncio.Queue = asyncio.Queue()
await q.put(Command(action="status"))
await q.put(Command(action="ss"))
attempts: list = []
async def _fake_dispatch(ctx, cmd):
attempts.append(cmd.action)
if cmd.action == "status":
raise RuntimeError("boom")
monkeypatch.setattr(_main, "_dispatch_command", _fake_dispatch)
ctx = _make_ctx_for_drain(q, attempts)
await _main._drain_cmd_queue(ctx)
assert attempts == ["status", "ss"]
# warn alert for the failed command
warn_titles = [a.title for a in ctx.notifier.alerts if a.kind == "warn"]
assert any("status" in t for t in warn_titles)
# command_error audit event
errs = [e for e in ctx.audit.events if e.get("event") == "command_error"]
assert len(errs) == 1 and errs[0]["action"] == "status"