feat(voice): Pas 5 — voice/pipeline.py VoiceSession + EchoVoiceSink + cleanup

Central voice pipeline (~250 LOC + docstrings = ~430 lines): VoiceSession (context manager + idempotent cleanup pe 5 căi): - __enter__: acquire _lock, open JSONL (record=on) - __exit__: calls cleanup("exit"), nu suprimă exceptions - cleanup(reason): IDEMPOTENT, side effects o singură dată — JSONL flush+close (record=on) sau delete (record=off), bot presence cleared, voice_client.cleanup(), ttsq.stop(), cancel filler task, lock release, structured log la logs/voice_metrics.jsonl - on_segment_done(speaker_id, text, no_speech_prob): mirror text channel, append JSONL, arm 3s filler timer, route_message cu on_text callback + cancel filler la first block - last_activity_ts: time.monotonic() — caller-driven 5min auto-leave EchoVoiceSink(session, bot_user_id): - wants_opus() False (PCM) - write() runs în voice_recv reader thread (threading primitives only): - GUARD 1: user None/id==0/id==bot_user_id → return (load-bearing echo prevention) - GUARD 2: whitelist filter (empty = allow all) - Buffer 20ms packets per-user → batch 100ms (5×20ms = 19200 bytes) → silero-vad threshold 0.5 → 800ms cumulative silence flush - _flush_to_stt: faster-whisper small int8 cpu_threads=4 lang=ro beam_size=1, no_speech_prob > 0.6 drop, schedule on_segment_done via run_coroutine_threadsafe pe session.loop Module helpers (lazy thread-safe singletons): _get_whisper_model, _get_silero_vad. Constants: FILLER_DELAY_S=3.0, SILENCE_FLUSH_MS=800, VAD_THRESHOLD=0.5, VAD_WINDOW_MS=100, NO_SPEECH_DROP_THRESHOLD=0.6. Decisions: - STT runs in audio thread — acceptable la 2.25s p50 (user just stopped talking, no batching contention). Wrap în ThreadPoolExecutor.submit if perf bites later. - Downsample 48k→16k via 3-sample averaging (no scipy dep). Whisper robust la mild aliasing. - Energy-RMS VAD fallback dacă torch import fail — graceful degrade. - router_route_message injection seam ca kwarg pentru testabilitate. - bot.change_presence handling cross-thread via run_coroutine_threadsafe. tests/test_voice_session_cleanup.py — 6 tests: - voice_leave / disconnect / crash via __exit__ / auto_leave / user_left_channel (5 cleanup paths each verified for: JSONL state, presence cleared, voice_client.cleanup, ttsq.stop, lock release, idempotency) - 1 robustness cross-cut (double-cleanup safety) 6/6 PASS. Regression suite 63/63 PASS (normalize + adapter + mutex). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 14:55:57 +00:00
parent 217da65417
commit 23666f7910
2 changed files with 870 additions and 0 deletions
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -0,0 +1,551 @@
 """Central voice pipeline: VAD -> STT -> Claude -> TTS for Discord voice.
 ``VoiceSession`` binds per-call state — voice_client, TTS queue, transcript
 JSONL buffer, whitelist, presence — and exposes a single idempotent
 ``cleanup()`` invoked from every exit path (user /voice leave, network
 disconnect, crash via ``__exit__``, auto-leave timer, user leaves channel).
 ``EchoVoiceSink`` is the discord-ext-voice-recv ``AudioSink`` subclass that
 runs in the voice_recv reader thread. It batches 20ms PCM packets into
 100ms windows for silero-vad inference, marks per-user speech timestamps,
 and on 800ms cumulative silence flushes the accumulated audio through
 faster-whisper. Hallucinated segments (``no_speech_prob > 0.6``) are
 dropped. Valid transcripts are scheduled onto the session's event loop
 via ``asyncio.run_coroutine_threadsafe``.
 The bot's own ``user.id`` is filtered FIRST inside ``write()`` — load-bearing
 echo prevention so a future whitelist expansion (Bianca, etc.) never lets
 the bot transcribe itself.
 See plan: ``src/voice/pipeline.py`` (Pas 5), Engineering decisions #4
 (VAD 100ms batched), #5 (cleanup centralizat), #7 (bot.user.id explicit
 guard), #8 (filler audio ``thinking.wav`` at 3s pre-first-block).
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import threading
 import time
 from pathlib import Path
 from typing import Any, Callable, Optional
 import numpy as np
 from src.voice._discord_voice_adapter import AudioSink, VoiceData
 log = logging.getLogger(__name__)
 # Discord delivers 48kHz s16le stereo PCM, 20ms per packet (3840 bytes).
 SAMPLE_RATE_DISCORD = 48000
 SAMPLE_RATE_WHISPER = 16000
 PACKET_MS = 20
 PACKET_BYTES = 3840  # 48000 Hz * 0.020 s * 2 channels * 2 bytes
 VAD_WINDOW_MS = 100  # batch 5 * 20ms packets per VAD inference (Decision #4)
 VAD_WINDOW_BYTES = PACKET_BYTES * (VAD_WINDOW_MS // PACKET_MS)
 VAD_THRESHOLD = 0.5
 SILENCE_FLUSH_MS = 800
 NO_SPEECH_DROP_THRESHOLD = 0.6
 FILLER_DELAY_S = 3.0
 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
 LOGS_DIR = PROJECT_ROOT / "logs"
 VOICE_METRICS_PATH = LOGS_DIR / "voice_metrics.jsonl"
 # ---------- Lazy model singletons ----------
 _whisper_model: Any = None
 _whisper_lock = threading.Lock()
 _silero_model: Any = None
 _silero_get_timestamps: Any = None
 _silero_lock = threading.Lock()
 def _get_whisper_model() -> Any:
    """Lazy-load faster-whisper ``small`` int8 with the spike-validated
    ``cpu_threads=4`` (see ``tasks/voice-bench-results.md``)."""
    global _whisper_model
    if _whisper_model is not None:
        return _whisper_model
    with _whisper_lock:
        if _whisper_model is not None:
            return _whisper_model
        from faster_whisper import WhisperModel
        _whisper_model = WhisperModel(
            "small", device="cpu", compute_type="int8", cpu_threads=4,
        )
        return _whisper_model
 def _get_silero_vad():
    """Lazy-load silero-vad. Returns ``(model, get_speech_timestamps)``."""
    global _silero_model, _silero_get_timestamps
    if _silero_model is not None:
        return _silero_model, _silero_get_timestamps
    with _silero_lock:
        if _silero_model is not None:
            return _silero_model, _silero_get_timestamps
        from silero_vad import get_speech_timestamps, load_silero_vad
        _silero_model = load_silero_vad()
        _silero_get_timestamps = get_speech_timestamps
        return _silero_model, _silero_get_timestamps
 # ---------- Audio helpers ----------
 def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
    """Discord 48kHz s16le stereo bytes -> 16kHz mono float32 in [-1, 1].
    Cheap downsample: average the two channels, then average every 3
    samples (48k / 3 = 16k). faster-whisper + silero-vad accept the
    resulting ``np.float32`` array directly.
    """
    if not pcm:
        return np.zeros(0, dtype=np.float32)
    samples = np.frombuffer(pcm, dtype=np.int16)
    if samples.size % 2 != 0:
        samples = samples[:-1]
    stereo = samples.reshape(-1, 2)
    mono = stereo.mean(axis=1).astype(np.float32) / 32768.0
    if mono.size == 0:
        return mono
    trim = (mono.size // 3) * 3
    if trim == 0:
        return np.zeros(0, dtype=np.float32)
    mono = mono[:trim].reshape(-1, 3).mean(axis=1)
    return mono.astype(np.float32)
 # ---------- VoiceSession ----------
 class VoiceSession:
    """Per-voice-call state with a single idempotent ``cleanup()``."""
    def __init__(
        self,
        *,
        channel_id: int,
        guild_id: int,
        text_channel: Any,
        voice_client: Any,
        bot: Any,
        ttsq: Any,
        whitelist: Optional[set] = None,
        record_enabled: bool = False,
        mirror_enabled: bool = True,
        transcripts_jsonl_path: Optional[Path] = None,
        loop: Optional[asyncio.AbstractEventLoop] = None,
        router_route_message: Optional[Callable] = None,
    ):
        self.channel_id = int(channel_id)
        self.guild_id = int(guild_id)
        self.text_channel = text_channel
        self.voice_client = voice_client
        self.bot = bot
        self.ttsq = ttsq
        self.whitelist: set = set(whitelist or set())
        self.record_enabled = bool(record_enabled)
        self.mirror_enabled = bool(mirror_enabled)
        self.transcripts_jsonl_path = transcripts_jsonl_path
        self.loop = loop
        # Injection seam so tests can replace router.route_message without
        # mocking the whole module.
        if router_route_message is None:
            from src.router import route_message as _rm
            self._route_message = _rm
        else:
            self._route_message = router_route_message
        self.last_activity_ts = time.monotonic()
        self._jsonl_fh = None
        self._lock = threading.Lock()
        self._cleaned_up = False
        self._lock_owner_thread: Optional[int] = None
        self._filler_task: Optional[asyncio.Task] = None
        self._first_block_seen = False
    # ----- context manager -----
    def __enter__(self) -> "VoiceSession":
        self._lock.acquire()
        self._lock_owner_thread = threading.get_ident()
        if self.record_enabled and self.transcripts_jsonl_path is not None:
            try:
                self.transcripts_jsonl_path.parent.mkdir(
                    parents=True, exist_ok=True,
                )
                self._jsonl_fh = open(
                    self.transcripts_jsonl_path, "a",
                    buffering=1, encoding="utf-8",
                )
            except OSError as e:
                log.warning("voice transcript open failed: %s", e)
                self._jsonl_fh = None
        return self
    def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
        self.cleanup("exit")
        return False  # never suppress exceptions
    # ----- cleanup (centralized, idempotent) -----
    def cleanup(self, reason: str) -> None:
        """Single drain path for ALL 5 exit scenarios. Safe to call twice."""
        if self._cleaned_up:
            return
        self._cleaned_up = True
        # 1. Flush or discard JSONL transcript.
        if self._jsonl_fh is not None:
            try:
                self._jsonl_fh.flush()
                self._jsonl_fh.close()
            except Exception as e:  # noqa: BLE001
                log.warning("voice transcript flush failed: %s", e)
            self._jsonl_fh = None
        if (not self.record_enabled
                and self.transcripts_jsonl_path is not None
                and self.transcripts_jsonl_path.exists()):
            try:
                self.transcripts_jsonl_path.unlink()
            except OSError:
                pass
        # 2. Restore bot presence (clear Listening activity).
        if self.bot is not None:
            try:
                change = getattr(self.bot, "change_presence", None)
                if callable(change):
                    coro = change(activity=None)
                    if asyncio.iscoroutine(coro):
                        if self.loop is not None and self.loop.is_running():
                            asyncio.run_coroutine_threadsafe(coro, self.loop)
                        else:
                            # Best-effort: close the coroutine so Python
                            # doesn't emit "coroutine was never awaited".
                            coro.close()
            except Exception as e:  # noqa: BLE001
                log.warning("voice presence restore failed: %s", e)
        # 3. Tear down the voice client.
        if self.voice_client is not None:
            try:
                self.voice_client.cleanup()
            except Exception as e:  # noqa: BLE001
                log.warning("voice_client.cleanup failed: %s", e)
        # 4. Stop the TTS queue worker.
        if self.ttsq is not None:
            try:
                self.ttsq.stop()
            except Exception as e:  # noqa: BLE001
                log.warning("ttsq.stop failed: %s", e)
        # 5. Cancel pending filler task.
        if self._filler_task is not None and not self._filler_task.done():
            try:
                self._filler_task.cancel()
            except Exception:  # noqa: BLE001
                pass
        # 6. Release the session lock (held since __enter__).
        try:
            if self._lock.locked():
                self._lock.release()
        except RuntimeError:
            # Released from a different thread than acquired it — already
            # free for the next caller; nothing to do.
            pass
        self._log_metric({"event": "cleanup", "reason": reason})
    # ----- segment completion (scheduled from sink) -----
    async def on_segment_done(
        self,
        speaker_id: int,
        text: str,
        no_speech_prob: float,
    ) -> None:
        """Mirror, persist, route to Claude, drive TTS via streaming callback."""
        if self._cleaned_up:
            return
        self.last_activity_ts = time.monotonic()
        speaker_name = self._resolve_speaker_name(speaker_id)
        # 1. Mirror to text channel (one Unicode 🎤 — exception per plan).
        if self.mirror_enabled and self.text_channel is not None:
            try:
                send = getattr(self.text_channel, "send", None)
                if callable(send):
                    coro = send(f"\U0001f3a4 {speaker_name}: \"{text}\"")
                    if asyncio.iscoroutine(coro):
                        await coro
            except Exception as e:  # noqa: BLE001
                log.warning("voice mirror send failed: %s", e)
        # 2. Append to JSONL transcript buffer if recording.
        if self._jsonl_fh is not None:
            try:
                self._jsonl_fh.write(
                    json.dumps({
                        "ts": time.time(),
                        "speaker_id": speaker_id,
                        "speaker": speaker_name,
                        "text": text,
                        "no_speech_prob": no_speech_prob,
                    }, ensure_ascii=False) + "\n"
                )
            except Exception as e:  # noqa: BLE001
                log.warning("voice transcript write failed: %s", e)
        # 3. Arm the 3s filler timer — fires only if no Claude block arrives.
        self._first_block_seen = False
        if self._filler_task is not None and not self._filler_task.done():
            self._filler_task.cancel()
        try:
            self._filler_task = asyncio.create_task(self._filler_after_delay())
        except RuntimeError:
            # No running loop (test path). Skip the timer.
            self._filler_task = None
        def voice_stream_callback(block: str) -> None:
            """Called once per Claude streamed text block — pushes to TTS
            and cancels the filler on first arrival."""
            if not self._first_block_seen:
                self._first_block_seen = True
                ft = self._filler_task
                if ft is not None and not ft.done():
                    try:
                        ft.cancel()
                    except Exception:  # noqa: BLE001
                        pass
            try:
                self.ttsq.push_text(block)
            except Exception as e:  # noqa: BLE001
                log.warning("ttsq.push_text failed: %s", e)
        # 4. Dispatch to Claude. send_message is sync subprocess, run on
        #    a worker thread so the loop stays responsive for mirror/TTS.
        try:
            await asyncio.to_thread(
                self._route_message,
                str(self.channel_id),
                str(speaker_id),
                text,
                None,                       # model
                voice_stream_callback,      # on_text
                "discord-voice",            # adapter_name
            )
        except Exception as e:  # noqa: BLE001
            log.error("route_message voice path failed: %s", e)
    async def _filler_after_delay(self) -> None:
        """Push ``assets/voice/thinking.wav`` after FILLER_DELAY_S if Claude
        hasn't produced a first block yet."""
        try:
            await asyncio.sleep(FILLER_DELAY_S)
        except asyncio.CancelledError:
            return
        if self._first_block_seen or self._cleaned_up:
            return
        try:
            self.ttsq.push_filler()
        except Exception as e:  # noqa: BLE001
            log.warning("ttsq.push_filler failed: %s", e)
    # ----- helpers -----
    def _resolve_speaker_name(self, speaker_id: int) -> str:
        """Best-effort display name lookup via the bot user cache."""
        try:
            if self.bot is not None and hasattr(self.bot, "get_user"):
                user = self.bot.get_user(speaker_id)
                if user is not None:
                    name = getattr(user, "display_name", None) or getattr(
                        user, "name", None,
                    )
                    if name:
                        return str(name)
        except Exception:  # noqa: BLE001
            pass
        return str(speaker_id)
    def _log_metric(self, payload: dict) -> None:
        """Append a structured event to ``logs/voice_metrics.jsonl``."""
        event = {"ts": time.time(), "channel_id": self.channel_id, **payload}
        try:
            LOGS_DIR.mkdir(parents=True, exist_ok=True)
            with open(VOICE_METRICS_PATH, "a", buffering=1, encoding="utf-8") as f:
                f.write(json.dumps(event, ensure_ascii=False) + "\n")
        except OSError:
            pass
 # ---------- EchoVoiceSink ----------
 class EchoVoiceSink(AudioSink):
    """PCM-in sink: per-user 20ms buffer -> 100ms VAD windows -> 800ms
    silence triggers Whisper STT -> schedules ``on_segment_done`` on the
    session loop.
    Lives in the voice_recv reader thread; uses ``threading`` primitives
    only (no asyncio in the hot path).
    """
    def __init__(self, session: VoiceSession, bot_user_id: int):
        super().__init__()
        self.session = session
        self.bot_user_id = int(bot_user_id) if bot_user_id is not None else 0
        self.whitelist: set = set(session.whitelist or set())
        self._user_buffers: dict[int, bytearray] = {}
        self._packet_accum: dict[int, bytearray] = {}
        self._last_speech_ts: dict[int, float] = {}
        self._has_speech: dict[int, bool] = {}
        self._sink_lock = threading.Lock()
    def wants_opus(self) -> bool:
        return False
    def cleanup(self) -> None:
        with self._sink_lock:
            self._user_buffers.clear()
            self._packet_accum.clear()
            self._last_speech_ts.clear()
            self._has_speech.clear()
    def write(self, user, voice_data: VoiceData) -> None:
        # ---- FIRST GUARD (LOAD-BEARING): bot's own voice ---------------
        if user is None:
            return
        uid = int(getattr(user, "id", 0) or 0)
        if uid == 0:
            return
        if uid == self.bot_user_id:
            return
        # ---- SECOND GUARD: whitelist filter ----------------------------
        if self.whitelist and uid not in self.whitelist:
            return
        pcm = getattr(voice_data, "pcm", None)
        if not pcm:
            return
        window_pcm: Optional[bytes] = None
        pcm_for_stt: Optional[bytes] = None
        try:
            with self._sink_lock:
                buf = self._user_buffers.setdefault(uid, bytearray())
                accum = self._packet_accum.setdefault(uid, bytearray())
                buf.extend(pcm)
                accum.extend(pcm)
                if len(accum) >= VAD_WINDOW_BYTES:
                    window_pcm = bytes(accum[:VAD_WINDOW_BYTES])
                    del accum[:VAD_WINDOW_BYTES]
            if window_pcm is not None:
                if self._vad_detects_speech(window_pcm):
                    with self._sink_lock:
                        self._last_speech_ts[uid] = time.monotonic()
                        self._has_speech[uid] = True
            with self._sink_lock:
                if self._has_speech.get(uid):
                    last = self._last_speech_ts.get(uid, 0.0)
                    silence_ms = (time.monotonic() - last) * 1000.0
                    if silence_ms >= SILENCE_FLUSH_MS:
                        pcm_for_stt = bytes(self._user_buffers.get(uid, b""))
                        self._user_buffers[uid] = bytearray()
                        self._packet_accum[uid] = bytearray()
                        self._has_speech[uid] = False
            if pcm_for_stt:
                self._flush_to_stt(uid, pcm_for_stt)
        except Exception as e:  # noqa: BLE001
            log.warning("EchoVoiceSink.write failed: %s", e)
    # ----- VAD -----
    def _vad_detects_speech(self, pcm48_stereo: bytes) -> bool:
        """Run silero-vad on a 100ms window. Falls back to an RMS energy
        threshold if torch / silero are unavailable."""
        try:
            mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
            if mono16.size == 0:
                return False
            try:
                import torch
            except ImportError:
                rms = float(np.sqrt(np.mean(mono16.astype(np.float64) ** 2)))
                return rms > 0.02
            model, _ = _get_silero_vad()
            with torch.no_grad():
                prob = float(model(torch.from_numpy(mono16),
                                   SAMPLE_RATE_WHISPER).item())
            return prob >= VAD_THRESHOLD
        except Exception as e:  # noqa: BLE001
            log.debug("VAD inference failed: %s", e)
            return False
    # ----- STT flush -----
    def _flush_to_stt(self, user_id: int, pcm48_stereo: bytes) -> None:
        """Downsample, Whisper-transcribe RO, drop hallucinations, dispatch."""
        try:
            mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
            if mono16.size == 0:
                return
            model = _get_whisper_model()
            segments, _info = model.transcribe(
                mono16, language="ro", beam_size=1,
            )
            text_parts: list[str] = []
            worst_no_speech = 0.0
            for seg in segments:
                no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
                if no_sp > worst_no_speech:
                    worst_no_speech = no_sp
                if no_sp > NO_SPEECH_DROP_THRESHOLD:
                    continue
                seg_text = (getattr(seg, "text", "") or "").strip()
                if seg_text:
                    text_parts.append(seg_text)
            if not text_parts:
                return
            text = " ".join(text_parts).strip()
            if not text:
                return
            self._schedule_segment_done(user_id, text, worst_no_speech)
        except Exception as e:  # noqa: BLE001
            log.warning("Whisper transcribe failed: %s", e)
    def _schedule_segment_done(
        self, user_id: int, text: str, no_speech_prob: float,
    ) -> None:
        loop = self.session.loop
        if loop is None or not loop.is_running():
            log.debug("voice session loop missing — dropping segment")
            return
        try:
            asyncio.run_coroutine_threadsafe(
                self.session.on_segment_done(user_id, text, no_speech_prob),
                loop,
            )
        except Exception as e:  # noqa: BLE001
            log.warning("voice segment dispatch failed: %s", e)
 __all__ = [
    "VoiceSession",
    "EchoVoiceSink",
    "FILLER_DELAY_S",
    "SILENCE_FLUSH_MS",
    "VAD_THRESHOLD",
    "VAD_WINDOW_MS",
    "NO_SPEECH_DROP_THRESHOLD",
 ]
--- a/tests/test_voice_session_cleanup.py
+++ b/tests/test_voice_session_cleanup.py
@@ -0,0 +1,319 @@
 """Cleanup-path tests for ``src/voice/pipeline.py::VoiceSession``.
 Pins the centralized ``cleanup()`` contract from the voice plan
 (Engineering decision #5): every one of the FIVE exit paths must drain
 state cleanly and idempotently — lock released, JSONL flushed or
 discarded, presence cleared, ``voice_client.cleanup()`` invoked,
 ``ttsq.stop()`` invoked, and a second call to ``cleanup()`` MUST be a
 no-op (side effects happen exactly once).
 The 5 paths under test:
    1. ``test_cleanup_on_voice_leave``           — explicit ``/voice leave``
    2. ``test_cleanup_on_disconnect``            — Discord-level disconnect
    3. ``test_cleanup_on_crash``                 — exception via ``__exit__``
    4. ``test_cleanup_on_auto_leave``            — 5-min inactivity timer
    5. ``test_cleanup_on_user_leaves_channel``   — user leaves voice channel
 """
 from __future__ import annotations
 import json
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 from src.voice.pipeline import VoiceSession
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@pytest.fixture
 def mock_bot():
    bot = MagicMock(name="bot")
    bot.user = MagicMock()
    bot.user.id = 999_999
    bot.change_presence = AsyncMock(name="change_presence")
    bot.get_user = MagicMock(return_value=None)
    return bot
@pytest.fixture
 def mock_voice_client():
    vc = MagicMock(name="voice_client")
    vc.cleanup = MagicMock(name="vc_cleanup")
    return vc
@pytest.fixture
 def mock_ttsq():
    ttsq = MagicMock(name="ttsq")
    ttsq.stop = MagicMock(name="ttsq_stop")
    return ttsq
@pytest.fixture
 def mock_text_channel():
    tc = MagicMock(name="text_channel")
    tc.send = AsyncMock(name="text_send")
    return tc
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _make_session(
    tmp_path: Path,
    mock_bot,
    mock_voice_client,
    mock_ttsq,
    mock_text_channel,
    *,
    record_enabled: bool = True,
 ) -> VoiceSession:
    jsonl = tmp_path / ("transcripts.jsonl" if record_enabled else "noop.jsonl")
    return VoiceSession(
        channel_id=1001,
        guild_id=42,
        text_channel=mock_text_channel,
        voice_client=mock_voice_client,
        bot=mock_bot,
        ttsq=mock_ttsq,
        whitelist={1234},
        record_enabled=record_enabled,
        mirror_enabled=True,
        transcripts_jsonl_path=jsonl,
        loop=None,
        router_route_message=MagicMock(name="route_message"),
    )
 def _assert_clean_post_cleanup(
    session: VoiceSession,
    voice_client,
    ttsq,
    bot,
    jsonl_path: Path,
    record_enabled: bool,
 ) -> None:
    """Assertions shared across all five cleanup-path tests."""
    # 1. Lock released — non-blocking acquire from this thread returns True.
    acquired = session._lock.acquire(blocking=False)
    assert acquired, "session._lock must be released after cleanup()"
    session._lock.release()
    # 2. voice_client.cleanup() called exactly once.
    assert voice_client.cleanup.call_count == 1, (
        f"voice_client.cleanup() called {voice_client.cleanup.call_count}x, "
        f"expected 1"
    )
    # 3. ttsq.stop() called exactly once.
    assert ttsq.stop.call_count == 1, (
        f"ttsq.stop() called {ttsq.stop.call_count}x, expected 1"
    )
    # 4. bot.change_presence(activity=None) called at least once with that kwarg.
    assert bot.change_presence.call_count >= 1, (
        "bot.change_presence was never called — presence not restored"
    )
    bot.change_presence.assert_called_with(activity=None)
    # 5. JSONL flushed (record=on) OR absent (record=off).
    if record_enabled:
        assert jsonl_path.exists(), (
            "record=on: JSONL file must exist (was created by __enter__ and "
            "left in place by cleanup so transcript can be persisted)"
        )
    else:
        # record=off: cleanup unlinks the file if it ever existed.
        assert not jsonl_path.exists() or jsonl_path.stat().st_size == 0
 # ---------------------------------------------------------------------------
 # Scenario 1 — explicit /voice leave
 # ---------------------------------------------------------------------------
 class TestCleanupOnVoiceLeave:
    def test_cleanup_on_voice_leave(
        self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
    ):
        session = _make_session(
            tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
            record_enabled=True,
        )
        jsonl_path = session.transcripts_jsonl_path
        with session:
            # Simulate one transcript line.
            session._jsonl_fh.write(json.dumps({"text": "salut"}) + "\n")
            session.cleanup("voice_leave")
            assert session._cleaned_up is True
        # __exit__ called cleanup("exit") — must be a no-op the second time.
        _assert_clean_post_cleanup(
            session, mock_voice_client, mock_ttsq, mock_bot,
            jsonl_path, record_enabled=True,
        )
        # Idempotency: a third explicit call still doesn't bump counts.
        session.cleanup("redundant")
        assert mock_voice_client.cleanup.call_count == 1
        assert mock_ttsq.stop.call_count == 1
 # ---------------------------------------------------------------------------
 # Scenario 2 — Discord-level voice disconnect
 # ---------------------------------------------------------------------------
 class TestCleanupOnDisconnect:
    def test_cleanup_on_disconnect(
        self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
    ):
        session = _make_session(
            tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
            record_enabled=False,
        )
        jsonl_path = session.transcripts_jsonl_path
        session.__enter__()
        # Network drop arrives outside the with-block.
        session.cleanup("disconnect")
        _assert_clean_post_cleanup(
            session, mock_voice_client, mock_ttsq, mock_bot,
            jsonl_path, record_enabled=False,
        )
        # Idempotency.
        session.cleanup("disconnect-again")
        assert mock_voice_client.cleanup.call_count == 1
        assert mock_ttsq.stop.call_count == 1
 # ---------------------------------------------------------------------------
 # Scenario 3 — crash / exception via __exit__
 # ---------------------------------------------------------------------------
 class TestCleanupOnCrash:
    def test_cleanup_on_crash(
        self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
    ):
        session = _make_session(
            tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
            record_enabled=True,
        )
        jsonl_path = session.transcripts_jsonl_path
        with pytest.raises(RuntimeError, match="simulated crash"):
            with session:
                # Pipeline raises mid-call.
                raise RuntimeError("simulated crash")
        # __exit__ must have driven cleanup — every side effect happened once.
        _assert_clean_post_cleanup(
            session, mock_voice_client, mock_ttsq, mock_bot,
            jsonl_path, record_enabled=True,
        )
        # Idempotency: explicit follow-up call (e.g. an outer error handler
        # also tries to cleanup) MUST be a no-op.
        session.cleanup("post-crash")
        assert mock_voice_client.cleanup.call_count == 1
        assert mock_ttsq.stop.call_count == 1
 # ---------------------------------------------------------------------------
 # Scenario 4 — auto-leave timer fires after 5 min inactivity
 # ---------------------------------------------------------------------------
 class TestCleanupOnAutoLeave:
    def test_cleanup_on_auto_leave(
        self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
    ):
        session = _make_session(
            tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
            record_enabled=True,
        )
        jsonl_path = session.transcripts_jsonl_path
        session.__enter__()
        # The auto-leave timer trips outside the with-block.
        session.cleanup("auto_leave")
        _assert_clean_post_cleanup(
            session, mock_voice_client, mock_ttsq, mock_bot,
            jsonl_path, record_enabled=True,
        )
        # Idempotency.
        session.cleanup("auto_leave_redundant")
        assert mock_voice_client.cleanup.call_count == 1
        assert mock_ttsq.stop.call_count == 1
 # ---------------------------------------------------------------------------
 # Scenario 5 — user leaves voice channel themselves
 # ---------------------------------------------------------------------------
 class TestCleanupOnUserLeaves:
    def test_cleanup_on_user_leaves_channel(
        self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
    ):
        session = _make_session(
            tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
            record_enabled=False,
        )
        jsonl_path = session.transcripts_jsonl_path
        session.__enter__()
        # voice_state_update event handler invokes cleanup directly.
        session.cleanup("user_left_channel")
        _assert_clean_post_cleanup(
            session, mock_voice_client, mock_ttsq, mock_bot,
            jsonl_path, record_enabled=False,
        )
        # Idempotency.
        session.cleanup("user_left_again")
        assert mock_voice_client.cleanup.call_count == 1
        assert mock_ttsq.stop.call_count == 1
 # ---------------------------------------------------------------------------
 # Cross-cutting: failures inside cleanup don't propagate
 # ---------------------------------------------------------------------------
 class TestCleanupRobustness:
    def test_cleanup_swallows_voice_client_errors(
        self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
    ):
        """If voice_client.cleanup() raises, ttsq.stop() must still run and
        the lock must still release — otherwise a broken Discord state would
        deadlock the channel forever."""
        mock_voice_client.cleanup.side_effect = RuntimeError("vc died")
        session = _make_session(
            tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
            record_enabled=False,
        )
        with session:
            session.cleanup("voice_leave")
        # ttsq.stop still ran exactly once.
        assert mock_ttsq.stop.call_count == 1
        # Lock released.
        acquired = session._lock.acquire(blocking=False)
        assert acquired, "lock must release even when voice_client.cleanup raises"
        session._lock.release()