feat(voice): unify Discord voice↔text session (squash of voice/text-unify)

Voice utterances and text messages on the same Discord channel now share one Claude session, and Echo's voice replies are mirrored back into the text channel. Replaces the old voice:<id> session-key split. Changes: - src/adapters/_text_chunks.py: new leaf module for split_message (used by both discord_bot and voice pipeline) - src/router.py: drop voice: prefix from session_key; add [voice] marker; strip leading [speaker:/[voice] tokens from user input (anti-jailbreak); remove dead double-clear of voice: key - src/claude_session.py: include personality/VOICE_MODE.md unconditionally (rules become per-turn-aware via [speaker:] prefix instead of session flag) - src/voice/pipeline.py: VoiceSession splits text_channel_id + voice_channel_id; resolve text channel per-send (no stale refs); mirror Echo's reply text into the text channel after route_message returns - src/adapters/discord_voice.py: /voice join passes both channel ids - src/adapters/discord_bot.py: import split_message from leaf module - personality/VOICE_MODE.md: rewrite as per-turn dynamic rules; add synthesis instructions for text turns after voice turns Tests: - tests/test_router.py: 4 new cases (plain channel_id, anti-jailbreak, text-adapter regression, no-double-clear) - tests/test_pipeline_mirror.py: new — Echo reply mirror chunking, empty guard, mirror_enabled=False, send-raises resilience - tests/test_voice_session_channel_ids.py: new — split-attr contract + metrics payload schema - tests/test_voice_session_cleanup.py: update for new kwargs Plan: /home/moltbot/.claude/plans/vreau-ca-tot-textul-greedy-rivest.md Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 14:24:15 +00:00
parent 4be70440e8
commit e79bed7afe
11 changed files with 468 additions and 76 deletions
--- a/src/adapters/_text_chunks.py
+++ b/src/adapters/_text_chunks.py
@@ -0,0 +1,19 @@
+"""Leaf module — message chunking helper for Discord (2000 char limit). Zero deps."""
+
+
+def split_message(text: str, limit: int = 2000) -> list[str]:
+    """Split text into chunks that fit Discord's message limit."""
+    if len(text) <= limit:
+        return [text]
+
+    chunks = []
+    while text:
+        if len(text) <= limit:
+            chunks.append(text)
+            break
+        split_at = text.rfind('\n', 0, limit)
+        if split_at == -1:
+            split_at = limit
+        chunks.append(text[:split_at])
+        text = text[split_at:].lstrip('\n')
+    return chunks
--- a/src/adapters/discord_bot.py
+++ b/src/adapters/discord_bot.py
@@ -28,6 +28,7 @@ from src.router import (
    planning_cancel,
    start_planning_session,
 )
+from src.adapters._text_chunks import split_message
 from src.adapters.discord_views import (
    RalphRootView,
    PlanningActiveView,
@@ -80,28 +81,6 @@ def _channel_alias_for_id(channel_id: str) -> str | None:
    return None


-# --- Message splitting helper ---
-
-
-def split_message(text: str, limit: int = 2000) -> list[str]:
-    """Split text into chunks that fit Discord's message limit."""
-    if len(text) <= limit:
-        return [text]
-
-    chunks = []
-    while text:
-        if len(text) <= limit:
-            chunks.append(text)
-            break
-        # Find last newline before limit
-        split_at = text.rfind('\n', 0, limit)
-        if split_at == -1:
-            split_at = limit
-        chunks.append(text[:split_at])
-        text = text[split_at:].lstrip('\n')
-    return chunks
-
-
 # --- Factory ---


--- a/src/adapters/discord_voice.py
+++ b/src/adapters/discord_voice.py
@@ -157,10 +157,10 @@ def register(tree: app_commands.CommandTree, bot: discord.Client) -> app_command
        ttsq.start()
        try:
            session = VoiceSession(
-                channel_id=channel.id,
+                text_channel_id=int(interaction.channel.id),
+                voice_channel_id=int(channel.id),
                guild_id=guild_id,
                voice_client=vc,
-                text_channel=interaction.channel,
                record_enabled=False,
                mirror_enabled=True,
                whitelist=whitelist,
--- a/src/claude_session.py
+++ b/src/claude_session.py
@@ -402,8 +402,10 @@ def _run_claude(
 def build_system_prompt(voice_mode: bool = False) -> str:
    """Concatenate personality/*.md files into a single system prompt.

-    When ``voice_mode=True``, appends ``VOICE_MODE.md`` so the model knows
-    its reply will be read aloud (terse, no markdown, no abbreviations, etc.).
+    ``VOICE_MODE.md`` is always appended; its rules self-gate on the
+    ``[voice]`` / ``[speaker:...]`` prefix injected per-turn by the router.
+    The ``voice_mode`` parameter is retained for callers but no longer
+    influences prompt assembly.
    """
    if not PERSONALITY_DIR.is_dir():
        raise FileNotFoundError(
@@ -411,8 +413,7 @@ def build_system_prompt(voice_mode: bool = False) -> str:
        )

    files = list(PERSONALITY_FILES)
-    if voice_mode:
-        files.append("VOICE_MODE.md")
+    files.append("VOICE_MODE.md")

    parts: list[str] = []
    for filename in files:
@@ -451,8 +452,9 @@ def start_session(
    If *on_text* is provided, each intermediate Claude text block is passed
    to the callback as soon as it arrives.

-    *voice_mode* — when True, ``VOICE_MODE.md`` is appended to the system
-    prompt so the model produces short, TTS-friendly responses.
+    *voice_mode* — retained for the router's per-turn ``[voice]`` /
+    ``[speaker:...]`` prefix logic; no longer gates ``VOICE_MODE.md``
+    inclusion (the file is now part of every system prompt).
    """
    if model not in VALID_MODELS:
        raise ValueError(
--- a/src/router.py
+++ b/src/router.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import re
 import signal
 from datetime import datetime, timezone
 from pathlib import Path
@@ -31,6 +32,20 @@ log = logging.getLogger(__name__)

 APPROVED_TASKS_FILE = Path(__file__).parent.parent / "approved-tasks.json"

+# Anti-jailbreak: strip user-controlled leading [voice] / [speaker:...]
+# tokens so they cannot impersonate the system-injected prefix on voice turns.
+_LEADING_VOICE_TOKEN_RE = re.compile(
+    r'^\s*(?:\[voice\]|\[speaker:[^\]]*\])\s*', re.IGNORECASE
+)
+
+
+def _strip_leading_voice_tokens(text: str) -> str:
+    while True:
+        stripped = _LEADING_VOICE_TOKEN_RE.sub('', text, count=1)
+        if stripped == text:
+            return text
+        text = stripped
+
 # Module-level config instance (lazy singleton)
 _config: Config | None = None

@@ -63,6 +78,7 @@ def route_message(
    adapter-specific response shaping (e.g., redirect line on WhatsApp).
    """
    text = text.strip()
+    text = _strip_leading_voice_tokens(text)

    # ---- Planning state-aware routing -----------------------------------
    # If the channel is in an active planning session, the user's message is
@@ -124,8 +140,6 @@ def route_message(
    if text.lower() == "/clear":
        default_model = _get_config().get("bot.default_model", "sonnet")
        cleared_text = clear_session(channel_id)
-        # Also drop the isolated voice session if one exists on this channel.
-        clear_session(f"voice:{channel_id}")
        if cleared_text:
            return f"Session cleared. Model reset to {default_model}.", True
        return "No active session.", True
@@ -156,18 +170,15 @@ def route_message(
            channel_cfg = _get_channel_config(channel_id)
            model = (channel_cfg or {}).get("default_model") or _get_config().get("bot.default_model", "sonnet")

-    # Voice-mode augment: prepend speaker prefix so Claude knows who spoke
-    # in a voice channel. Cheap now, future-proof for multi-speaker later.
-    # (Engineering decision #14 in the plan.) Only the discord-voice adapter
-    # triggers it — text adapters keep the message verbatim.
+    # Voice turns get a system-controlled [voice] [speaker:NAME] prefix so
+    # VOICE_MODE.md rules self-activate per-turn. Session key is the plain
+    # channel_id — voice + text share one Claude session on the same channel.
    claude_text = text
    voice_mode = adapter_name == "discord-voice"
    if voice_mode:
        user_name = _get_config().get("voice.user_name", "user") or "user"
-        claude_text = f"[speaker:{user_name}] {text}"
-    # Voice sessions use an isolated session key so they start fresh with
-    # VOICE_MODE.md and don't pollute the text channel's conversation.
-    session_key = f"voice:{channel_id}" if voice_mode else channel_id
+        claude_text = f"[voice] [speaker:{user_name}] {text}"
+    session_key = channel_id

    try:
        response = send_message(
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -128,9 +128,9 @@ class VoiceSession:
    def __init__(
        self,
        *,
-        channel_id: int,
+        text_channel_id: int,
+        voice_channel_id: int,
        guild_id: int,
-        text_channel: Any,
        voice_client: Any,
        bot: Any,
        ttsq: Any,
@@ -141,9 +141,9 @@ class VoiceSession:
        loop: Optional[asyncio.AbstractEventLoop] = None,
        router_route_message: Optional[Callable] = None,
    ):
-        self.channel_id = int(channel_id)
+        self.text_channel_id = int(text_channel_id)
+        self.voice_channel_id = int(voice_channel_id)
        self.guild_id = int(guild_id)
-        self.text_channel = text_channel
        self.voice_client = voice_client
        self.bot = bot
        self.ttsq = ttsq
@@ -256,6 +256,29 @@ class VoiceSession:

    # ----- segment completion (scheduled from sink) -----

+    async def _resolve_text_channel(self) -> Any:
+        """Resolve the Discord text channel id to a fresh channel object.
+
+        Re-resolved per-send so a websocket resume that invalidates cached
+        objects doesn't leave us with a stale reference.
+        """
+        if self.bot is None:
+            return None
+        try:
+            getter = getattr(self.bot, "get_channel", None)
+            ch = getter(self.text_channel_id) if callable(getter) else None
+            if ch is not None:
+                return ch
+            fetch = getattr(self.bot, "fetch_channel", None)
+            if callable(fetch):
+                coro = fetch(self.text_channel_id)
+                if asyncio.iscoroutine(coro):
+                    return await coro
+                return coro
+        except Exception as e:  # noqa: BLE001
+            log.warning("voice text_channel resolve failed: %s", e)
+        return None
+
    async def on_segment_done(
        self,
        speaker_id: int,
@@ -281,10 +304,11 @@ class VoiceSession:
            await self._handle_voice_change(speaker_name, text, new_voice)
            return

-        # 1. Mirror to text channel (one Unicode 🎤 — exception per plan).
-        if self.mirror_enabled and self.text_channel is not None:
+        # 1. Mirror user utterance to text channel.
+        text_channel = await self._resolve_text_channel() if self.mirror_enabled else None
+        if self.mirror_enabled and text_channel is not None:
            try:
-                send = getattr(self.text_channel, "send", None)
+                send = getattr(text_channel, "send", None)
                if callable(send):
                    coro = send(f"\U0001f3a4 {speaker_name}: \"{text}\"")
                    if asyncio.iscoroutine(coro):
@@ -321,19 +345,39 @@ class VoiceSession:

        # Dispatch to Claude. send_message is sync subprocess, run on
        # a worker thread so the loop stays responsive for mirror/TTS.
+        response_text = ""
        try:
-            await asyncio.to_thread(
+            result = await asyncio.to_thread(
                self._route_message,
-                str(self.channel_id),
+                str(self.text_channel_id),
                str(speaker_id),
                text,
                None,                       # model
                voice_stream_callback,      # on_text
                "discord-voice",            # adapter_name
            )
+            if isinstance(result, tuple) and result:
+                response_text = result[0] or ""
        except Exception as e:  # noqa: BLE001
            log.error("route_message voice path failed: %s", e)

+        # 3. Mirror Echo's reply back into the text channel so voice + text
+        # stay aligned. Resolved per-send to avoid stale refs after reconnect.
+        if self.mirror_enabled and response_text and response_text.strip():
+            reply_channel = await self._resolve_text_channel()
+            if reply_channel is not None:
+                from src.adapters._text_chunks import split_message
+                try:
+                    for chunk in split_message(response_text):
+                        send = getattr(reply_channel, "send", None)
+                        if not callable(send):
+                            break
+                        coro = send(chunk)
+                        if asyncio.iscoroutine(coro):
+                            await coro
+                except Exception as e:  # noqa: BLE001
+                    log.warning("voice echo-reply mirror send failed: %s", e)
+
    async def _handle_voice_change(
        self, speaker_name: str, original_text: str, new_voice: str,
    ) -> None:
@@ -354,18 +398,20 @@ class VoiceSession:
        except Exception as e:  # noqa: BLE001
            log.warning("voice default persist failed: %s", e)
        # 3. Mirror what was heard + show the swap in the text channel.
-        if self.mirror_enabled and self.text_channel is not None:
-            try:
-                send = getattr(self.text_channel, "send", None)
-                if callable(send):
-                    coro = send(
-                        f"\U0001f3a4 {speaker_name}: \"{original_text}\"\n"
-                        f"\U0001f50a Voce → **{new_voice}**"
-                    )
-                    if asyncio.iscoroutine(coro):
-                        await coro
-            except Exception as e:  # noqa: BLE001
-                log.warning("voice mirror send failed: %s", e)
+        if self.mirror_enabled:
+            text_channel = await self._resolve_text_channel()
+            if text_channel is not None:
+                try:
+                    send = getattr(text_channel, "send", None)
+                    if callable(send):
+                        coro = send(
+                            f"\U0001f3a4 {speaker_name}: \"{original_text}\"\n"
+                            f"\U0001f50a Voce → **{new_voice}**"
+                        )
+                        if asyncio.iscoroutine(coro):
+                            await coro
+                except Exception as e:  # noqa: BLE001
+                    log.warning("voice mirror send failed: %s", e)
        # 4. Verbal acknowledgment in the NEW voice.
        try:
            self.ttsq.push_text(f"Vocea {new_voice}.")
@@ -391,8 +437,18 @@ class VoiceSession:
        return str(speaker_id)

    def _log_metric(self, payload: dict) -> None:
-        """Append a structured event to ``logs/voice_metrics.jsonl``."""
-        event = {"ts": time.time(), "channel_id": self.channel_id, **payload}
+        """Append a structured event to ``logs/voice_metrics.jsonl``.
+
+        ``claude_session_key`` is the channel id used to key the unified
+        Claude session (text channel where the user invoked /voice join);
+        ``voice_channel_id`` is the actual Discord voice channel id.
+        """
+        event = {
+            "ts": time.time(),
+            "claude_session_key": str(self.text_channel_id),
+            "voice_channel_id": self.voice_channel_id,
+            **payload,
+        }
        try:
            LOGS_DIR.mkdir(parents=True, exist_ok=True)
            with open(VOICE_METRICS_PATH, "a", buffering=1, encoding="utf-8") as f: