From e589e4885e5985da10ca03caf8ad4addbe1f3ee6 Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Wed, 27 May 2026 20:59:10 +0000 Subject: [PATCH] feat(voice): voice-mode prompt, isolated session, units, verbal voice swap, fast barge-in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second voice UX iteration. Targets Marius's live-test pain points from today. - **Voice-mode system prompt** (personality/VOICE_MODE.md, plumbed via claude_session.build_system_prompt(voice_mode=True)) — when the voice adapter starts a session, append voice-tailored instructions: short replies, no markdown, no abbreviations, time without seconds, distances rounded to "mii"/"milioane", no curly quotes / em-dash / ellipsis. Marius asked for a "in-the-car friend" persona for voice. - **Isolated voice session key** (router.py) — voice mode uses `voice:` so it doesn't share context with the text adapter on the same Discord channel. Fresh start, voice prompt applied automatically without `/clear` ceremony. `/clear` drops both keys. - **Metric units + Romanian thousands** (src/voice/normalize.py) — `384.000 km` was being read as "trei sute optzeci și patru virgulă zero zero zero km" because the dot was treated as decimal separator and `km` wasn't expanded. New `normalize_thousands` collapses Romanian thousands separators (`X.000`/`X.000.000`) before number expansion, and `expand_units` handles km/kg/cm/mm/ml/ha/mp with correct Romanian pluralization ("un kilometru", "două kilograme", "douăzeci de centimetri", "o sută de kilometri" with "de" particle). - **`/voice setvoice ` slash command** (discord_voice.py) — Discord native autocomplete; swaps the live TTSQueue voice_id AND persists voice.default_voice to config.json. No restart needed. - **Verbal voice change** (src/voice/voice_commands.py — new module + 29 tests) — say "schimbă vocea pe M5" / "vorbește cu vocea F3" / "voce em cinci" from inside the voice channel. Detector requires both a trigger word (voce/vorbește/schimbă/treci pe) and a recognizable voice ID (direct "M5", word form "em cinci", or fallback substring match for Whisper-mangled forms like "unul cinci"=M5 and "Mâcinci"=M5). On detection: live-swap, persist to config, mirror to chat with `🎤 ... / 🔊 Voce → M5`, speak short ack in the NEW voice, skip Claude. "pământinci" still can't be recovered (no recoverable digit substring); user gets passthrough to Claude in that case. - **Whisper initial_prompt** now lists the voice-command vocabulary so STT biases toward producing clean "M5" / "F3" tokens instead of inventing "pământ" / "unul" phonetic neighbors. - **Fast barge-in** (pipeline.py EchoVoiceSink) — previously `ttsq.clear()` only fired in `on_segment_done` (after 800ms silence + 2-3s STT ≈ 3s lag). Now also fires from the sink as soon as VAD detects ≥2 consecutive windows (~200ms) of sustained speech on Marius's user while Echo has pending TTS frames. Single-window glitches don't cut Echo off; sustained speech does. (Acoustic echo bleed-through still requires headphones — no AEC in the bot.) - Tests: 130 voice + router tests pass; updated test_router.py to expect `/clear` to drop both text and voice session keys. Co-Authored-By: Claude Opus 4.7 --- config.json | 6 +- personality/VOICE_MODE.md | 30 +++++++++ src/adapters/discord_voice.py | 39 +++++++++++ src/claude_session.py | 24 +++++-- src/router.py | 17 +++-- src/voice/normalize.py | 51 +++++++++++++++ src/voice/pipeline.py | 73 ++++++++++++++++++++- src/voice/voice_commands.py | 118 ++++++++++++++++++++++++++++++++++ tests/test_router.py | 19 +++--- tests/test_voice_commands.py | 55 ++++++++++++++++ 10 files changed, 412 insertions(+), 20 deletions(-) create mode 100644 personality/VOICE_MODE.md create mode 100644 src/voice/voice_commands.py create mode 100644 tests/test_voice_commands.py diff --git a/config.json b/config.json index ef7b221..798da29 100644 --- a/config.json +++ b/config.json @@ -105,9 +105,11 @@ "url": "http://10.0.20.161:11434" }, "voice": { - "allowed_user_ids": ["949388626146517022"], + "allowed_user_ids": [ + "949388626146517022" + ], "user_name": "Marius", - "default_voice": "M2", + "default_voice": "M5", "auto_leave_minutes": 5 }, "paths": { diff --git a/personality/VOICE_MODE.md b/personality/VOICE_MODE.md new file mode 100644 index 0000000..c4d0cef --- /dev/null +++ b/personality/VOICE_MODE.md @@ -0,0 +1,30 @@ +# Voice Mode + +Răspunzi prin voce (TTS). Marius te aude — nu citește. Reguli care contează: + +## Lungime și ton + +- **Scurt**: 1-2 propoziții, max ~30 cuvinte per turn. Marius vorbește cu tine — nu redactezi un document. +- **Conversațional**: ca un om viu. Fără "Sigur, iată...", "Permite-mi să...", "Te rog să...". Direct la subiect. +- **Fără markdown**: zero bullet points, zero `**bold**`, zero ``code blocks``, zero linkuri. Totul e citit cu voce. + +## Numere și unități + +- **Ora**: fără secunde. Spune "ora 23 și 9 minute" sau "9 și jumătate", nu "23:09:42". +- **Distanțe mari**: rotunjește în "mii" sau "milioane". Pentru Pământ-Lună spune "384 mii de kilometri", nu "384.000 km". +- **Zecimale**: omite-le când nu adaugă informație. "5 lei" nu "5,00 lei". "două ore" nu "2,0 ore". "20 de minute" nu "20,5 minute". +- **Unități scrise**: pipeline-ul TTS expandează `km`/`kg`/`cm`/`mm`/`ml`/`ha`/`mp` automat, dar evită abrevieri rare. Scrie "metri" nu "m." dacă e ambiguu. + +## Structură + +- Listă scurtă verbală: "Trei lucruri: întâi X, apoi Y, plus Z." +- Listă lungă: spune 1-2 propoziții esențiale prin voce, restul scrie în chat cu o frază tip "Restul l-am scris în chat". +- Întrebări clarificatoare: pune UNA, nu trei. + +## Punctuație + +- Doar virgule și puncte. Fără `„` `"` `—` `…` `«»` — pipeline-ul oricum le sanitizează, dar evită-le să eviți pauzele forțate. + +## Tu ești Marius's prieten în mașină + +Imaginează-ți că Marius conduce și te-a întrebat ceva pe difuzor. Răspunzi natural, scurt, la subiect — fără ceremonii. diff --git a/src/adapters/discord_voice.py b/src/adapters/discord_voice.py index f7afc34..59cd677 100644 --- a/src/adapters/discord_voice.py +++ b/src/adapters/discord_voice.py @@ -246,6 +246,45 @@ def register(tree: app_commands.CommandTree, bot: discord.Client) -> app_command log.warning("Presence reset skipped", exc_info=True) await interaction.followup.send("Plecat.", ephemeral=True) + _VOICE_CHOICES = [ + app_commands.Choice(name=v, value=v) + for v in ("M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5") + ] + + @voice_group.command(name="setvoice", description="Schimbă vocea Echo (M1-M5 sau F1-F5)") + @app_commands.describe(voice="Voce nouă") + @app_commands.choices(voice=_VOICE_CHOICES) + async def setvoice( + interaction: discord.Interaction, + voice: app_commands.Choice[str], + ) -> None: + await interaction.response.defer(ephemeral=True) + new_voice = voice.value + # Live-swap on the active session if Echo is in voice on this guild. + guild_id = interaction.guild.id if interaction.guild else None + session = _voice_sessions.get(guild_id) if guild_id is not None else None + live_swapped = False + if session is not None and session.ttsq is not None: + session.ttsq.voice_id = new_voice + live_swapped = True + # Persist as the new default for future sessions. + try: + cfg = Config() + cfg.set("voice.default_voice", new_voice) + cfg.save() + except Exception as e: + log.warning("config save failed for new default voice: %s", e) + await interaction.followup.send( + f"Voce schimbată live ({new_voice}), dar config-ul nu s-a salvat: {e}", + ephemeral=True, + ) + return + if live_swapped: + msg = f"Vocea schimbată **live** pe {new_voice}. Următoarea frază va folosi vocea nouă." + else: + msg = f"Default voce setată {new_voice}. Va intra în vigoare la următorul /voice join." + await interaction.followup.send(msg, ephemeral=True) + @voice_group.command(name="doctor", description="Verifică voice stack") async def doctor(interaction: discord.Interaction) -> None: await interaction.response.defer(ephemeral=True) diff --git a/src/claude_session.py b/src/claude_session.py index b319b23..6327170 100644 --- a/src/claude_session.py +++ b/src/claude_session.py @@ -399,15 +399,23 @@ def _run_claude( # --------------------------------------------------------------------------- -def build_system_prompt() -> str: - """Concatenate personality/*.md files into a single system prompt.""" +def build_system_prompt(voice_mode: bool = False) -> str: + """Concatenate personality/*.md files into a single system prompt. + + When ``voice_mode=True``, appends ``VOICE_MODE.md`` so the model knows + its reply will be read aloud (terse, no markdown, no abbreviations, etc.). + """ if not PERSONALITY_DIR.is_dir(): raise FileNotFoundError( f"Personality directory not found: {PERSONALITY_DIR}" ) + files = list(PERSONALITY_FILES) + if voice_mode: + files.append("VOICE_MODE.md") + parts: list[str] = [] - for filename in PERSONALITY_FILES: + for filename in files: filepath = PERSONALITY_DIR / filename if filepath.is_file(): parts.append(filepath.read_text(encoding="utf-8")) @@ -434,6 +442,7 @@ def start_session( model: str = DEFAULT_MODEL, timeout: int = DEFAULT_TIMEOUT, on_text: Callable[[str], None] | None = None, + voice_mode: bool = False, ) -> tuple[str, str]: """Start a new Claude CLI session for a channel. @@ -441,13 +450,16 @@ def start_session( If *on_text* is provided, each intermediate Claude text block is passed to the callback as soon as it arrives. + + *voice_mode* — when True, ``VOICE_MODE.md`` is appended to the system + prompt so the model produces short, TTS-friendly responses. """ if model not in VALID_MODELS: raise ValueError( f"Invalid model '{model}'. Must be one of: haiku, sonnet, opus" ) - system_prompt = build_system_prompt() + system_prompt = build_system_prompt(voice_mode=voice_mode) # Wrap external user message with injection protection markers wrapped_message = f"[EXTERNAL CONTENT]\n{message}\n[END EXTERNAL CONTENT]" @@ -578,6 +590,7 @@ def send_message( model: str = DEFAULT_MODEL, timeout: int = DEFAULT_TIMEOUT, on_text: Callable[[str], None] | None = None, + voice_mode: bool = False, ) -> str: """High-level convenience: auto start or resume based on channel state. @@ -598,7 +611,8 @@ def send_message( if session is not None and session.get("model"): effective_model = session["model"] response_text, _session_id = start_session( - channel_id, message, effective_model, timeout, on_text=on_text + channel_id, message, effective_model, timeout, + on_text=on_text, voice_mode=voice_mode, ) return response_text diff --git a/src/router.py b/src/router.py index c6a0344..7c1f861 100644 --- a/src/router.py +++ b/src/router.py @@ -123,8 +123,10 @@ def route_message( # Text-based commands (not slash commands — these work in any adapter) if text.lower() == "/clear": default_model = _get_config().get("bot.default_model", "sonnet") - cleared = clear_session(channel_id) - if cleared: + cleared_text = clear_session(channel_id) + # Also drop the isolated voice session if one exists on this channel. + clear_session(f"voice:{channel_id}") + if cleared_text: return f"Session cleared. Model reset to {default_model}.", True return "No active session.", True @@ -159,12 +161,19 @@ def route_message( # (Engineering decision #14 in the plan.) Only the discord-voice adapter # triggers it — text adapters keep the message verbatim. claude_text = text - if adapter_name == "discord-voice": + voice_mode = adapter_name == "discord-voice" + if voice_mode: user_name = _get_config().get("voice.user_name", "user") or "user" claude_text = f"[speaker:{user_name}] {text}" + # Voice sessions use an isolated session key so they start fresh with + # VOICE_MODE.md and don't pollute the text channel's conversation. + session_key = f"voice:{channel_id}" if voice_mode else channel_id try: - response = send_message(channel_id, claude_text, model=model, on_text=on_text) + response = send_message( + session_key, claude_text, model=model, on_text=on_text, + voice_mode=voice_mode, + ) _set_last_response(channel_id, response) return response, False except Exception as e: diff --git a/src/voice/normalize.py b/src/voice/normalize.py index d7b7f6b..4daad5b 100644 --- a/src/voice/normalize.py +++ b/src/voice/normalize.py @@ -94,6 +94,55 @@ def expand_numbers_ro(text: str) -> str: return _NUM_TOKEN.sub(_sub, text) +# ---------- Thousands separator ---------- + +# Romanian uses dot or space as thousands separator: 384.000 / 384 000. The +# decimal expander would read "384.000" as "trei sute optzeci și patru virgulă +# zero zero zero" — wrong. Collapse the dots so expand_numbers_ro reads the +# whole integer. Only 1-3 leading digits followed by ≥1 group of exactly 3 +# digits, never adjacent to other digits. +_THOUSANDS_DOT = re.compile(r'(? str: + """Strip the dot from Romanian thousands-separator integers.""" + return _THOUSANDS_DOT.sub(lambda m: m.group(1).replace('.', ''), text) + + +# ---------- Metric units ---------- + +# (regex_matching_, singular, plural). Matches an integer or decimal +# followed by the abbreviation as a whole word. Skipping bare ``m`` and ``l`` +# because they collide with too many tokens ("M2" voice id, list markers). +_UNIT_PATTERNS: list[tuple[re.Pattern, str, str]] = [ + (re.compile(r'(? str: + """Mirror ``_format_currency_unit`` for metric units. Decimals fall through + to the generic decimal expander (which leaves them with plural form).""" + if '.' in amount_str or ',' in amount_str: + return f"{_decimal_to_ro(amount_str.replace(',', '.'))} {plural}" + return _format_currency_unit(int(amount_str), singular, plural) + + +def expand_units(text: str) -> str: + """Expand metric unit abbreviations into spoken Romanian.""" + for pattern, singular, plural in _UNIT_PATTERNS: + text = pattern.sub( + lambda m, sg=singular, pl=plural: _format_unit(m.group(1), sg, pl), + text, + ) + return text + + # ---------- Time ---------- _TIME_PATTERN = re.compile(r'(? str: text = strip_markdown(text) text = sanitize_punctuation(text) text = expand_abbreviations(text) + text = normalize_thousands(text) text = expand_time(text) text = expand_currency(text) + text = expand_units(text) text = expand_numbers_ro(text) text = expand_symbols(text) words = text.split() diff --git a/src/voice/pipeline.py b/src/voice/pipeline.py index fa4796a..e6e2914 100644 --- a/src/voice/pipeline.py +++ b/src/voice/pipeline.py @@ -34,6 +34,7 @@ from typing import Any, Callable, Optional import numpy as np from src.voice._discord_voice_adapter import AudioSink, VoiceData +from src.voice.voice_commands import detect_voice_change log = logging.getLogger(__name__) @@ -274,6 +275,12 @@ class VoiceSession: except Exception as e: # noqa: BLE001 log.warning("ttsq.clear failed: %s", e) + # In-band voice command: change TTS voice without round-tripping Claude. + new_voice = detect_voice_change(text) + if new_voice is not None: + await self._handle_voice_change(speaker_name, text, new_voice) + return + # 1. Mirror to text channel (one Unicode 🎤 — exception per plan). if self.mirror_enabled and self.text_channel is not None: try: @@ -327,6 +334,45 @@ class VoiceSession: except Exception as e: # noqa: BLE001 log.error("route_message voice path failed: %s", e) + async def _handle_voice_change( + self, speaker_name: str, original_text: str, new_voice: str, + ) -> None: + """Apply an in-band 'change voice' command: swap live, persist to + config, mirror to chat, speak a short acknowledgment in the new voice. + Does NOT forward the utterance to Claude.""" + # 1. Live-swap on the TTS queue. Next clause synth uses the new voice. + try: + self.ttsq.voice_id = new_voice + except Exception as e: # noqa: BLE001 + log.warning("ttsq voice swap failed: %s", e) + # 2. Persist as the new default for future sessions. + try: + from src.config import Config + cfg = Config() + cfg.set("voice.default_voice", new_voice) + cfg.save() + except Exception as e: # noqa: BLE001 + log.warning("voice default persist failed: %s", e) + # 3. Mirror what was heard + show the swap in the text channel. + if self.mirror_enabled and self.text_channel is not None: + try: + send = getattr(self.text_channel, "send", None) + if callable(send): + coro = send( + f"\U0001f3a4 {speaker_name}: \"{original_text}\"\n" + f"\U0001f50a Voce → **{new_voice}**" + ) + if asyncio.iscoroutine(coro): + await coro + except Exception as e: # noqa: BLE001 + log.warning("voice mirror send failed: %s", e) + # 4. Verbal acknowledgment in the NEW voice. + try: + self.ttsq.push_text(f"Vocea {new_voice}.") + except Exception as e: # noqa: BLE001 + log.warning("voice ack push failed: %s", e) + self._log_metric({"event": "voice_change", "new_voice": new_voice}) + # ----- helpers ----- def _resolve_speaker_name(self, speaker_id: int) -> str: @@ -381,6 +427,10 @@ class EchoVoiceSink(AudioSink): # chain breaks when "I spoke but Echo heard nothing" happens. self._first_packet_logged: set[int] = set() self._first_speech_logged: set[int] = set() + # Track consecutive VAD-positive windows per user. Used to delay + # barge-in (don't cut Echo off on a single jittery VAD hit; require + # ≥2 windows ≈ 200ms of sustained speech). + self._vad_consecutive: dict[int, int] = {} # Background poller that triggers the silence flush even when Discord # DTX stops delivering RTP packets after the user stops speaking. Without # this, sink.write would stop firing and STT would never run on the @@ -444,9 +494,27 @@ class EchoVoiceSink(AudioSink): if uid not in self._first_speech_logged: self._first_speech_logged.add(uid) log.info("voice sink: VAD detected speech from user %s", uid) + self._vad_consecutive[uid] = self._vad_consecutive.get(uid, 0) + 1 with self._sink_lock: self._last_speech_ts[uid] = time.monotonic() self._has_speech[uid] = True + # Fast barge-in: after ≥2 consecutive VAD windows (~200ms + # of sustained speech), cut Echo's TTS mid-sentence so the + # user doesn't have to wait the full silence-flush + STT + # cycle (~3s). + if self._vad_consecutive[uid] >= 2: + try: + ttsq = self.session.ttsq + if ttsq is not None and not ttsq.is_empty(): + ttsq.clear() + log.info( + "voice sink: barge-in cleared TTS queue (user=%s)", + uid, + ) + except Exception as e: # noqa: BLE001 + log.warning("barge-in clear failed: %s", e) + else: + self._vad_consecutive[uid] = 0 pcm_for_stt = self._take_flushable_pcm(uid) if pcm_for_stt: @@ -530,7 +598,10 @@ class EchoVoiceSink(AudioSink): mono16, language="ro", beam_size=5, initial_prompt=( "Echo Core, asistent personal AI românesc al lui Marius. " - "Conversație colocvială în română." + "Conversație colocvială în română. " + "Comenzi voce recunoscute: schimbă vocea pe M1, M2, M3, M4, M5, " + "F1, F2, F3, F4, F5. Exemple: vorbește cu vocea M5, voce F3, " + "treci pe vocea F1." ), condition_on_previous_text=False, ) diff --git a/src/voice/voice_commands.py b/src/voice/voice_commands.py new file mode 100644 index 0000000..fcac503 --- /dev/null +++ b/src/voice/voice_commands.py @@ -0,0 +1,118 @@ +"""Detect in-band voice commands from STT transcripts. + +The voice pipeline transcribes Marius's speech via Whisper and dispatches the +text to Claude. Some utterances are not questions for Claude — they're +control commands for the voice stack itself. This module parses those out +*before* the Claude round-trip so they take effect instantly and don't waste +a Claude session turn. + +Currently handled: + * change TTS voice — "schimbă vocea pe M5", "vorbește cu vocea F3", + "voce em cinci", "voce feminină 3", etc. + +The parser is intentionally conservative: it requires BOTH a voice trigger +word ("voce", "vorbește", "schimbă", "treci pe") AND a recognizable voice +ID. A bare "M5" without context is NOT a command — Marius might be quoting +a string. +""" +from __future__ import annotations + +import re +from typing import Optional + + +_VALID_VOICES = {f"M{i}" for i in range(1, 6)} | {f"F{i}" for i in range(1, 6)} + + +# Trigger words that suggest the user is talking ABOUT the voice, not just +# saying something that happens to contain a voice-ID-looking substring. +_VOICE_TRIGGER_RE = re.compile( + r'\b(voce|vocea|voci|voice|vorbe[șs]te|schimb[aăÎ]|treci\s+pe)\b', + re.IGNORECASE, +) + +# Direct form: "M5", "F 3", "m5", etc. +_VOICE_ID_DIRECT_RE = re.compile( + r'\b([MF])\s*([1-5])\b', + re.IGNORECASE, +) + +# Word form: "em cinci", "M trei", "masculin doi", "feminină patru", etc. +# Whisper often transcribes "M5" as "em cinci" / "M cinci" because letter +# names are spelled out phonetically in Romanian. +_VOICE_ID_WORDS_RE = re.compile( + r'\b(em|m|masculin[aăe]?|ef|f|feminin[aăe]?)\s+(unu|una|doi|dou[ăa]|trei|patru|cinci|[1-5])\b', + re.IGNORECASE, +) + + +_DIGIT_WORD_TO_INT = { + 'unu': 1, 'una': 1, 'unul': 1, '1': 1, + 'doi': 2, 'două': 2, 'doua': 2, '2': 2, + 'trei': 3, '3': 3, + 'patru': 4, '4': 4, + 'cinci': 5, '5': 5, +} + +# Substring fallback: matches digit roots even when Whisper glues them into +# compound non-words like "Mâcinci" (for "M cinci"=M5). +_DIGIT_SUBSTR_RE = re.compile( + r'(cinci|patru|trei|dou[ăa]|unul|unu|una)', + re.IGNORECASE, +) + +_F_GENDER_HINT_RE = re.compile(r'feminin|\bef\b|\bF\d?\b', re.IGNORECASE) + + +def _normalize_gender(word: str) -> Optional[str]: + """Map gender word to 'M' or 'F'.""" + w = word.lower() + if w in ('m', 'em') or w.startswith('masculin'): + return 'M' + if w in ('f', 'ef') or w.startswith('feminin'): + return 'F' + return None + + +def detect_voice_change(text: str) -> Optional[str]: + """Parse a transcript for a 'change voice' command. + + Returns the target voice id (one of M1-M5, F1-F5) or None if no command + was detected. Requires both a voice trigger word and a voice ID. + """ + if not text: + return None + if not _VOICE_TRIGGER_RE.search(text): + return None + # Try the direct form first (M5, F3, etc.) + m = _VOICE_ID_DIRECT_RE.search(text) + if m: + candidate = f"{m.group(1).upper()}{m.group(2)}" + if candidate in _VALID_VOICES: + return candidate + # Fall back to the word form ("em cinci", "feminin trei", ...). + m = _VOICE_ID_WORDS_RE.search(text) + if m: + gender = _normalize_gender(m.group(1)) + digit = _DIGIT_WORD_TO_INT.get(m.group(2).lower()) + if gender is not None and digit is not None: + candidate = f"{gender}{digit}" + if candidate in _VALID_VOICES: + return candidate + # Permissive fallback: Whisper sometimes glues the letter into the next + # word ("Mâcinci" for "M cinci") or replaces it ("unul cinci" for + # "M unu cinci"). After a voice trigger word, scan for any digit-word + # substring and infer gender (F if a feminine marker is present, else M). + digit_hits = _DIGIT_SUBSTR_RE.findall(text) + digits = [_DIGIT_WORD_TO_INT[d.lower()] for d in digit_hits + if d.lower() in _DIGIT_WORD_TO_INT] + digits = [d for d in digits if 1 <= d <= 5] + if digits: + gender = 'F' if _F_GENDER_HINT_RE.search(text) else 'M' + # Last digit wins — handles "M unu cinci" → M5 since "unu" is a + # mangled letter-name prefix, "cinci" is the actual target. + return f"{gender}{digits[-1]}" + return None + + +__all__ = ["detect_voice_change"] diff --git a/tests/test_router.py b/tests/test_router.py index 0038136..f1a93d7 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -30,7 +30,10 @@ class TestClearCommand: response, is_cmd = route_message("ch-1", "user-1", "/clear") assert response == "Session cleared. Model reset to sonnet." assert is_cmd is True - mock_clear.assert_called_once_with("ch-1") + # /clear drops both the text-adapter session and the isolated voice + # session for the same Discord channel. + mock_clear.assert_any_call("ch-1") + mock_clear.assert_any_call("voice:ch-1") @patch("src.router._get_config") @patch("src.router.clear_session") @@ -191,7 +194,7 @@ class TestRegularMessage: response, is_cmd = route_message("ch-1", "user-1", "hello") assert response == "Hello from Claude!" assert is_cmd is False - mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None) + mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None, voice_mode=False) @patch("src.router.send_message") def test_model_override(self, mock_send): @@ -199,7 +202,7 @@ class TestRegularMessage: response, is_cmd = route_message("ch-1", "user-1", "hello", model="opus") assert response == "Response" assert is_cmd is False - mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None) + mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False) @patch("src.router._get_channel_config") @patch("src.router._get_config") @@ -227,7 +230,7 @@ class TestRegularMessage: cb = lambda t: None route_message("ch-1", "user-1", "hello", on_text=cb) - mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=cb) + mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=cb, voice_mode=False) # --- _get_channel_config --- @@ -269,7 +272,7 @@ class TestModelResolution: mock_chan_cfg.return_value = {"id": "ch-1", "default_model": "haiku"} route_message("ch-1", "user-1", "hello") - mock_send.assert_called_once_with("ch-1", "hello", model="haiku", on_text=None) + mock_send.assert_called_once_with("ch-1", "hello", model="haiku", on_text=None, voice_mode=False) @patch("src.router._get_channel_config") @patch("src.router._get_config") @@ -283,7 +286,7 @@ class TestModelResolution: mock_get_config.return_value = mock_cfg route_message("ch-1", "user-1", "hello") - mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None) + mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False) @patch("src.router._get_channel_config") @patch("src.router._get_config") @@ -297,7 +300,7 @@ class TestModelResolution: mock_get_config.return_value = mock_cfg route_message("ch-1", "user-1", "hello") - mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None) + mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None, voice_mode=False) @patch("src.router.get_active_session") @patch("src.router.send_message") @@ -307,4 +310,4 @@ class TestModelResolution: mock_get_session.return_value = {"model": "opus", "session_id": "abc"} route_message("ch-1", "user-1", "hello") - mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None) + mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False) diff --git a/tests/test_voice_commands.py b/tests/test_voice_commands.py new file mode 100644 index 0000000..e848deb --- /dev/null +++ b/tests/test_voice_commands.py @@ -0,0 +1,55 @@ +"""Tests for src/voice/voice_commands.detect_voice_change.""" +from __future__ import annotations + +import pytest + +from src.voice.voice_commands import detect_voice_change + + +class TestDetectVoiceChange: + # --- positive cases (direct form) --- + @pytest.mark.parametrize("text,expected", [ + ("schimbă vocea pe M5", "M5"), + ("Schimbă vocea pe F3.", "F3"), + ("vorbește cu vocea M1", "M1"), + ("vorbește cu vocea F2", "F2"), + ("voce M4", "M4"), + ("Voce F5.", "F5"), + ("treci pe vocea F1", "F1"), + ("Echo, treci pe M2.", "M2"), + ("voice M3", "M3"), + ]) + def test_direct_form(self, text, expected): + assert detect_voice_change(text) == expected + + # --- positive cases (word form, what Whisper actually produces) --- + @pytest.mark.parametrize("text,expected", [ + ("schimbă vocea pe em cinci", "M5"), + ("vorbește cu vocea em trei", "M3"), + ("voce em unu", "M1"), + ("schimbă vocea pe ef doi", "F2"), + ("voce ef cinci", "F5"), + ("vorbește cu vocea masculină cinci", "M5"), + ("schimbă vocea pe feminină trei", "F3"), + ("voce masculin patru", "M4"), + ("schimbă vocea pe M cinci", "M5"), + ("voce F două", "F2"), + ]) + def test_word_form(self, text, expected): + assert detect_voice_change(text) == expected + + # --- negative cases --- + @pytest.mark.parametrize("text", [ + "", + "cât este ora", + "M5", # no trigger word + "Salut Echo, sunt în M3", # M3 here is a location/etc, no trigger + "vocea ta este foarte bună", # trigger but no voice id + "schimbă te rog", # trigger but no id + "voce M6", # out of range + "voce M0", # out of range + "voce F8", # out of range + "schimbă vocea pe șapte", # digit out of range + ]) + def test_no_match(self, text): + assert detect_voice_change(text) is None