feat(voice): voice-mode prompt, isolated session, units, verbal voice swap, fast barge-in
Second voice UX iteration. Targets Marius's live-test pain points from today.
- **Voice-mode system prompt** (personality/VOICE_MODE.md, plumbed via
claude_session.build_system_prompt(voice_mode=True)) — when the voice
adapter starts a session, append voice-tailored instructions: short replies,
no markdown, no abbreviations, time without seconds, distances rounded
to "mii"/"milioane", no curly quotes / em-dash / ellipsis. Marius asked
for a "in-the-car friend" persona for voice.
- **Isolated voice session key** (router.py) — voice mode uses
`voice:<channel_id>` so it doesn't share context with the text adapter
on the same Discord channel. Fresh start, voice prompt applied
automatically without `/clear` ceremony. `/clear` drops both keys.
- **Metric units + Romanian thousands** (src/voice/normalize.py) —
`384.000 km` was being read as "trei sute optzeci și patru virgulă zero
zero zero km" because the dot was treated as decimal separator and `km`
wasn't expanded. New `normalize_thousands` collapses Romanian thousands
separators (`X.000`/`X.000.000`) before number expansion, and
`expand_units` handles km/kg/cm/mm/ml/ha/mp with correct Romanian
pluralization ("un kilometru", "două kilograme", "douăzeci de
centimetri", "o sută de kilometri" with "de" particle).
- **`/voice setvoice <M1-F5>` slash command** (discord_voice.py) — Discord
native autocomplete; swaps the live TTSQueue voice_id AND persists
voice.default_voice to config.json. No restart needed.
- **Verbal voice change** (src/voice/voice_commands.py — new module +
29 tests) — say "schimbă vocea pe M5" / "vorbește cu vocea F3" / "voce
em cinci" from inside the voice channel. Detector requires both a
trigger word (voce/vorbește/schimbă/treci pe) and a recognizable voice
ID (direct "M5", word form "em cinci", or fallback substring match for
Whisper-mangled forms like "unul cinci"=M5 and "Mâcinci"=M5). On
detection: live-swap, persist to config, mirror to chat with
`🎤 ... / 🔊 Voce → M5`, speak short ack in the NEW voice, skip
Claude. "pământinci" still can't be recovered (no recoverable digit
substring); user gets passthrough to Claude in that case.
- **Whisper initial_prompt** now lists the voice-command vocabulary so
STT biases toward producing clean "M5" / "F3" tokens instead of
inventing "pământ" / "unul" phonetic neighbors.
- **Fast barge-in** (pipeline.py EchoVoiceSink) — previously `ttsq.clear()`
only fired in `on_segment_done` (after 800ms silence + 2-3s STT ≈ 3s lag).
Now also fires from the sink as soon as VAD detects ≥2 consecutive
windows (~200ms) of sustained speech on Marius's user while Echo has
pending TTS frames. Single-window glitches don't cut Echo off; sustained
speech does. (Acoustic echo bleed-through still requires headphones —
no AEC in the bot.)
- Tests: 130 voice + router tests pass; updated test_router.py to expect
`/clear` to drop both text and voice session keys.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -30,7 +30,10 @@ class TestClearCommand:
|
||||
response, is_cmd = route_message("ch-1", "user-1", "/clear")
|
||||
assert response == "Session cleared. Model reset to sonnet."
|
||||
assert is_cmd is True
|
||||
mock_clear.assert_called_once_with("ch-1")
|
||||
# /clear drops both the text-adapter session and the isolated voice
|
||||
# session for the same Discord channel.
|
||||
mock_clear.assert_any_call("ch-1")
|
||||
mock_clear.assert_any_call("voice:ch-1")
|
||||
|
||||
@patch("src.router._get_config")
|
||||
@patch("src.router.clear_session")
|
||||
@@ -191,7 +194,7 @@ class TestRegularMessage:
|
||||
response, is_cmd = route_message("ch-1", "user-1", "hello")
|
||||
assert response == "Hello from Claude!"
|
||||
assert is_cmd is False
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router.send_message")
|
||||
def test_model_override(self, mock_send):
|
||||
@@ -199,7 +202,7 @@ class TestRegularMessage:
|
||||
response, is_cmd = route_message("ch-1", "user-1", "hello", model="opus")
|
||||
assert response == "Response"
|
||||
assert is_cmd is False
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router._get_channel_config")
|
||||
@patch("src.router._get_config")
|
||||
@@ -227,7 +230,7 @@ class TestRegularMessage:
|
||||
|
||||
cb = lambda t: None
|
||||
route_message("ch-1", "user-1", "hello", on_text=cb)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=cb)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=cb, voice_mode=False)
|
||||
|
||||
|
||||
# --- _get_channel_config ---
|
||||
@@ -269,7 +272,7 @@ class TestModelResolution:
|
||||
mock_chan_cfg.return_value = {"id": "ch-1", "default_model": "haiku"}
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="haiku", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="haiku", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router._get_channel_config")
|
||||
@patch("src.router._get_config")
|
||||
@@ -283,7 +286,7 @@ class TestModelResolution:
|
||||
mock_get_config.return_value = mock_cfg
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router._get_channel_config")
|
||||
@patch("src.router._get_config")
|
||||
@@ -297,7 +300,7 @@ class TestModelResolution:
|
||||
mock_get_config.return_value = mock_cfg
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router.get_active_session")
|
||||
@patch("src.router.send_message")
|
||||
@@ -307,4 +310,4 @@ class TestModelResolution:
|
||||
mock_get_session.return_value = {"model": "opus", "session_id": "abc"}
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False)
|
||||
|
||||
55
tests/test_voice_commands.py
Normal file
55
tests/test_voice_commands.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Tests for src/voice/voice_commands.detect_voice_change."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from src.voice.voice_commands import detect_voice_change
|
||||
|
||||
|
||||
class TestDetectVoiceChange:
|
||||
# --- positive cases (direct form) ---
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("schimbă vocea pe M5", "M5"),
|
||||
("Schimbă vocea pe F3.", "F3"),
|
||||
("vorbește cu vocea M1", "M1"),
|
||||
("vorbește cu vocea F2", "F2"),
|
||||
("voce M4", "M4"),
|
||||
("Voce F5.", "F5"),
|
||||
("treci pe vocea F1", "F1"),
|
||||
("Echo, treci pe M2.", "M2"),
|
||||
("voice M3", "M3"),
|
||||
])
|
||||
def test_direct_form(self, text, expected):
|
||||
assert detect_voice_change(text) == expected
|
||||
|
||||
# --- positive cases (word form, what Whisper actually produces) ---
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("schimbă vocea pe em cinci", "M5"),
|
||||
("vorbește cu vocea em trei", "M3"),
|
||||
("voce em unu", "M1"),
|
||||
("schimbă vocea pe ef doi", "F2"),
|
||||
("voce ef cinci", "F5"),
|
||||
("vorbește cu vocea masculină cinci", "M5"),
|
||||
("schimbă vocea pe feminină trei", "F3"),
|
||||
("voce masculin patru", "M4"),
|
||||
("schimbă vocea pe M cinci", "M5"),
|
||||
("voce F două", "F2"),
|
||||
])
|
||||
def test_word_form(self, text, expected):
|
||||
assert detect_voice_change(text) == expected
|
||||
|
||||
# --- negative cases ---
|
||||
@pytest.mark.parametrize("text", [
|
||||
"",
|
||||
"cât este ora",
|
||||
"M5", # no trigger word
|
||||
"Salut Echo, sunt în M3", # M3 here is a location/etc, no trigger
|
||||
"vocea ta este foarte bună", # trigger but no voice id
|
||||
"schimbă te rog", # trigger but no id
|
||||
"voce M6", # out of range
|
||||
"voce M0", # out of range
|
||||
"voce F8", # out of range
|
||||
"schimbă vocea pe șapte", # digit out of range
|
||||
])
|
||||
def test_no_match(self, text):
|
||||
assert detect_voice_change(text) is None
|
||||
Reference in New Issue
Block a user