"""Detect in-band voice commands from STT transcripts. The voice pipeline transcribes Marius's speech via Whisper and dispatches the text to Claude. Some utterances are not questions for Claude — they're control commands for the voice stack itself. This module parses those out *before* the Claude round-trip so they take effect instantly and don't waste a Claude session turn. Currently handled: * change TTS voice — "schimbă vocea pe M5", "vorbește cu vocea F3", "voce em cinci", "voce feminină 3", etc. The parser is intentionally conservative: it requires BOTH a voice trigger word ("voce", "vorbește", "schimbă", "treci pe") AND a recognizable voice ID. A bare "M5" without context is NOT a command — Marius might be quoting a string. """ from __future__ import annotations import re from typing import Optional _VALID_VOICES = {f"M{i}" for i in range(1, 6)} | {f"F{i}" for i in range(1, 6)} # Trigger words that suggest the user is talking ABOUT the voice, not just # saying something that happens to contain a voice-ID-looking substring. _VOICE_TRIGGER_RE = re.compile( r'\b(voce|vocea|voci|voice|vorbe[șs]te|schimb[aăÎ]|treci\s+pe)\b', re.IGNORECASE, ) # Direct form: "M5", "F 3", "m5", etc. _VOICE_ID_DIRECT_RE = re.compile( r'\b([MF])\s*([1-5])\b', re.IGNORECASE, ) # Word form: "em cinci", "M trei", "masculin doi", "feminină patru", etc. # Whisper often transcribes "M5" as "em cinci" / "M cinci" because letter # names are spelled out phonetically in Romanian. _VOICE_ID_WORDS_RE = re.compile( r'\b(em|m|masculin[aăe]?|ef|f|feminin[aăe]?)\s+(unu|una|doi|dou[ăa]|trei|patru|cinci|[1-5])\b', re.IGNORECASE, ) _DIGIT_WORD_TO_INT = { 'unu': 1, 'una': 1, 'unul': 1, '1': 1, 'doi': 2, 'două': 2, 'doua': 2, '2': 2, 'trei': 3, '3': 3, 'patru': 4, '4': 4, 'cinci': 5, '5': 5, } # Substring fallback: matches digit roots even when Whisper glues them into # compound non-words like "Mâcinci" (for "M cinci"=M5). _DIGIT_SUBSTR_RE = re.compile( r'(cinci|patru|trei|dou[ăa]|unul|unu|una)', re.IGNORECASE, ) _F_GENDER_HINT_RE = re.compile(r'feminin|\bef\b|\bF\d?\b', re.IGNORECASE) def _normalize_gender(word: str) -> Optional[str]: """Map gender word to 'M' or 'F'.""" w = word.lower() if w in ('m', 'em') or w.startswith('masculin'): return 'M' if w in ('f', 'ef') or w.startswith('feminin'): return 'F' return None def detect_voice_change(text: str) -> Optional[str]: """Parse a transcript for a 'change voice' command. Returns the target voice id (one of M1-M5, F1-F5) or None if no command was detected. Requires both a voice trigger word and a voice ID. """ if not text: return None if not _VOICE_TRIGGER_RE.search(text): return None # Try the direct form first (M5, F3, etc.) m = _VOICE_ID_DIRECT_RE.search(text) if m: candidate = f"{m.group(1).upper()}{m.group(2)}" if candidate in _VALID_VOICES: return candidate # Fall back to the word form ("em cinci", "feminin trei", ...). m = _VOICE_ID_WORDS_RE.search(text) if m: gender = _normalize_gender(m.group(1)) digit = _DIGIT_WORD_TO_INT.get(m.group(2).lower()) if gender is not None and digit is not None: candidate = f"{gender}{digit}" if candidate in _VALID_VOICES: return candidate # Permissive fallback: Whisper sometimes glues the letter into the next # word ("Mâcinci" for "M cinci") or replaces it ("unul cinci" for # "M unu cinci"). After a voice trigger word, scan for any digit-word # substring and infer gender (F if a feminine marker is present, else M). digit_hits = _DIGIT_SUBSTR_RE.findall(text) digits = [_DIGIT_WORD_TO_INT[d.lower()] for d in digit_hits if d.lower() in _DIGIT_WORD_TO_INT] digits = [d for d in digits if 1 <= d <= 5] if digits: gender = 'F' if _F_GENDER_HINT_RE.search(text) else 'M' # Last digit wins — handles "M unu cinci" → M5 since "unu" is a # mangled letter-name prefix, "cinci" is the actual target. return f"{gender}{digits[-1]}" return None __all__ = ["detect_voice_change"]