echo-core/src/voice/voice_commands.py

"""Detect in-band voice commands from STT transcripts.

The voice pipeline transcribes Marius's speech via Whisper and dispatches the
text to Claude. Some utterances are not questions for Claude — they're
control commands for the voice stack itself. This module parses those out
*before* the Claude round-trip so they take effect instantly and don't waste
a Claude session turn.

Currently handled:
  * change TTS voice — "schimbă vocea pe M5", "vorbește cu vocea F3",
    "voce em cinci", "voce feminină 3", etc.

The parser is intentionally conservative: it requires BOTH a voice trigger
word ("voce", "vorbește", "schimbă", "treci pe") AND a recognizable voice
ID. A bare "M5" without context is NOT a command — Marius might be quoting
a string.
"""
from __future__ import annotations

import re
from typing import Optional


_VALID_VOICES = {f"M{i}" for i in range(1, 6)} | {f"F{i}" for i in range(1, 6)}


# Trigger words that suggest the user is talking ABOUT the voice, not just
# saying something that happens to contain a voice-ID-looking substring.
_VOICE_TRIGGER_RE = re.compile(
    r'\b(voce|vocea|voci|voice|vorbe[șs]te|schimb[aăÎ]|treci\s+pe)\b',
    re.IGNORECASE,
)

# Direct form: "M5", "F 3", "m5", etc.
_VOICE_ID_DIRECT_RE = re.compile(
    r'\b([MF])\s*([1-5])\b',
    re.IGNORECASE,
)

# Word form: "em cinci", "M trei", "masculin doi", "feminină patru", etc.
# Whisper often transcribes "M5" as "em cinci" / "M cinci" because letter
# names are spelled out phonetically in Romanian.
_VOICE_ID_WORDS_RE = re.compile(
    r'\b(em|m|masculin[aăe]?|ef|f|feminin[aăe]?)\s+(unu|una|doi|dou[ăa]|trei|patru|cinci|[1-5])\b',
    re.IGNORECASE,
)


_DIGIT_WORD_TO_INT = {
    'unu': 1, 'una': 1, 'unul': 1, '1': 1,
    'doi': 2, 'două': 2, 'doua': 2, '2': 2,
    'trei': 3, '3': 3,
    'patru': 4, '4': 4,
    'cinci': 5, '5': 5,
}

# Substring fallback: matches digit roots even when Whisper glues them into
# compound non-words like "Mâcinci" (for "M cinci"=M5).
_DIGIT_SUBSTR_RE = re.compile(
    r'(cinci|patru|trei|dou[ăa]|unul|unu|una)',
    re.IGNORECASE,
)

_F_GENDER_HINT_RE = re.compile(r'feminin|\bef\b|\bF\d?\b', re.IGNORECASE)


def _normalize_gender(word: str) -> Optional[str]:
    """Map gender word to 'M' or 'F'."""
    w = word.lower()
    if w in ('m', 'em') or w.startswith('masculin'):
        return 'M'
    if w in ('f', 'ef') or w.startswith('feminin'):
        return 'F'
    return None


def detect_voice_change(text: str) -> Optional[str]:
    """Parse a transcript for a 'change voice' command.

    Returns the target voice id (one of M1-M5, F1-F5) or None if no command
    was detected. Requires both a voice trigger word and a voice ID.
    """
    if not text:
        return None
    if not _VOICE_TRIGGER_RE.search(text):
        return None
    # Try the direct form first (M5, F3, etc.)
    m = _VOICE_ID_DIRECT_RE.search(text)
    if m:
        candidate = f"{m.group(1).upper()}{m.group(2)}"
        if candidate in _VALID_VOICES:
            return candidate
    # Fall back to the word form ("em cinci", "feminin trei", ...).
    m = _VOICE_ID_WORDS_RE.search(text)
    if m:
        gender = _normalize_gender(m.group(1))
        digit = _DIGIT_WORD_TO_INT.get(m.group(2).lower())
        if gender is not None and digit is not None:
            candidate = f"{gender}{digit}"
            if candidate in _VALID_VOICES:
                return candidate
    # Permissive fallback: Whisper sometimes glues the letter into the next
    # word ("Mâcinci" for "M cinci") or replaces it ("unul cinci" for
    # "M unu cinci"). After a voice trigger word, scan for any digit-word
    # substring and infer gender (F if a feminine marker is present, else M).
    digit_hits = _DIGIT_SUBSTR_RE.findall(text)
    digits = [_DIGIT_WORD_TO_INT[d.lower()] for d in digit_hits
              if d.lower() in _DIGIT_WORD_TO_INT]
    digits = [d for d in digits if 1 <= d <= 5]
    if digits:
        gender = 'F' if _F_GENDER_HINT_RE.search(text) else 'M'
        # Last digit wins — handles "M unu cinci" → M5 since "unu" is a
        # mangled letter-name prefix, "cinci" is the actual target.
        return f"{gender}{digits[-1]}"
    return None


__all__ = ["detect_voice_change"]