feat(voice): Pas 2 — install voice deps, vendor discord-ext-voice-recv, setup assets

Foundation pentru Discord voice-to-voice pipeline. - requirements.txt: faster-whisper, silero-vad, num2words, numpy, PyNaCl - vendor/discord-ext-voice-recv/: vendored la commit ac04ea7b09 (bump version 0.5.3a) — Discord voice protocol fragil, upstream hobby fork. Adapter layer in src/voice/_discord_voice_adapter.py izolează churn (swap la py-cord = doar acel fișier rescris). VENDOR_INFO.md documentează update procedure. - tools/voice_setup.py: idempotent setup script — libopus check, ffmpeg check, Supertonic reachable, faster-whisper/silero-vad warm, assets generation. Exit 0 = green, 1 = needs human (currently libopus missing needs `sudo apt install -y libopus0`). - assets/voice/: thinking.wav (filler "Stai puțin să-mi adun gândurile", ~2.8s), mhm.wav (listener noise), beep_200ms.wav (wake-up tone 880Hz). - src/voice/__init__.py: package stub. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 14:42:27 +00:00
parent c6d11bdf9f
commit af5af8133f
32 changed files with 4259 additions and 0 deletions
--- a/tools/voice_setup.py
+++ b/tools/voice_setup.py
@@ -0,0 +1,273 @@
+"""
+voice_setup.py — One-shot setup for Discord voice pipeline.
+
+Run after `pip install -r requirements.txt`. Idempotent.
+
+Steps:
+1. Verify libopus0 loaded by discord.py (apt install libopus0 if missing)
+2. Verify ffmpeg in PATH
+3. Verify Supertonic TTS reachable at :7788
+4. Warm faster-whisper small int8 (downloads to ~/.cache/huggingface/ if cold)
+5. Warm silero-vad
+6. Generate assets/voice/{beep_200ms,mhm,thinking}.wav via Supertonic + ffmpeg
+
+Exit code: 0 = all green, 1 = something needs human intervention.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import sys
+import time
+import urllib.request
+import urllib.error
+import json
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+ASSETS_DIR = REPO_ROOT / "assets" / "voice"
+SUPERTONIC_URL = "http://127.0.0.1:7788/v1/audio/speech"
+SUPERTONIC_VOICE = "M2"
+
+GREEN = "\033[32m"
+RED = "\033[31m"
+YELLOW = "\033[33m"
+RESET = "\033[0m"
+
+
+def _ok(msg: str) -> None:
+    print(f"{GREEN}[ OK ]{RESET} {msg}")
+
+
+def _fail(msg: str) -> None:
+    print(f"{RED}[FAIL]{RESET} {msg}")
+
+
+def _warn(msg: str) -> None:
+    print(f"{YELLOW}[WARN]{RESET} {msg}")
+
+
+def check_libopus() -> bool:
+    try:
+        import discord
+    except ImportError:
+        _fail("discord.py not installed — run `pip install -r requirements.txt`")
+        return False
+
+    if discord.opus.is_loaded():
+        _ok("libopus loaded (discord.py)")
+        return True
+
+    try:
+        discord.opus._load_default()
+    except Exception:
+        pass
+
+    if discord.opus.is_loaded():
+        _ok("libopus loaded after fallback")
+        return True
+
+    _fail(
+        "libopus NOT loaded — Discord voice will fail silent. "
+        "Run: sudo apt install -y libopus0"
+    )
+    return False
+
+
+def check_ffmpeg() -> bool:
+    if not shutil.which("ffmpeg"):
+        _fail("ffmpeg not in PATH — required for audio asset generation")
+        return False
+    _ok(f"ffmpeg at {shutil.which('ffmpeg')}")
+    return True
+
+
+def check_supertonic() -> bool:
+    try:
+        req = urllib.request.Request(
+            SUPERTONIC_URL,
+            data=json.dumps(
+                {
+                    "model": "supertonic-3",
+                    "input": "test",
+                    "voice": SUPERTONIC_VOICE,
+                    "response_format": "wav",
+                    "lang": "ro",
+                }
+            ).encode("utf-8"),
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            if resp.status == 200:
+                _ok(f"Supertonic up at {SUPERTONIC_URL}")
+                return True
+    except (urllib.error.URLError, ConnectionError) as e:
+        _fail(f"Supertonic unreachable at :7788 — {e}. Start: systemctl --user start supertonic-tts")
+        return False
+    _fail(f"Supertonic returned non-200")
+    return False
+
+
+def warm_whisper() -> bool:
+    try:
+        from faster_whisper import WhisperModel
+    except ImportError:
+        _fail("faster-whisper not installed")
+        return False
+
+    print("    Warming faster-whisper small int8 (downloads if cold)...")
+    t0 = time.perf_counter()
+    try:
+        WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
+        elapsed = time.perf_counter() - t0
+        _ok(f"faster-whisper small int8 warm ({elapsed:.1f}s)")
+        return True
+    except Exception as e:
+        _fail(f"faster-whisper warm failed: {e}")
+        return False
+
+
+def warm_silero() -> bool:
+    try:
+        from silero_vad import load_silero_vad
+    except ImportError:
+        _fail("silero-vad not installed")
+        return False
+
+    print("    Warming silero-vad...")
+    t0 = time.perf_counter()
+    try:
+        load_silero_vad()
+        elapsed = time.perf_counter() - t0
+        _ok(f"silero-vad warm ({elapsed:.1f}s)")
+        return True
+    except Exception as e:
+        _fail(f"silero-vad warm failed: {e}")
+        return False
+
+
+def _supertonic_synth(text: str, out_path: Path) -> bool:
+    payload = {
+        "model": "supertonic-3",
+        "input": text,
+        "voice": SUPERTONIC_VOICE,
+        "response_format": "wav",
+        "lang": "ro",
+    }
+    req = urllib.request.Request(
+        SUPERTONIC_URL,
+        data=json.dumps(payload).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            wav_bytes = resp.read()
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_bytes(wav_bytes)
+        return True
+    except Exception as e:
+        _fail(f"Supertonic synth failed for {out_path.name}: {e}")
+        return False
+
+
+def gen_thinking_wav() -> bool:
+    path = ASSETS_DIR / "thinking.wav"
+    if path.exists() and path.stat().st_size > 1024:
+        _ok(f"thinking.wav exists ({path.stat().st_size} bytes)")
+        return True
+    print("    Generating thinking.wav via Supertonic...")
+    if _supertonic_synth("Stai puțin să-mi adun gândurile.", path):
+        _ok(f"thinking.wav generated ({path.stat().st_size} bytes)")
+        return True
+    return False
+
+
+def gen_mhm_wav() -> bool:
+    path = ASSETS_DIR / "mhm.wav"
+    if path.exists() and path.stat().st_size > 512:
+        _ok(f"mhm.wav exists ({path.stat().st_size} bytes)")
+        return True
+    print("    Generating mhm.wav via Supertonic...")
+    if _supertonic_synth("Mhm.", path):
+        _ok(f"mhm.wav generated ({path.stat().st_size} bytes)")
+        return True
+    return False
+
+
+def gen_beep_wav() -> bool:
+    path = ASSETS_DIR / "beep_200ms.wav"
+    if path.exists() and path.stat().st_size > 512:
+        _ok(f"beep_200ms.wav exists ({path.stat().st_size} bytes)")
+        return True
+    print("    Generating beep_200ms.wav via ffmpeg (880Hz sine, 200ms)...")
+    path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        subprocess.run(
+            [
+                "ffmpeg",
+                "-y",
+                "-loglevel",
+                "error",
+                "-f",
+                "lavfi",
+                "-i",
+                "sine=frequency=880:duration=0.2:sample_rate=48000",
+                "-af",
+                "afade=t=out:st=0.15:d=0.05,volume=0.3",
+                "-ac",
+                "2",
+                str(path),
+            ],
+            check=True,
+        )
+        _ok(f"beep_200ms.wav generated ({path.stat().st_size} bytes)")
+        return True
+    except subprocess.CalledProcessError as e:
+        _fail(f"ffmpeg beep gen failed: {e}")
+        return False
+
+
+def main() -> int:
+    print(f"voice_setup.py — Discord voice pipeline setup\n")
+
+    checks: list[tuple[str, bool]] = []
+
+    checks.append(("libopus", check_libopus()))
+    checks.append(("ffmpeg", check_ffmpeg()))
+    checks.append(("Supertonic", check_supertonic()))
+    checks.append(("faster-whisper", warm_whisper()))
+    checks.append(("silero-vad", warm_silero()))
+
+    if checks[2][1]:  # Supertonic OK
+        checks.append(("thinking.wav", gen_thinking_wav()))
+        checks.append(("mhm.wav", gen_mhm_wav()))
+    else:
+        _warn("Skipping thinking.wav / mhm.wav generation — Supertonic down")
+        checks.append(("thinking.wav", False))
+        checks.append(("mhm.wav", False))
+
+    if checks[1][1]:  # ffmpeg OK
+        checks.append(("beep_200ms.wav", gen_beep_wav()))
+    else:
+        _warn("Skipping beep_200ms.wav — ffmpeg missing")
+        checks.append(("beep_200ms.wav", False))
+
+    print()
+    failed = [name for name, ok in checks if not ok]
+    if failed:
+        print(f"{RED}FAILED:{RESET} {len(failed)}/{len(checks)} — fix above before /voice join works:")
+        for name in failed:
+            print(f"  - {name}")
+        return 1
+
+    print(f"{GREEN}ALL GREEN{RESET} ({len(checks)} checks). Voice pipeline ready.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())