Files
echo-core/tools/voice_setup.py
Marius Mutu af5af8133f feat(voice): Pas 2 — install voice deps, vendor discord-ext-voice-recv, setup assets
Foundation pentru Discord voice-to-voice pipeline.

- requirements.txt: faster-whisper, silero-vad, num2words, numpy, PyNaCl
- vendor/discord-ext-voice-recv/: vendored la commit ac04ea7b09 (bump version
  0.5.3a) — Discord voice protocol fragil, upstream hobby fork. Adapter layer
  in src/voice/_discord_voice_adapter.py izolează churn (swap la py-cord =
  doar acel fișier rescris). VENDOR_INFO.md documentează update procedure.
- tools/voice_setup.py: idempotent setup script — libopus check, ffmpeg
  check, Supertonic reachable, faster-whisper/silero-vad warm, assets
  generation. Exit 0 = green, 1 = needs human (currently libopus missing
  needs `sudo apt install -y libopus0`).
- assets/voice/: thinking.wav (filler "Stai puțin să-mi adun gândurile",
  ~2.8s), mhm.wav (listener noise), beep_200ms.wav (wake-up tone 880Hz).
- src/voice/__init__.py: package stub.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 14:42:27 +00:00

274 lines
7.8 KiB
Python

"""
voice_setup.py — One-shot setup for Discord voice pipeline.
Run after `pip install -r requirements.txt`. Idempotent.
Steps:
1. Verify libopus0 loaded by discord.py (apt install libopus0 if missing)
2. Verify ffmpeg in PATH
3. Verify Supertonic TTS reachable at :7788
4. Warm faster-whisper small int8 (downloads to ~/.cache/huggingface/ if cold)
5. Warm silero-vad
6. Generate assets/voice/{beep_200ms,mhm,thinking}.wav via Supertonic + ffmpeg
Exit code: 0 = all green, 1 = something needs human intervention.
"""
from __future__ import annotations
import os
import shutil
import subprocess
import sys
import time
import urllib.request
import urllib.error
import json
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
ASSETS_DIR = REPO_ROOT / "assets" / "voice"
SUPERTONIC_URL = "http://127.0.0.1:7788/v1/audio/speech"
SUPERTONIC_VOICE = "M2"
GREEN = "\033[32m"
RED = "\033[31m"
YELLOW = "\033[33m"
RESET = "\033[0m"
def _ok(msg: str) -> None:
print(f"{GREEN}[ OK ]{RESET} {msg}")
def _fail(msg: str) -> None:
print(f"{RED}[FAIL]{RESET} {msg}")
def _warn(msg: str) -> None:
print(f"{YELLOW}[WARN]{RESET} {msg}")
def check_libopus() -> bool:
try:
import discord
except ImportError:
_fail("discord.py not installed — run `pip install -r requirements.txt`")
return False
if discord.opus.is_loaded():
_ok("libopus loaded (discord.py)")
return True
try:
discord.opus._load_default()
except Exception:
pass
if discord.opus.is_loaded():
_ok("libopus loaded after fallback")
return True
_fail(
"libopus NOT loaded — Discord voice will fail silent. "
"Run: sudo apt install -y libopus0"
)
return False
def check_ffmpeg() -> bool:
if not shutil.which("ffmpeg"):
_fail("ffmpeg not in PATH — required for audio asset generation")
return False
_ok(f"ffmpeg at {shutil.which('ffmpeg')}")
return True
def check_supertonic() -> bool:
try:
req = urllib.request.Request(
SUPERTONIC_URL,
data=json.dumps(
{
"model": "supertonic-3",
"input": "test",
"voice": SUPERTONIC_VOICE,
"response_format": "wav",
"lang": "ro",
}
).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=5) as resp:
if resp.status == 200:
_ok(f"Supertonic up at {SUPERTONIC_URL}")
return True
except (urllib.error.URLError, ConnectionError) as e:
_fail(f"Supertonic unreachable at :7788 — {e}. Start: systemctl --user start supertonic-tts")
return False
_fail(f"Supertonic returned non-200")
return False
def warm_whisper() -> bool:
try:
from faster_whisper import WhisperModel
except ImportError:
_fail("faster-whisper not installed")
return False
print(" Warming faster-whisper small int8 (downloads if cold)...")
t0 = time.perf_counter()
try:
WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
elapsed = time.perf_counter() - t0
_ok(f"faster-whisper small int8 warm ({elapsed:.1f}s)")
return True
except Exception as e:
_fail(f"faster-whisper warm failed: {e}")
return False
def warm_silero() -> bool:
try:
from silero_vad import load_silero_vad
except ImportError:
_fail("silero-vad not installed")
return False
print(" Warming silero-vad...")
t0 = time.perf_counter()
try:
load_silero_vad()
elapsed = time.perf_counter() - t0
_ok(f"silero-vad warm ({elapsed:.1f}s)")
return True
except Exception as e:
_fail(f"silero-vad warm failed: {e}")
return False
def _supertonic_synth(text: str, out_path: Path) -> bool:
payload = {
"model": "supertonic-3",
"input": text,
"voice": SUPERTONIC_VOICE,
"response_format": "wav",
"lang": "ro",
}
req = urllib.request.Request(
SUPERTONIC_URL,
data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
wav_bytes = resp.read()
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_bytes(wav_bytes)
return True
except Exception as e:
_fail(f"Supertonic synth failed for {out_path.name}: {e}")
return False
def gen_thinking_wav() -> bool:
path = ASSETS_DIR / "thinking.wav"
if path.exists() and path.stat().st_size > 1024:
_ok(f"thinking.wav exists ({path.stat().st_size} bytes)")
return True
print(" Generating thinking.wav via Supertonic...")
if _supertonic_synth("Stai puțin să-mi adun gândurile.", path):
_ok(f"thinking.wav generated ({path.stat().st_size} bytes)")
return True
return False
def gen_mhm_wav() -> bool:
path = ASSETS_DIR / "mhm.wav"
if path.exists() and path.stat().st_size > 512:
_ok(f"mhm.wav exists ({path.stat().st_size} bytes)")
return True
print(" Generating mhm.wav via Supertonic...")
if _supertonic_synth("Mhm.", path):
_ok(f"mhm.wav generated ({path.stat().st_size} bytes)")
return True
return False
def gen_beep_wav() -> bool:
path = ASSETS_DIR / "beep_200ms.wav"
if path.exists() and path.stat().st_size > 512:
_ok(f"beep_200ms.wav exists ({path.stat().st_size} bytes)")
return True
print(" Generating beep_200ms.wav via ffmpeg (880Hz sine, 200ms)...")
path.parent.mkdir(parents=True, exist_ok=True)
try:
subprocess.run(
[
"ffmpeg",
"-y",
"-loglevel",
"error",
"-f",
"lavfi",
"-i",
"sine=frequency=880:duration=0.2:sample_rate=48000",
"-af",
"afade=t=out:st=0.15:d=0.05,volume=0.3",
"-ac",
"2",
str(path),
],
check=True,
)
_ok(f"beep_200ms.wav generated ({path.stat().st_size} bytes)")
return True
except subprocess.CalledProcessError as e:
_fail(f"ffmpeg beep gen failed: {e}")
return False
def main() -> int:
print(f"voice_setup.py — Discord voice pipeline setup\n")
checks: list[tuple[str, bool]] = []
checks.append(("libopus", check_libopus()))
checks.append(("ffmpeg", check_ffmpeg()))
checks.append(("Supertonic", check_supertonic()))
checks.append(("faster-whisper", warm_whisper()))
checks.append(("silero-vad", warm_silero()))
if checks[2][1]: # Supertonic OK
checks.append(("thinking.wav", gen_thinking_wav()))
checks.append(("mhm.wav", gen_mhm_wav()))
else:
_warn("Skipping thinking.wav / mhm.wav generation — Supertonic down")
checks.append(("thinking.wav", False))
checks.append(("mhm.wav", False))
if checks[1][1]: # ffmpeg OK
checks.append(("beep_200ms.wav", gen_beep_wav()))
else:
_warn("Skipping beep_200ms.wav — ffmpeg missing")
checks.append(("beep_200ms.wav", False))
print()
failed = [name for name, ok in checks if not ok]
if failed:
print(f"{RED}FAILED:{RESET} {len(failed)}/{len(checks)} — fix above before /voice join works:")
for name in failed:
print(f" - {name}")
return 1
print(f"{GREEN}ALL GREEN{RESET} ({len(checks)} checks). Voice pipeline ready.")
return 0
if __name__ == "__main__":
sys.exit(main())