Foundation pentru Discord voice-to-voice pipeline. - requirements.txt: faster-whisper, silero-vad, num2words, numpy, PyNaCl - vendor/discord-ext-voice-recv/: vendored la commit ac04ea7b09 (bump version 0.5.3a) — Discord voice protocol fragil, upstream hobby fork. Adapter layer in src/voice/_discord_voice_adapter.py izolează churn (swap la py-cord = doar acel fișier rescris). VENDOR_INFO.md documentează update procedure. - tools/voice_setup.py: idempotent setup script — libopus check, ffmpeg check, Supertonic reachable, faster-whisper/silero-vad warm, assets generation. Exit 0 = green, 1 = needs human (currently libopus missing needs `sudo apt install -y libopus0`). - assets/voice/: thinking.wav (filler "Stai puțin să-mi adun gândurile", ~2.8s), mhm.wav (listener noise), beep_200ms.wav (wake-up tone 880Hz). - src/voice/__init__.py: package stub. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
274 lines
7.8 KiB
Python
274 lines
7.8 KiB
Python
"""
|
|
voice_setup.py — One-shot setup for Discord voice pipeline.
|
|
|
|
Run after `pip install -r requirements.txt`. Idempotent.
|
|
|
|
Steps:
|
|
1. Verify libopus0 loaded by discord.py (apt install libopus0 if missing)
|
|
2. Verify ffmpeg in PATH
|
|
3. Verify Supertonic TTS reachable at :7788
|
|
4. Warm faster-whisper small int8 (downloads to ~/.cache/huggingface/ if cold)
|
|
5. Warm silero-vad
|
|
6. Generate assets/voice/{beep_200ms,mhm,thinking}.wav via Supertonic + ffmpeg
|
|
|
|
Exit code: 0 = all green, 1 = something needs human intervention.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
import json
|
|
from pathlib import Path
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
ASSETS_DIR = REPO_ROOT / "assets" / "voice"
|
|
SUPERTONIC_URL = "http://127.0.0.1:7788/v1/audio/speech"
|
|
SUPERTONIC_VOICE = "M2"
|
|
|
|
GREEN = "\033[32m"
|
|
RED = "\033[31m"
|
|
YELLOW = "\033[33m"
|
|
RESET = "\033[0m"
|
|
|
|
|
|
def _ok(msg: str) -> None:
|
|
print(f"{GREEN}[ OK ]{RESET} {msg}")
|
|
|
|
|
|
def _fail(msg: str) -> None:
|
|
print(f"{RED}[FAIL]{RESET} {msg}")
|
|
|
|
|
|
def _warn(msg: str) -> None:
|
|
print(f"{YELLOW}[WARN]{RESET} {msg}")
|
|
|
|
|
|
def check_libopus() -> bool:
|
|
try:
|
|
import discord
|
|
except ImportError:
|
|
_fail("discord.py not installed — run `pip install -r requirements.txt`")
|
|
return False
|
|
|
|
if discord.opus.is_loaded():
|
|
_ok("libopus loaded (discord.py)")
|
|
return True
|
|
|
|
try:
|
|
discord.opus._load_default()
|
|
except Exception:
|
|
pass
|
|
|
|
if discord.opus.is_loaded():
|
|
_ok("libopus loaded after fallback")
|
|
return True
|
|
|
|
_fail(
|
|
"libopus NOT loaded — Discord voice will fail silent. "
|
|
"Run: sudo apt install -y libopus0"
|
|
)
|
|
return False
|
|
|
|
|
|
def check_ffmpeg() -> bool:
|
|
if not shutil.which("ffmpeg"):
|
|
_fail("ffmpeg not in PATH — required for audio asset generation")
|
|
return False
|
|
_ok(f"ffmpeg at {shutil.which('ffmpeg')}")
|
|
return True
|
|
|
|
|
|
def check_supertonic() -> bool:
|
|
try:
|
|
req = urllib.request.Request(
|
|
SUPERTONIC_URL,
|
|
data=json.dumps(
|
|
{
|
|
"model": "supertonic-3",
|
|
"input": "test",
|
|
"voice": SUPERTONIC_VOICE,
|
|
"response_format": "wav",
|
|
"lang": "ro",
|
|
}
|
|
).encode("utf-8"),
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
if resp.status == 200:
|
|
_ok(f"Supertonic up at {SUPERTONIC_URL}")
|
|
return True
|
|
except (urllib.error.URLError, ConnectionError) as e:
|
|
_fail(f"Supertonic unreachable at :7788 — {e}. Start: systemctl --user start supertonic-tts")
|
|
return False
|
|
_fail(f"Supertonic returned non-200")
|
|
return False
|
|
|
|
|
|
def warm_whisper() -> bool:
|
|
try:
|
|
from faster_whisper import WhisperModel
|
|
except ImportError:
|
|
_fail("faster-whisper not installed")
|
|
return False
|
|
|
|
print(" Warming faster-whisper small int8 (downloads if cold)...")
|
|
t0 = time.perf_counter()
|
|
try:
|
|
WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
|
|
elapsed = time.perf_counter() - t0
|
|
_ok(f"faster-whisper small int8 warm ({elapsed:.1f}s)")
|
|
return True
|
|
except Exception as e:
|
|
_fail(f"faster-whisper warm failed: {e}")
|
|
return False
|
|
|
|
|
|
def warm_silero() -> bool:
|
|
try:
|
|
from silero_vad import load_silero_vad
|
|
except ImportError:
|
|
_fail("silero-vad not installed")
|
|
return False
|
|
|
|
print(" Warming silero-vad...")
|
|
t0 = time.perf_counter()
|
|
try:
|
|
load_silero_vad()
|
|
elapsed = time.perf_counter() - t0
|
|
_ok(f"silero-vad warm ({elapsed:.1f}s)")
|
|
return True
|
|
except Exception as e:
|
|
_fail(f"silero-vad warm failed: {e}")
|
|
return False
|
|
|
|
|
|
def _supertonic_synth(text: str, out_path: Path) -> bool:
|
|
payload = {
|
|
"model": "supertonic-3",
|
|
"input": text,
|
|
"voice": SUPERTONIC_VOICE,
|
|
"response_format": "wav",
|
|
"lang": "ro",
|
|
}
|
|
req = urllib.request.Request(
|
|
SUPERTONIC_URL,
|
|
data=json.dumps(payload).encode("utf-8"),
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
wav_bytes = resp.read()
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
out_path.write_bytes(wav_bytes)
|
|
return True
|
|
except Exception as e:
|
|
_fail(f"Supertonic synth failed for {out_path.name}: {e}")
|
|
return False
|
|
|
|
|
|
def gen_thinking_wav() -> bool:
|
|
path = ASSETS_DIR / "thinking.wav"
|
|
if path.exists() and path.stat().st_size > 1024:
|
|
_ok(f"thinking.wav exists ({path.stat().st_size} bytes)")
|
|
return True
|
|
print(" Generating thinking.wav via Supertonic...")
|
|
if _supertonic_synth("Stai puțin să-mi adun gândurile.", path):
|
|
_ok(f"thinking.wav generated ({path.stat().st_size} bytes)")
|
|
return True
|
|
return False
|
|
|
|
|
|
def gen_mhm_wav() -> bool:
|
|
path = ASSETS_DIR / "mhm.wav"
|
|
if path.exists() and path.stat().st_size > 512:
|
|
_ok(f"mhm.wav exists ({path.stat().st_size} bytes)")
|
|
return True
|
|
print(" Generating mhm.wav via Supertonic...")
|
|
if _supertonic_synth("Mhm.", path):
|
|
_ok(f"mhm.wav generated ({path.stat().st_size} bytes)")
|
|
return True
|
|
return False
|
|
|
|
|
|
def gen_beep_wav() -> bool:
|
|
path = ASSETS_DIR / "beep_200ms.wav"
|
|
if path.exists() and path.stat().st_size > 512:
|
|
_ok(f"beep_200ms.wav exists ({path.stat().st_size} bytes)")
|
|
return True
|
|
print(" Generating beep_200ms.wav via ffmpeg (880Hz sine, 200ms)...")
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
subprocess.run(
|
|
[
|
|
"ffmpeg",
|
|
"-y",
|
|
"-loglevel",
|
|
"error",
|
|
"-f",
|
|
"lavfi",
|
|
"-i",
|
|
"sine=frequency=880:duration=0.2:sample_rate=48000",
|
|
"-af",
|
|
"afade=t=out:st=0.15:d=0.05,volume=0.3",
|
|
"-ac",
|
|
"2",
|
|
str(path),
|
|
],
|
|
check=True,
|
|
)
|
|
_ok(f"beep_200ms.wav generated ({path.stat().st_size} bytes)")
|
|
return True
|
|
except subprocess.CalledProcessError as e:
|
|
_fail(f"ffmpeg beep gen failed: {e}")
|
|
return False
|
|
|
|
|
|
def main() -> int:
|
|
print(f"voice_setup.py — Discord voice pipeline setup\n")
|
|
|
|
checks: list[tuple[str, bool]] = []
|
|
|
|
checks.append(("libopus", check_libopus()))
|
|
checks.append(("ffmpeg", check_ffmpeg()))
|
|
checks.append(("Supertonic", check_supertonic()))
|
|
checks.append(("faster-whisper", warm_whisper()))
|
|
checks.append(("silero-vad", warm_silero()))
|
|
|
|
if checks[2][1]: # Supertonic OK
|
|
checks.append(("thinking.wav", gen_thinking_wav()))
|
|
checks.append(("mhm.wav", gen_mhm_wav()))
|
|
else:
|
|
_warn("Skipping thinking.wav / mhm.wav generation — Supertonic down")
|
|
checks.append(("thinking.wav", False))
|
|
checks.append(("mhm.wav", False))
|
|
|
|
if checks[1][1]: # ffmpeg OK
|
|
checks.append(("beep_200ms.wav", gen_beep_wav()))
|
|
else:
|
|
_warn("Skipping beep_200ms.wav — ffmpeg missing")
|
|
checks.append(("beep_200ms.wav", False))
|
|
|
|
print()
|
|
failed = [name for name, ok in checks if not ok]
|
|
if failed:
|
|
print(f"{RED}FAILED:{RESET} {len(failed)}/{len(checks)} — fix above before /voice join works:")
|
|
for name in failed:
|
|
print(f" - {name}")
|
|
return 1
|
|
|
|
print(f"{GREEN}ALL GREEN{RESET} ({len(checks)} checks). Voice pipeline ready.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|