Compare commits
11 Commits
44cf0001bb
...
4be70440e8
| Author | SHA1 | Date | |
|---|---|---|---|
| 4be70440e8 | |||
| 13931db953 | |||
| 23666f7910 | |||
| 217da65417 | |||
| 0cc01c1450 | |||
| c93c4f822e | |||
| 3af6bcaea4 | |||
| a3eefbc799 | |||
| a48562b2f5 | |||
| af5af8133f | |||
| c6d11bdf9f |
BIN
assets/voice/beep_200ms.wav
Normal file
BIN
assets/voice/beep_200ms.wav
Normal file
Binary file not shown.
BIN
assets/voice/mhm.wav
Normal file
BIN
assets/voice/mhm.wav
Normal file
Binary file not shown.
BIN
assets/voice/thinking.wav
Normal file
BIN
assets/voice/thinking.wav
Normal file
Binary file not shown.
101
cli.py
101
cli.py
@@ -114,6 +114,104 @@ def _load_sessions_file() -> dict:
|
||||
return {}
|
||||
|
||||
|
||||
def _voice_doctor_checks() -> list[tuple[str, bool]]:
|
||||
"""Voice-stack health checks (Pas 10).
|
||||
|
||||
Mirrors the logic in tools/voice_setup.py but returns (label, ok) tuples
|
||||
so they integrate with cmd_doctor's PASS/FAIL output. All checks degrade
|
||||
gracefully — ImportError on optional voice deps is reported as FAIL, never
|
||||
raised, so the rest of `eco doctor` is unaffected.
|
||||
"""
|
||||
import importlib.util
|
||||
import json as _json
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
results: list[tuple[str, bool]] = []
|
||||
|
||||
# 1. libopus0 loaded by discord.py
|
||||
try:
|
||||
import discord
|
||||
if not discord.opus.is_loaded():
|
||||
try:
|
||||
discord.opus._load_default()
|
||||
except Exception:
|
||||
pass
|
||||
results.append(("libopus loaded (discord.py)", discord.opus.is_loaded()))
|
||||
except ImportError:
|
||||
results.append(("libopus loaded (discord.py)", False))
|
||||
except Exception:
|
||||
results.append(("libopus loaded (discord.py)", False))
|
||||
|
||||
# 2. ffmpeg in PATH
|
||||
results.append(("ffmpeg in PATH", shutil.which("ffmpeg") is not None))
|
||||
|
||||
# 3. Supertonic TTS reachable at http://127.0.0.1:7788/
|
||||
supertonic_url = "http://127.0.0.1:7788/v1/audio/speech"
|
||||
supertonic_ok = False
|
||||
try:
|
||||
payload = _json.dumps({
|
||||
"model": "supertonic-3",
|
||||
"input": "test",
|
||||
"voice": "M2",
|
||||
"response_format": "wav",
|
||||
"lang": "ro",
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
supertonic_url,
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
supertonic_ok = resp.status == 200
|
||||
except (urllib.error.URLError, ConnectionError, OSError):
|
||||
supertonic_ok = False
|
||||
except Exception:
|
||||
supertonic_ok = False
|
||||
results.append(("Supertonic TTS reachable at :7788", supertonic_ok))
|
||||
|
||||
# 4. faster-whisper importable (don't load model — too slow)
|
||||
results.append((
|
||||
"faster-whisper importable",
|
||||
importlib.util.find_spec("faster_whisper") is not None,
|
||||
))
|
||||
|
||||
# 5. silero-vad importable
|
||||
results.append((
|
||||
"silero-vad importable",
|
||||
importlib.util.find_spec("silero_vad") is not None,
|
||||
))
|
||||
|
||||
# 6. discord.ext.voice_recv importable (vendor package)
|
||||
voice_recv_ok = False
|
||||
try:
|
||||
voice_recv_ok = importlib.util.find_spec("discord.ext.voice_recv") is not None
|
||||
except (ImportError, ValueError, ModuleNotFoundError):
|
||||
voice_recv_ok = False
|
||||
except Exception:
|
||||
voice_recv_ok = False
|
||||
results.append(("discord.ext.voice_recv importable", voice_recv_ok))
|
||||
|
||||
# 7-9. Voice assets present and non-trivial size
|
||||
voice_assets = [
|
||||
("assets/voice/thinking.wav", 1024),
|
||||
("assets/voice/beep_200ms.wav", 512),
|
||||
("assets/voice/mhm.wav", 512),
|
||||
]
|
||||
for rel_path, min_bytes in voice_assets:
|
||||
path = PROJECT_ROOT / rel_path
|
||||
ok = False
|
||||
try:
|
||||
ok = path.exists() and path.stat().st_size > min_bytes
|
||||
except OSError:
|
||||
ok = False
|
||||
label = f"{rel_path} (>{min_bytes}B)"
|
||||
results.append((label, ok))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def cmd_doctor(args):
|
||||
"""Run diagnostic checks."""
|
||||
import re
|
||||
@@ -227,6 +325,9 @@ def cmd_doctor(args):
|
||||
else:
|
||||
checks.append(("WhatsApp bridge (optional)", True))
|
||||
|
||||
# ---- Voice stack checks (Pas 10) ----
|
||||
checks.extend(_voice_doctor_checks())
|
||||
|
||||
# Print results
|
||||
all_pass = True
|
||||
for label, passed in checks:
|
||||
|
||||
@@ -104,6 +104,14 @@
|
||||
"ollama": {
|
||||
"url": "http://10.0.20.161:11434"
|
||||
},
|
||||
"voice": {
|
||||
"allowed_user_ids": [
|
||||
"949388626146517022"
|
||||
],
|
||||
"user_name": "Marius",
|
||||
"default_voice": "M5",
|
||||
"auto_leave_minutes": 5
|
||||
},
|
||||
"paths": {
|
||||
"personality": "personality/",
|
||||
"tools": "tools/",
|
||||
|
||||
@@ -189,4 +189,16 @@ Când lansez sub-agent, îi dau context: AGENTS.md, SOUL.md, USER.md + relevant
|
||||
- Discord links: `<url>` pentru a suprima embed-uri
|
||||
- Cand primesc o sarcina mai mare de executat, raspund intotdeauna cu o reactie sau confirmare si apoi trec la executie
|
||||
- **Link-uri:** Folosesc `https://moltbot.tailf7372d.ts.net/echo/` (NU IP 100.120.119.70) pentru ca WhatsApp să le recunoască ca link-uri
|
||||
- **Link-uri fișiere salvate:** Când salvez/menționez fișiere din `memory/kb/`, ofer automat link către `files.html#memory/kb/path/to/file.md` pentru preview
|
||||
- **Link-uri fișiere salvate:** Când salvez/menționez fișiere din `memory/kb/`, ofer automat link către `files.html#memory/kb/path/to/file.md` pentru preview
|
||||
|
||||
## Voice mode
|
||||
|
||||
Reguli aplicate când `adapter_name == "discord-voice"` — Marius mă ascultă, nu citește. Vocea e intolerantă la lung și la structură.
|
||||
|
||||
- **1-3 propoziții max per răspuns.** Dacă am mai mult de spus, condensez sau mut în chat.
|
||||
- **Fără markdown.** Niciun bold, italic, cod cu backticks, headere. Text plat, atât.
|
||||
- **Fără bullet lists, nici numerotate.** Le pronunț natural ca propoziții: "trei lucruri: în primul rând..., apoi..., și la final..."
|
||||
- **Fără linkuri.** Nu rostesc URL-uri. Dacă e relevant: "îți trimit linkul în chat".
|
||||
- **Numere și valute formulate conversațional.** Scriu "treizeci de lei", nu "30 RON"; "douăzeci și cinci la sută", nu "25%". Modulul `normalize.py` face curățare tehnică, dar eu formulez deja natural — un om vorbește, nu citește tabelul.
|
||||
- **Lung sau structurat → mută în chat.** Dacă răspunsul cere listă, cod, linkuri sau peste 3 propoziții, închei rostit cu "L-am scris în chat." iar restul ajunge în text channel mirror.
|
||||
- **Ton:** cum vorbesc cu Marius la o cafea, nu cum scriu raport. Contracții, pauze, "păi" sau "stai puțin" dacă mă ajută să sune uman. Concis, fără tic-uri robotice.
|
||||
@@ -63,6 +63,13 @@
|
||||
- **Venv:** ~/echo-core/.venv/ | **Model:** base
|
||||
- **Utilizare:** `whisper.load_model('base').transcribe(path, language='ro')`
|
||||
|
||||
### Discord Voice
|
||||
- **Ce este:** Bot conectat la un voice channel Discord — ascultă microfonul lui Marius, transcrie cu faster-whisper (`small` int8, RO), rutează prin router și răspunde rostit cu Supertonic TTS.
|
||||
- **Cum sunt "în voce":** Slash command `/voice join` mă cheamă în channel; cât stau acolo, presence-ul arată că ascult. `/voice leave` sau auto-leave după 5 minute fără voce.
|
||||
- **Latență așteptată:** ~5 secunde perceput end-to-end (STT p50 2.25s + LLM + TTS first chunk). Peste 3s pornesc un filler audio ("Stai să-mi adun gândurile") ca să nu pară mort.
|
||||
- **Streaming TTS:** răspunsul iese pe clauze, nu cuvânt-cu-cuvânt și nu frază întreagă — primul sunet pleacă imediat ce am o propoziție scurtă.
|
||||
- **Limitări:** 1-3 propoziții max (vezi AGENTS.md § Voice mode). Cuvinte rare, nume proprii sau acronime pot apărea ciudat în STT — dacă sună greșit, cer reformulare în loc să ghicesc.
|
||||
|
||||
### Pauze respirație
|
||||
- **Script:** `python3 tools/pauza_random.py`
|
||||
- **Bancă:** memory/kb/tehnici-pauza.md
|
||||
|
||||
30
personality/VOICE_MODE.md
Normal file
30
personality/VOICE_MODE.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Voice Mode
|
||||
|
||||
Răspunzi prin voce (TTS). Marius te aude — nu citește. Reguli care contează:
|
||||
|
||||
## Lungime și ton
|
||||
|
||||
- **Scurt**: 1-2 propoziții, max ~30 cuvinte per turn. Marius vorbește cu tine — nu redactezi un document.
|
||||
- **Conversațional**: ca un om viu. Fără "Sigur, iată...", "Permite-mi să...", "Te rog să...". Direct la subiect.
|
||||
- **Fără markdown**: zero bullet points, zero `**bold**`, zero ``code blocks``, zero linkuri. Totul e citit cu voce.
|
||||
|
||||
## Numere și unități
|
||||
|
||||
- **Ora**: fără secunde. Spune "ora 23 și 9 minute" sau "9 și jumătate", nu "23:09:42".
|
||||
- **Distanțe mari**: rotunjește în "mii" sau "milioane". Pentru Pământ-Lună spune "384 mii de kilometri", nu "384.000 km".
|
||||
- **Zecimale**: omite-le când nu adaugă informație. "5 lei" nu "5,00 lei". "două ore" nu "2,0 ore". "20 de minute" nu "20,5 minute".
|
||||
- **Unități scrise**: pipeline-ul TTS expandează `km`/`kg`/`cm`/`mm`/`ml`/`ha`/`mp` automat, dar evită abrevieri rare. Scrie "metri" nu "m." dacă e ambiguu.
|
||||
|
||||
## Structură
|
||||
|
||||
- Listă scurtă verbală: "Trei lucruri: întâi X, apoi Y, plus Z."
|
||||
- Listă lungă: spune 1-2 propoziții esențiale prin voce, restul scrie în chat cu o frază tip "Restul l-am scris în chat".
|
||||
- Întrebări clarificatoare: pune UNA, nu trei.
|
||||
|
||||
## Punctuație
|
||||
|
||||
- Doar virgule și puncte. Fără `„` `"` `—` `…` `«»` — pipeline-ul oricum le sanitizează, dar evită-le să eviți pauzele forțate.
|
||||
|
||||
## Tu ești Marius's prieten în mașină
|
||||
|
||||
Imaginează-ți că Marius conduce și te-a întrebat ceva pe difuzor. Răspunzi natural, scurt, la subiect — fără ceremonii.
|
||||
@@ -7,3 +7,14 @@ httpx>=0.27
|
||||
pytest>=8.0
|
||||
supertonic[serve]>=1.3.1
|
||||
trafilatura>=1.8
|
||||
|
||||
# Voice pipeline (Pas 2 setup)
|
||||
faster-whisper>=1.0
|
||||
silero-vad>=5.1
|
||||
num2words>=0.5
|
||||
numpy>=1.24
|
||||
PyNaCl>=1.5
|
||||
# discord-ext-voice-recv vendored at vendor/discord-ext-voice-recv/
|
||||
# pinned commit: ac04ea7b0941112e83767cf1c1469b408fa06748
|
||||
# install: pip install -e vendor/discord-ext-voice-recv
|
||||
# System deps (NOT pip): libopus0 (apt), ffmpeg
|
||||
|
||||
@@ -112,6 +112,7 @@ def create_bot(config: Config) -> discord.Client:
|
||||
|
||||
intents = discord.Intents.default()
|
||||
intents.message_content = True
|
||||
intents.voice_states = True
|
||||
|
||||
client = discord.Client(intents=intents)
|
||||
tree = app_commands.CommandTree(client)
|
||||
@@ -958,6 +959,11 @@ def create_bot(config: Config) -> discord.Client:
|
||||
else:
|
||||
await interaction.followup.send(result or "Eroare TTS.")
|
||||
|
||||
# Voice slash group (Pas 7)
|
||||
from src.adapters.discord_voice import register as register_voice
|
||||
voice_group = register_voice(tree, client)
|
||||
tree.add_command(voice_group)
|
||||
|
||||
# --- Ralph commands (autonomous project execution) ---
|
||||
|
||||
async def _autocomplete_by_status(
|
||||
@@ -1118,6 +1124,11 @@ def create_bot(config: Config) -> discord.Client:
|
||||
from datetime import datetime, timezone
|
||||
client._ready_at = datetime.now(timezone.utc)
|
||||
logger.info("Echo Core online as %s", client.user)
|
||||
# Voice models eager warmup (Pas 7)
|
||||
from src.adapters import discord_voice
|
||||
discord_voice._models_warmup_future = asyncio.create_task(
|
||||
discord_voice.warmup_models()
|
||||
)
|
||||
|
||||
async def _handle_chat(message: discord.Message) -> None:
|
||||
"""Process a chat message through the router and send the response."""
|
||||
|
||||
361
src/adapters/discord_voice.py
Normal file
361
src/adapters/discord_voice.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""Discord voice slash commands (Pas 7 — CONVERGENCE wiring).
|
||||
|
||||
Registers the `/voice` slash command group on the existing CommandTree and
|
||||
exposes an async `warmup_models()` for eager model load at bot startup.
|
||||
|
||||
Owns nothing in `src/voice/*` — purely the Discord-facing wiring. Defers
|
||||
heavy lifting to:
|
||||
|
||||
- ``src.voice.pipeline.VoiceSession`` — per-guild session state machine
|
||||
- ``src.voice.pipeline.EchoVoiceSink`` — discord-ext-voice-recv sink
|
||||
- ``src.voice.tts_stream.TTSQueue`` / ``EchoStreamingAudioSource``
|
||||
- ``src.voice._discord_voice_adapter.connect_voice``
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import discord
|
||||
from discord import app_commands
|
||||
|
||||
# Optional DAVE dep (mandatory at runtime when discord.py 2.7.1 is paired with
|
||||
# Discord voice gateway v=8; tolerated missing in tests / dev environments).
|
||||
try:
|
||||
import davey
|
||||
_HAS_DAVE = True
|
||||
except ImportError:
|
||||
_HAS_DAVE = False
|
||||
|
||||
from src.config import Config
|
||||
from src.voice.pipeline import (
|
||||
VoiceSession,
|
||||
EchoVoiceSink,
|
||||
_get_whisper_model,
|
||||
_get_silero_vad,
|
||||
)
|
||||
from src.voice.tts_stream import TTSQueue, EchoStreamingAudioSource
|
||||
from src.voice._discord_voice_adapter import connect_voice
|
||||
|
||||
log = logging.getLogger("echo-core.discord.voice")
|
||||
|
||||
# Per-guild voice session registry. Key = guild_id.
|
||||
_voice_sessions: dict[int, VoiceSession] = {}
|
||||
|
||||
# Set if model warmup failed; surfaces as ephemeral error on /voice join.
|
||||
_voice_load_error: Optional[str] = None
|
||||
|
||||
# Reference to the eager warmup task created in on_ready, so /voice join can
|
||||
# await it if the user is faster than the background load.
|
||||
_models_warmup_future: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def warmup_models() -> None:
|
||||
"""Eager model load — called from `on_ready()` as a background task.
|
||||
|
||||
Runs the (synchronous, blocking) model loaders on a worker thread so the
|
||||
event loop stays responsive. On failure, sets `_voice_load_error` instead
|
||||
of raising, so `/voice join` can degrade gracefully.
|
||||
"""
|
||||
global _voice_load_error
|
||||
try:
|
||||
if not discord.opus.is_loaded():
|
||||
discord.opus.load_opus("libopus.so.0")
|
||||
if _HAS_DAVE:
|
||||
log.info("DAVE protocol v%d available (davey %s)",
|
||||
davey.DAVE_PROTOCOL_VERSION, davey.__version__)
|
||||
await asyncio.to_thread(_get_whisper_model)
|
||||
await asyncio.to_thread(_get_silero_vad)
|
||||
log.info("Voice models warm")
|
||||
except Exception as e:
|
||||
_voice_load_error = f"{type(e).__name__}: {e}"
|
||||
log.error("Voice models load failed: %s", _voice_load_error)
|
||||
|
||||
|
||||
def _get_whitelist() -> set[int]:
|
||||
"""Read `voice.allowed_user_ids` from config and coerce to int set.
|
||||
|
||||
Re-reads config from disk to pick up any runtime edits between bot start
|
||||
and /voice join.
|
||||
"""
|
||||
try:
|
||||
raw = Config().get("voice.allowed_user_ids", [])
|
||||
except Exception:
|
||||
raw = []
|
||||
out: set[int] = set()
|
||||
for v in raw or []:
|
||||
try:
|
||||
out.add(int(v))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
def _get_default_voice() -> str:
|
||||
try:
|
||||
return Config().get("voice.default_voice", "M2") or "M2"
|
||||
except Exception:
|
||||
return "M2"
|
||||
|
||||
|
||||
def register(tree: app_commands.CommandTree, bot: discord.Client) -> app_commands.Group:
|
||||
"""Build the `/voice` slash command group and return it (caller registers)."""
|
||||
voice_group = app_commands.Group(
|
||||
name="voice", description="Echo Core voice channel"
|
||||
)
|
||||
|
||||
@voice_group.command(name="join", description="Echo intră în voice channel-ul tău")
|
||||
async def join(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
if _voice_load_error:
|
||||
await interaction.followup.send(
|
||||
f"Voice unavailable: {_voice_load_error}", ephemeral=True
|
||||
)
|
||||
return
|
||||
if _models_warmup_future is not None and not _models_warmup_future.done():
|
||||
try:
|
||||
await _models_warmup_future
|
||||
except Exception as e:
|
||||
await interaction.followup.send(
|
||||
f"Voice unavailable: {type(e).__name__}: {e}", ephemeral=True
|
||||
)
|
||||
return
|
||||
user = interaction.user
|
||||
if not isinstance(user, discord.Member) or user.voice is None or user.voice.channel is None:
|
||||
await interaction.followup.send(
|
||||
"Intră într-un voice channel întâi.", ephemeral=True
|
||||
)
|
||||
return
|
||||
channel = user.voice.channel
|
||||
whitelist = _get_whitelist()
|
||||
if user.id not in whitelist:
|
||||
await interaction.followup.send(
|
||||
"Nu ești pe whitelist voice.", ephemeral=True
|
||||
)
|
||||
return
|
||||
# Reject double-join on the same guild.
|
||||
guild_id = channel.guild.id
|
||||
if guild_id in _voice_sessions:
|
||||
await interaction.followup.send(
|
||||
"Sunt deja în voice pe acest server. Folosește /voice leave întâi.",
|
||||
ephemeral=True,
|
||||
)
|
||||
return
|
||||
# Connect
|
||||
try:
|
||||
vc = await connect_voice(channel)
|
||||
except Exception as e:
|
||||
log.exception("connect_voice failed")
|
||||
await interaction.followup.send(
|
||||
f"Conectare eșuată: {type(e).__name__}: {e}", ephemeral=True
|
||||
)
|
||||
return
|
||||
# Build TTS queue + session
|
||||
ttsq = TTSQueue(voice_id=_get_default_voice(), lang="ro")
|
||||
ttsq.start()
|
||||
try:
|
||||
session = VoiceSession(
|
||||
channel_id=channel.id,
|
||||
guild_id=guild_id,
|
||||
voice_client=vc,
|
||||
text_channel=interaction.channel,
|
||||
record_enabled=False,
|
||||
mirror_enabled=True,
|
||||
whitelist=whitelist,
|
||||
ttsq=ttsq,
|
||||
bot=bot,
|
||||
loop=asyncio.get_running_loop(),
|
||||
)
|
||||
except Exception as e:
|
||||
log.exception("VoiceSession construction failed")
|
||||
ttsq.stop()
|
||||
try:
|
||||
await vc.disconnect(force=True)
|
||||
except Exception:
|
||||
pass
|
||||
await interaction.followup.send(
|
||||
f"Sesiune voice eșuată: {type(e).__name__}: {e}", ephemeral=True
|
||||
)
|
||||
return
|
||||
_voice_sessions[guild_id] = session
|
||||
# Start TTS streaming source for the entire session. Chain the
|
||||
# wake-up beep via `after=` so streaming takes over when beep ends.
|
||||
def _start_stream(error: Optional[Exception] = None) -> None:
|
||||
if error is not None:
|
||||
log.warning("Beep playback ended with error: %s", error)
|
||||
try:
|
||||
vc.play(EchoStreamingAudioSource(ttsq))
|
||||
log.info("TTS streaming source attached")
|
||||
except Exception:
|
||||
log.exception("EchoStreamingAudioSource attach failed")
|
||||
try:
|
||||
vc.play(
|
||||
discord.FFmpegPCMAudio("assets/voice/beep_200ms.wav"),
|
||||
after=_start_stream,
|
||||
)
|
||||
except Exception:
|
||||
log.warning("Beep playback skipped, starting stream directly", exc_info=True)
|
||||
_start_stream()
|
||||
# Attach sink
|
||||
try:
|
||||
bot_user_id = int(bot.user.id) if bot.user is not None else 0
|
||||
sink = EchoVoiceSink(session=session, bot_user_id=bot_user_id)
|
||||
vc.listen(sink)
|
||||
except Exception as e:
|
||||
log.exception("Sink attach failed")
|
||||
_voice_sessions.pop(guild_id, None)
|
||||
try:
|
||||
session.cleanup("sink_attach_failed")
|
||||
except Exception:
|
||||
pass
|
||||
await interaction.followup.send(
|
||||
f"Atașare sink eșuată: {type(e).__name__}: {e}", ephemeral=True
|
||||
)
|
||||
return
|
||||
# Presence
|
||||
try:
|
||||
await bot.change_presence(activity=discord.Activity(
|
||||
type=discord.ActivityType.listening,
|
||||
name=f"{user.display_name} în #{channel.name}",
|
||||
))
|
||||
except Exception:
|
||||
log.warning("Presence update skipped", exc_info=True)
|
||||
await interaction.followup.send(
|
||||
f"În voce în #{channel.name}.", ephemeral=True
|
||||
)
|
||||
|
||||
@voice_group.command(name="leave", description="Echo iese din voice channel")
|
||||
async def leave(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
guild_id = interaction.guild.id if interaction.guild else None
|
||||
session = _voice_sessions.pop(guild_id, None) if guild_id is not None else None
|
||||
if session is None:
|
||||
await interaction.followup.send(
|
||||
"Nu sunt în niciun voice channel aici.", ephemeral=True
|
||||
)
|
||||
return
|
||||
try:
|
||||
session.cleanup("user_leave")
|
||||
except Exception:
|
||||
log.exception("session.cleanup raised")
|
||||
try:
|
||||
await bot.change_presence(activity=None)
|
||||
except Exception:
|
||||
log.warning("Presence reset skipped", exc_info=True)
|
||||
await interaction.followup.send("Plecat.", ephemeral=True)
|
||||
|
||||
_VOICE_CHOICES = [
|
||||
app_commands.Choice(name=v, value=v)
|
||||
for v in ("M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5")
|
||||
]
|
||||
|
||||
@voice_group.command(name="setvoice", description="Schimbă vocea Echo (M1-M5 sau F1-F5)")
|
||||
@app_commands.describe(voice="Voce nouă")
|
||||
@app_commands.choices(voice=_VOICE_CHOICES)
|
||||
async def setvoice(
|
||||
interaction: discord.Interaction,
|
||||
voice: app_commands.Choice[str],
|
||||
) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
new_voice = voice.value
|
||||
# Live-swap on the active session if Echo is in voice on this guild.
|
||||
guild_id = interaction.guild.id if interaction.guild else None
|
||||
session = _voice_sessions.get(guild_id) if guild_id is not None else None
|
||||
live_swapped = False
|
||||
if session is not None and session.ttsq is not None:
|
||||
session.ttsq.voice_id = new_voice
|
||||
live_swapped = True
|
||||
# Persist as the new default for future sessions.
|
||||
try:
|
||||
cfg = Config()
|
||||
cfg.set("voice.default_voice", new_voice)
|
||||
cfg.save()
|
||||
except Exception as e:
|
||||
log.warning("config save failed for new default voice: %s", e)
|
||||
await interaction.followup.send(
|
||||
f"Voce schimbată live ({new_voice}), dar config-ul nu s-a salvat: {e}",
|
||||
ephemeral=True,
|
||||
)
|
||||
return
|
||||
if live_swapped:
|
||||
msg = f"Vocea schimbată **live** pe {new_voice}. Următoarea frază va folosi vocea nouă."
|
||||
else:
|
||||
msg = f"Default voce setată {new_voice}. Va intra în vigoare la următorul /voice join."
|
||||
await interaction.followup.send(msg, ephemeral=True)
|
||||
|
||||
@voice_group.command(name="doctor", description="Verifică voice stack")
|
||||
async def doctor(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
checks: list[tuple[str, bool]] = []
|
||||
# libopus
|
||||
try:
|
||||
checks.append(("libopus", bool(discord.opus.is_loaded())))
|
||||
except Exception:
|
||||
checks.append(("libopus", False))
|
||||
# warmup
|
||||
checks.append(("voice load error", _voice_load_error is None))
|
||||
# Build response
|
||||
lines = ["**Voice doctor:**"]
|
||||
for label, ok in checks:
|
||||
lines.append(f"{'OK' if ok else 'FAIL'} — {label}")
|
||||
if _voice_load_error:
|
||||
lines.append(f" details: {_voice_load_error}")
|
||||
await interaction.followup.send("\n".join(lines), ephemeral=True)
|
||||
|
||||
# --- /voice mirror on|off ---
|
||||
mirror_group = app_commands.Group(
|
||||
name="mirror", description="Text mirror", parent=voice_group
|
||||
)
|
||||
|
||||
@mirror_group.command(name="on", description="Activează text mirror în canal")
|
||||
async def mirror_on(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
guild_id = interaction.guild.id if interaction.guild else None
|
||||
s = _voice_sessions.get(guild_id) if guild_id is not None else None
|
||||
if s is None:
|
||||
await interaction.followup.send("Nu sunt în voice.", ephemeral=True)
|
||||
return
|
||||
s.mirror_enabled = True
|
||||
await interaction.followup.send("Mirror ON.", ephemeral=True)
|
||||
|
||||
@mirror_group.command(name="off", description="Dezactivează text mirror")
|
||||
async def mirror_off(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
guild_id = interaction.guild.id if interaction.guild else None
|
||||
s = _voice_sessions.get(guild_id) if guild_id is not None else None
|
||||
if s is None:
|
||||
await interaction.followup.send("Nu sunt în voice.", ephemeral=True)
|
||||
return
|
||||
s.mirror_enabled = False
|
||||
await interaction.followup.send("Mirror OFF.", ephemeral=True)
|
||||
|
||||
# --- /voice record on|off ---
|
||||
record_group = app_commands.Group(
|
||||
name="record", description="KB recording", parent=voice_group
|
||||
)
|
||||
|
||||
@record_group.command(name="on", description="Activează înregistrare în KB")
|
||||
async def record_on(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
guild_id = interaction.guild.id if interaction.guild else None
|
||||
s = _voice_sessions.get(guild_id) if guild_id is not None else None
|
||||
if s is None:
|
||||
await interaction.followup.send("Nu sunt în voice.", ephemeral=True)
|
||||
return
|
||||
s.record_enabled = True
|
||||
await interaction.followup.send("Record ON.", ephemeral=True)
|
||||
|
||||
@record_group.command(name="off", description="Dezactivează înregistrare")
|
||||
async def record_off(interaction: discord.Interaction) -> None:
|
||||
await interaction.response.defer(ephemeral=True)
|
||||
guild_id = interaction.guild.id if interaction.guild else None
|
||||
s = _voice_sessions.get(guild_id) if guild_id is not None else None
|
||||
if s is None:
|
||||
await interaction.followup.send("Nu sunt în voice.", ephemeral=True)
|
||||
return
|
||||
s.record_enabled = False
|
||||
await interaction.followup.send("Record OFF.", ephemeral=True)
|
||||
|
||||
return voice_group
|
||||
@@ -37,6 +37,42 @@ DEFAULT_TIMEOUT = 300 # seconds
|
||||
|
||||
CLAUDE_BIN = os.environ.get("CLAUDE_BIN", "claude")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-channel mutex for send_message
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Two paths can hit `send_message(channel_id, ...)` concurrently for the same
|
||||
# channel: a text adapter (Discord/Telegram/WhatsApp) and the voice adapter
|
||||
# (`adapter_name="discord-voice"`). The underlying Claude CLI subprocess is
|
||||
# blocking (`subprocess.Popen` with stream-json read loop) and stateful via
|
||||
# `--resume <session_id>` — interleaving two concurrent invocations on the
|
||||
# same channel would corrupt the conversation order.
|
||||
#
|
||||
# We use `threading.Lock` (NOT `asyncio.Lock`) because `send_message` is sync
|
||||
# code typically run from `asyncio.to_thread` in async adapters. asyncio.Lock
|
||||
# only serializes coroutines, not threads — it would NOT protect this path.
|
||||
#
|
||||
# Each channel gets its own lock so DIFFERENT channels still run in parallel.
|
||||
# Locks are created lazily on first use; the dict itself is guarded by a
|
||||
# small bootstrap lock so two concurrent first-uses don't race on creation.
|
||||
_session_locks: dict[str, threading.Lock] = {}
|
||||
_session_locks_bootstrap = threading.Lock()
|
||||
|
||||
|
||||
def _get_session_lock(channel_id: str) -> threading.Lock:
|
||||
"""Return the channel's mutex, creating it on first access.
|
||||
|
||||
Two threads racing to create the same channel's lock would otherwise
|
||||
end up with different lock objects (setdefault is not atomic across
|
||||
the read-modify-write under all interpreter conditions — defensive).
|
||||
"""
|
||||
lock = _session_locks.get(channel_id)
|
||||
if lock is not None:
|
||||
return lock
|
||||
with _session_locks_bootstrap:
|
||||
return _session_locks.setdefault(channel_id, threading.Lock())
|
||||
|
||||
|
||||
PERSONALITY_FILES = [
|
||||
"IDENTITY.md",
|
||||
"SOUL.md",
|
||||
@@ -363,15 +399,23 @@ def _run_claude(
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_system_prompt() -> str:
|
||||
"""Concatenate personality/*.md files into a single system prompt."""
|
||||
def build_system_prompt(voice_mode: bool = False) -> str:
|
||||
"""Concatenate personality/*.md files into a single system prompt.
|
||||
|
||||
When ``voice_mode=True``, appends ``VOICE_MODE.md`` so the model knows
|
||||
its reply will be read aloud (terse, no markdown, no abbreviations, etc.).
|
||||
"""
|
||||
if not PERSONALITY_DIR.is_dir():
|
||||
raise FileNotFoundError(
|
||||
f"Personality directory not found: {PERSONALITY_DIR}"
|
||||
)
|
||||
|
||||
files = list(PERSONALITY_FILES)
|
||||
if voice_mode:
|
||||
files.append("VOICE_MODE.md")
|
||||
|
||||
parts: list[str] = []
|
||||
for filename in PERSONALITY_FILES:
|
||||
for filename in files:
|
||||
filepath = PERSONALITY_DIR / filename
|
||||
if filepath.is_file():
|
||||
parts.append(filepath.read_text(encoding="utf-8"))
|
||||
@@ -398,6 +442,7 @@ def start_session(
|
||||
model: str = DEFAULT_MODEL,
|
||||
timeout: int = DEFAULT_TIMEOUT,
|
||||
on_text: Callable[[str], None] | None = None,
|
||||
voice_mode: bool = False,
|
||||
) -> tuple[str, str]:
|
||||
"""Start a new Claude CLI session for a channel.
|
||||
|
||||
@@ -405,13 +450,16 @@ def start_session(
|
||||
|
||||
If *on_text* is provided, each intermediate Claude text block is passed
|
||||
to the callback as soon as it arrives.
|
||||
|
||||
*voice_mode* — when True, ``VOICE_MODE.md`` is appended to the system
|
||||
prompt so the model produces short, TTS-friendly responses.
|
||||
"""
|
||||
if model not in VALID_MODELS:
|
||||
raise ValueError(
|
||||
f"Invalid model '{model}'. Must be one of: haiku, sonnet, opus"
|
||||
)
|
||||
|
||||
system_prompt = build_system_prompt()
|
||||
system_prompt = build_system_prompt(voice_mode=voice_mode)
|
||||
|
||||
# Wrap external user message with injection protection markers
|
||||
wrapped_message = f"[EXTERNAL CONTENT]\n{message}\n[END EXTERNAL CONTENT]"
|
||||
@@ -542,20 +590,31 @@ def send_message(
|
||||
model: str = DEFAULT_MODEL,
|
||||
timeout: int = DEFAULT_TIMEOUT,
|
||||
on_text: Callable[[str], None] | None = None,
|
||||
voice_mode: bool = False,
|
||||
) -> str:
|
||||
"""High-level convenience: auto start or resume based on channel state."""
|
||||
session = get_active_session(channel_id)
|
||||
# Only resume if session has a valid session_id (not a pre-set model placeholder)
|
||||
if session is not None and session.get("session_id"):
|
||||
return resume_session(session["session_id"], message, timeout, on_text=on_text)
|
||||
# Use model from pre-set session if available, otherwise use provided model
|
||||
effective_model = model
|
||||
if session is not None and session.get("model"):
|
||||
effective_model = session["model"]
|
||||
response_text, _session_id = start_session(
|
||||
channel_id, message, effective_model, timeout, on_text=on_text
|
||||
)
|
||||
return response_text
|
||||
"""High-level convenience: auto start or resume based on channel state.
|
||||
|
||||
Concurrency: a per-`channel_id` `threading.Lock` serializes invocations
|
||||
that hit the same channel (e.g. text adapter + voice adapter racing on
|
||||
the same Discord guild text channel). Different channels run in
|
||||
parallel — each holds its own lock. Lock is acquired blocking; we rely
|
||||
on `timeout` (default 5 minutes) to bound the worst case rather than
|
||||
a non-blocking acquire (loss of fairness vs adapter-side queueing).
|
||||
"""
|
||||
with _get_session_lock(channel_id):
|
||||
session = get_active_session(channel_id)
|
||||
# Only resume if session has a valid session_id (not a pre-set model placeholder)
|
||||
if session is not None and session.get("session_id"):
|
||||
return resume_session(session["session_id"], message, timeout, on_text=on_text)
|
||||
# Use model from pre-set session if available, otherwise use provided model
|
||||
effective_model = model
|
||||
if session is not None and session.get("model"):
|
||||
effective_model = session["model"]
|
||||
response_text, _session_id = start_session(
|
||||
channel_id, message, effective_model, timeout,
|
||||
on_text=on_text, voice_mode=voice_mode,
|
||||
)
|
||||
return response_text
|
||||
|
||||
|
||||
def clear_session(channel_id: str) -> bool:
|
||||
|
||||
@@ -123,8 +123,10 @@ def route_message(
|
||||
# Text-based commands (not slash commands — these work in any adapter)
|
||||
if text.lower() == "/clear":
|
||||
default_model = _get_config().get("bot.default_model", "sonnet")
|
||||
cleared = clear_session(channel_id)
|
||||
if cleared:
|
||||
cleared_text = clear_session(channel_id)
|
||||
# Also drop the isolated voice session if one exists on this channel.
|
||||
clear_session(f"voice:{channel_id}")
|
||||
if cleared_text:
|
||||
return f"Session cleared. Model reset to {default_model}.", True
|
||||
return "No active session.", True
|
||||
|
||||
@@ -154,8 +156,24 @@ def route_message(
|
||||
channel_cfg = _get_channel_config(channel_id)
|
||||
model = (channel_cfg or {}).get("default_model") or _get_config().get("bot.default_model", "sonnet")
|
||||
|
||||
# Voice-mode augment: prepend speaker prefix so Claude knows who spoke
|
||||
# in a voice channel. Cheap now, future-proof for multi-speaker later.
|
||||
# (Engineering decision #14 in the plan.) Only the discord-voice adapter
|
||||
# triggers it — text adapters keep the message verbatim.
|
||||
claude_text = text
|
||||
voice_mode = adapter_name == "discord-voice"
|
||||
if voice_mode:
|
||||
user_name = _get_config().get("voice.user_name", "user") or "user"
|
||||
claude_text = f"[speaker:{user_name}] {text}"
|
||||
# Voice sessions use an isolated session key so they start fresh with
|
||||
# VOICE_MODE.md and don't pollute the text channel's conversation.
|
||||
session_key = f"voice:{channel_id}" if voice_mode else channel_id
|
||||
|
||||
try:
|
||||
response = send_message(channel_id, text, model=model, on_text=on_text)
|
||||
response = send_message(
|
||||
session_key, claude_text, model=model, on_text=on_text,
|
||||
voice_mode=voice_mode,
|
||||
)
|
||||
_set_last_response(channel_id, response)
|
||||
return response, False
|
||||
except Exception as e:
|
||||
|
||||
1
src/voice/__init__.py
Normal file
1
src/voice/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Discord voice pipeline modules — Pas 3-7 in voice plan."""
|
||||
67
src/voice/_discord_voice_adapter.py
Normal file
67
src/voice/_discord_voice_adapter.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Adapter layer over `discord-ext-voice-recv` (vendored at vendor/).
|
||||
|
||||
If discord-ext-voice-recv breaks, swap to py-cord by rewriting only this file.
|
||||
Contract test in tests/test_voice_adapter_contract.py guards drift.
|
||||
|
||||
Downstream consumers (`src/voice/*`, `src/adapters/discord_voice.py`) MUST
|
||||
import from this file — never from `discord.ext.voice_recv` directly.
|
||||
|
||||
## Public API surface (stable across upstream changes)
|
||||
|
||||
- ``VoiceReceiveClient`` — alias for ``voice_recv.VoiceRecvClient``. Subclass
|
||||
of ``discord.VoiceClient`` with extra audio-receive plumbing.
|
||||
Key methods used by the pipeline:
|
||||
* ``await client.disconnect(force: bool = False)`` (from discord.VoiceClient)
|
||||
* ``client.listen(sink, *, after=None)`` — attach an ``AudioSink``;
|
||||
raises ``discord.ClientException`` if not connected or already listening
|
||||
* ``client.stop_listening()`` — detach the current sink
|
||||
* ``client.is_listening() -> bool``
|
||||
* ``client.stop()`` — stop both playing and listening
|
||||
* ``client.sink`` (property, getter+setter) — swap the active sink in place
|
||||
|
||||
- ``AudioSink`` — abstract base. Subclasses MUST implement:
|
||||
* ``write(user: Optional[discord.User|Member], data: VoiceData) -> None``
|
||||
* ``wants_opus() -> bool`` (True → receive opus bytes; False → receive PCM)
|
||||
* ``cleanup() -> None``
|
||||
|
||||
- ``VoiceData`` — per-packet container. Slots: ``packet``, ``source``, ``pcm``.
|
||||
``.pcm`` is decoded 48kHz s16le stereo bytes when ``wants_opus()`` is False.
|
||||
``.opus`` property returns the raw opus bytes from the underlying RTP packet.
|
||||
|
||||
- ``connect_voice(channel) -> VoiceReceiveClient`` — async helper, returns a
|
||||
connected receive-capable voice client.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from discord.ext import voice_recv
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import discord
|
||||
|
||||
|
||||
# --- Stable re-exports -------------------------------------------------------
|
||||
|
||||
VoiceReceiveClient = voice_recv.VoiceRecvClient
|
||||
AudioSink = voice_recv.AudioSink
|
||||
VoiceData = voice_recv.VoiceData
|
||||
|
||||
|
||||
__all__ = [
|
||||
"VoiceReceiveClient",
|
||||
"AudioSink",
|
||||
"VoiceData",
|
||||
"connect_voice",
|
||||
]
|
||||
|
||||
|
||||
async def connect_voice(channel: "discord.VoiceChannel") -> VoiceReceiveClient:
|
||||
"""Connect to a Discord voice channel with the receive-capable client.
|
||||
|
||||
Thin wrapper around ``channel.connect(cls=VoiceRecvClient)`` so callers
|
||||
don't have to import the vendored class directly.
|
||||
"""
|
||||
return await channel.connect(cls=VoiceReceiveClient)
|
||||
318
src/voice/normalize.py
Normal file
318
src/voice/normalize.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""Voice mode text normalization for Romanian TTS.
|
||||
|
||||
Pure functions — no side effects, no I/O, no logging. Strip markdown,
|
||||
expand numbers / currency / symbols / abbreviations into natural-sounding
|
||||
Romanian text. See plan: src/voice/normalize.py (Pas 3).
|
||||
|
||||
Pipeline order in normalize_for_tts:
|
||||
strip_markdown -> expand_abbreviations -> expand_currency
|
||||
-> expand_numbers_ro -> expand_symbols -> truncate(200)
|
||||
|
||||
Currency runs BEFORE generic number expansion so "12.50 RON" becomes
|
||||
"doisprezece lei și cincizeci de bani" rather than
|
||||
"doisprezece virgulă cincizeci RON".
|
||||
"""
|
||||
import re
|
||||
|
||||
from num2words import num2words
|
||||
|
||||
|
||||
# ---------- Markdown ----------
|
||||
|
||||
_MARKDOWN_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)')
|
||||
_MARKDOWN_BOLD = re.compile(r'\*\*([^*]+)\*\*')
|
||||
_MARKDOWN_CODE = re.compile(r'`([^`\n]+)`')
|
||||
_MARKDOWN_ITALIC = re.compile(r'(?<!\*)\*([^*\n]+)\*(?!\*)')
|
||||
_MARKDOWN_HEADING = re.compile(r'^[ \t]*#{1,6}[ \t]+', re.MULTILINE)
|
||||
_MARKDOWN_LIST = re.compile(r'^[ \t]*[-*+][ \t]+', re.MULTILINE)
|
||||
|
||||
|
||||
def strip_markdown(text: str) -> str:
|
||||
"""Remove common markdown formatting, preserve the visible content."""
|
||||
text = _MARKDOWN_LINK.sub(r'\1', text)
|
||||
text = _MARKDOWN_BOLD.sub(r'\1', text)
|
||||
text = _MARKDOWN_CODE.sub(r'\1', text)
|
||||
text = _MARKDOWN_ITALIC.sub(r'\1', text)
|
||||
text = _MARKDOWN_HEADING.sub('', text)
|
||||
text = _MARKDOWN_LIST.sub('', text)
|
||||
return text
|
||||
|
||||
|
||||
# ---------- Number helpers ----------
|
||||
|
||||
def _needs_de(n: int) -> bool:
|
||||
"""Romanian: insert 'de' between numeral and noun for n >= 20,
|
||||
except when the trailing 1-19 portion makes it sound off
|
||||
(e.g., 105, 119 -> no 'de'; 120, 200 -> 'de').
|
||||
"""
|
||||
if n < 20:
|
||||
return False
|
||||
last = n % 100
|
||||
if 1 <= last <= 19:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _int_to_ro(n: int) -> str:
|
||||
return num2words(n, lang='ro')
|
||||
|
||||
|
||||
def _decimal_to_ro(s: str) -> str:
|
||||
"""Convert decimal string 'X.Y' to RO words.
|
||||
|
||||
Decimal part is read as a whole number ('3.14' -> 'trei virgulă paisprezece'),
|
||||
unless it has a leading zero ('3.05' -> 'trei virgulă zero cinci') so the
|
||||
magnitude is preserved.
|
||||
"""
|
||||
int_part, dec_part = s.split('.', 1)
|
||||
int_words = _int_to_ro(int(int_part))
|
||||
if dec_part.startswith('0') and len(dec_part) > 1:
|
||||
dec_words = ' '.join(_int_to_ro(int(d)) for d in dec_part)
|
||||
else:
|
||||
dec_words = _int_to_ro(int(dec_part))
|
||||
return f"{int_words} virgulă {dec_words}"
|
||||
|
||||
|
||||
# ---------- Numbers ----------
|
||||
|
||||
_NUM_TOKEN = re.compile(r'(?<!\w)(\d+(?:\.\d+)?)(?!\w)')
|
||||
|
||||
|
||||
def expand_numbers_ro(text: str) -> str:
|
||||
"""Expand bare numeric tokens to Romanian words.
|
||||
|
||||
Only matches pure number tokens (no surrounding letters). Decimals
|
||||
use 'virgulă' separator. Currency-bound numbers should already be
|
||||
handled by expand_currency before this runs.
|
||||
"""
|
||||
def _sub(match: re.Match) -> str:
|
||||
token = match.group(1)
|
||||
if '.' in token:
|
||||
return _decimal_to_ro(token)
|
||||
return _int_to_ro(int(token))
|
||||
|
||||
return _NUM_TOKEN.sub(_sub, text)
|
||||
|
||||
|
||||
# ---------- Thousands separator ----------
|
||||
|
||||
# Romanian uses dot or space as thousands separator: 384.000 / 384 000. The
|
||||
# decimal expander would read "384.000" as "trei sute optzeci și patru virgulă
|
||||
# zero zero zero" — wrong. Collapse the dots so expand_numbers_ro reads the
|
||||
# whole integer. Only 1-3 leading digits followed by ≥1 group of exactly 3
|
||||
# digits, never adjacent to other digits.
|
||||
_THOUSANDS_DOT = re.compile(r'(?<!\d)(\d{1,3}(?:\.\d{3})+)(?!\d)')
|
||||
|
||||
|
||||
def normalize_thousands(text: str) -> str:
|
||||
"""Strip the dot from Romanian thousands-separator integers."""
|
||||
return _THOUSANDS_DOT.sub(lambda m: m.group(1).replace('.', ''), text)
|
||||
|
||||
|
||||
# ---------- Metric units ----------
|
||||
|
||||
# (regex_matching_<n><unit>, singular, plural). Matches an integer or decimal
|
||||
# followed by the abbreviation as a whole word. Skipping bare ``m`` and ``l``
|
||||
# because they collide with too many tokens ("M2" voice id, list markers).
|
||||
_UNIT_PATTERNS: list[tuple[re.Pattern, str, str]] = [
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*km\b', re.IGNORECASE), 'kilometru', 'kilometri'),
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*kg\b', re.IGNORECASE), 'kilogram', 'kilograme'),
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*cm\b', re.IGNORECASE), 'centimetru', 'centimetri'),
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*mm\b', re.IGNORECASE), 'milimetru', 'milimetri'),
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*ml\b', re.IGNORECASE), 'mililitru', 'mililitri'),
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*ha\b', re.IGNORECASE), 'hectar', 'hectare'),
|
||||
(re.compile(r'(?<!\w)(\d+(?:[.,]\d+)?)\s*mp\b', re.IGNORECASE), 'metru pătrat', 'metri pătrați'),
|
||||
]
|
||||
|
||||
|
||||
def _format_unit(amount_str: str, singular: str, plural: str) -> str:
|
||||
"""Mirror ``_format_currency_unit`` for metric units. Decimals fall through
|
||||
to the generic decimal expander (which leaves them with plural form)."""
|
||||
if '.' in amount_str or ',' in amount_str:
|
||||
return f"{_decimal_to_ro(amount_str.replace(',', '.'))} {plural}"
|
||||
return _format_currency_unit(int(amount_str), singular, plural)
|
||||
|
||||
|
||||
def expand_units(text: str) -> str:
|
||||
"""Expand metric unit abbreviations into spoken Romanian."""
|
||||
for pattern, singular, plural in _UNIT_PATTERNS:
|
||||
text = pattern.sub(
|
||||
lambda m, sg=singular, pl=plural: _format_unit(m.group(1), sg, pl),
|
||||
text,
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
# ---------- Time ----------
|
||||
|
||||
_TIME_PATTERN = re.compile(r'(?<!\d)([01]?\d|2[0-3]):([0-5]?\d)(?!\d)')
|
||||
|
||||
|
||||
def _format_minutes_ro(n: int) -> str:
|
||||
"""Romanian-correct feminine forms for minute counts (0-59)."""
|
||||
if n == 1:
|
||||
return "un minut"
|
||||
if n == 2:
|
||||
return "două minute"
|
||||
if n < 20:
|
||||
return f"{_int_to_ro(n)} minute"
|
||||
last = n % 10
|
||||
rest = n - last
|
||||
if last == 0:
|
||||
return f"{_int_to_ro(n)} de minute"
|
||||
if last == 1:
|
||||
return f"{_int_to_ro(rest)} și una de minute"
|
||||
if last == 2:
|
||||
return f"{_int_to_ro(rest)} și două de minute"
|
||||
return f"{_int_to_ro(rest)} și {_int_to_ro(last)} de minute"
|
||||
|
||||
|
||||
def expand_time(text: str) -> str:
|
||||
"""Expand ``HH:MM`` clock times into colloquial Romanian.
|
||||
|
||||
23:09 -> "douăzeci și trei și nouă minute"
|
||||
23:00 -> "douăzeci și trei fix"
|
||||
"""
|
||||
def _sub(match: re.Match) -> str:
|
||||
h = int(match.group(1))
|
||||
m = int(match.group(2))
|
||||
hour_str = _int_to_ro(h)
|
||||
if m == 0:
|
||||
return f"{hour_str} fix"
|
||||
return f"{hour_str} și {_format_minutes_ro(m)}"
|
||||
|
||||
return _TIME_PATTERN.sub(_sub, text)
|
||||
|
||||
|
||||
# ---------- Currency ----------
|
||||
|
||||
_CURRENCY_MAIN = {
|
||||
'RON': ('leu', 'lei'),
|
||||
'USD': ('dolar', 'dolari'),
|
||||
'EUR': ('euro', 'euro'),
|
||||
'GBP': ('liră', 'lire'),
|
||||
}
|
||||
|
||||
_CURRENCY_SUB = {
|
||||
'RON': ('ban', 'bani'),
|
||||
'USD': ('cent', 'cenți'),
|
||||
'EUR': ('cent', 'cenți'),
|
||||
'GBP': ('penny', 'pence'),
|
||||
}
|
||||
|
||||
_CURRENCY_PATTERNS = [
|
||||
# RON suffix (case-insensitive: RON, ron, lei)
|
||||
(re.compile(r'(?<!\w)(\d+(?:\.\d+)?)\s+(?:RON|lei)\b', re.IGNORECASE), 'RON'),
|
||||
# Prefix currencies
|
||||
(re.compile(r'\$(\d+(?:\.\d+)?)'), 'USD'),
|
||||
(re.compile(r'€(\d+(?:\.\d+)?)'), 'EUR'),
|
||||
(re.compile(r'£(\d+(?:\.\d+)?)'), 'GBP'),
|
||||
]
|
||||
|
||||
|
||||
def _format_currency_unit(n: int, singular: str, plural: str) -> str:
|
||||
"""Format integer amount + currency noun with proper RO singular/plural
|
||||
and 'de' particle. Uses 'un' (article) for n=1, not 'unu' (cardinal).
|
||||
"""
|
||||
if n == 1:
|
||||
return f"un {singular}"
|
||||
word = _int_to_ro(n)
|
||||
if _needs_de(n):
|
||||
return f"{word} de {plural}"
|
||||
return f"{word} {plural}"
|
||||
|
||||
|
||||
def _format_currency(amount: str, code: str) -> str:
|
||||
main_sg, main_pl = _CURRENCY_MAIN[code]
|
||||
if '.' in amount:
|
||||
whole_s, frac_s = amount.split('.', 1)
|
||||
# Normalize fractional part to 2 digits so "12.5 RON" reads as
|
||||
# 50 bani, not 5 bani.
|
||||
if len(frac_s) == 1:
|
||||
frac_s = frac_s + '0'
|
||||
elif len(frac_s) > 2:
|
||||
frac_s = frac_s[:2]
|
||||
whole = int(whole_s)
|
||||
frac = int(frac_s)
|
||||
whole_part = _format_currency_unit(whole, main_sg, main_pl)
|
||||
if frac == 0:
|
||||
return whole_part
|
||||
sub_sg, sub_pl = _CURRENCY_SUB[code]
|
||||
frac_part = _format_currency_unit(frac, sub_sg, sub_pl)
|
||||
return f"{whole_part} și {frac_part}"
|
||||
return _format_currency_unit(int(amount), main_sg, main_pl)
|
||||
|
||||
|
||||
def expand_currency(text: str) -> str:
|
||||
"""Expand currency amounts into natural Romanian.
|
||||
|
||||
Recognises ``<n> RON`` / ``<n> lei`` suffix and ``$``, ``€``, ``£`` prefix
|
||||
forms with optional 2-decimal fractional part (treated as sub-unit:
|
||||
bani / cenți / pence).
|
||||
"""
|
||||
for pattern, code in _CURRENCY_PATTERNS:
|
||||
text = pattern.sub(lambda m, c=code: _format_currency(m.group(1), c), text)
|
||||
return text
|
||||
|
||||
|
||||
# ---------- Symbols ----------
|
||||
|
||||
def expand_symbols(text: str) -> str:
|
||||
"""Replace common symbols with their Romanian spoken form."""
|
||||
text = text.replace('%', ' la sută')
|
||||
text = text.replace('&', ' și ')
|
||||
text = text.replace('@', ' la ')
|
||||
text = text.replace('°', ' grade')
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
|
||||
from tools.tts import sanitize_for_supertonic as sanitize_punctuation
|
||||
|
||||
|
||||
# ---------- Abbreviations ----------
|
||||
|
||||
# Longer patterns first so 'ș.a.m.d.' wins over 'ș.a.'
|
||||
_ABBREVIATIONS = [
|
||||
(re.compile(r'(?<!\w)[șş]\.a\.m\.d\.', re.IGNORECASE), 'și așa mai departe'),
|
||||
(re.compile(r'(?<!\w)[șş]\.a\.', re.IGNORECASE), 'și altele'),
|
||||
(re.compile(r'(?<!\w)etc\.', re.IGNORECASE), 'etcetera'),
|
||||
(re.compile(r'(?<!\w)dl\.', re.IGNORECASE), 'domnul'),
|
||||
(re.compile(r'(?<!\w)dna\.', re.IGNORECASE), 'doamna'),
|
||||
(re.compile(r'(?<!\w)nr\.', re.IGNORECASE), 'numărul'),
|
||||
]
|
||||
|
||||
|
||||
def expand_abbreviations(text: str) -> str:
|
||||
"""Expand Romanian abbreviations into their full forms."""
|
||||
for pattern, replacement in _ABBREVIATIONS:
|
||||
text = pattern.sub(replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
# ---------- Top-level pipeline ----------
|
||||
|
||||
_MAX_WORDS = 200
|
||||
_TRUNCATE_SUFFIX = "Restul l-am scris în chat."
|
||||
|
||||
|
||||
def normalize_for_tts(text: str) -> str:
|
||||
"""Apply the full normalization pipeline and truncate to 200 words.
|
||||
|
||||
If the text exceeds 200 words, the first 200 are kept and the suffix
|
||||
"Restul l-am scris în chat." is appended so the listener knows the
|
||||
response continues in the text channel mirror.
|
||||
"""
|
||||
text = strip_markdown(text)
|
||||
text = sanitize_punctuation(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = normalize_thousands(text)
|
||||
text = expand_time(text)
|
||||
text = expand_currency(text)
|
||||
text = expand_units(text)
|
||||
text = expand_numbers_ro(text)
|
||||
text = expand_symbols(text)
|
||||
words = text.split()
|
||||
if len(words) > _MAX_WORDS:
|
||||
text = ' '.join(words[:_MAX_WORDS]) + f" {_TRUNCATE_SUFFIX}"
|
||||
return text.strip()
|
||||
651
src/voice/pipeline.py
Normal file
651
src/voice/pipeline.py
Normal file
@@ -0,0 +1,651 @@
|
||||
"""Central voice pipeline: VAD -> STT -> Claude -> TTS for Discord voice.
|
||||
|
||||
``VoiceSession`` binds per-call state — voice_client, TTS queue, transcript
|
||||
JSONL buffer, whitelist, presence — and exposes a single idempotent
|
||||
``cleanup()`` invoked from every exit path (user /voice leave, network
|
||||
disconnect, crash via ``__exit__``, auto-leave timer, user leaves channel).
|
||||
|
||||
``EchoVoiceSink`` is the discord-ext-voice-recv ``AudioSink`` subclass that
|
||||
runs in the voice_recv reader thread. It batches 20ms PCM packets into
|
||||
100ms windows for silero-vad inference, marks per-user speech timestamps,
|
||||
and on 800ms cumulative silence flushes the accumulated audio through
|
||||
faster-whisper. Hallucinated segments (``no_speech_prob > 0.6``) are
|
||||
dropped. Valid transcripts are scheduled onto the session's event loop
|
||||
via ``asyncio.run_coroutine_threadsafe``.
|
||||
|
||||
The bot's own ``user.id`` is filtered FIRST inside ``write()`` — load-bearing
|
||||
echo prevention so a future whitelist expansion (Bianca, etc.) never lets
|
||||
the bot transcribe itself.
|
||||
|
||||
See plan: ``src/voice/pipeline.py`` (Pas 5), Engineering decisions #4
|
||||
(VAD 100ms batched), #5 (cleanup centralizat), #7 (bot.user.id explicit
|
||||
guard).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.voice._discord_voice_adapter import AudioSink, VoiceData
|
||||
from src.voice.voice_commands import detect_voice_change
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Discord delivers 48kHz s16le stereo PCM, 20ms per packet (3840 bytes).
|
||||
SAMPLE_RATE_DISCORD = 48000
|
||||
SAMPLE_RATE_WHISPER = 16000
|
||||
PACKET_MS = 20
|
||||
PACKET_BYTES = 3840 # 48000 Hz * 0.020 s * 2 channels * 2 bytes
|
||||
VAD_WINDOW_MS = 100 # batch 5 * 20ms packets per VAD inference (Decision #4)
|
||||
VAD_WINDOW_BYTES = PACKET_BYTES * (VAD_WINDOW_MS // PACKET_MS)
|
||||
VAD_THRESHOLD = 0.5
|
||||
SILENCE_FLUSH_MS = 800
|
||||
NO_SPEECH_DROP_THRESHOLD = 0.6
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
LOGS_DIR = PROJECT_ROOT / "logs"
|
||||
VOICE_METRICS_PATH = LOGS_DIR / "voice_metrics.jsonl"
|
||||
|
||||
|
||||
# ---------- Lazy model singletons ----------
|
||||
|
||||
_whisper_model: Any = None
|
||||
_whisper_lock = threading.Lock()
|
||||
_silero_model: Any = None
|
||||
_silero_get_timestamps: Any = None
|
||||
_silero_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_whisper_model() -> Any:
|
||||
"""Lazy-load faster-whisper ``small`` int8 with the spike-validated
|
||||
``cpu_threads=4`` (see ``tasks/voice-bench-results.md``)."""
|
||||
global _whisper_model
|
||||
if _whisper_model is not None:
|
||||
return _whisper_model
|
||||
with _whisper_lock:
|
||||
if _whisper_model is not None:
|
||||
return _whisper_model
|
||||
from faster_whisper import WhisperModel
|
||||
_whisper_model = WhisperModel(
|
||||
"small", device="cpu", compute_type="int8", cpu_threads=4,
|
||||
local_files_only=True,
|
||||
)
|
||||
return _whisper_model
|
||||
|
||||
|
||||
def _get_silero_vad():
|
||||
"""Lazy-load silero-vad. Returns ``(model, get_speech_timestamps)``."""
|
||||
global _silero_model, _silero_get_timestamps
|
||||
if _silero_model is not None:
|
||||
return _silero_model, _silero_get_timestamps
|
||||
with _silero_lock:
|
||||
if _silero_model is not None:
|
||||
return _silero_model, _silero_get_timestamps
|
||||
from silero_vad import get_speech_timestamps, load_silero_vad
|
||||
_silero_model = load_silero_vad()
|
||||
_silero_get_timestamps = get_speech_timestamps
|
||||
return _silero_model, _silero_get_timestamps
|
||||
|
||||
|
||||
# ---------- Audio helpers ----------
|
||||
|
||||
def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
|
||||
"""Discord 48kHz s16le stereo bytes -> 16kHz mono float32 in [-1, 1].
|
||||
|
||||
Cheap downsample: average the two channels, then average every 3
|
||||
samples (48k / 3 = 16k). faster-whisper + silero-vad accept the
|
||||
resulting ``np.float32`` array directly.
|
||||
"""
|
||||
if not pcm:
|
||||
return np.zeros(0, dtype=np.float32)
|
||||
samples = np.frombuffer(pcm, dtype=np.int16)
|
||||
if samples.size % 2 != 0:
|
||||
samples = samples[:-1]
|
||||
stereo = samples.reshape(-1, 2)
|
||||
mono = stereo.mean(axis=1).astype(np.float32) / 32768.0
|
||||
if mono.size == 0:
|
||||
return mono
|
||||
trim = (mono.size // 3) * 3
|
||||
if trim == 0:
|
||||
return np.zeros(0, dtype=np.float32)
|
||||
mono = mono[:trim].reshape(-1, 3).mean(axis=1)
|
||||
return mono.astype(np.float32)
|
||||
|
||||
|
||||
# ---------- VoiceSession ----------
|
||||
|
||||
class VoiceSession:
|
||||
"""Per-voice-call state with a single idempotent ``cleanup()``."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
channel_id: int,
|
||||
guild_id: int,
|
||||
text_channel: Any,
|
||||
voice_client: Any,
|
||||
bot: Any,
|
||||
ttsq: Any,
|
||||
whitelist: Optional[set] = None,
|
||||
record_enabled: bool = False,
|
||||
mirror_enabled: bool = True,
|
||||
transcripts_jsonl_path: Optional[Path] = None,
|
||||
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||
router_route_message: Optional[Callable] = None,
|
||||
):
|
||||
self.channel_id = int(channel_id)
|
||||
self.guild_id = int(guild_id)
|
||||
self.text_channel = text_channel
|
||||
self.voice_client = voice_client
|
||||
self.bot = bot
|
||||
self.ttsq = ttsq
|
||||
self.whitelist: set = set(whitelist or set())
|
||||
self.record_enabled = bool(record_enabled)
|
||||
self.mirror_enabled = bool(mirror_enabled)
|
||||
self.transcripts_jsonl_path = transcripts_jsonl_path
|
||||
self.loop = loop
|
||||
# Injection seam so tests can replace router.route_message without
|
||||
# mocking the whole module.
|
||||
if router_route_message is None:
|
||||
from src.router import route_message as _rm
|
||||
self._route_message = _rm
|
||||
else:
|
||||
self._route_message = router_route_message
|
||||
|
||||
self.last_activity_ts = time.monotonic()
|
||||
self._jsonl_fh = None
|
||||
self._lock = threading.Lock()
|
||||
self._cleaned_up = False
|
||||
self._lock_owner_thread: Optional[int] = None
|
||||
|
||||
# ----- context manager -----
|
||||
|
||||
def __enter__(self) -> "VoiceSession":
|
||||
self._lock.acquire()
|
||||
self._lock_owner_thread = threading.get_ident()
|
||||
if self.record_enabled and self.transcripts_jsonl_path is not None:
|
||||
try:
|
||||
self.transcripts_jsonl_path.parent.mkdir(
|
||||
parents=True, exist_ok=True,
|
||||
)
|
||||
self._jsonl_fh = open(
|
||||
self.transcripts_jsonl_path, "a",
|
||||
buffering=1, encoding="utf-8",
|
||||
)
|
||||
except OSError as e:
|
||||
log.warning("voice transcript open failed: %s", e)
|
||||
self._jsonl_fh = None
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
|
||||
self.cleanup("exit")
|
||||
return False # never suppress exceptions
|
||||
|
||||
# ----- cleanup (centralized, idempotent) -----
|
||||
|
||||
def cleanup(self, reason: str) -> None:
|
||||
"""Single drain path for ALL 5 exit scenarios. Safe to call twice."""
|
||||
if self._cleaned_up:
|
||||
return
|
||||
self._cleaned_up = True
|
||||
|
||||
# 1. Flush or discard JSONL transcript.
|
||||
if self._jsonl_fh is not None:
|
||||
try:
|
||||
self._jsonl_fh.flush()
|
||||
self._jsonl_fh.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice transcript flush failed: %s", e)
|
||||
self._jsonl_fh = None
|
||||
if (not self.record_enabled
|
||||
and self.transcripts_jsonl_path is not None
|
||||
and self.transcripts_jsonl_path.exists()):
|
||||
try:
|
||||
self.transcripts_jsonl_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# 2. Restore bot presence (clear Listening activity).
|
||||
if self.bot is not None:
|
||||
try:
|
||||
change = getattr(self.bot, "change_presence", None)
|
||||
if callable(change):
|
||||
coro = change(activity=None)
|
||||
if asyncio.iscoroutine(coro):
|
||||
if self.loop is not None and self.loop.is_running():
|
||||
asyncio.run_coroutine_threadsafe(coro, self.loop)
|
||||
else:
|
||||
# Best-effort: close the coroutine so Python
|
||||
# doesn't emit "coroutine was never awaited".
|
||||
coro.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice presence restore failed: %s", e)
|
||||
|
||||
# 3. Tear down the voice client.
|
||||
if self.voice_client is not None:
|
||||
try:
|
||||
self.voice_client.cleanup()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice_client.cleanup failed: %s", e)
|
||||
|
||||
# 4. Stop the TTS queue worker.
|
||||
if self.ttsq is not None:
|
||||
try:
|
||||
self.ttsq.stop()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq.stop failed: %s", e)
|
||||
|
||||
# 5. Release the session lock (held since __enter__).
|
||||
try:
|
||||
if self._lock.locked():
|
||||
self._lock.release()
|
||||
except RuntimeError:
|
||||
# Released from a different thread than acquired it — already
|
||||
# free for the next caller; nothing to do.
|
||||
pass
|
||||
|
||||
self._log_metric({"event": "cleanup", "reason": reason})
|
||||
|
||||
# ----- segment completion (scheduled from sink) -----
|
||||
|
||||
async def on_segment_done(
|
||||
self,
|
||||
speaker_id: int,
|
||||
text: str,
|
||||
no_speech_prob: float,
|
||||
) -> None:
|
||||
"""Mirror, persist, route to Claude, drive TTS via streaming callback."""
|
||||
if self._cleaned_up:
|
||||
return
|
||||
self.last_activity_ts = time.monotonic()
|
||||
speaker_name = self._resolve_speaker_name(speaker_id)
|
||||
|
||||
# Drop any TTS frames from the previous turn so a new utterance cuts off
|
||||
# stale Echo speech (barge-in) and never mixes with the new response.
|
||||
try:
|
||||
self.ttsq.clear()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq.clear failed: %s", e)
|
||||
|
||||
# In-band voice command: change TTS voice without round-tripping Claude.
|
||||
new_voice = detect_voice_change(text)
|
||||
if new_voice is not None:
|
||||
await self._handle_voice_change(speaker_name, text, new_voice)
|
||||
return
|
||||
|
||||
# 1. Mirror to text channel (one Unicode 🎤 — exception per plan).
|
||||
if self.mirror_enabled and self.text_channel is not None:
|
||||
try:
|
||||
send = getattr(self.text_channel, "send", None)
|
||||
if callable(send):
|
||||
coro = send(f"\U0001f3a4 {speaker_name}: \"{text}\"")
|
||||
if asyncio.iscoroutine(coro):
|
||||
await coro
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice mirror send failed: %s", e)
|
||||
|
||||
# 2. Append to JSONL transcript buffer if recording.
|
||||
if self._jsonl_fh is not None:
|
||||
try:
|
||||
self._jsonl_fh.write(
|
||||
json.dumps({
|
||||
"ts": time.time(),
|
||||
"speaker_id": speaker_id,
|
||||
"speaker": speaker_name,
|
||||
"text": text,
|
||||
"no_speech_prob": no_speech_prob,
|
||||
}, ensure_ascii=False) + "\n"
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice transcript write failed: %s", e)
|
||||
|
||||
block_count = [0]
|
||||
|
||||
def voice_stream_callback(block: str) -> None:
|
||||
"""Called once per Claude streamed text block — pushes to TTS."""
|
||||
block_count[0] += 1
|
||||
log.info("voice stream block #%d (%d chars): %r",
|
||||
block_count[0], len(block or ""), (block or "")[:80])
|
||||
try:
|
||||
self.ttsq.push_text(block)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq.push_text failed: %s", e)
|
||||
|
||||
# Dispatch to Claude. send_message is sync subprocess, run on
|
||||
# a worker thread so the loop stays responsive for mirror/TTS.
|
||||
try:
|
||||
await asyncio.to_thread(
|
||||
self._route_message,
|
||||
str(self.channel_id),
|
||||
str(speaker_id),
|
||||
text,
|
||||
None, # model
|
||||
voice_stream_callback, # on_text
|
||||
"discord-voice", # adapter_name
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.error("route_message voice path failed: %s", e)
|
||||
|
||||
async def _handle_voice_change(
|
||||
self, speaker_name: str, original_text: str, new_voice: str,
|
||||
) -> None:
|
||||
"""Apply an in-band 'change voice' command: swap live, persist to
|
||||
config, mirror to chat, speak a short acknowledgment in the new voice.
|
||||
Does NOT forward the utterance to Claude."""
|
||||
# 1. Live-swap on the TTS queue. Next clause synth uses the new voice.
|
||||
try:
|
||||
self.ttsq.voice_id = new_voice
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq voice swap failed: %s", e)
|
||||
# 2. Persist as the new default for future sessions.
|
||||
try:
|
||||
from src.config import Config
|
||||
cfg = Config()
|
||||
cfg.set("voice.default_voice", new_voice)
|
||||
cfg.save()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice default persist failed: %s", e)
|
||||
# 3. Mirror what was heard + show the swap in the text channel.
|
||||
if self.mirror_enabled and self.text_channel is not None:
|
||||
try:
|
||||
send = getattr(self.text_channel, "send", None)
|
||||
if callable(send):
|
||||
coro = send(
|
||||
f"\U0001f3a4 {speaker_name}: \"{original_text}\"\n"
|
||||
f"\U0001f50a Voce → **{new_voice}**"
|
||||
)
|
||||
if asyncio.iscoroutine(coro):
|
||||
await coro
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice mirror send failed: %s", e)
|
||||
# 4. Verbal acknowledgment in the NEW voice.
|
||||
try:
|
||||
self.ttsq.push_text(f"Vocea {new_voice}.")
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice ack push failed: %s", e)
|
||||
self._log_metric({"event": "voice_change", "new_voice": new_voice})
|
||||
|
||||
# ----- helpers -----
|
||||
|
||||
def _resolve_speaker_name(self, speaker_id: int) -> str:
|
||||
"""Best-effort display name lookup via the bot user cache."""
|
||||
try:
|
||||
if self.bot is not None and hasattr(self.bot, "get_user"):
|
||||
user = self.bot.get_user(speaker_id)
|
||||
if user is not None:
|
||||
name = getattr(user, "display_name", None) or getattr(
|
||||
user, "name", None,
|
||||
)
|
||||
if name:
|
||||
return str(name)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return str(speaker_id)
|
||||
|
||||
def _log_metric(self, payload: dict) -> None:
|
||||
"""Append a structured event to ``logs/voice_metrics.jsonl``."""
|
||||
event = {"ts": time.time(), "channel_id": self.channel_id, **payload}
|
||||
try:
|
||||
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(VOICE_METRICS_PATH, "a", buffering=1, encoding="utf-8") as f:
|
||||
f.write(json.dumps(event, ensure_ascii=False) + "\n")
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
# ---------- EchoVoiceSink ----------
|
||||
|
||||
class EchoVoiceSink(AudioSink):
|
||||
"""PCM-in sink: per-user 20ms buffer -> 100ms VAD windows -> 800ms
|
||||
silence triggers Whisper STT -> schedules ``on_segment_done`` on the
|
||||
session loop.
|
||||
|
||||
Lives in the voice_recv reader thread; uses ``threading`` primitives
|
||||
only (no asyncio in the hot path).
|
||||
"""
|
||||
|
||||
def __init__(self, session: VoiceSession, bot_user_id: int):
|
||||
super().__init__()
|
||||
self.session = session
|
||||
self.bot_user_id = int(bot_user_id) if bot_user_id is not None else 0
|
||||
self.whitelist: set = set(session.whitelist or set())
|
||||
self._user_buffers: dict[int, bytearray] = {}
|
||||
self._packet_accum: dict[int, bytearray] = {}
|
||||
self._last_speech_ts: dict[int, float] = {}
|
||||
self._has_speech: dict[int, bool] = {}
|
||||
self._sink_lock = threading.Lock()
|
||||
# Diagnostics: log once-per-user when packets first arrive and when
|
||||
# VAD first detects speech. Cheap, but tells us exactly where the
|
||||
# chain breaks when "I spoke but Echo heard nothing" happens.
|
||||
self._first_packet_logged: set[int] = set()
|
||||
self._first_speech_logged: set[int] = set()
|
||||
# Track consecutive VAD-positive windows per user. Used to delay
|
||||
# barge-in (don't cut Echo off on a single jittery VAD hit; require
|
||||
# ≥2 windows ≈ 200ms of sustained speech).
|
||||
self._vad_consecutive: dict[int, int] = {}
|
||||
# Background poller that triggers the silence flush even when Discord
|
||||
# DTX stops delivering RTP packets after the user stops speaking. Without
|
||||
# this, sink.write would stop firing and STT would never run on the
|
||||
# final utterance.
|
||||
self._poller_stop = threading.Event()
|
||||
self._poller_thread = threading.Thread(
|
||||
target=self._silence_flush_poller,
|
||||
name="echo-voice-flush-poller",
|
||||
daemon=True,
|
||||
)
|
||||
self._poller_thread.start()
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self._poller_stop.set()
|
||||
with self._sink_lock:
|
||||
self._user_buffers.clear()
|
||||
self._packet_accum.clear()
|
||||
self._last_speech_ts.clear()
|
||||
self._has_speech.clear()
|
||||
|
||||
def write(self, user, voice_data: VoiceData) -> None:
|
||||
# ---- FIRST GUARD (LOAD-BEARING): bot's own voice ---------------
|
||||
if user is None:
|
||||
return
|
||||
uid = int(getattr(user, "id", 0) or 0)
|
||||
if uid == 0:
|
||||
return
|
||||
if uid == self.bot_user_id:
|
||||
return
|
||||
|
||||
# ---- SECOND GUARD: whitelist filter ----------------------------
|
||||
if self.whitelist and uid not in self.whitelist:
|
||||
return
|
||||
|
||||
pcm = getattr(voice_data, "pcm", None)
|
||||
if not pcm:
|
||||
return
|
||||
|
||||
if uid not in self._first_packet_logged:
|
||||
self._first_packet_logged.add(uid)
|
||||
log.info("voice sink: first PCM packet from user %s (%d bytes)", uid, len(pcm))
|
||||
|
||||
window_pcm: Optional[bytes] = None
|
||||
pcm_for_stt: Optional[bytes] = None
|
||||
|
||||
try:
|
||||
with self._sink_lock:
|
||||
buf = self._user_buffers.setdefault(uid, bytearray())
|
||||
accum = self._packet_accum.setdefault(uid, bytearray())
|
||||
buf.extend(pcm)
|
||||
accum.extend(pcm)
|
||||
if len(accum) >= VAD_WINDOW_BYTES:
|
||||
window_pcm = bytes(accum[:VAD_WINDOW_BYTES])
|
||||
del accum[:VAD_WINDOW_BYTES]
|
||||
|
||||
if window_pcm is not None:
|
||||
if self._vad_detects_speech(window_pcm):
|
||||
if uid not in self._first_speech_logged:
|
||||
self._first_speech_logged.add(uid)
|
||||
log.info("voice sink: VAD detected speech from user %s", uid)
|
||||
self._vad_consecutive[uid] = self._vad_consecutive.get(uid, 0) + 1
|
||||
with self._sink_lock:
|
||||
self._last_speech_ts[uid] = time.monotonic()
|
||||
self._has_speech[uid] = True
|
||||
# Fast barge-in: after ≥2 consecutive VAD windows (~200ms
|
||||
# of sustained speech), cut Echo's TTS mid-sentence so the
|
||||
# user doesn't have to wait the full silence-flush + STT
|
||||
# cycle (~3s).
|
||||
if self._vad_consecutive[uid] >= 2:
|
||||
try:
|
||||
ttsq = self.session.ttsq
|
||||
if ttsq is not None and not ttsq.is_empty():
|
||||
ttsq.clear()
|
||||
log.info(
|
||||
"voice sink: barge-in cleared TTS queue (user=%s)",
|
||||
uid,
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("barge-in clear failed: %s", e)
|
||||
else:
|
||||
self._vad_consecutive[uid] = 0
|
||||
|
||||
pcm_for_stt = self._take_flushable_pcm(uid)
|
||||
if pcm_for_stt:
|
||||
self._flush_to_stt(uid, pcm_for_stt)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("EchoVoiceSink.write failed: %s", e)
|
||||
|
||||
def _take_flushable_pcm(self, uid: int) -> Optional[bytes]:
|
||||
"""If user `uid` has buffered speech that's been silent ≥SILENCE_FLUSH_MS,
|
||||
consume the buffer and return it. Otherwise return None."""
|
||||
with self._sink_lock:
|
||||
if not self._has_speech.get(uid):
|
||||
return None
|
||||
last = self._last_speech_ts.get(uid, 0.0)
|
||||
silence_ms = (time.monotonic() - last) * 1000.0
|
||||
if silence_ms < SILENCE_FLUSH_MS:
|
||||
return None
|
||||
pcm = bytes(self._user_buffers.get(uid, b""))
|
||||
self._user_buffers[uid] = bytearray()
|
||||
self._packet_accum[uid] = bytearray()
|
||||
self._has_speech[uid] = False
|
||||
return pcm if pcm else None
|
||||
|
||||
def _silence_flush_poller(self) -> None:
|
||||
"""Background tick: Discord DTX stops sending RTP packets when the user
|
||||
goes silent, so the inline flush check in `write()` never fires for the
|
||||
last utterance. Poll every 200ms so the trailing audio actually reaches
|
||||
Whisper."""
|
||||
while not self._poller_stop.wait(0.2):
|
||||
try:
|
||||
with self._sink_lock:
|
||||
pending = [uid for uid, has in self._has_speech.items() if has]
|
||||
for uid in pending:
|
||||
pcm = self._take_flushable_pcm(uid)
|
||||
if pcm:
|
||||
self._flush_to_stt(uid, pcm)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("silence flush poller iter failed: %s", e)
|
||||
|
||||
# ----- VAD -----
|
||||
|
||||
def _vad_detects_speech(self, pcm48_stereo: bytes) -> bool:
|
||||
"""Run silero-vad on a 100ms window. silero-vad v5+ requires exactly
|
||||
512 samples per call at 16kHz, so we slice the window into 512-sample
|
||||
chunks and return True if any chunk crosses the threshold."""
|
||||
try:
|
||||
mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
|
||||
if mono16.size == 0:
|
||||
return False
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
rms = float(np.sqrt(np.mean(mono16.astype(np.float64) ** 2)))
|
||||
return rms > 0.02
|
||||
model, _ = _get_silero_vad()
|
||||
chunk = 512 # silero-vad v5+ hard requirement at 16kHz
|
||||
max_prob = 0.0
|
||||
with torch.no_grad():
|
||||
for start in range(0, mono16.size - chunk + 1, chunk):
|
||||
seg = mono16[start:start + chunk]
|
||||
p = float(model(torch.from_numpy(seg), SAMPLE_RATE_WHISPER).item())
|
||||
if p > max_prob:
|
||||
max_prob = p
|
||||
if p >= VAD_THRESHOLD:
|
||||
return True
|
||||
return False
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.debug("VAD inference failed: %s", e)
|
||||
return False
|
||||
|
||||
# ----- STT flush -----
|
||||
|
||||
def _flush_to_stt(self, user_id: int, pcm48_stereo: bytes) -> None:
|
||||
"""Downsample, Whisper-transcribe RO, drop hallucinations, dispatch."""
|
||||
try:
|
||||
mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
|
||||
if mono16.size == 0:
|
||||
return
|
||||
model = _get_whisper_model()
|
||||
segments, _info = model.transcribe(
|
||||
mono16, language="ro", beam_size=5,
|
||||
initial_prompt=(
|
||||
"Echo Core, asistent personal AI românesc al lui Marius. "
|
||||
"Conversație colocvială în română. "
|
||||
"Comenzi voce recunoscute: schimbă vocea pe M1, M2, M3, M4, M5, "
|
||||
"F1, F2, F3, F4, F5. Exemple: vorbește cu vocea M5, voce F3, "
|
||||
"treci pe vocea F1."
|
||||
),
|
||||
condition_on_previous_text=False,
|
||||
)
|
||||
text_parts: list[str] = []
|
||||
worst_no_speech = 0.0
|
||||
for seg in segments:
|
||||
no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
|
||||
if no_sp > worst_no_speech:
|
||||
worst_no_speech = no_sp
|
||||
if no_sp > NO_SPEECH_DROP_THRESHOLD:
|
||||
continue
|
||||
seg_text = (getattr(seg, "text", "") or "").strip()
|
||||
if seg_text:
|
||||
text_parts.append(seg_text)
|
||||
if not text_parts:
|
||||
return
|
||||
text = " ".join(text_parts).strip()
|
||||
if not text:
|
||||
return
|
||||
self._schedule_segment_done(user_id, text, worst_no_speech)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("Whisper transcribe failed: %s", e)
|
||||
|
||||
def _schedule_segment_done(
|
||||
self, user_id: int, text: str, no_speech_prob: float,
|
||||
) -> None:
|
||||
loop = self.session.loop
|
||||
if loop is None or not loop.is_running():
|
||||
log.debug("voice session loop missing — dropping segment")
|
||||
return
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.session.on_segment_done(user_id, text, no_speech_prob),
|
||||
loop,
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice segment dispatch failed: %s", e)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"VoiceSession",
|
||||
"EchoVoiceSink",
|
||||
"SILENCE_FLUSH_MS",
|
||||
"VAD_THRESHOLD",
|
||||
"VAD_WINDOW_MS",
|
||||
"NO_SPEECH_DROP_THRESHOLD",
|
||||
]
|
||||
333
src/voice/tts_stream.py
Normal file
333
src/voice/tts_stream.py
Normal file
@@ -0,0 +1,333 @@
|
||||
"""Streaming TTS with clause-level chunking for Discord voice mode.
|
||||
|
||||
A worker thread consumes text -> produces 20ms PCM frames on a queue.Queue.
|
||||
``EchoStreamingAudioSource`` pulls frames into Discord's audio thread so a
|
||||
single ``voice_client.play()`` call lasts the whole turn (eliminates the
|
||||
RTP gap between successive ``play()`` calls and the race with barge-in
|
||||
``stop()``). See plan: src/voice/tts_stream.py (Pas 6 / Lane TTS),
|
||||
Engineering decisions #6, #8, #15.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import queue
|
||||
import re
|
||||
import subprocess
|
||||
import threading
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
import discord
|
||||
|
||||
from src.voice.normalize import normalize_for_tts
|
||||
from tools.tts import synthesize
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Discord wants 20ms of 16-bit 48kHz stereo PCM per frame.
|
||||
# 48000 Hz * 0.020 s * 2 channels * 2 bytes = 3840 bytes.
|
||||
FRAME_BYTES = 3840
|
||||
TARGET_SAMPLE_RATE = 48000
|
||||
TARGET_CHANNELS = 2
|
||||
TARGET_SAMPLE_WIDTH = 2
|
||||
|
||||
# Sentinel pushed onto the text queue to ask the worker to exit cleanly.
|
||||
_POISON = object()
|
||||
|
||||
|
||||
# ---------- Clause segmentation ----------
|
||||
|
||||
# Split at Romanian sentence punctuation followed by whitespace. The
|
||||
# trailing whitespace requirement protects mid-number (1.000), mid-decimal
|
||||
# (12.5), and mid-abbreviation (M.D.) tokens, since none of those have a
|
||||
# space right after the inner punctuation.
|
||||
_CLAUSE_SPLIT = re.compile(r'(?<=[,;:.!?])\s+')
|
||||
|
||||
|
||||
def clause_segments(text: str, min_words: int = 8) -> Iterator[str]:
|
||||
"""Yield text in clause-sized chunks for streaming TTS.
|
||||
|
||||
Splits at ``, ; : . ! ?`` boundaries (only when the punctuation is
|
||||
followed by whitespace, so numbers / decimals / abbreviations stay
|
||||
intact). Short clauses are buffered and merged with the next one
|
||||
until the accumulated chunk has at least ``min_words`` words. The
|
||||
final remainder is always yielded, even if it's shorter than
|
||||
``min_words`` -- otherwise the tail of the response would never
|
||||
reach the TTS.
|
||||
"""
|
||||
if text is None:
|
||||
return
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return
|
||||
pieces = [p.strip() for p in _CLAUSE_SPLIT.split(text) if p and p.strip()]
|
||||
if not pieces:
|
||||
return
|
||||
buffer = ''
|
||||
for clause in pieces:
|
||||
buffer = (buffer + ' ' + clause).strip() if buffer else clause
|
||||
if len(buffer.split()) >= min_words:
|
||||
yield buffer
|
||||
buffer = ''
|
||||
if buffer:
|
||||
yield buffer
|
||||
|
||||
|
||||
# ---------- WAV -> PCM frame conversion ----------
|
||||
|
||||
def _ffmpeg_resample(wav_bytes: bytes) -> bytes:
|
||||
"""Convert any WAV payload to raw 48kHz stereo s16le PCM via ffmpeg.
|
||||
|
||||
ffmpeg is already an Echo Core hard dependency (heartbeat, video
|
||||
transcription). Using a stdin/stdout pipe keeps the synth tempfile
|
||||
short-lived and avoids extra disk traffic.
|
||||
"""
|
||||
proc = subprocess.run(
|
||||
[
|
||||
'ffmpeg', '-hide_banner', '-loglevel', 'error',
|
||||
'-i', 'pipe:0',
|
||||
'-f', 's16le',
|
||||
'-ar', str(TARGET_SAMPLE_RATE),
|
||||
'-ac', str(TARGET_CHANNELS),
|
||||
'-acodec', 'pcm_s16le',
|
||||
'pipe:1',
|
||||
],
|
||||
input=wav_bytes,
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
err = proc.stderr.decode('utf-8', errors='replace')[:200]
|
||||
raise RuntimeError(f"ffmpeg resample failed (rc={proc.returncode}): {err}")
|
||||
return proc.stdout
|
||||
|
||||
|
||||
def _is_target_format(wav_bytes: bytes) -> bool:
|
||||
"""Quick check whether the WAV already matches Discord's PCM format."""
|
||||
try:
|
||||
with wave.open(io.BytesIO(wav_bytes), 'rb') as w:
|
||||
return (
|
||||
w.getframerate() == TARGET_SAMPLE_RATE
|
||||
and w.getnchannels() == TARGET_CHANNELS
|
||||
and w.getsampwidth() == TARGET_SAMPLE_WIDTH
|
||||
and w.getcomptype() == 'NONE'
|
||||
)
|
||||
except (wave.Error, EOFError):
|
||||
return False
|
||||
|
||||
|
||||
def _extract_pcm_native(wav_bytes: bytes) -> bytes:
|
||||
"""Strip the WAV header and return raw PCM (target format assumed)."""
|
||||
with wave.open(io.BytesIO(wav_bytes), 'rb') as w:
|
||||
return w.readframes(w.getnframes())
|
||||
|
||||
|
||||
def wav_to_pcm_20ms_frames(wav_bytes: bytes) -> List[bytes]:
|
||||
"""Parse a WAV blob, normalize to 48kHz s16le stereo, slice into 20ms frames.
|
||||
|
||||
The final frame is zero-padded to a full 3840 bytes so Discord's audio
|
||||
thread always reads whole frames. Empty input yields an empty list.
|
||||
"""
|
||||
if not wav_bytes:
|
||||
return []
|
||||
pcm = _extract_pcm_native(wav_bytes) if _is_target_format(wav_bytes) else _ffmpeg_resample(wav_bytes)
|
||||
if not pcm:
|
||||
return []
|
||||
frames: List[bytes] = []
|
||||
for offset in range(0, len(pcm), FRAME_BYTES):
|
||||
chunk = pcm[offset:offset + FRAME_BYTES]
|
||||
if len(chunk) < FRAME_BYTES:
|
||||
chunk = chunk + b'\x00' * (FRAME_BYTES - len(chunk))
|
||||
frames.append(chunk)
|
||||
return frames
|
||||
|
||||
|
||||
# ---------- TTS worker queue ----------
|
||||
|
||||
class TTSQueue:
|
||||
"""Worker thread: text in -> 20ms PCM frames out.
|
||||
|
||||
Usage::
|
||||
|
||||
ttsq = TTSQueue(voice_id="M2", lang="ro")
|
||||
ttsq.start()
|
||||
ttsq.push_text("salut Marius, ce mai faci?")
|
||||
voice_client.play(EchoStreamingAudioSource(ttsq))
|
||||
# ... barge-in detected:
|
||||
ttsq.clear()
|
||||
# ... session over:
|
||||
ttsq.stop()
|
||||
"""
|
||||
|
||||
def __init__(self, voice_id: str = "M2", lang: str = "ro"):
|
||||
self.voice_id = voice_id
|
||||
self.lang = lang
|
||||
self._text_queue: queue.Queue = queue.Queue()
|
||||
self._pcm_queue: queue.Queue = queue.Queue()
|
||||
self._worker_thread: Optional[threading.Thread] = None
|
||||
self._stop_event = threading.Event()
|
||||
|
||||
# --- lifecycle ---
|
||||
|
||||
def start(self) -> None:
|
||||
if self._worker_thread is not None and self._worker_thread.is_alive():
|
||||
return
|
||||
self._stop_event.clear()
|
||||
self._worker_thread = threading.Thread(
|
||||
target=self._worker_loop,
|
||||
name=f"tts-worker-{self.voice_id}",
|
||||
daemon=True,
|
||||
)
|
||||
self._worker_thread.start()
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Signal the worker to exit, drain queues, join (timeout 5s)."""
|
||||
self._stop_event.set()
|
||||
# Wake the worker if it's blocked on get(timeout=...).
|
||||
self._text_queue.put(_POISON)
|
||||
thread = self._worker_thread
|
||||
if thread is not None:
|
||||
thread.join(timeout=5.0)
|
||||
self._worker_thread = None
|
||||
self._drain(self._text_queue)
|
||||
self._drain(self._pcm_queue)
|
||||
|
||||
# --- producer side ---
|
||||
|
||||
def push_text(self, text: str) -> None:
|
||||
"""Normalize, segment into clauses, enqueue each clause for synthesis."""
|
||||
if not text:
|
||||
return
|
||||
cleaned = normalize_for_tts(text)
|
||||
n = 0
|
||||
for clause in clause_segments(cleaned):
|
||||
clause = clause.strip()
|
||||
if clause:
|
||||
self._text_queue.put(clause)
|
||||
n += 1
|
||||
log.info("ttsq.push_text: input %d chars → %d clauses queued", len(text), n)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Drop everything pending (used for barge-in)."""
|
||||
self._drain(self._text_queue)
|
||||
self._drain(self._pcm_queue)
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return self._text_queue.empty() and self._pcm_queue.empty()
|
||||
|
||||
# --- consumer side (called by EchoStreamingAudioSource) ---
|
||||
|
||||
def get_frame_nowait(self) -> Optional[bytes]:
|
||||
"""Return the next PCM frame if available, else None — no blocking.
|
||||
|
||||
Blocking inside the player's read() loop wrecks Discord's 20ms cadence
|
||||
and the client interprets the stream as stuttering / out-of-order.
|
||||
"""
|
||||
try:
|
||||
return self._pcm_queue.get_nowait()
|
||||
except queue.Empty:
|
||||
return None
|
||||
|
||||
# --- internals ---
|
||||
|
||||
@staticmethod
|
||||
def _drain(q: queue.Queue) -> None:
|
||||
while True:
|
||||
try:
|
||||
q.get_nowait()
|
||||
except queue.Empty:
|
||||
return
|
||||
|
||||
def _worker_loop(self) -> None:
|
||||
while not self._stop_event.is_set():
|
||||
try:
|
||||
item = self._text_queue.get(timeout=0.1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
if item is _POISON:
|
||||
break
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
preview = item[:60]
|
||||
try:
|
||||
result = synthesize(item, voice=self.voice_id, lang=self.lang)
|
||||
except Exception as e:
|
||||
log.warning("TTS synth raised for %r: %s", preview, e)
|
||||
continue
|
||||
if not result.get('ok'):
|
||||
log.warning("TTS synth not ok for %r: %s", preview, result.get('error'))
|
||||
continue
|
||||
path = result.get('path')
|
||||
if not path:
|
||||
log.warning("TTS synth ok but no path for %r", preview)
|
||||
continue
|
||||
wav_bytes = b''
|
||||
try:
|
||||
wav_bytes = Path(path).read_bytes()
|
||||
except OSError as e:
|
||||
log.warning("TTS WAV read failed for %r: %s", preview, e)
|
||||
finally:
|
||||
try:
|
||||
Path(path).unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
if not wav_bytes:
|
||||
continue
|
||||
try:
|
||||
frames = wav_to_pcm_20ms_frames(wav_bytes)
|
||||
except RuntimeError as e:
|
||||
log.warning("TTS WAV-to-PCM failed for %r: %s", preview, e)
|
||||
continue
|
||||
if not frames:
|
||||
log.warning("TTS WAV-to-PCM produced 0 frames for %r", preview)
|
||||
continue
|
||||
for frame in frames:
|
||||
if self._stop_event.is_set():
|
||||
return
|
||||
self._pcm_queue.put(frame)
|
||||
log.info("TTS pushed %d frames (%.1fs) for %r",
|
||||
len(frames), len(frames) * 0.02, preview)
|
||||
|
||||
|
||||
# ---------- Discord audio source ----------
|
||||
|
||||
class EchoStreamingAudioSource(discord.AudioSource):
|
||||
"""Pull PCM frames from a ``TTSQueue`` into Discord's audio thread.
|
||||
|
||||
A single ``voice_client.play(EchoStreamingAudioSource(ttsq))`` call
|
||||
spans the whole session. When the TTS queue is empty, ``read()``
|
||||
returns a 20ms silence frame to keep the player alive — otherwise
|
||||
Discord would interpret an empty return as end-of-stream and stop
|
||||
the player, so real TTS frames pushed later would be silently
|
||||
discarded. The player is explicitly terminated only via
|
||||
``cleanup()`` (called on voice session teardown).
|
||||
"""
|
||||
|
||||
# 20ms of s16le stereo at 48kHz silence (960 samples × 2 channels × 2 bytes).
|
||||
_SILENCE_FRAME = b'\x00' * (960 * 2 * 2)
|
||||
|
||||
def __init__(self, ttsq: TTSQueue):
|
||||
self._ttsq = ttsq
|
||||
self._closed = False
|
||||
|
||||
def read(self) -> bytes:
|
||||
if self._closed:
|
||||
return b''
|
||||
frame = self._ttsq.get_frame_nowait()
|
||||
if frame is None:
|
||||
return self._SILENCE_FRAME
|
||||
return frame
|
||||
|
||||
def is_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self._closed = True
|
||||
try:
|
||||
self._ttsq.clear()
|
||||
except Exception:
|
||||
pass
|
||||
118
src/voice/voice_commands.py
Normal file
118
src/voice/voice_commands.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Detect in-band voice commands from STT transcripts.
|
||||
|
||||
The voice pipeline transcribes Marius's speech via Whisper and dispatches the
|
||||
text to Claude. Some utterances are not questions for Claude — they're
|
||||
control commands for the voice stack itself. This module parses those out
|
||||
*before* the Claude round-trip so they take effect instantly and don't waste
|
||||
a Claude session turn.
|
||||
|
||||
Currently handled:
|
||||
* change TTS voice — "schimbă vocea pe M5", "vorbește cu vocea F3",
|
||||
"voce em cinci", "voce feminină 3", etc.
|
||||
|
||||
The parser is intentionally conservative: it requires BOTH a voice trigger
|
||||
word ("voce", "vorbește", "schimbă", "treci pe") AND a recognizable voice
|
||||
ID. A bare "M5" without context is NOT a command — Marius might be quoting
|
||||
a string.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
_VALID_VOICES = {f"M{i}" for i in range(1, 6)} | {f"F{i}" for i in range(1, 6)}
|
||||
|
||||
|
||||
# Trigger words that suggest the user is talking ABOUT the voice, not just
|
||||
# saying something that happens to contain a voice-ID-looking substring.
|
||||
_VOICE_TRIGGER_RE = re.compile(
|
||||
r'\b(voce|vocea|voci|voice|vorbe[șs]te|schimb[aăÎ]|treci\s+pe)\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Direct form: "M5", "F 3", "m5", etc.
|
||||
_VOICE_ID_DIRECT_RE = re.compile(
|
||||
r'\b([MF])\s*([1-5])\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Word form: "em cinci", "M trei", "masculin doi", "feminină patru", etc.
|
||||
# Whisper often transcribes "M5" as "em cinci" / "M cinci" because letter
|
||||
# names are spelled out phonetically in Romanian.
|
||||
_VOICE_ID_WORDS_RE = re.compile(
|
||||
r'\b(em|m|masculin[aăe]?|ef|f|feminin[aăe]?)\s+(unu|una|doi|dou[ăa]|trei|patru|cinci|[1-5])\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
_DIGIT_WORD_TO_INT = {
|
||||
'unu': 1, 'una': 1, 'unul': 1, '1': 1,
|
||||
'doi': 2, 'două': 2, 'doua': 2, '2': 2,
|
||||
'trei': 3, '3': 3,
|
||||
'patru': 4, '4': 4,
|
||||
'cinci': 5, '5': 5,
|
||||
}
|
||||
|
||||
# Substring fallback: matches digit roots even when Whisper glues them into
|
||||
# compound non-words like "Mâcinci" (for "M cinci"=M5).
|
||||
_DIGIT_SUBSTR_RE = re.compile(
|
||||
r'(cinci|patru|trei|dou[ăa]|unul|unu|una)',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_F_GENDER_HINT_RE = re.compile(r'feminin|\bef\b|\bF\d?\b', re.IGNORECASE)
|
||||
|
||||
|
||||
def _normalize_gender(word: str) -> Optional[str]:
|
||||
"""Map gender word to 'M' or 'F'."""
|
||||
w = word.lower()
|
||||
if w in ('m', 'em') or w.startswith('masculin'):
|
||||
return 'M'
|
||||
if w in ('f', 'ef') or w.startswith('feminin'):
|
||||
return 'F'
|
||||
return None
|
||||
|
||||
|
||||
def detect_voice_change(text: str) -> Optional[str]:
|
||||
"""Parse a transcript for a 'change voice' command.
|
||||
|
||||
Returns the target voice id (one of M1-M5, F1-F5) or None if no command
|
||||
was detected. Requires both a voice trigger word and a voice ID.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
if not _VOICE_TRIGGER_RE.search(text):
|
||||
return None
|
||||
# Try the direct form first (M5, F3, etc.)
|
||||
m = _VOICE_ID_DIRECT_RE.search(text)
|
||||
if m:
|
||||
candidate = f"{m.group(1).upper()}{m.group(2)}"
|
||||
if candidate in _VALID_VOICES:
|
||||
return candidate
|
||||
# Fall back to the word form ("em cinci", "feminin trei", ...).
|
||||
m = _VOICE_ID_WORDS_RE.search(text)
|
||||
if m:
|
||||
gender = _normalize_gender(m.group(1))
|
||||
digit = _DIGIT_WORD_TO_INT.get(m.group(2).lower())
|
||||
if gender is not None and digit is not None:
|
||||
candidate = f"{gender}{digit}"
|
||||
if candidate in _VALID_VOICES:
|
||||
return candidate
|
||||
# Permissive fallback: Whisper sometimes glues the letter into the next
|
||||
# word ("Mâcinci" for "M cinci") or replaces it ("unul cinci" for
|
||||
# "M unu cinci"). After a voice trigger word, scan for any digit-word
|
||||
# substring and infer gender (F if a feminine marker is present, else M).
|
||||
digit_hits = _DIGIT_SUBSTR_RE.findall(text)
|
||||
digits = [_DIGIT_WORD_TO_INT[d.lower()] for d in digit_hits
|
||||
if d.lower() in _DIGIT_WORD_TO_INT]
|
||||
digits = [d for d in digits if 1 <= d <= 5]
|
||||
if digits:
|
||||
gender = 'F' if _F_GENDER_HINT_RE.search(text) else 'M'
|
||||
# Last digit wins — handles "M unu cinci" → M5 since "unu" is a
|
||||
# mangled letter-name prefix, "cinci" is the actual target.
|
||||
return f"{gender}{digits[-1]}"
|
||||
return None
|
||||
|
||||
|
||||
__all__ = ["detect_voice_change"]
|
||||
@@ -17,6 +17,20 @@ Lecții capturate din corectările lui Marius. Citește acest fișier la începu
|
||||
|
||||
<!-- Lecțiile se adaugă mai jos, cele mai noi sus. -->
|
||||
|
||||
## Supertonic rejectează ghilimelele curly (Unicode) cu HTTP 500
|
||||
**Data:** 2026-05-27
|
||||
**Context:** Marius a dat o comandă audio pe Discord cu un URL, iar răspunsul lui Claude conținea `„foo"` (ghilimele românești curly). Supertonic a returnat `HTTP 500: synthesis failed: Found 1 unsupported character(s): ['„']` și răspunsul nu s-a mai auzit. Fără retry logic vizibil în UX — pur și simplu tace.
|
||||
**Greșeala:** Am presupus că `normalize_for_tts` produce text deja "TTS-safe" pentru Supertonic. În realitate `strip_markdown` păstrează ghilimelele Unicode (`„` U+201E, `"` U+201D, `—` U+2014, `…` U+2026, etc.) pe care Supertonic le refuză.
|
||||
**Regula:** Înainte de orice apel HTTP la Supertonic, **sanitizează punctuația Unicode** la echivalentele ASCII (`„` `"` `"` → `"`, `'` `'` `‚` → `'`, `–` `—` → `-`, `…` → `...`, `«` `»` → `"`). Funcția `sanitize_punctuation` în `src/voice/normalize.py` face asta și e apelată chiar după `strip_markdown` în pipeline. Dacă apar caractere noi care crapă Supertonic (ex: simboluri matematice, săgeți), adaugă-le în `_TTS_PUNCT_MAP`.
|
||||
**Când se aplică:** Orice cod care trimite text la Supertonic (`tools/tts.py`, `src/voice/tts_stream.py`). Inclusiv testare manuală cu `curl` — folosește text românesc realistic (include `„foo"`, em-dash `—`, ellipsis `…`).
|
||||
|
||||
## Mai multe threads ≠ mai rapid — fitează `cpu_threads` pe physical cores, nu logical
|
||||
**Data:** 2026-05-27
|
||||
**Context:** Benchmark `tools/voice_bench.py` pentru faster-whisper `small` int8 pe i7-6700T (4 physical / 8 logical cores). Marius a urcat VM-ul de la 2 → 4 → 6 cores online, așteptând că mai multe = mai rapid.
|
||||
**Greșeala:** Presupoziție implicită că `cpu_threads=N` scalează liniar cu N. La 6 threads `small.p50` a regresat la 2.79s vs 2.25s la 4 threads (+24% MAI LENT). Era ușor de ratat dacă rulam doar un singur pass.
|
||||
**Regula:** Pentru workload-uri compute-bound (int8/fp16 ML inference, video encode, criptografie) setează `cpu_threads = numărul de PHYSICAL cores`, NU logical. Hyperthreads adaugă synchronization overhead și memory bandwidth contention fără paralelism real. Sweet spot tipic: `min(num_physical_cores, $optimal_threads)`. Verifică cu `lscpu` (Core(s) per socket × Socket(s) = physical; CPU(s) = logical). Dacă faci benchmark, rulează SWEEP nu single point — 2/4/6/8 threads să vezi unde e curba reală.
|
||||
**Când se aplică:** Configurare `cpu_threads`, `OMP_NUM_THREADS`, `MKL_NUM_THREADS`, `torch.set_num_threads()`, ffmpeg `-threads`, sau orice runtime ML/inference. Mai ales pe Proxmox VM-uri unde "more cores online" sună ca îmbunătățire. Întreabă-te: e workload compute-bound (yes → physical only) sau IO-bound (yes → logical OK)?
|
||||
|
||||
## Nu șterge crontab-uri din sistem fără confirmare explicită
|
||||
**Data:** 2026-05-20
|
||||
**Context:** Marius a cerut să șteargă "newsletter test din cron jobs". Am interpretat că `check_newsletter_cercetasi.py` din crontab de sistem face parte din "newsletter test".
|
||||
|
||||
53
tasks/voice-bench-results-threads2.md
Normal file
53
tasks/voice-bench-results-threads2.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Voice Bench Results — Discord Voice-to-Voice Spike
|
||||
|
||||
Generated: 2026-05-27 12:23:08 UTC
|
||||
Budget: STT p50 < 1.50s (per CEO plan + eng review)
|
||||
Trials per sample: 3
|
||||
|
||||
## Decision: **FALLBACK_TINY**
|
||||
|
||||
small.p50=3.25s >= budget; tiny.p50=0.50s < budget 1.50s. Document fallback la 'tiny' în plan (accuracy mai slabă, latency OK).
|
||||
|
||||
## Per-Model Summary
|
||||
|
||||
| Model | p50 (s) | p95 (s) | Mean RTF | Load (s) | Threads |
|
||||
|-------|--------:|--------:|---------:|---------:|--------:|
|
||||
| small | 3.25 (FAIL) | 3.61 | 0.80 | 10.63 | 2 |
|
||||
| tiny | 0.50 (PASS) | 0.56 | 0.12 | 3.15 | 2 |
|
||||
|
||||
## Per-Utterance Detail
|
||||
|
||||
### small
|
||||
|
||||
| Sample | Audio (s) | Median lat (s) | RTF | Trials | Transcript |
|
||||
|--------|----------:|---------------:|----:|--------|------------|
|
||||
| short | 1.88 | 2.95 | 1.57 | 3.24, 2.95, 2.94 | Salut ce mai faci! |
|
||||
| conversational | 2.93 | 3.10 | 1.06 | 3.09, 3.10, 3.13 | Stai puțin să mă gândesc la asta. |
|
||||
| medium | 5.99 | 3.42 | 0.57 | 3.44, 3.42, 3.34 | Am verificat în calendari și avem sedință cu echipa la 3 după amiază. |
|
||||
| numbers | 5.64 | 3.24 | 0.57 | 3.24, 3.21, 3.24 | Costul total este 120 și 3 delei și 5-10 de bani. |
|
||||
| question | 5.09 | 3.28 | 0.64 | 3.33, 3.27, 3.28 | Marius, vrei să-ți spun pe agenda de mâine să suni la noa? |
|
||||
| longer | 9.26 | 3.61 | 0.39 | 3.63, 3.61, 3.56 | Vreau să mi-reamintești, di seară, să verific dacă scriptul de bacup a rulat cor |
|
||||
|
||||
### tiny
|
||||
|
||||
| Sample | Audio (s) | Median lat (s) | RTF | Trials | Transcript |
|
||||
|--------|----------:|---------------:|----:|--------|------------|
|
||||
| short | 1.88 | 0.44 | 0.24 | 0.44, 0.45, 0.44 | Salute mai face? |
|
||||
| conversational | 2.93 | 0.48 | 0.16 | 0.48, 0.48, 0.47 | Stei putin să mă gândesc la asta. |
|
||||
| medium | 5.99 | 0.51 | 0.08 | 0.51, 0.51, 0.51 | Am verificat în calendar și avem sedeință cu equipala 3 dupa am iază. |
|
||||
| numbers | 5.64 | 0.50 | 0.09 | 0.50, 0.52, 0.49 | Costul total este o suta doozec și trei de lei și 50 de bani. |
|
||||
| question | 5.09 | 0.51 | 0.10 | 0.51, 0.50, 0.53 | Marius, vrei să-ți pun pe agenda de muină să sunilă nu a. |
|
||||
| longer | 9.26 | 0.56 | 0.06 | 0.56, 0.54, 0.57 | Vreau să mire am in test, disiară să verific dacă scriptul de backup a rulat cor |
|
||||
|
||||
## Hardware Context
|
||||
|
||||
- Platform: Linux-6.8.12-15-pve-x86_64-with-glibc2.39
|
||||
- CPU count (logical): 4
|
||||
- model name : Intel(R) Core(TM) i7-6700T CPU @ 2.80GHz
|
||||
- MemTotal: 6291456 kB
|
||||
- MemFree: 295808 kB
|
||||
- MemAvailable: 1737392 kB
|
||||
|
||||
## Raw Data
|
||||
|
||||
Vezi `tools/voice_bench_results.json` pentru JSON complet.
|
||||
65
tasks/voice-bench-results-threads4.md
Normal file
65
tasks/voice-bench-results-threads4.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# Voice Bench Results — Discord Voice-to-Voice Spike
|
||||
|
||||
Generated: 2026-05-27 (BLOCKING Pas 1 din test plan)
|
||||
Hardware: i7-6700T (Skylake mobile), Proxmox VM, no GPU
|
||||
Budget original: STT p50 < 1.50s (per CEO plan aspirational)
|
||||
Budget honest: 1.5-3s (per Outside Voice #1, baked in CEO plan)
|
||||
|
||||
## Final Recommendation: **PASS cu `small` model**
|
||||
|
||||
Script-ul a returnat auto-decision `FALLBACK_TINY` pentru că `small.p50=2.25s > 1.5s` literal. **Override manual**: `tiny` produce transcript ilizibil în RO ("muină să sun la nu a", "să mream in test de seare", "Stei putin") — inutilizabil pentru produs. `small @ 4 threads` cade în honest range-ul "1.5-3s" deja acceptat în CEO plan și produce transcript clean modulo normalizare numerică (deja în scope: `src/voice/normalize.py`).
|
||||
|
||||
**Implicații pentru implementare:**
|
||||
1. Folosește `WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)` în `src/voice/pipeline.py`.
|
||||
2. Update plan latency budget: STT p50 = 2.25s (era 1.5s); perceived round-trip estimate = 3.5-5s (STT 2.25s + Claude TTFB 0.5-1s + streaming TTS first clause ~0.5s).
|
||||
3. Streaming Claude→TTS rămâne critic — fără el, total perceived = 6-8s, peste limita conversațională.
|
||||
4. Filler audio "Stai să-mi adun gândurile" (deja în plan) maschează cazurile p95 (>3s).
|
||||
5. Document fallback la `tiny` DOAR pentru `/voice doctor` mode degraded (Whisper OOM etc.), nu pentru happy path.
|
||||
|
||||
## Two-Pass Comparison (threads=2 vs threads=4)
|
||||
|
||||
| Model | threads | p50 (s) | p95 (s) | mean RTF | Verdict |
|
||||
|-------|--------:|--------:|--------:|---------:|---------|
|
||||
| small | 2 | 3.25 | 3.63 | 0.67 | FAIL latency |
|
||||
| **small** | **4** | **2.25** | **2.64** | **0.46** | **CHOSEN** (quality + honest range) |
|
||||
| tiny | 2 | 0.50 | 0.57 | 0.10 | FAIL quality |
|
||||
| tiny | 4 | 0.48 | 0.57 | 0.10 | FAIL quality |
|
||||
|
||||
CPU upgrade 2→4 cores: **`small` got 31% faster** (3.25s → 2.25s), `tiny` essentially unchanged (CPU-light enough că nu beneficiază). Confirmă că `small` e CPU-bound, `tiny` nu.
|
||||
|
||||
## Transcript Quality Side-by-Side (4 threads)
|
||||
|
||||
| Input | small @ 4t | tiny @ 4t |
|
||||
|-------|-----------|-----------|
|
||||
| "Salut, ce mai faci?" | "Salut ce mai faci!" | "Salut, ce mai fac?" |
|
||||
| "Stai puțin să mă gândesc la asta." | "Stai putin să mă gândesc la asta." | "Stei putin să mă gândesc la asta." |
|
||||
| "Am verificat în calendar și avem ședință cu echipa la trei după-amiază." | "Am verificat în calendari și avem sedință cu echipa la 3 după amiază." | "Am verificat în calendar și avem sedeință cu equipala 3 du pămiază." |
|
||||
| "Costul total este o sută douăzeci și trei de lei și cincizeci de bani." | "Costul total este 120 și 3 delei și 50 de bani." | "Costul total este o suta 20 și 3 de lei și 50 de bani." |
|
||||
| "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?" | "Marius, vrei să-ți spun pe agenda de mâine să suni la noa a." | "Marius, vrei să-ți pun pe agenda de muină să sun la nu a." |
|
||||
| "Vreau să-mi reamintești diseară..." | "Vreau să mi-răimintești di seară..." | "Vreau să mream in test de seare..." |
|
||||
|
||||
**Observații:**
|
||||
- `small` greșeli: diacritice (`putin`/`puțin`, `sedință`/`ședință`), numbere ca digiti ("3" în loc de "trei"), acronime (NOAA→noa), aglutinare ("delei"/"de lei", "răimintești"/"reamintești").
|
||||
- `tiny` greșeli: cuvinte INVENTATE ("mream", "muină", "equipala", "sunilă") — hallucination, nu doar misspell.
|
||||
|
||||
## Hardware Context
|
||||
|
||||
- Intel(R) Core(TM) i7-6700T CPU @ 2.80GHz (Skylake mobile, 2015)
|
||||
- Cores online: 4 logical (din 8), upgrade de la 2 în timpul benchmark-ului
|
||||
- RAM: 6.0Gi total, ~2.5Gi available
|
||||
- No NVIDIA GPU (CPU-only inference)
|
||||
- ctranslate2 4.7.2 + faster-whisper 1.2.1 + int8 quantization
|
||||
|
||||
## Open Questions pentru Decision Lock
|
||||
|
||||
1. **Budget relax oficial:** acceptăm 2.25s p50 în plan și comunicăm honest user-facing? Sau încercăm:
|
||||
- **Groq Whisper Large-v3 API** (~0.3s, free tier 14k req/day) — vine cu network dependency
|
||||
- **Deepgram Nova-2 RO streaming** ($, dar 0.2s streaming partial transcripts)
|
||||
- **Whisper.cpp + AVX2** (același small model, optimizat C++) — ~30% boost suplimentar potențial
|
||||
2. **CPU bump:** dacă activăm restul de 4 cores offline (3-6) ar coborî `small.p50` la ~1.5s? Worth investigat (probabil VM resource cap, nu hardware limit).
|
||||
|
||||
## Raw Data
|
||||
|
||||
- `tools/voice_bench_results.json` — run curent (threads=4)
|
||||
- `tools/voice_bench_results_threads2.json` — baseline (threads=2)
|
||||
- `tasks/voice-bench-results-threads2.md` — narrative pentru baseline
|
||||
79
tasks/voice-bench-results.md
Normal file
79
tasks/voice-bench-results.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Voice Bench Results — Discord Voice-to-Voice Spike (BLOCKING Pas 1)
|
||||
|
||||
Generated: 2026-05-27
|
||||
Hardware: i7-6700T (4 physical cores / 8 logical), Proxmox VM, no GPU
|
||||
Budget original: STT p50 < 1.50s (per CEO plan aspirational)
|
||||
Budget honest range: 1.5-3s (per Outside Voice #1, baked in CEO plan)
|
||||
|
||||
## Final Recommendation: **PASS cu `small` model + `cpu_threads=4`**
|
||||
|
||||
`small @ 4t` → p50 **2.25s**, p95 **2.64s**, mean RTF **0.46**. Cade în honest range "1.5-3s" deja acceptat. Transcript clean modulo normalizare numerică (deja în scope: `src/voice/normalize.py`).
|
||||
|
||||
**Auto-decision script-ul** (`FALLBACK_TINY`) **este override-uit manual**: `tiny` produce transcript ilizibil ("Stei putin", "muină să sun la nu a", "să mream in test de seare") — neutilizabil în RO. Latency-ul rapid nu compensează lipsa de înțelegere.
|
||||
|
||||
## Surprise Finding: Threads Sweet Spot = 4, nu 6
|
||||
|
||||
Sweep complet:
|
||||
|
||||
| cpu_threads | small.p50 | small.p95 | mean RTF | Δ p50 vs threads=4 |
|
||||
|------------:|---------:|---------:|---------:|-------------------:|
|
||||
| 2 | 3.25s | 3.63s | 0.67 | +44% (slower) |
|
||||
| **4** | **2.25s** | **2.64s** | **0.46** | **baseline** |
|
||||
| 6 | 2.79s | 3.31s | 0.70 | +24% (slower!) |
|
||||
|
||||
`tiny` essentially flat (~0.5s) la orice thread count — CPU-light enough că nu beneficiază.
|
||||
|
||||
**Explicație:** i7-6700T = 4 physical cores + 4 hyperthreads. `cpu_threads=4` fitează exact pe physical cores (no hyperthread contention). `cpu_threads=6` spill-uiește pe hyperthreads care HURT compute-bound int8 inference (memory bandwidth contention, fără parallelism real). **Lock în plan: `cpu_threads=4` regardless of VM core count.** Adăugarea de cores în VM nu mai accelerează `small` peste 4 threads.
|
||||
|
||||
## Implicații pentru implementare
|
||||
|
||||
1. `src/voice/pipeline.py` →
|
||||
```python
|
||||
WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
|
||||
```
|
||||
2. **Plan budget update:** STT p50 = 2.25s (era 1.5s); perceived round-trip estimate = **3.5-5s** (STT 2.25s + Claude TTFB 0.5-1s + streaming TTS first clause ~0.5s).
|
||||
3. **Streaming Claude→TTS rămâne critic** — fără el, total perceived = 6-8s, peste limita conversațională.
|
||||
4. **Filler audio** "Stai să-mi adun gândurile" (deja în plan) maschează cazurile p95 (>3s).
|
||||
5. **Tiny model** rămâne instalat dar doar pentru `/voice doctor` degraded mode (Whisper OOM, low memory), NU pentru happy path.
|
||||
|
||||
## Transcript Quality (4 threads run)
|
||||
|
||||
| Input | `small` output | `tiny` output |
|
||||
|-------|----------------|---------------|
|
||||
| "Salut, ce mai faci?" | "Salut ce mai faci!" | "Salut, ce mai fac?" |
|
||||
| "Stai puțin să mă gândesc la asta." | "Stai putin să mă gândesc la asta." | "Stei putin să mă gândesc la asta." |
|
||||
| "Am verificat în calendar și avem ședință cu echipa la trei după-amiază." | "Am verificat în calendari și avem sedință cu echipa la 3 după amiază." | "Am verificat în calendar și avem sedeință cu equipala 3 du pămiază." |
|
||||
| "Costul total este o sută douăzeci și trei de lei și cincizeci de bani." | "Costul total este 120 și 3 delei și 50 de bani." | "Costul total este o suta 20 și 3 de lei și 50 de bani." |
|
||||
| "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?" | "Marius, vrei să-ți spun pe agenda de mâine să suni la noa a." | "Marius, vrei să-ți pun pe agenda de muină să sun la nu a." |
|
||||
| "Vreau să-mi reamintești diseară..." | "Vreau să mi-răimintești di seară..." | "Vreau să mream in test de seare..." |
|
||||
|
||||
**Pattern erori:**
|
||||
- `small`: diacritice missing (`putin`/`puțin`, `sedință`/`ședință`), numere ca digiti ("3" în loc de "trei" — normalizator inverse din scope), acronime ("noa" pentru NOAA — expected, deferr), aglutinare minoră ("delei", "răimintești").
|
||||
- `tiny`: cuvinte INVENTATE ("mream", "muină", "equipala", "sunilă"). Hallucination, nu doar misspell. **Unusable.**
|
||||
|
||||
## Open Questions (pentru decizie finală)
|
||||
|
||||
1. **Acceptăm 2.25s p50?** YES — în honest range CEO plan deja aprobat. User-facing communication: "Echo gândește 2-3 secunde înainte să răspundă" (vs. aspirational sub-secundă).
|
||||
2. **Activate restul de 2 cores offline (5,6)?** Marginal — nu va îmbunătăți peste threads=4 sweet spot. Worth doar pentru concurrent workloads (TTS + STT simultan, alte servicii).
|
||||
3. **Network STT alternative (Groq/Deepgram)?** Deferred — `small @ 4t` confirmat sufficient. Reconsiderăm DOAR dacă post-implementation p95 perceived >7s.
|
||||
|
||||
## Hardware Context
|
||||
|
||||
- Intel(R) Core(TM) i7-6700T CPU @ 2.80GHz (Skylake mobile, 2015)
|
||||
- Cores online (final): 6 logical (0-4, 7), 2 offline (5, 6)
|
||||
- Physical cores: 4 (TUI 8 logical via HT)
|
||||
- RAM: 6.0Gi total, ~2.0Gi available
|
||||
- No GPU (CPU-only int8 inference)
|
||||
- ctranslate2 4.7.2 + faster-whisper 1.2.1
|
||||
|
||||
## Raw Data
|
||||
|
||||
- `tools/voice_bench_results.json` — last run (threads=6)
|
||||
- `tools/voice_bench_results_threads4.json` — **WINNING config** (threads=4)
|
||||
- `tools/voice_bench_results_threads2.json` — baseline (threads=2)
|
||||
- `tasks/voice-bench-results-threads2.md` — narrative threads=2
|
||||
- `tasks/voice-bench-results-threads4.md` — narrative threads=4
|
||||
|
||||
## Status
|
||||
|
||||
**BLOCKING Pas 1 → CLEARED.** Sweet spot identificat. Plan file ready pentru update.
|
||||
307
tests/test_claude_session_mutex.py
Normal file
307
tests/test_claude_session_mutex.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""Regression-critical tests for per-channel mutex in src/claude_session.py.
|
||||
|
||||
Three scenarios from the eng-review test plan (2026-05-27):
|
||||
|
||||
1. Concurrent `send_message` calls on the SAME channel_id serialize —
|
||||
the second waits for the first to finish before its subprocess runs.
|
||||
2. Concurrent `send_message` calls on DIFFERENT channel_ids run in parallel
|
||||
— independent channels never block each other.
|
||||
3. Acquisition contract is documented and consistent: the lock is acquired
|
||||
blocking (no acquire timeout), which means a hung subprocess on
|
||||
channel X delays subsequent X messages but never X' (X != X'). This
|
||||
test pins that behavior so future refactors must preserve it.
|
||||
|
||||
The mutex is `threading.Lock`, NOT `asyncio.Lock`, because `send_message`
|
||||
is a sync function typically dispatched via `asyncio.to_thread` from
|
||||
async adapters. asyncio.Lock would serialize coroutines only — not the
|
||||
subprocess invocation. See plan section "Engineering decisions" #2.
|
||||
"""
|
||||
|
||||
import json
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src import claude_session
|
||||
from src.claude_session import (
|
||||
_get_session_lock,
|
||||
_session_locks,
|
||||
send_message,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clear_session_locks():
|
||||
"""Each test starts with a fresh lock map so we don't share state."""
|
||||
_session_locks.clear()
|
||||
yield
|
||||
_session_locks.clear()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_sessions(tmp_path, monkeypatch):
|
||||
"""Isolated active.json per test — keeps real session state untouched."""
|
||||
sessions_dir = tmp_path / "sessions"
|
||||
sessions_dir.mkdir()
|
||||
sf = sessions_dir / "active.json"
|
||||
sf.write_text("{}")
|
||||
monkeypatch.setattr(claude_session, "SESSIONS_DIR", sessions_dir)
|
||||
monkeypatch.setattr(claude_session, "_SESSIONS_FILE", sf)
|
||||
return sf
|
||||
|
||||
|
||||
def _slow_run_claude(sleep_seconds: float, in_critical: threading.Event,
|
||||
concurrent_seen: threading.Event):
|
||||
"""Build a fake `_run_claude` that signals when inside the critical section.
|
||||
|
||||
The fake holds the simulated subprocess for `sleep_seconds`. Any other
|
||||
invocation that overlaps will set `concurrent_seen` — the mutex test
|
||||
asserts this NEVER happens for the same channel_id.
|
||||
"""
|
||||
state = {"active": 0, "lock": threading.Lock()}
|
||||
|
||||
def fake(cmd, timeout, on_text=None, cwd=None):
|
||||
with state["lock"]:
|
||||
state["active"] += 1
|
||||
if state["active"] > 1:
|
||||
concurrent_seen.set()
|
||||
in_critical.set()
|
||||
time.sleep(sleep_seconds)
|
||||
with state["lock"]:
|
||||
state["active"] -= 1
|
||||
return {
|
||||
"result": "Hello from Claude!",
|
||||
"session_id": "sess-abc-123",
|
||||
"usage": {"input_tokens": 10, "output_tokens": 5},
|
||||
"total_cost_usd": 0.001,
|
||||
"cost_usd": 0.001,
|
||||
"duration_ms": int(sleep_seconds * 1000),
|
||||
"num_turns": 1,
|
||||
"intermediate_count": 0,
|
||||
"subtype": "success",
|
||||
"is_error": False,
|
||||
}
|
||||
|
||||
return fake
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 1 — same channel serializes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSameChannelSerializes:
|
||||
def test_two_concurrent_calls_same_channel_run_one_at_a_time(
|
||||
self, temp_sessions
|
||||
):
|
||||
"""Two parallel send_message on the SAME channel_id never overlap.
|
||||
|
||||
We instrument `_run_claude` to signal whenever more than one
|
||||
invocation is concurrently inside it. The mutex MUST prevent that.
|
||||
"""
|
||||
in_critical = threading.Event()
|
||||
concurrent_seen = threading.Event()
|
||||
slow = _slow_run_claude(0.25, in_critical, concurrent_seen)
|
||||
|
||||
with patch.object(claude_session, "_run_claude", side_effect=slow):
|
||||
start = time.monotonic()
|
||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
||||
futures = [
|
||||
pool.submit(send_message, "ch-same", f"msg-{i}")
|
||||
for i in range(2)
|
||||
]
|
||||
results = [f.result(timeout=10) for f in futures]
|
||||
elapsed = time.monotonic() - start
|
||||
|
||||
assert not concurrent_seen.is_set(), (
|
||||
"Two send_message calls on the same channel ran concurrently — "
|
||||
"mutex did not serialize them."
|
||||
)
|
||||
assert all(r == "Hello from Claude!" for r in results)
|
||||
# Two serial 0.25s subprocesses must take at least ~0.5s total
|
||||
# (we allow a generous floor — schedulers can be slow).
|
||||
assert elapsed >= 0.45, f"Expected serialized ~0.5s, got {elapsed:.3f}s"
|
||||
|
||||
def test_lock_is_reentrant_per_channel_dict(self, temp_sessions):
|
||||
"""`_get_session_lock` returns the SAME lock object for the same channel."""
|
||||
lock_a1 = _get_session_lock("channel-A")
|
||||
lock_a2 = _get_session_lock("channel-A")
|
||||
lock_b = _get_session_lock("channel-B")
|
||||
assert lock_a1 is lock_a2
|
||||
assert lock_a1 is not lock_b
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 2 — different channels parallel
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDifferentChannelsParallel:
|
||||
def test_two_concurrent_calls_different_channels_run_in_parallel(
|
||||
self, temp_sessions
|
||||
):
|
||||
"""Different channels MUST NOT block each other.
|
||||
|
||||
We measure elapsed wall-clock: two 0.4s subprocesses on different
|
||||
channels should finish in ~0.4s (parallel), NOT ~0.8s (serialized).
|
||||
"""
|
||||
in_critical = threading.Event()
|
||||
# `concurrent_seen` is OK to fire here — we WANT them to overlap.
|
||||
concurrent_seen = threading.Event()
|
||||
slow = _slow_run_claude(0.4, in_critical, concurrent_seen)
|
||||
|
||||
with patch.object(claude_session, "_run_claude", side_effect=slow):
|
||||
start = time.monotonic()
|
||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
||||
f1 = pool.submit(send_message, "ch-A", "msg-A")
|
||||
f2 = pool.submit(send_message, "ch-B", "msg-B")
|
||||
results = [f1.result(timeout=10), f2.result(timeout=10)]
|
||||
elapsed = time.monotonic() - start
|
||||
|
||||
assert all(r == "Hello from Claude!" for r in results)
|
||||
# Parallel execution: total time should be close to 0.4s, well under
|
||||
# 0.7s (would mean serialization). 0.65s ceiling allows for GIL +
|
||||
# scheduler jitter on a busy test box.
|
||||
assert elapsed < 0.65, (
|
||||
f"Different channels appear serialized: elapsed {elapsed:.3f}s "
|
||||
f"(expected ~0.4s parallel, <0.65s ceiling)"
|
||||
)
|
||||
assert concurrent_seen.is_set(), (
|
||||
"Different channels did not overlap — mutex is too coarse "
|
||||
"(should be per-channel, not global)."
|
||||
)
|
||||
|
||||
def test_three_channels_all_overlap(self, temp_sessions):
|
||||
"""Stress: three concurrent channels all run in parallel."""
|
||||
in_critical = threading.Event()
|
||||
concurrent_seen = threading.Event()
|
||||
slow = _slow_run_claude(0.3, in_critical, concurrent_seen)
|
||||
|
||||
with patch.object(claude_session, "_run_claude", side_effect=slow):
|
||||
start = time.monotonic()
|
||||
with ThreadPoolExecutor(max_workers=3) as pool:
|
||||
futures = [
|
||||
pool.submit(send_message, f"ch-{i}", f"msg-{i}")
|
||||
for i in range(3)
|
||||
]
|
||||
for f in as_completed(futures, timeout=10):
|
||||
assert f.result() == "Hello from Claude!"
|
||||
elapsed = time.monotonic() - start
|
||||
|
||||
# 3 × 0.3s in parallel ≈ 0.3s; serial would be ~0.9s.
|
||||
assert elapsed < 0.6, (
|
||||
f"Three channels serialized: {elapsed:.3f}s (expected <0.6s)"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 3 — acquisition behavior documented and consistent
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAcquisitionBehavior:
|
||||
"""Pin the chosen acquisition policy: blocking, no timeout.
|
||||
|
||||
Project style is to bound subprocess execution via `timeout` (default
|
||||
5 min) rather than fail-fast on lock acquire. Reasons:
|
||||
|
||||
- Adapter callers (Discord/Telegram/voice) already serialize work via
|
||||
asyncio.to_thread; queue depth is naturally bounded.
|
||||
- A non-blocking acquire would surface a timing error to the user
|
||||
("busy, try again") for an entirely transient and self-resolving
|
||||
condition. Blocking gives FIFO-ish ordering with simple semantics.
|
||||
- If a subprocess truly hangs past `timeout`, _run_claude raises
|
||||
TimeoutError → the held lock releases (via `with`) → queued
|
||||
callers proceed.
|
||||
|
||||
This test pins that: a second caller waits and eventually proceeds; it
|
||||
does not raise an exception on contention.
|
||||
"""
|
||||
|
||||
def test_contested_acquire_blocks_then_proceeds(self, temp_sessions):
|
||||
in_critical = threading.Event()
|
||||
concurrent_seen = threading.Event()
|
||||
slow = _slow_run_claude(0.3, in_critical, concurrent_seen)
|
||||
|
||||
results: list[str | BaseException] = []
|
||||
|
||||
def run(label: str):
|
||||
try:
|
||||
results.append(send_message("ch-contend", label))
|
||||
except BaseException as e:
|
||||
results.append(e)
|
||||
|
||||
with patch.object(claude_session, "_run_claude", side_effect=slow):
|
||||
t1 = threading.Thread(target=run, args=("first",))
|
||||
t1.start()
|
||||
# Wait until the first call is inside the critical section so
|
||||
# the second is GUARANTEED to contend on the lock.
|
||||
assert in_critical.wait(timeout=2.0), "first call never entered"
|
||||
in_critical.clear()
|
||||
t2 = threading.Thread(target=run, args=("second",))
|
||||
t2.start()
|
||||
t1.join(timeout=5.0)
|
||||
t2.join(timeout=5.0)
|
||||
|
||||
assert len(results) == 2
|
||||
# Both must return the canned response — no exception, no error.
|
||||
assert all(r == "Hello from Claude!" for r in results), (
|
||||
f"Contended acquire surfaced an error instead of blocking: {results}"
|
||||
)
|
||||
# Critical-section overlap check: contended calls MUST serialize.
|
||||
assert not concurrent_seen.is_set(), (
|
||||
"Contended same-channel calls ran concurrently — mutex broken."
|
||||
)
|
||||
|
||||
def test_lock_released_on_subprocess_exception(self, temp_sessions):
|
||||
"""If `_run_claude` raises, the lock MUST be released so the next
|
||||
caller can proceed (otherwise a single error deadlocks the channel
|
||||
forever)."""
|
||||
|
||||
call_count = {"n": 0}
|
||||
|
||||
def flaky(cmd, timeout, on_text=None, cwd=None):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
raise RuntimeError("simulated subprocess crash")
|
||||
return {
|
||||
"result": "Hello from Claude!",
|
||||
"session_id": "sess-abc-123",
|
||||
"usage": {"input_tokens": 10, "output_tokens": 5},
|
||||
"total_cost_usd": 0.001,
|
||||
"cost_usd": 0.001,
|
||||
"duration_ms": 50,
|
||||
"num_turns": 1,
|
||||
"intermediate_count": 0,
|
||||
"subtype": "success",
|
||||
"is_error": False,
|
||||
}
|
||||
|
||||
with patch.object(claude_session, "_run_claude", side_effect=flaky):
|
||||
with pytest.raises(RuntimeError, match="simulated subprocess crash"):
|
||||
send_message("ch-recover", "first")
|
||||
|
||||
# Second call MUST acquire the lock (proves the first released it).
|
||||
# We use a short timeout via a thread so a deadlock would fail loudly.
|
||||
done = threading.Event()
|
||||
result_box: list[str] = []
|
||||
|
||||
def second():
|
||||
result_box.append(send_message("ch-recover", "second"))
|
||||
done.set()
|
||||
|
||||
t = threading.Thread(target=second)
|
||||
t.start()
|
||||
assert done.wait(timeout=3.0), (
|
||||
"Second call deadlocked — lock was not released on exception."
|
||||
)
|
||||
t.join(timeout=1.0)
|
||||
assert result_box == ["Hello from Claude!"]
|
||||
@@ -30,7 +30,10 @@ class TestClearCommand:
|
||||
response, is_cmd = route_message("ch-1", "user-1", "/clear")
|
||||
assert response == "Session cleared. Model reset to sonnet."
|
||||
assert is_cmd is True
|
||||
mock_clear.assert_called_once_with("ch-1")
|
||||
# /clear drops both the text-adapter session and the isolated voice
|
||||
# session for the same Discord channel.
|
||||
mock_clear.assert_any_call("ch-1")
|
||||
mock_clear.assert_any_call("voice:ch-1")
|
||||
|
||||
@patch("src.router._get_config")
|
||||
@patch("src.router.clear_session")
|
||||
@@ -191,7 +194,7 @@ class TestRegularMessage:
|
||||
response, is_cmd = route_message("ch-1", "user-1", "hello")
|
||||
assert response == "Hello from Claude!"
|
||||
assert is_cmd is False
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router.send_message")
|
||||
def test_model_override(self, mock_send):
|
||||
@@ -199,7 +202,7 @@ class TestRegularMessage:
|
||||
response, is_cmd = route_message("ch-1", "user-1", "hello", model="opus")
|
||||
assert response == "Response"
|
||||
assert is_cmd is False
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router._get_channel_config")
|
||||
@patch("src.router._get_config")
|
||||
@@ -227,7 +230,7 @@ class TestRegularMessage:
|
||||
|
||||
cb = lambda t: None
|
||||
route_message("ch-1", "user-1", "hello", on_text=cb)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=cb)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=cb, voice_mode=False)
|
||||
|
||||
|
||||
# --- _get_channel_config ---
|
||||
@@ -269,7 +272,7 @@ class TestModelResolution:
|
||||
mock_chan_cfg.return_value = {"id": "ch-1", "default_model": "haiku"}
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="haiku", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="haiku", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router._get_channel_config")
|
||||
@patch("src.router._get_config")
|
||||
@@ -283,7 +286,7 @@ class TestModelResolution:
|
||||
mock_get_config.return_value = mock_cfg
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router._get_channel_config")
|
||||
@patch("src.router._get_config")
|
||||
@@ -297,7 +300,7 @@ class TestModelResolution:
|
||||
mock_get_config.return_value = mock_cfg
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="sonnet", on_text=None, voice_mode=False)
|
||||
|
||||
@patch("src.router.get_active_session")
|
||||
@patch("src.router.send_message")
|
||||
@@ -307,4 +310,4 @@ class TestModelResolution:
|
||||
mock_get_session.return_value = {"model": "opus", "session_id": "abc"}
|
||||
|
||||
route_message("ch-1", "user-1", "hello")
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None)
|
||||
mock_send.assert_called_once_with("ch-1", "hello", model="opus", on_text=None, voice_mode=False)
|
||||
|
||||
222
tests/test_voice_adapter_contract.py
Normal file
222
tests/test_voice_adapter_contract.py
Normal file
@@ -0,0 +1,222 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Contract test for `src/voice/_discord_voice_adapter.py`.
|
||||
|
||||
Purpose: catch drift when the vendored `discord-ext-voice-recv` is upgraded.
|
||||
If upstream renames/removes a method we depend on, this test fails LOUDLY
|
||||
before any downstream code breaks at runtime in a Discord voice call.
|
||||
|
||||
Per VENDOR_INFO.md: this test MUST PASS after every vendor upgrade.
|
||||
|
||||
Plain `import` + `hasattr` / `callable` checks — no mocks. We're verifying
|
||||
the SHAPE of the API surface, not behavior.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# --- Adapter re-exports import cleanly --------------------------------------
|
||||
|
||||
|
||||
def test_adapter_exports_voice_receive_client():
|
||||
from src.voice._discord_voice_adapter import VoiceReceiveClient
|
||||
|
||||
assert VoiceReceiveClient is not None
|
||||
assert inspect.isclass(VoiceReceiveClient)
|
||||
|
||||
|
||||
def test_adapter_exports_audio_sink():
|
||||
from src.voice._discord_voice_adapter import AudioSink
|
||||
|
||||
assert AudioSink is not None
|
||||
assert inspect.isclass(AudioSink)
|
||||
|
||||
|
||||
def test_adapter_exports_voice_data():
|
||||
from src.voice._discord_voice_adapter import VoiceData
|
||||
|
||||
assert VoiceData is not None
|
||||
assert inspect.isclass(VoiceData)
|
||||
|
||||
|
||||
def test_adapter_exports_connect_helper():
|
||||
from src.voice._discord_voice_adapter import connect_voice
|
||||
|
||||
assert callable(connect_voice)
|
||||
assert inspect.iscoroutinefunction(connect_voice)
|
||||
|
||||
|
||||
# --- Re-exports point at the real vendored classes (no accidental shadowing) -
|
||||
|
||||
|
||||
def test_voice_receive_client_is_voice_recv_client():
|
||||
from discord.ext import voice_recv
|
||||
|
||||
from src.voice._discord_voice_adapter import VoiceReceiveClient
|
||||
|
||||
assert VoiceReceiveClient is voice_recv.VoiceRecvClient
|
||||
|
||||
|
||||
def test_audio_sink_is_voice_recv_audio_sink():
|
||||
from discord.ext import voice_recv
|
||||
|
||||
from src.voice._discord_voice_adapter import AudioSink
|
||||
|
||||
assert AudioSink is voice_recv.AudioSink
|
||||
|
||||
|
||||
def test_voice_data_is_voice_recv_voice_data():
|
||||
from discord.ext import voice_recv
|
||||
|
||||
from src.voice._discord_voice_adapter import VoiceData
|
||||
|
||||
assert VoiceData is voice_recv.VoiceData
|
||||
|
||||
|
||||
# --- VoiceReceiveClient API surface used by the pipeline --------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method_name",
|
||||
[
|
||||
"connect", # inherited from discord.VoiceClient
|
||||
"disconnect", # inherited from discord.VoiceClient
|
||||
"listen", # voice_recv extension
|
||||
"stop_listening", # voice_recv extension
|
||||
"is_listening", # voice_recv extension
|
||||
"stop", # voice_recv extension (stops play+listen)
|
||||
"cleanup", # voice_recv extension
|
||||
],
|
||||
)
|
||||
def test_voice_receive_client_has_method(method_name):
|
||||
from src.voice._discord_voice_adapter import VoiceReceiveClient
|
||||
|
||||
attr = getattr(VoiceReceiveClient, method_name, None)
|
||||
assert attr is not None, f"VoiceReceiveClient is missing `.{method_name}()`"
|
||||
assert callable(attr), f"VoiceReceiveClient.{method_name} is not callable"
|
||||
|
||||
|
||||
def test_voice_receive_client_listen_accepts_sink_and_after():
|
||||
"""`.listen(sink, *, after=None)` is the canonical call shape."""
|
||||
from src.voice._discord_voice_adapter import VoiceReceiveClient
|
||||
|
||||
sig = inspect.signature(VoiceReceiveClient.listen)
|
||||
params = sig.parameters
|
||||
assert "sink" in params, f"VoiceReceiveClient.listen missing `sink` param; got {list(params)}"
|
||||
assert "after" in params, f"VoiceReceiveClient.listen missing `after` kwarg; got {list(params)}"
|
||||
|
||||
|
||||
def test_voice_receive_client_has_sink_property():
|
||||
"""`.sink` is read/write so we can swap sinks in place."""
|
||||
from src.voice._discord_voice_adapter import VoiceReceiveClient
|
||||
|
||||
sink_attr = inspect.getattr_static(VoiceReceiveClient, "sink", None)
|
||||
assert isinstance(sink_attr, property), "VoiceReceiveClient.sink must be a property"
|
||||
assert sink_attr.fget is not None, "VoiceReceiveClient.sink property missing getter"
|
||||
assert sink_attr.fset is not None, "VoiceReceiveClient.sink property missing setter"
|
||||
|
||||
|
||||
# --- AudioSink API surface --------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method_name",
|
||||
[
|
||||
"write", # write(user, voice_data) — the hot path
|
||||
"cleanup",
|
||||
"wants_opus", # bool: opus bytes vs decoded PCM
|
||||
],
|
||||
)
|
||||
def test_audio_sink_has_method(method_name):
|
||||
from src.voice._discord_voice_adapter import AudioSink
|
||||
|
||||
attr = getattr(AudioSink, method_name, None)
|
||||
assert attr is not None, f"AudioSink is missing `.{method_name}()`"
|
||||
assert callable(attr), f"AudioSink.{method_name} is not callable"
|
||||
|
||||
|
||||
def test_audio_sink_write_signature():
|
||||
"""`.write(self, user, data)` — user is the speaker (Optional), data is VoiceData."""
|
||||
from src.voice._discord_voice_adapter import AudioSink
|
||||
|
||||
sig = inspect.signature(AudioSink.write)
|
||||
params = list(sig.parameters)
|
||||
# self, user, data
|
||||
assert len(params) >= 3, f"AudioSink.write expected (self, user, data), got {params}"
|
||||
|
||||
|
||||
# --- VoiceData attributes ---------------------------------------------------
|
||||
|
||||
|
||||
def test_voice_data_slots():
|
||||
"""VoiceData uses __slots__ for per-packet allocation. Pipeline reads these."""
|
||||
from src.voice._discord_voice_adapter import VoiceData
|
||||
|
||||
assert hasattr(VoiceData, "__slots__"), "VoiceData lost __slots__ — perf regression risk"
|
||||
slots = set(VoiceData.__slots__)
|
||||
# Documented attributes the pipeline depends on.
|
||||
assert "packet" in slots, f"VoiceData missing `packet` slot; got {slots}"
|
||||
assert "source" in slots, f"VoiceData missing `source` slot (speaker user); got {slots}"
|
||||
assert "pcm" in slots, f"VoiceData missing `pcm` slot (decoded audio); got {slots}"
|
||||
|
||||
|
||||
def test_voice_data_has_opus_property():
|
||||
"""`.opus` exposes the raw opus bytes from the underlying RTP packet."""
|
||||
from src.voice._discord_voice_adapter import VoiceData
|
||||
|
||||
opus_attr = inspect.getattr_static(VoiceData, "opus", None)
|
||||
assert isinstance(opus_attr, property), "VoiceData.opus must be a property"
|
||||
|
||||
|
||||
# --- Echo-core DAVE-decrypt fork guards -------------------------------------
|
||||
#
|
||||
# Two contract tests pinned by the DAVE receive-side decrypt patch.
|
||||
# See plan: /home/moltbot/.claude/plans/wiggly-exploring-glade.md
|
||||
#
|
||||
# These fail fast on either:
|
||||
# 1. An upstream voice-recv re-install wiping the fork's version marker
|
||||
# (i.e. our patch is gone), OR
|
||||
# 2. A discord.py upgrade renaming the connection-level DAVE attrs the
|
||||
# patch reads (`dave_session`, `dave_protocol_version`).
|
||||
|
||||
|
||||
def test_voice_recv_fork_version():
|
||||
"""Echo-core fork tag for the DAVE-decrypt patch.
|
||||
|
||||
Lane A bumps `voice_recv.__version__` to `'0.5.3a+echo.dave1'` (PEP 440
|
||||
local segment). If this assertion fails after a vendor reinstall, the
|
||||
fork patch has been lost — re-apply `_maybe_dave_decrypt` + the
|
||||
`callback()` hook before deploying, or live voice will regress to the
|
||||
`opus_decode: corrupted stream` error chain.
|
||||
"""
|
||||
from discord.ext import voice_recv
|
||||
|
||||
assert voice_recv.__version__ == "0.5.3a+echo.dave1", (
|
||||
f"voice_recv.__version__ is {voice_recv.__version__!r}; expected "
|
||||
"'0.5.3a+echo.dave1'. The DAVE-decrypt fork patch has been "
|
||||
"overwritten — re-apply before reinstalling the vendored package."
|
||||
)
|
||||
|
||||
|
||||
def test_voice_connection_state_has_dave_attrs():
|
||||
"""`_maybe_dave_decrypt` reads `dave_session` and `dave_protocol_version`
|
||||
off the discord.py `VoiceConnectionState`. If a future discord.py upgrade
|
||||
renames either attr, fail loudly here rather than in a live voice call
|
||||
(where the symptom is silent packet drops).
|
||||
"""
|
||||
from discord import voice_state
|
||||
|
||||
src = inspect.getsource(voice_state.VoiceConnectionState)
|
||||
assert "dave_session" in src, (
|
||||
"discord.voice_state.VoiceConnectionState source no longer mentions "
|
||||
"'dave_session' — discord.py may have renamed the attr. Update "
|
||||
"vendor/discord-ext-voice-recv/.../reader.py::_maybe_dave_decrypt."
|
||||
)
|
||||
assert "dave_protocol_version" in src, (
|
||||
"discord.voice_state.VoiceConnectionState source no longer mentions "
|
||||
"'dave_protocol_version' — discord.py may have renamed the attr. "
|
||||
"Update _maybe_dave_decrypt accordingly."
|
||||
)
|
||||
55
tests/test_voice_commands.py
Normal file
55
tests/test_voice_commands.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Tests for src/voice/voice_commands.detect_voice_change."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from src.voice.voice_commands import detect_voice_change
|
||||
|
||||
|
||||
class TestDetectVoiceChange:
|
||||
# --- positive cases (direct form) ---
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("schimbă vocea pe M5", "M5"),
|
||||
("Schimbă vocea pe F3.", "F3"),
|
||||
("vorbește cu vocea M1", "M1"),
|
||||
("vorbește cu vocea F2", "F2"),
|
||||
("voce M4", "M4"),
|
||||
("Voce F5.", "F5"),
|
||||
("treci pe vocea F1", "F1"),
|
||||
("Echo, treci pe M2.", "M2"),
|
||||
("voice M3", "M3"),
|
||||
])
|
||||
def test_direct_form(self, text, expected):
|
||||
assert detect_voice_change(text) == expected
|
||||
|
||||
# --- positive cases (word form, what Whisper actually produces) ---
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("schimbă vocea pe em cinci", "M5"),
|
||||
("vorbește cu vocea em trei", "M3"),
|
||||
("voce em unu", "M1"),
|
||||
("schimbă vocea pe ef doi", "F2"),
|
||||
("voce ef cinci", "F5"),
|
||||
("vorbește cu vocea masculină cinci", "M5"),
|
||||
("schimbă vocea pe feminină trei", "F3"),
|
||||
("voce masculin patru", "M4"),
|
||||
("schimbă vocea pe M cinci", "M5"),
|
||||
("voce F două", "F2"),
|
||||
])
|
||||
def test_word_form(self, text, expected):
|
||||
assert detect_voice_change(text) == expected
|
||||
|
||||
# --- negative cases ---
|
||||
@pytest.mark.parametrize("text", [
|
||||
"",
|
||||
"cât este ora",
|
||||
"M5", # no trigger word
|
||||
"Salut Echo, sunt în M3", # M3 here is a location/etc, no trigger
|
||||
"vocea ta este foarte bună", # trigger but no voice id
|
||||
"schimbă te rog", # trigger but no id
|
||||
"voce M6", # out of range
|
||||
"voce M0", # out of range
|
||||
"voce F8", # out of range
|
||||
"schimbă vocea pe șapte", # digit out of range
|
||||
])
|
||||
def test_no_match(self, text):
|
||||
assert detect_voice_change(text) is None
|
||||
137
tests/test_voice_normalize.py
Normal file
137
tests/test_voice_normalize.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""Tests for src/voice/normalize.py — 35 Romanian cases.
|
||||
|
||||
Categories:
|
||||
markdown strip (5), numbers cardinals (6), decimals (4),
|
||||
currency natural (8), symbols (4), abbreviations (4),
|
||||
truncation boundary (2), edge cases empty / whitespace (2).
|
||||
|
||||
Total: 35.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from src.voice.normalize import (
|
||||
expand_abbreviations,
|
||||
expand_currency,
|
||||
expand_numbers_ro,
|
||||
expand_symbols,
|
||||
normalize_for_tts,
|
||||
strip_markdown,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Markdown stripping (5)
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("**bold text**", "bold text"),
|
||||
("*italic text*", "italic text"),
|
||||
("`code snippet`", "code snippet"),
|
||||
("[click here](https://example.com)", "click here"),
|
||||
("# Heading text", "Heading text"),
|
||||
])
|
||||
def test_strip_markdown(text, expected):
|
||||
assert strip_markdown(text) == expected
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Numbers cardinals (6)
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("21", "douăzeci și unu"),
|
||||
("81", "optzeci și unu"),
|
||||
("100", "o sută"),
|
||||
("3", "trei"),
|
||||
("0", "zero"),
|
||||
("200", "două sute"),
|
||||
])
|
||||
def test_expand_numbers_cardinals(text, expected):
|
||||
assert expand_numbers_ro(text) == expected
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Decimals (4)
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("3.14", "trei virgulă paisprezece"),
|
||||
("12.5", "doisprezece virgulă cinci"),
|
||||
("0.5", "zero virgulă cinci"),
|
||||
("99.99", "nouăzeci și nouă virgulă nouăzeci și nouă"),
|
||||
])
|
||||
def test_expand_numbers_decimals(text, expected):
|
||||
assert expand_numbers_ro(text) == expected
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Currency natural RO (8) — RON / USD / EUR / GBP mix
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("12.50 RON", "doisprezece lei și cincizeci de bani"),
|
||||
("$25.99", "douăzeci și cinci de dolari și nouăzeci și nouă de cenți"),
|
||||
("€100.50", "o sută de euro și cincizeci de cenți"),
|
||||
("£200", "două sute de lire"),
|
||||
("100 RON", "o sută de lei"),
|
||||
("$1", "un dolar"),
|
||||
("€50", "cincizeci de euro"),
|
||||
("1 RON", "un leu"),
|
||||
])
|
||||
def test_expand_currency(text, expected):
|
||||
assert expand_currency(text) == expected
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Symbols (4)
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("25%", "25 la sută"),
|
||||
("foo & bar", "foo și bar"),
|
||||
("Marius @ home", "Marius la home"),
|
||||
("30°", "30 grade"),
|
||||
])
|
||||
def test_expand_symbols(text, expected):
|
||||
assert expand_symbols(text) == expected
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Abbreviations (4)
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("etc.", "etcetera"),
|
||||
("dl. Popescu", "domnul Popescu"),
|
||||
("dna. Ionescu", "doamna Ionescu"),
|
||||
("nr. 5", "numărul 5"),
|
||||
])
|
||||
def test_expand_abbreviations(text, expected):
|
||||
assert expand_abbreviations(text) == expected
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Truncation boundary (2)
|
||||
# ============================================================
|
||||
def test_truncate_exactly_200_words_unchanged():
|
||||
"""Exactly 200 simple word tokens — no truncation, no suffix."""
|
||||
text = " ".join(["cuvant"] * 200)
|
||||
out = normalize_for_tts(text)
|
||||
assert "Restul l-am scris în chat." not in out
|
||||
assert out.split() == ["cuvant"] * 200
|
||||
|
||||
|
||||
def test_truncate_over_200_words_appends_suffix():
|
||||
"""250 word tokens — keep first 200 then append the chat-deferral phrase."""
|
||||
text = " ".join(["cuvant"] * 250)
|
||||
out = normalize_for_tts(text)
|
||||
assert out.endswith("Restul l-am scris în chat.")
|
||||
words = out.split()
|
||||
# First 200 are 'cuvant', followed by the 5-word suffix.
|
||||
assert words[:200] == ["cuvant"] * 200
|
||||
assert words[200:] == ["Restul", "l-am", "scris", "în", "chat."]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Edge cases (2)
|
||||
# ============================================================
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("", ""),
|
||||
(" ", ""),
|
||||
])
|
||||
def test_normalize_edge_cases(text, expected):
|
||||
assert normalize_for_tts(text) == expected
|
||||
302
tests/test_voice_recv_dave.py
Normal file
302
tests/test_voice_recv_dave.py
Normal file
@@ -0,0 +1,302 @@
|
||||
"""DAVE receive-side decrypt tests for the vendored voice-recv fork.
|
||||
|
||||
Exercises Lane A's patch on
|
||||
`vendor/discord-ext-voice-recv/discord/ext/voice_recv/reader.py`:
|
||||
|
||||
* `_maybe_dave_decrypt(rtp_packet)` — DAVE E2E layer sandwiched between the
|
||||
transport-layer decrypt and the routing into the opus decoder. No-op when
|
||||
the room is non-DAVE, when davey isn't installed, or when the SSRC map
|
||||
hasn't caught up to a new speaker yet.
|
||||
* `callback()` hook — feeds the DAVE-unwrapped plaintext into
|
||||
`packet_router.feed_rtp()` on success, drops the packet on failure WITHOUT
|
||||
killing the reader thread.
|
||||
|
||||
The test fixtures mirror `tests/test_voice_session_cleanup.py:33-54`:
|
||||
* Construct `AudioReader` via `AudioReader.__new__(AudioReader)` + manual
|
||||
attr set so the reader thread is never started.
|
||||
* `MagicMock` everything below the unit under test.
|
||||
|
||||
`_HAS_DAVE` / `_MEDIA_TYPE_AUDIO` on the reader module are monkey-patched per
|
||||
test so the suite passes whether or not `davey` is importable in the venv.
|
||||
The assertions only become meaningful once Lane A's patch has landed and the
|
||||
package has been re-installed (`pip install -e vendor/discord-ext-voice-recv
|
||||
--force-reinstall`); the FILE itself is valid Python regardless.
|
||||
|
||||
See plan: /home/moltbot/.claude/plans/wiggly-exploring-glade.md
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from discord.ext.voice_recv.reader import AudioReader
|
||||
|
||||
|
||||
# Sentinel for `_MEDIA_TYPE_AUDIO`. Using a plain object() keeps the tests
|
||||
# independent of whether davey is importable — we just assert the value
|
||||
# flows through to `dave_session.decrypt()` unchanged.
|
||||
_FAKE_MEDIA_TYPE_AUDIO = object()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_dave_session():
|
||||
sess = MagicMock(name="dave_session")
|
||||
sess.ready = True
|
||||
# Default: this user is NOT in passthrough — DAVE decrypt must run.
|
||||
# Individual tests can override to True to exercise the passthrough path.
|
||||
sess.can_passthrough = MagicMock(return_value=False)
|
||||
sess.decrypt = MagicMock(return_value=b"plaintext_opus")
|
||||
return sess
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_connection(fake_dave_session):
|
||||
conn = MagicMock(name="_connection")
|
||||
conn.dave_protocol_version = 1
|
||||
conn.dave_session = fake_dave_session
|
||||
return conn
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_voice_client(fake_connection):
|
||||
vc = MagicMock(name="voice_client")
|
||||
vc._connection = fake_connection
|
||||
vc._ssrc_to_id = {12345: 999_000}
|
||||
return vc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_rtp_packet():
|
||||
pkt = MagicMock(name="rtp_packet")
|
||||
pkt.ssrc = 12345
|
||||
pkt.decrypted_data = b"ciphertext_after_transport_decrypt"
|
||||
pkt.is_silence = MagicMock(return_value=False)
|
||||
return pkt
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reader(fake_voice_client):
|
||||
"""`AudioReader` instance with no reader thread spawned.
|
||||
|
||||
Same pattern used by `tests/test_voice_session_cleanup.py` for
|
||||
`VoiceSession` — bypass `__init__` so we can drive the public surface
|
||||
against pure mocks.
|
||||
"""
|
||||
r = AudioReader.__new__(AudioReader)
|
||||
r.voice_client = fake_voice_client
|
||||
r.error = None
|
||||
return r
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dave_enabled(monkeypatch):
|
||||
"""Force the reader module's DAVE-availability flags ON.
|
||||
|
||||
Pins `_MEDIA_TYPE_AUDIO` to a known sentinel so the happy-path test can
|
||||
assert exactly what gets passed to `dave_session.decrypt`. `raising=False`
|
||||
keeps the monkeypatch valid even if Lane A's patch hasn't landed yet —
|
||||
the tests will still fail (no `_maybe_dave_decrypt` attr), just for the
|
||||
right reason.
|
||||
"""
|
||||
import discord.ext.voice_recv.reader as reader_mod
|
||||
|
||||
monkeypatch.setattr(reader_mod, "_HAS_DAVE", True, raising=False)
|
||||
monkeypatch.setattr(
|
||||
reader_mod, "_MEDIA_TYPE_AUDIO", _FAKE_MEDIA_TYPE_AUDIO, raising=False
|
||||
)
|
||||
return reader_mod
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests: `_maybe_dave_decrypt`
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMaybeDaveDecrypt:
|
||||
"""Seven unit tests on the DAVE-decrypt gate.
|
||||
|
||||
The gate mirrors `voice_client.can_encrypt` in discord.py 2.7.1 exactly
|
||||
(`voice_state.py:272-273`). Bypass semantics on every "DAVE inactive"
|
||||
branch let non-DAVE rooms and davey-less environments keep working.
|
||||
"""
|
||||
|
||||
def test_protocol_version_zero_bypasses_decrypt(
|
||||
self, dave_enabled, reader, fake_connection, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""`dave_protocol_version == 0` → return the transport-decrypted
|
||||
payload unchanged; never touch `dave_session.decrypt`."""
|
||||
fake_connection.dave_protocol_version = 0
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is fake_rtp_packet.decrypted_data
|
||||
fake_dave_session.decrypt.assert_not_called()
|
||||
|
||||
def test_dave_session_none_bypasses_decrypt(
|
||||
self, dave_enabled, reader, fake_connection, fake_rtp_packet,
|
||||
):
|
||||
"""`dave_session is None` → bypass. Pre-MLS-handshake state."""
|
||||
fake_connection.dave_session = None
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is fake_rtp_packet.decrypted_data
|
||||
|
||||
def test_dave_session_not_ready_bypasses_decrypt(
|
||||
self, dave_enabled, reader, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""`dave_session.ready is False` → bypass. Pre-MLS-epoch-1 packets
|
||||
are transport-only on the wire."""
|
||||
fake_dave_session.ready = False
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is fake_rtp_packet.decrypted_data
|
||||
fake_dave_session.decrypt.assert_not_called()
|
||||
|
||||
def test_unknown_ssrc_returns_none(
|
||||
self, dave_enabled, reader, fake_voice_client, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""SSRC not in `_ssrc_to_id` → drop (return None).
|
||||
|
||||
Accepted regression: davey requires per-user keys; when SPEAKING
|
||||
events race behind the first audio packet, 1-5 packets per new
|
||||
speaker per session are dropped. See plan §Edge cases.
|
||||
"""
|
||||
fake_voice_client._ssrc_to_id.clear()
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is None
|
||||
fake_dave_session.decrypt.assert_not_called()
|
||||
|
||||
def test_happy_path_invokes_decrypt_and_returns_plaintext(
|
||||
self, dave_enabled, reader, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""Full DAVE-active path: `decrypt(user_id, MediaType.audio, ciphertext)`
|
||||
called exactly once with the expected args; method returns the
|
||||
davey plaintext bytes verbatim."""
|
||||
ciphertext = fake_rtp_packet.decrypted_data
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result == b"plaintext_opus"
|
||||
fake_dave_session.decrypt.assert_called_once_with(
|
||||
999_000, _FAKE_MEDIA_TYPE_AUDIO, ciphertext,
|
||||
)
|
||||
|
||||
def test_decrypt_raises_returns_none_no_crash(
|
||||
self, dave_enabled, reader, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""davey.decrypt raising → drop the packet, don't propagate, and
|
||||
leave `reader.error` untouched so the reader thread stays alive.
|
||||
|
||||
MLS epoch transitions can produce transient decrypt failures —
|
||||
bumping `reader.error` would call `self.stop()` and kill the whole
|
||||
receive pipeline."""
|
||||
fake_dave_session.decrypt.side_effect = RuntimeError(
|
||||
"simulated MLS epoch transition fail"
|
||||
)
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is None
|
||||
assert reader.error is None
|
||||
|
||||
def test_has_dave_false_bypasses_even_with_session_present(
|
||||
self, monkeypatch, reader, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""`_HAS_DAVE = False` → bypass everything, even if a real session
|
||||
somehow showed up on the connection. Defensive shim that keeps the
|
||||
tests (and any davey-less deploys) green."""
|
||||
import discord.ext.voice_recv.reader as reader_mod
|
||||
|
||||
monkeypatch.setattr(reader_mod, "_HAS_DAVE", False, raising=False)
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is fake_rtp_packet.decrypted_data
|
||||
fake_dave_session.decrypt.assert_not_called()
|
||||
|
||||
def test_can_passthrough_true_returns_payload_without_decrypt(
|
||||
self, dave_enabled, reader, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""`can_passthrough(user_id) == True` → return the transport-decrypted
|
||||
payload as-is; never call `decrypt`. Mirrors Discord's protocol where
|
||||
a passthrough-mode peer sends non-DAVE-wrapped packets that the
|
||||
receiver must accept verbatim."""
|
||||
fake_dave_session.can_passthrough = MagicMock(return_value=True)
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result is fake_rtp_packet.decrypted_data
|
||||
fake_dave_session.can_passthrough.assert_called_once_with(999_000)
|
||||
fake_dave_session.decrypt.assert_not_called()
|
||||
|
||||
def test_can_passthrough_raises_falls_through_to_decrypt(
|
||||
self, dave_enabled, reader, fake_dave_session, fake_rtp_packet,
|
||||
):
|
||||
"""`can_passthrough` raising → swallow the error and try `decrypt`.
|
||||
Defensive: an older davey build or transient internal state shouldn't
|
||||
break the receive pipeline."""
|
||||
fake_dave_session.can_passthrough = MagicMock(
|
||||
side_effect=RuntimeError("simulated davey internal error")
|
||||
)
|
||||
result = reader._maybe_dave_decrypt(fake_rtp_packet)
|
||||
assert result == b"plaintext_opus"
|
||||
fake_dave_session.decrypt.assert_called_once()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration tests: `callback()` exercises the DAVE hook
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCallbackIntegration:
|
||||
"""Two integration tests for the hook Lane A inserts between transport
|
||||
decrypt (reader.py:141) and the post-decrypt routing (reader.py:159).
|
||||
|
||||
Strategy: stub the transport-decrypt and RTP parsing path so `callback()`
|
||||
reaches the hook, then mock `_maybe_dave_decrypt` directly on the reader
|
||||
instance. The assertion focuses on `feed_rtp` being called (test 8) vs.
|
||||
not called (test 9). The transport path correctness is covered by
|
||||
voice-recv's own upstream tests.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _wire_callback(reader, monkeypatch, fake_rtp_packet):
|
||||
import discord.ext.voice_recv.reader as reader_mod
|
||||
|
||||
# Redirect rtp parsing — we want an RTP path (not RTCP) so the hook fires.
|
||||
monkeypatch.setattr(reader_mod.rtp, "is_rtcp", lambda data: False)
|
||||
monkeypatch.setattr(reader_mod.rtp, "decode_rtp", lambda data: fake_rtp_packet)
|
||||
|
||||
# Stub the instance attrs `callback()` touches besides the hook.
|
||||
reader.decryptor = MagicMock(name="decryptor")
|
||||
reader.decryptor.decrypt_rtp = MagicMock(return_value=b"ciphertext")
|
||||
reader.packet_router = MagicMock(name="packet_router")
|
||||
reader.packet_router.feed_rtp = MagicMock()
|
||||
reader.speaking_timer = MagicMock(name="speaking_timer")
|
||||
reader.sink = MagicMock(name="sink")
|
||||
|
||||
def test_callback_feeds_when_dave_returns_bytes(
|
||||
self, monkeypatch, reader, fake_rtp_packet,
|
||||
):
|
||||
"""Hook returns plaintext → `feed_rtp` called once with the
|
||||
rtp_packet whose `decrypted_data` is now the post-DAVE plaintext."""
|
||||
self._wire_callback(reader, monkeypatch, fake_rtp_packet)
|
||||
plaintext = b"dave_unwrapped_opus_payload"
|
||||
reader._maybe_dave_decrypt = MagicMock(return_value=plaintext)
|
||||
|
||||
reader.callback(b"raw_packet_bytes")
|
||||
|
||||
reader._maybe_dave_decrypt.assert_called_once_with(fake_rtp_packet)
|
||||
assert reader.packet_router.feed_rtp.call_count == 1
|
||||
called_with = reader.packet_router.feed_rtp.call_args[0][0]
|
||||
assert called_with is fake_rtp_packet
|
||||
assert fake_rtp_packet.decrypted_data == plaintext
|
||||
assert reader.error is None
|
||||
|
||||
def test_callback_drops_when_dave_returns_none(
|
||||
self, monkeypatch, reader, fake_rtp_packet,
|
||||
):
|
||||
"""Hook returns None → `feed_rtp` NOT called, no exception propagated,
|
||||
`reader.error` stays None (reader thread survives the drop)."""
|
||||
self._wire_callback(reader, monkeypatch, fake_rtp_packet)
|
||||
reader._maybe_dave_decrypt = MagicMock(return_value=None)
|
||||
|
||||
reader.callback(b"raw_packet_bytes")
|
||||
|
||||
reader._maybe_dave_decrypt.assert_called_once_with(fake_rtp_packet)
|
||||
reader.packet_router.feed_rtp.assert_not_called()
|
||||
assert reader.error is None
|
||||
319
tests/test_voice_session_cleanup.py
Normal file
319
tests/test_voice_session_cleanup.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""Cleanup-path tests for ``src/voice/pipeline.py::VoiceSession``.
|
||||
|
||||
Pins the centralized ``cleanup()`` contract from the voice plan
|
||||
(Engineering decision #5): every one of the FIVE exit paths must drain
|
||||
state cleanly and idempotently — lock released, JSONL flushed or
|
||||
discarded, presence cleared, ``voice_client.cleanup()`` invoked,
|
||||
``ttsq.stop()`` invoked, and a second call to ``cleanup()`` MUST be a
|
||||
no-op (side effects happen exactly once).
|
||||
|
||||
The 5 paths under test:
|
||||
1. ``test_cleanup_on_voice_leave`` — explicit ``/voice leave``
|
||||
2. ``test_cleanup_on_disconnect`` — Discord-level disconnect
|
||||
3. ``test_cleanup_on_crash`` — exception via ``__exit__``
|
||||
4. ``test_cleanup_on_auto_leave`` — 5-min inactivity timer
|
||||
5. ``test_cleanup_on_user_leaves_channel`` — user leaves voice channel
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.voice.pipeline import VoiceSession
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_bot():
|
||||
bot = MagicMock(name="bot")
|
||||
bot.user = MagicMock()
|
||||
bot.user.id = 999_999
|
||||
bot.change_presence = AsyncMock(name="change_presence")
|
||||
bot.get_user = MagicMock(return_value=None)
|
||||
return bot
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_voice_client():
|
||||
vc = MagicMock(name="voice_client")
|
||||
vc.cleanup = MagicMock(name="vc_cleanup")
|
||||
return vc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ttsq():
|
||||
ttsq = MagicMock(name="ttsq")
|
||||
ttsq.stop = MagicMock(name="ttsq_stop")
|
||||
return ttsq
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_text_channel():
|
||||
tc = MagicMock(name="text_channel")
|
||||
tc.send = AsyncMock(name="text_send")
|
||||
return tc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_session(
|
||||
tmp_path: Path,
|
||||
mock_bot,
|
||||
mock_voice_client,
|
||||
mock_ttsq,
|
||||
mock_text_channel,
|
||||
*,
|
||||
record_enabled: bool = True,
|
||||
) -> VoiceSession:
|
||||
jsonl = tmp_path / ("transcripts.jsonl" if record_enabled else "noop.jsonl")
|
||||
return VoiceSession(
|
||||
channel_id=1001,
|
||||
guild_id=42,
|
||||
text_channel=mock_text_channel,
|
||||
voice_client=mock_voice_client,
|
||||
bot=mock_bot,
|
||||
ttsq=mock_ttsq,
|
||||
whitelist={1234},
|
||||
record_enabled=record_enabled,
|
||||
mirror_enabled=True,
|
||||
transcripts_jsonl_path=jsonl,
|
||||
loop=None,
|
||||
router_route_message=MagicMock(name="route_message"),
|
||||
)
|
||||
|
||||
|
||||
def _assert_clean_post_cleanup(
|
||||
session: VoiceSession,
|
||||
voice_client,
|
||||
ttsq,
|
||||
bot,
|
||||
jsonl_path: Path,
|
||||
record_enabled: bool,
|
||||
) -> None:
|
||||
"""Assertions shared across all five cleanup-path tests."""
|
||||
# 1. Lock released — non-blocking acquire from this thread returns True.
|
||||
acquired = session._lock.acquire(blocking=False)
|
||||
assert acquired, "session._lock must be released after cleanup()"
|
||||
session._lock.release()
|
||||
|
||||
# 2. voice_client.cleanup() called exactly once.
|
||||
assert voice_client.cleanup.call_count == 1, (
|
||||
f"voice_client.cleanup() called {voice_client.cleanup.call_count}x, "
|
||||
f"expected 1"
|
||||
)
|
||||
|
||||
# 3. ttsq.stop() called exactly once.
|
||||
assert ttsq.stop.call_count == 1, (
|
||||
f"ttsq.stop() called {ttsq.stop.call_count}x, expected 1"
|
||||
)
|
||||
|
||||
# 4. bot.change_presence(activity=None) called at least once with that kwarg.
|
||||
assert bot.change_presence.call_count >= 1, (
|
||||
"bot.change_presence was never called — presence not restored"
|
||||
)
|
||||
bot.change_presence.assert_called_with(activity=None)
|
||||
|
||||
# 5. JSONL flushed (record=on) OR absent (record=off).
|
||||
if record_enabled:
|
||||
assert jsonl_path.exists(), (
|
||||
"record=on: JSONL file must exist (was created by __enter__ and "
|
||||
"left in place by cleanup so transcript can be persisted)"
|
||||
)
|
||||
else:
|
||||
# record=off: cleanup unlinks the file if it ever existed.
|
||||
assert not jsonl_path.exists() or jsonl_path.stat().st_size == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 1 — explicit /voice leave
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupOnVoiceLeave:
|
||||
def test_cleanup_on_voice_leave(
|
||||
self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
):
|
||||
session = _make_session(
|
||||
tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
record_enabled=True,
|
||||
)
|
||||
jsonl_path = session.transcripts_jsonl_path
|
||||
|
||||
with session:
|
||||
# Simulate one transcript line.
|
||||
session._jsonl_fh.write(json.dumps({"text": "salut"}) + "\n")
|
||||
session.cleanup("voice_leave")
|
||||
assert session._cleaned_up is True
|
||||
|
||||
# __exit__ called cleanup("exit") — must be a no-op the second time.
|
||||
_assert_clean_post_cleanup(
|
||||
session, mock_voice_client, mock_ttsq, mock_bot,
|
||||
jsonl_path, record_enabled=True,
|
||||
)
|
||||
|
||||
# Idempotency: a third explicit call still doesn't bump counts.
|
||||
session.cleanup("redundant")
|
||||
assert mock_voice_client.cleanup.call_count == 1
|
||||
assert mock_ttsq.stop.call_count == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 2 — Discord-level voice disconnect
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupOnDisconnect:
|
||||
def test_cleanup_on_disconnect(
|
||||
self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
):
|
||||
session = _make_session(
|
||||
tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
record_enabled=False,
|
||||
)
|
||||
jsonl_path = session.transcripts_jsonl_path
|
||||
|
||||
session.__enter__()
|
||||
# Network drop arrives outside the with-block.
|
||||
session.cleanup("disconnect")
|
||||
_assert_clean_post_cleanup(
|
||||
session, mock_voice_client, mock_ttsq, mock_bot,
|
||||
jsonl_path, record_enabled=False,
|
||||
)
|
||||
|
||||
# Idempotency.
|
||||
session.cleanup("disconnect-again")
|
||||
assert mock_voice_client.cleanup.call_count == 1
|
||||
assert mock_ttsq.stop.call_count == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 3 — crash / exception via __exit__
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupOnCrash:
|
||||
def test_cleanup_on_crash(
|
||||
self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
):
|
||||
session = _make_session(
|
||||
tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
record_enabled=True,
|
||||
)
|
||||
jsonl_path = session.transcripts_jsonl_path
|
||||
|
||||
with pytest.raises(RuntimeError, match="simulated crash"):
|
||||
with session:
|
||||
# Pipeline raises mid-call.
|
||||
raise RuntimeError("simulated crash")
|
||||
|
||||
# __exit__ must have driven cleanup — every side effect happened once.
|
||||
_assert_clean_post_cleanup(
|
||||
session, mock_voice_client, mock_ttsq, mock_bot,
|
||||
jsonl_path, record_enabled=True,
|
||||
)
|
||||
|
||||
# Idempotency: explicit follow-up call (e.g. an outer error handler
|
||||
# also tries to cleanup) MUST be a no-op.
|
||||
session.cleanup("post-crash")
|
||||
assert mock_voice_client.cleanup.call_count == 1
|
||||
assert mock_ttsq.stop.call_count == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 4 — auto-leave timer fires after 5 min inactivity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupOnAutoLeave:
|
||||
def test_cleanup_on_auto_leave(
|
||||
self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
):
|
||||
session = _make_session(
|
||||
tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
record_enabled=True,
|
||||
)
|
||||
jsonl_path = session.transcripts_jsonl_path
|
||||
|
||||
session.__enter__()
|
||||
# The auto-leave timer trips outside the with-block.
|
||||
session.cleanup("auto_leave")
|
||||
|
||||
_assert_clean_post_cleanup(
|
||||
session, mock_voice_client, mock_ttsq, mock_bot,
|
||||
jsonl_path, record_enabled=True,
|
||||
)
|
||||
|
||||
# Idempotency.
|
||||
session.cleanup("auto_leave_redundant")
|
||||
assert mock_voice_client.cleanup.call_count == 1
|
||||
assert mock_ttsq.stop.call_count == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario 5 — user leaves voice channel themselves
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupOnUserLeaves:
|
||||
def test_cleanup_on_user_leaves_channel(
|
||||
self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
):
|
||||
session = _make_session(
|
||||
tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
record_enabled=False,
|
||||
)
|
||||
jsonl_path = session.transcripts_jsonl_path
|
||||
|
||||
session.__enter__()
|
||||
# voice_state_update event handler invokes cleanup directly.
|
||||
session.cleanup("user_left_channel")
|
||||
|
||||
_assert_clean_post_cleanup(
|
||||
session, mock_voice_client, mock_ttsq, mock_bot,
|
||||
jsonl_path, record_enabled=False,
|
||||
)
|
||||
|
||||
# Idempotency.
|
||||
session.cleanup("user_left_again")
|
||||
assert mock_voice_client.cleanup.call_count == 1
|
||||
assert mock_ttsq.stop.call_count == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-cutting: failures inside cleanup don't propagate
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupRobustness:
|
||||
def test_cleanup_swallows_voice_client_errors(
|
||||
self, tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
):
|
||||
"""If voice_client.cleanup() raises, ttsq.stop() must still run and
|
||||
the lock must still release — otherwise a broken Discord state would
|
||||
deadlock the channel forever."""
|
||||
mock_voice_client.cleanup.side_effect = RuntimeError("vc died")
|
||||
|
||||
session = _make_session(
|
||||
tmp_path, mock_bot, mock_voice_client, mock_ttsq, mock_text_channel,
|
||||
record_enabled=False,
|
||||
)
|
||||
|
||||
with session:
|
||||
session.cleanup("voice_leave")
|
||||
|
||||
# ttsq.stop still ran exactly once.
|
||||
assert mock_ttsq.stop.call_count == 1
|
||||
# Lock released.
|
||||
acquired = session._lock.acquire(blocking=False)
|
||||
assert acquired, "lock must release even when voice_client.cleanup raises"
|
||||
session._lock.release()
|
||||
20
tools/tts.py
20
tools/tts.py
@@ -23,6 +23,24 @@ VOICES = {"M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"}
|
||||
DEFAULT_VOICE = "M2"
|
||||
DEFAULT_LANG = "ro"
|
||||
|
||||
# Punctuation Supertonic synthesis rejects with HTTP 500 (Romanian curly quotes,
|
||||
# smart dashes, ellipsis, angle quotes). Mapped to ASCII so a stray „foo" in
|
||||
# any caller's text doesn't kill the whole request.
|
||||
_TTS_PUNCT_MAP = {
|
||||
'„': '"', '“': '"', '”': '"',
|
||||
'‘': "'", '’': "'", '‚': "'",
|
||||
'«': '"', '»': '"',
|
||||
'–': '-', '—': '-',
|
||||
'…': '...',
|
||||
}
|
||||
|
||||
|
||||
def sanitize_for_supertonic(text: str) -> str:
|
||||
"""Replace Unicode punctuation Supertonic rejects with ASCII equivalents."""
|
||||
for src, dst in _TTS_PUNCT_MAP.items():
|
||||
text = text.replace(src, dst)
|
||||
return text
|
||||
|
||||
|
||||
def synthesize(text: str, voice: str = DEFAULT_VOICE, lang: str = DEFAULT_LANG) -> dict:
|
||||
"""Call Supertonic server and save audio to a temp WAV file.
|
||||
@@ -34,6 +52,8 @@ def synthesize(text: str, voice: str = DEFAULT_VOICE, lang: str = DEFAULT_LANG)
|
||||
if not text or not text.strip():
|
||||
return {"ok": False, "error": "Text gol."}
|
||||
|
||||
text = sanitize_for_supertonic(text)
|
||||
|
||||
voice = voice.upper()
|
||||
if voice not in VOICES:
|
||||
voice = DEFAULT_VOICE
|
||||
|
||||
375
tools/voice_bench.py
Normal file
375
tools/voice_bench.py
Normal file
@@ -0,0 +1,375 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Voice latency spike benchmark — BLOCKING Pas 1 pentru voice-to-voice Discord.
|
||||
|
||||
Confirmă (sau infirmă) budget-ul STT p50 <1.5s pe hardware-ul curent.
|
||||
Generează audio RO via Supertonic la :7788, rulează faster-whisper pe sample-uri,
|
||||
raportează p50/p95 per model.
|
||||
|
||||
Decision logic:
|
||||
small.p50 < 1.5s → PASS (use small)
|
||||
small fail, tiny.p50 < 1.5s → FALLBACK_TINY (use tiny, document trade-off)
|
||||
ambele fail → FAIL (re-plan model sau hardware)
|
||||
|
||||
Output:
|
||||
tools/voice_bench_results.json — raw per-utterance + summary
|
||||
tasks/voice-bench-results.md — sumar uman cu decizie + recomandări
|
||||
exit 0 (PASS/FALLBACK_TINY) sau 1 (FAIL)
|
||||
|
||||
Usage:
|
||||
python3 tools/voice_bench.py
|
||||
python3 tools/voice_bench.py --models small,tiny --trials 3 --budget-s 1.5
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
SUPERTONIC_URL = "http://127.0.0.1:7788"
|
||||
DEFAULT_BUDGET_S = 1.5
|
||||
DEFAULT_MODELS = ("small", "tiny")
|
||||
DEFAULT_TRIALS = 3
|
||||
RESULTS_JSON = PROJECT_ROOT / "tools" / "voice_bench_results.json"
|
||||
RESULTS_MD = PROJECT_ROOT / "tasks" / "voice-bench-results.md"
|
||||
|
||||
UTTERANCES_RO: list[tuple[str, str]] = [
|
||||
("short", "Salut, ce mai faci?"),
|
||||
("conversational", "Stai puțin să mă gândesc la asta."),
|
||||
("medium", "Am verificat în calendar și avem ședință cu echipa la trei după-amiază."),
|
||||
("numbers", "Costul total este o sută douăzeci și trei de lei și cincizeci de bani."),
|
||||
("question", "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?"),
|
||||
("longer", "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă."),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SampleResult:
|
||||
name: str
|
||||
text: str
|
||||
wav_path: str
|
||||
audio_duration_s: float
|
||||
transcribe_latencies_s: list[float] = field(default_factory=list)
|
||||
transcribed_text: str = ""
|
||||
|
||||
@property
|
||||
def median_latency_s(self) -> float:
|
||||
return statistics.median(self.transcribe_latencies_s) if self.transcribe_latencies_s else float("inf")
|
||||
|
||||
@property
|
||||
def real_time_factor(self) -> float:
|
||||
if not self.audio_duration_s:
|
||||
return float("inf")
|
||||
return self.median_latency_s / self.audio_duration_s
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelSummary:
|
||||
model: str
|
||||
sample_results: list[SampleResult]
|
||||
load_time_s: float
|
||||
cpu_threads: int
|
||||
|
||||
@property
|
||||
def all_latencies(self) -> list[float]:
|
||||
out: list[float] = []
|
||||
for s in self.sample_results:
|
||||
out.extend(s.transcribe_latencies_s)
|
||||
return out
|
||||
|
||||
@property
|
||||
def p50_s(self) -> float:
|
||||
lat = self.all_latencies
|
||||
return statistics.median(lat) if lat else float("inf")
|
||||
|
||||
@property
|
||||
def p95_s(self) -> float:
|
||||
lat = sorted(self.all_latencies)
|
||||
if not lat:
|
||||
return float("inf")
|
||||
idx = max(0, int(round(0.95 * (len(lat) - 1))))
|
||||
return lat[idx]
|
||||
|
||||
@property
|
||||
def mean_rtf(self) -> float:
|
||||
rtfs = [s.real_time_factor for s in self.sample_results]
|
||||
return statistics.mean(rtfs) if rtfs else float("inf")
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(f"[voice_bench] {msg}", flush=True)
|
||||
|
||||
|
||||
def check_supertonic() -> None:
|
||||
try:
|
||||
r = httpx.post(
|
||||
f"{SUPERTONIC_URL}/v1/audio/speech",
|
||||
json={"model": "supertonic-3", "input": "test", "voice": "M2",
|
||||
"response_format": "wav", "lang": "ro"},
|
||||
timeout=10.0,
|
||||
)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
log(f"FATAL: Supertonic la {SUPERTONIC_URL} nu răspunde: {e}")
|
||||
log("Pornește cu: systemctl --user start supertonic-tts")
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def synthesize_sample(name: str, text: str, out_dir: Path) -> tuple[Path, float]:
|
||||
"""TTS la WAV + probe duration cu wave module (no ffmpeg dep)."""
|
||||
import wave
|
||||
|
||||
out_path = out_dir / f"{name}.wav"
|
||||
r = httpx.post(
|
||||
f"{SUPERTONIC_URL}/v1/audio/speech",
|
||||
json={"model": "supertonic-3", "input": text, "voice": "M2",
|
||||
"response_format": "wav", "lang": "ro"},
|
||||
timeout=60.0,
|
||||
)
|
||||
r.raise_for_status()
|
||||
out_path.write_bytes(r.content)
|
||||
with wave.open(str(out_path), "rb") as wf:
|
||||
duration = wf.getnframes() / float(wf.getframerate())
|
||||
return out_path, duration
|
||||
|
||||
|
||||
def benchmark_model(model_name: str, samples: list[SampleResult], trials: int, threads: int) -> ModelSummary:
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
log(f"Loading model '{model_name}' (compute_type=int8, threads={threads})…")
|
||||
t0 = time.perf_counter()
|
||||
model = WhisperModel(model_name, device="cpu", compute_type="int8", cpu_threads=threads)
|
||||
load_time = time.perf_counter() - t0
|
||||
log(f" loaded in {load_time:.2f}s")
|
||||
|
||||
for sample in samples:
|
||||
log(f" → '{sample.name}' ({sample.audio_duration_s:.2f}s audio) ×{trials} trials")
|
||||
for trial in range(trials):
|
||||
t0 = time.perf_counter()
|
||||
segments, _info = model.transcribe(
|
||||
sample.wav_path,
|
||||
language="ro",
|
||||
beam_size=1,
|
||||
vad_filter=False,
|
||||
without_timestamps=True,
|
||||
)
|
||||
text = " ".join(seg.text.strip() for seg in segments)
|
||||
latency = time.perf_counter() - t0
|
||||
sample.transcribe_latencies_s.append(latency)
|
||||
if trial == 0:
|
||||
sample.transcribed_text = text.strip()
|
||||
log(f" trial {trial+1}: {latency:.2f}s → \"{text.strip()[:70]}\"")
|
||||
|
||||
return ModelSummary(model=model_name, sample_results=samples, load_time_s=load_time, cpu_threads=threads)
|
||||
|
||||
|
||||
def decide(summaries: dict[str, ModelSummary], budget_s: float) -> tuple[str, str]:
|
||||
"""Returns (decision, rationale)."""
|
||||
small = summaries.get("small")
|
||||
tiny = summaries.get("tiny")
|
||||
|
||||
if small and small.p50_s < budget_s:
|
||||
return "PASS", (
|
||||
f"small.p50={small.p50_s:.2f}s < budget {budget_s:.2f}s. "
|
||||
f"Folosește 'small'. RTF mediu {small.mean_rtf:.2f}."
|
||||
)
|
||||
if tiny and tiny.p50_s < budget_s:
|
||||
small_p50 = small.p50_s if small else float("inf")
|
||||
return "FALLBACK_TINY", (
|
||||
f"small.p50={small_p50:.2f}s >= budget; "
|
||||
f"tiny.p50={tiny.p50_s:.2f}s < budget {budget_s:.2f}s. "
|
||||
f"Document fallback la 'tiny' în plan (accuracy mai slabă, latency OK)."
|
||||
)
|
||||
small_p50 = small.p50_s if small else float("inf")
|
||||
tiny_p50 = tiny.p50_s if tiny else float("inf")
|
||||
return "FAIL", (
|
||||
f"Ambele modele depășesc budget-ul {budget_s:.2f}s "
|
||||
f"(small.p50={small_p50:.2f}s, tiny.p50={tiny_p50:.2f}s). "
|
||||
f"Re-plan: model extern (Groq/Deepgram), upgrade hardware, sau "
|
||||
f"acceptă latență mai mare."
|
||||
)
|
||||
|
||||
|
||||
def write_json(summaries: dict[str, ModelSummary], decision: str, rationale: str,
|
||||
budget_s: float, trials: int) -> None:
|
||||
payload: dict[str, Any] = {
|
||||
"schema_version": 1,
|
||||
"timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"decision": decision,
|
||||
"rationale": rationale,
|
||||
"budget_s": budget_s,
|
||||
"trials_per_sample": trials,
|
||||
"models": {},
|
||||
}
|
||||
for name, s in summaries.items():
|
||||
payload["models"][name] = {
|
||||
"p50_s": round(s.p50_s, 3),
|
||||
"p95_s": round(s.p95_s, 3),
|
||||
"mean_rtf": round(s.mean_rtf, 3),
|
||||
"load_time_s": round(s.load_time_s, 3),
|
||||
"cpu_threads": s.cpu_threads,
|
||||
"samples": [
|
||||
{
|
||||
"name": sr.name,
|
||||
"text_in": sr.text,
|
||||
"text_out": sr.transcribed_text,
|
||||
"audio_duration_s": round(sr.audio_duration_s, 3),
|
||||
"latencies_s": [round(x, 3) for x in sr.transcribe_latencies_s],
|
||||
"median_latency_s": round(sr.median_latency_s, 3),
|
||||
"rtf": round(sr.real_time_factor, 3),
|
||||
}
|
||||
for sr in s.sample_results
|
||||
],
|
||||
}
|
||||
RESULTS_JSON.parent.mkdir(parents=True, exist_ok=True)
|
||||
RESULTS_JSON.write_text(json.dumps(payload, indent=2, ensure_ascii=False))
|
||||
log(f"Wrote {RESULTS_JSON}")
|
||||
|
||||
|
||||
def write_markdown(summaries: dict[str, ModelSummary], decision: str, rationale: str,
|
||||
budget_s: float, trials: int) -> None:
|
||||
lines: list[str] = []
|
||||
lines.append("# Voice Bench Results — Discord Voice-to-Voice Spike")
|
||||
lines.append("")
|
||||
lines.append(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}")
|
||||
lines.append(f"Budget: STT p50 < {budget_s:.2f}s (per CEO plan + eng review)")
|
||||
lines.append(f"Trials per sample: {trials}")
|
||||
lines.append("")
|
||||
lines.append(f"## Decision: **{decision}**")
|
||||
lines.append("")
|
||||
lines.append(rationale)
|
||||
lines.append("")
|
||||
lines.append("## Per-Model Summary")
|
||||
lines.append("")
|
||||
lines.append("| Model | p50 (s) | p95 (s) | Mean RTF | Load (s) | Threads |")
|
||||
lines.append("|-------|--------:|--------:|---------:|---------:|--------:|")
|
||||
for name, s in summaries.items():
|
||||
pass_mark = "PASS" if s.p50_s < budget_s else "FAIL"
|
||||
lines.append(
|
||||
f"| {name} | {s.p50_s:.2f} ({pass_mark}) | {s.p95_s:.2f} | "
|
||||
f"{s.mean_rtf:.2f} | {s.load_time_s:.2f} | {s.cpu_threads} |"
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("## Per-Utterance Detail")
|
||||
lines.append("")
|
||||
for name, s in summaries.items():
|
||||
lines.append(f"### {name}")
|
||||
lines.append("")
|
||||
lines.append("| Sample | Audio (s) | Median lat (s) | RTF | Trials | Transcript |")
|
||||
lines.append("|--------|----------:|---------------:|----:|--------|------------|")
|
||||
for sr in s.sample_results:
|
||||
trials_str = ", ".join(f"{x:.2f}" for x in sr.transcribe_latencies_s)
|
||||
transcript = sr.transcribed_text[:80].replace("|", "\\|")
|
||||
lines.append(
|
||||
f"| {sr.name} | {sr.audio_duration_s:.2f} | {sr.median_latency_s:.2f} | "
|
||||
f"{sr.real_time_factor:.2f} | {trials_str} | {transcript} |"
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("## Hardware Context")
|
||||
lines.append("")
|
||||
try:
|
||||
import platform
|
||||
import multiprocessing
|
||||
lines.append(f"- Platform: {platform.platform()}")
|
||||
lines.append(f"- CPU count (logical): {multiprocessing.cpu_count()}")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
with open("/proc/cpuinfo") as f:
|
||||
model_lines = [ln for ln in f.read().split("\n") if "model name" in ln]
|
||||
if model_lines:
|
||||
lines.append(f"- {model_lines[0].strip()}")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
for ln in f.read().split("\n")[:3]:
|
||||
lines.append(f"- {ln.strip()}")
|
||||
except Exception:
|
||||
pass
|
||||
lines.append("")
|
||||
lines.append("## Raw Data")
|
||||
lines.append("")
|
||||
lines.append(f"Vezi `{RESULTS_JSON.relative_to(PROJECT_ROOT)}` pentru JSON complet.")
|
||||
lines.append("")
|
||||
RESULTS_MD.parent.mkdir(parents=True, exist_ok=True)
|
||||
RESULTS_MD.write_text("\n".join(lines))
|
||||
log(f"Wrote {RESULTS_MD}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--models", default=",".join(DEFAULT_MODELS),
|
||||
help="CSV listă de modele faster-whisper (default: small,tiny)")
|
||||
ap.add_argument("--trials", type=int, default=DEFAULT_TRIALS,
|
||||
help=f"Trials per sample (default {DEFAULT_TRIALS})")
|
||||
ap.add_argument("--budget-s", type=float, default=DEFAULT_BUDGET_S,
|
||||
help=f"STT p50 budget secunde (default {DEFAULT_BUDGET_S})")
|
||||
ap.add_argument("--threads", type=int, default=int(os.environ.get("VOICE_BENCH_THREADS", "2")),
|
||||
help="cpu_threads pentru faster-whisper (default 2 — Proxmox VM)")
|
||||
ap.add_argument("--keep-wavs", action="store_true", help="Nu șterge WAV-urile temp")
|
||||
args = ap.parse_args()
|
||||
|
||||
log(f"Budget: p50 < {args.budget_s:.2f}s | Models: {args.models} | Trials: {args.trials}")
|
||||
check_supertonic()
|
||||
|
||||
work_dir = Path(tempfile.mkdtemp(prefix="voice_bench_"))
|
||||
log(f"Working dir: {work_dir}")
|
||||
|
||||
log("Stage 1/3: Generating RO audio samples via Supertonic…")
|
||||
samples: list[SampleResult] = []
|
||||
for name, text in UTTERANCES_RO:
|
||||
log(f" TTS '{name}': {text!r}")
|
||||
path, duration = synthesize_sample(name, text, work_dir)
|
||||
log(f" → {path.name} ({duration:.2f}s)")
|
||||
samples.append(SampleResult(name=name, text=text, wav_path=str(path),
|
||||
audio_duration_s=duration))
|
||||
|
||||
log("Stage 2/3: Running faster-whisper benchmarks…")
|
||||
summaries: dict[str, ModelSummary] = {}
|
||||
for model_name in args.models.split(","):
|
||||
model_name = model_name.strip()
|
||||
if not model_name:
|
||||
continue
|
||||
fresh_samples = [
|
||||
SampleResult(name=s.name, text=s.text, wav_path=s.wav_path,
|
||||
audio_duration_s=s.audio_duration_s)
|
||||
for s in samples
|
||||
]
|
||||
summaries[model_name] = benchmark_model(model_name, fresh_samples,
|
||||
args.trials, args.threads)
|
||||
|
||||
log("Stage 3/3: Decision & artifacts…")
|
||||
decision, rationale = decide(summaries, args.budget_s)
|
||||
log(f"DECISION: {decision}")
|
||||
log(f"WHY: {rationale}")
|
||||
|
||||
write_json(summaries, decision, rationale, args.budget_s, args.trials)
|
||||
write_markdown(summaries, decision, rationale, args.budget_s, args.trials)
|
||||
|
||||
if not args.keep_wavs:
|
||||
for s in samples:
|
||||
try:
|
||||
Path(s.wav_path).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
work_dir.rmdir()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return 0 if decision in ("PASS", "FALLBACK_TINY") else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
184
tools/voice_bench_results.json
Normal file
184
tools/voice_bench_results.json
Normal file
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"schema_version": 1,
|
||||
"timestamp_utc": "2026-05-27T12:30:17Z",
|
||||
"decision": "FALLBACK_TINY",
|
||||
"rationale": "small.p50=2.79s >= budget; tiny.p50=0.54s < budget 1.50s. Document fallback la 'tiny' în plan (accuracy mai slabă, latency OK).",
|
||||
"budget_s": 1.5,
|
||||
"trials_per_sample": 3,
|
||||
"models": {
|
||||
"small": {
|
||||
"p50_s": 2.793,
|
||||
"p95_s": 3.308,
|
||||
"mean_rtf": 0.699,
|
||||
"load_time_s": 1.505,
|
||||
"cpu_threads": 6,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut ce mai faci!",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
2.586,
|
||||
2.666,
|
||||
2.538
|
||||
],
|
||||
"median_latency_s": 2.586,
|
||||
"rtf": 1.375
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stai puțin să mă gândesc la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
2.739,
|
||||
2.697,
|
||||
2.683
|
||||
],
|
||||
"median_latency_s": 2.697,
|
||||
"rtf": 0.922
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendari și avem ședință cu echipa la trei după amiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
3.005,
|
||||
3.013,
|
||||
3.023
|
||||
],
|
||||
"median_latency_s": 3.013,
|
||||
"rtf": 0.503
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este 120 și 3 delei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
2.657,
|
||||
2.698,
|
||||
2.677
|
||||
],
|
||||
"median_latency_s": 2.677,
|
||||
"rtf": 0.475
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți spun pe agenda de mâine să suni la noa?",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
2.883,
|
||||
2.85,
|
||||
2.847
|
||||
],
|
||||
"median_latency_s": 2.85,
|
||||
"rtf": 0.561
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau să mi-reamintești di seară să verific dacă scriptul de bacup a rulat corect și să trimit raportul către echipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
3.277,
|
||||
3.428,
|
||||
3.308
|
||||
],
|
||||
"median_latency_s": 3.308,
|
||||
"rtf": 0.357
|
||||
}
|
||||
]
|
||||
},
|
||||
"tiny": {
|
||||
"p50_s": 0.541,
|
||||
"p95_s": 0.662,
|
||||
"mean_rtf": 0.138,
|
||||
"load_time_s": 0.576,
|
||||
"cpu_threads": 6,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut ce mai faci",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
0.669,
|
||||
0.542,
|
||||
0.557
|
||||
],
|
||||
"median_latency_s": 0.557,
|
||||
"rtf": 0.296
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stei putin să mă gândest la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
0.499,
|
||||
0.475,
|
||||
0.497
|
||||
],
|
||||
"median_latency_s": 0.497,
|
||||
"rtf": 0.17
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendar și avem sedeință cu equipala 3 dupa amiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
0.569,
|
||||
0.606,
|
||||
0.599
|
||||
],
|
||||
"median_latency_s": 0.599,
|
||||
"rtf": 0.1
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este o suta 20 și 3 de lei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
0.519,
|
||||
0.51,
|
||||
0.54
|
||||
],
|
||||
"median_latency_s": 0.519,
|
||||
"rtf": 0.092
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți pun pe agenda de muine să sunt la nu a.",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
0.51,
|
||||
0.524,
|
||||
0.522
|
||||
],
|
||||
"median_latency_s": 0.522,
|
||||
"rtf": 0.103
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau sămi rea minstești diseare să verific daca scriptul de backup a rulat correct și să trimitra portul către e kipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
0.662,
|
||||
0.646,
|
||||
0.627
|
||||
],
|
||||
"median_latency_s": 0.646,
|
||||
"rtf": 0.07
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
184
tools/voice_bench_results_threads2.json
Normal file
184
tools/voice_bench_results_threads2.json
Normal file
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"schema_version": 1,
|
||||
"timestamp_utc": "2026-05-27T12:23:08Z",
|
||||
"decision": "FALLBACK_TINY",
|
||||
"rationale": "small.p50=3.25s >= budget; tiny.p50=0.50s < budget 1.50s. Document fallback la 'tiny' în plan (accuracy mai slabă, latency OK).",
|
||||
"budget_s": 1.5,
|
||||
"trials_per_sample": 3,
|
||||
"models": {
|
||||
"small": {
|
||||
"p50_s": 3.255,
|
||||
"p95_s": 3.611,
|
||||
"mean_rtf": 0.801,
|
||||
"load_time_s": 10.633,
|
||||
"cpu_threads": 2,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut ce mai faci!",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
3.236,
|
||||
2.952,
|
||||
2.945
|
||||
],
|
||||
"median_latency_s": 2.952,
|
||||
"rtf": 1.569
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stai puțin să mă gândesc la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
3.095,
|
||||
3.099,
|
||||
3.126
|
||||
],
|
||||
"median_latency_s": 3.099,
|
||||
"rtf": 1.059
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendari și avem sedință cu echipa la 3 după amiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
3.437,
|
||||
3.419,
|
||||
3.342
|
||||
],
|
||||
"median_latency_s": 3.419,
|
||||
"rtf": 0.571
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este 120 și 3 delei și 5-10 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
3.24,
|
||||
3.207,
|
||||
3.237
|
||||
],
|
||||
"median_latency_s": 3.237,
|
||||
"rtf": 0.574
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți spun pe agenda de mâine să suni la noa?",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
3.329,
|
||||
3.27,
|
||||
3.278
|
||||
],
|
||||
"median_latency_s": 3.278,
|
||||
"rtf": 0.645
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau să mi-reamintești, di seară, să verific dacă scriptul de bacup a rulat corect și să trimit raportul către echipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
3.626,
|
||||
3.611,
|
||||
3.563
|
||||
],
|
||||
"median_latency_s": 3.611,
|
||||
"rtf": 0.39
|
||||
}
|
||||
]
|
||||
},
|
||||
"tiny": {
|
||||
"p50_s": 0.505,
|
||||
"p95_s": 0.556,
|
||||
"mean_rtf": 0.122,
|
||||
"load_time_s": 3.15,
|
||||
"cpu_threads": 2,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salute mai face?",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
0.438,
|
||||
0.449,
|
||||
0.443
|
||||
],
|
||||
"median_latency_s": 0.443,
|
||||
"rtf": 0.235
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stei putin să mă gândesc la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
0.477,
|
||||
0.476,
|
||||
0.47
|
||||
],
|
||||
"median_latency_s": 0.476,
|
||||
"rtf": 0.163
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendar și avem sedeință cu equipala 3 dupa am iază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
0.506,
|
||||
0.514,
|
||||
0.505
|
||||
],
|
||||
"median_latency_s": 0.506,
|
||||
"rtf": 0.084
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este o suta doozec și trei de lei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
0.504,
|
||||
0.522,
|
||||
0.493
|
||||
],
|
||||
"median_latency_s": 0.504,
|
||||
"rtf": 0.089
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți pun pe agenda de muină să sunilă nu a.",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
0.509,
|
||||
0.504,
|
||||
0.529
|
||||
],
|
||||
"median_latency_s": 0.509,
|
||||
"rtf": 0.1
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau să mire am in test, disiară să verific dacă scriptul de backup a rulat correct și să trimitra portul că trea equipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
0.556,
|
||||
0.535,
|
||||
0.571
|
||||
],
|
||||
"median_latency_s": 0.556,
|
||||
"rtf": 0.06
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
184
tools/voice_bench_results_threads4.json
Normal file
184
tools/voice_bench_results_threads4.json
Normal file
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"schema_version": 1,
|
||||
"timestamp_utc": "2026-05-27T12:24:48Z",
|
||||
"decision": "FALLBACK_TINY",
|
||||
"rationale": "small.p50=2.25s >= budget; tiny.p50=0.48s < budget 1.50s. Document fallback la 'tiny' în plan (accuracy mai slabă, latency OK).",
|
||||
"budget_s": 1.5,
|
||||
"trials_per_sample": 3,
|
||||
"models": {
|
||||
"small": {
|
||||
"p50_s": 2.249,
|
||||
"p95_s": 2.532,
|
||||
"mean_rtf": 0.54,
|
||||
"load_time_s": 1.339,
|
||||
"cpu_threads": 4,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut ce mai faci!",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
2.068,
|
||||
1.951,
|
||||
1.947
|
||||
],
|
||||
"median_latency_s": 1.951,
|
||||
"rtf": 1.038
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stai putin să mă gândesc la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
2.092,
|
||||
2.06,
|
||||
2.072
|
||||
],
|
||||
"median_latency_s": 2.072,
|
||||
"rtf": 0.708
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendari și avem sedință cu echipa la 3 după amiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
2.235,
|
||||
2.283,
|
||||
2.48
|
||||
],
|
||||
"median_latency_s": 2.283,
|
||||
"rtf": 0.381
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este 120 și 3 delei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
2.285,
|
||||
2.264,
|
||||
2.303
|
||||
],
|
||||
"median_latency_s": 2.285,
|
||||
"rtf": 0.405
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți spun pe agenda de mâine să suni la noa a.",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
2.279,
|
||||
2.205,
|
||||
2.21
|
||||
],
|
||||
"median_latency_s": 2.21,
|
||||
"rtf": 0.435
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau să mi-răimintești di seară să verific dacă scriptul de bacup a rulat corect și să trimit raportul către echipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
2.639,
|
||||
2.532,
|
||||
2.528
|
||||
],
|
||||
"median_latency_s": 2.532,
|
||||
"rtf": 0.273
|
||||
}
|
||||
]
|
||||
},
|
||||
"tiny": {
|
||||
"p50_s": 0.481,
|
||||
"p95_s": 0.574,
|
||||
"mean_rtf": 0.117,
|
||||
"load_time_s": 0.541,
|
||||
"cpu_threads": 4,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut, ce mai fac?",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
0.453,
|
||||
0.417,
|
||||
0.411
|
||||
],
|
||||
"median_latency_s": 0.417,
|
||||
"rtf": 0.222
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stei putin să mă gândesc la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
0.429,
|
||||
0.449,
|
||||
0.463
|
||||
],
|
||||
"median_latency_s": 0.449,
|
||||
"rtf": 0.153
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendar și avem sedeință cu equipala 3 du pămiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
0.499,
|
||||
0.495,
|
||||
0.504
|
||||
],
|
||||
"median_latency_s": 0.499,
|
||||
"rtf": 0.083
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este o suta 20 și 3 de lei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
0.491,
|
||||
0.487,
|
||||
0.456
|
||||
],
|
||||
"median_latency_s": 0.487,
|
||||
"rtf": 0.086
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți pun pe agenda de muină să sun la nu a.",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
0.474,
|
||||
0.468,
|
||||
0.505
|
||||
],
|
||||
"median_latency_s": 0.474,
|
||||
"rtf": 0.093
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau să mream in test de seare să verific dacă scriptul de bakup a rulat correct și să trimitra portul că trea equipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
0.574,
|
||||
0.532,
|
||||
0.575
|
||||
],
|
||||
"median_latency_s": 0.574,
|
||||
"rtf": 0.062
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
184
tools/voice_bench_results_threads6.json
Normal file
184
tools/voice_bench_results_threads6.json
Normal file
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"schema_version": 1,
|
||||
"timestamp_utc": "2026-05-27T12:30:17Z",
|
||||
"decision": "FALLBACK_TINY",
|
||||
"rationale": "small.p50=2.79s >= budget; tiny.p50=0.54s < budget 1.50s. Document fallback la 'tiny' în plan (accuracy mai slabă, latency OK).",
|
||||
"budget_s": 1.5,
|
||||
"trials_per_sample": 3,
|
||||
"models": {
|
||||
"small": {
|
||||
"p50_s": 2.793,
|
||||
"p95_s": 3.308,
|
||||
"mean_rtf": 0.699,
|
||||
"load_time_s": 1.505,
|
||||
"cpu_threads": 6,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut ce mai faci!",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
2.586,
|
||||
2.666,
|
||||
2.538
|
||||
],
|
||||
"median_latency_s": 2.586,
|
||||
"rtf": 1.375
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stai puțin să mă gândesc la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
2.739,
|
||||
2.697,
|
||||
2.683
|
||||
],
|
||||
"median_latency_s": 2.697,
|
||||
"rtf": 0.922
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendari și avem ședință cu echipa la trei după amiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
3.005,
|
||||
3.013,
|
||||
3.023
|
||||
],
|
||||
"median_latency_s": 3.013,
|
||||
"rtf": 0.503
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este 120 și 3 delei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
2.657,
|
||||
2.698,
|
||||
2.677
|
||||
],
|
||||
"median_latency_s": 2.677,
|
||||
"rtf": 0.475
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți spun pe agenda de mâine să suni la noa?",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
2.883,
|
||||
2.85,
|
||||
2.847
|
||||
],
|
||||
"median_latency_s": 2.85,
|
||||
"rtf": 0.561
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau să mi-reamintești di seară să verific dacă scriptul de bacup a rulat corect și să trimit raportul către echipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
3.277,
|
||||
3.428,
|
||||
3.308
|
||||
],
|
||||
"median_latency_s": 3.308,
|
||||
"rtf": 0.357
|
||||
}
|
||||
]
|
||||
},
|
||||
"tiny": {
|
||||
"p50_s": 0.541,
|
||||
"p95_s": 0.662,
|
||||
"mean_rtf": 0.138,
|
||||
"load_time_s": 0.576,
|
||||
"cpu_threads": 6,
|
||||
"samples": [
|
||||
{
|
||||
"name": "short",
|
||||
"text_in": "Salut, ce mai faci?",
|
||||
"text_out": "Salut ce mai faci",
|
||||
"audio_duration_s": 1.881,
|
||||
"latencies_s": [
|
||||
0.669,
|
||||
0.542,
|
||||
0.557
|
||||
],
|
||||
"median_latency_s": 0.557,
|
||||
"rtf": 0.296
|
||||
},
|
||||
{
|
||||
"name": "conversational",
|
||||
"text_in": "Stai puțin să mă gândesc la asta.",
|
||||
"text_out": "Stei putin să mă gândest la asta.",
|
||||
"audio_duration_s": 2.926,
|
||||
"latencies_s": [
|
||||
0.499,
|
||||
0.475,
|
||||
0.497
|
||||
],
|
||||
"median_latency_s": 0.497,
|
||||
"rtf": 0.17
|
||||
},
|
||||
{
|
||||
"name": "medium",
|
||||
"text_in": "Am verificat în calendar și avem ședință cu echipa la trei după-amiază.",
|
||||
"text_out": "Am verificat în calendar și avem sedeință cu equipala 3 dupa amiază.",
|
||||
"audio_duration_s": 5.991,
|
||||
"latencies_s": [
|
||||
0.569,
|
||||
0.606,
|
||||
0.599
|
||||
],
|
||||
"median_latency_s": 0.599,
|
||||
"rtf": 0.1
|
||||
},
|
||||
{
|
||||
"name": "numbers",
|
||||
"text_in": "Costul total este o sută douăzeci și trei de lei și cincizeci de bani.",
|
||||
"text_out": "Costul total este o suta 20 și 3 de lei și 50 de bani.",
|
||||
"audio_duration_s": 5.642,
|
||||
"latencies_s": [
|
||||
0.519,
|
||||
0.51,
|
||||
0.54
|
||||
],
|
||||
"median_latency_s": 0.519,
|
||||
"rtf": 0.092
|
||||
},
|
||||
{
|
||||
"name": "question",
|
||||
"text_in": "Marius, vrei să-ți pun pe agenda de mâine să suni la NOAA?",
|
||||
"text_out": "Marius, vrei să-ți pun pe agenda de muine să sunt la nu a.",
|
||||
"audio_duration_s": 5.085,
|
||||
"latencies_s": [
|
||||
0.51,
|
||||
0.524,
|
||||
0.522
|
||||
],
|
||||
"median_latency_s": 0.522,
|
||||
"rtf": 0.103
|
||||
},
|
||||
{
|
||||
"name": "longer",
|
||||
"text_in": "Vreau să-mi reamintești diseară să verific dacă scriptul de backup a rulat corect și să trimit raportul către echipă.",
|
||||
"text_out": "Vreau sămi rea minstești diseare să verific daca scriptul de backup a rulat correct și să trimitra portul către e kipă.",
|
||||
"audio_duration_s": 9.265,
|
||||
"latencies_s": [
|
||||
0.662,
|
||||
0.646,
|
||||
0.627
|
||||
],
|
||||
"median_latency_s": 0.646,
|
||||
"rtf": 0.07
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
273
tools/voice_setup.py
Normal file
273
tools/voice_setup.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
voice_setup.py — One-shot setup for Discord voice pipeline.
|
||||
|
||||
Run after `pip install -r requirements.txt`. Idempotent.
|
||||
|
||||
Steps:
|
||||
1. Verify libopus0 loaded by discord.py (apt install libopus0 if missing)
|
||||
2. Verify ffmpeg in PATH
|
||||
3. Verify Supertonic TTS reachable at :7788
|
||||
4. Warm faster-whisper small int8 (downloads to ~/.cache/huggingface/ if cold)
|
||||
5. Warm silero-vad
|
||||
6. Generate assets/voice/{beep_200ms,mhm,thinking}.wav via Supertonic + ffmpeg
|
||||
|
||||
Exit code: 0 = all green, 1 = something needs human intervention.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
ASSETS_DIR = REPO_ROOT / "assets" / "voice"
|
||||
SUPERTONIC_URL = "http://127.0.0.1:7788/v1/audio/speech"
|
||||
SUPERTONIC_VOICE = "M2"
|
||||
|
||||
GREEN = "\033[32m"
|
||||
RED = "\033[31m"
|
||||
YELLOW = "\033[33m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
def _ok(msg: str) -> None:
|
||||
print(f"{GREEN}[ OK ]{RESET} {msg}")
|
||||
|
||||
|
||||
def _fail(msg: str) -> None:
|
||||
print(f"{RED}[FAIL]{RESET} {msg}")
|
||||
|
||||
|
||||
def _warn(msg: str) -> None:
|
||||
print(f"{YELLOW}[WARN]{RESET} {msg}")
|
||||
|
||||
|
||||
def check_libopus() -> bool:
|
||||
try:
|
||||
import discord
|
||||
except ImportError:
|
||||
_fail("discord.py not installed — run `pip install -r requirements.txt`")
|
||||
return False
|
||||
|
||||
if discord.opus.is_loaded():
|
||||
_ok("libopus loaded (discord.py)")
|
||||
return True
|
||||
|
||||
try:
|
||||
discord.opus._load_default()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if discord.opus.is_loaded():
|
||||
_ok("libopus loaded after fallback")
|
||||
return True
|
||||
|
||||
_fail(
|
||||
"libopus NOT loaded — Discord voice will fail silent. "
|
||||
"Run: sudo apt install -y libopus0"
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
def check_ffmpeg() -> bool:
|
||||
if not shutil.which("ffmpeg"):
|
||||
_fail("ffmpeg not in PATH — required for audio asset generation")
|
||||
return False
|
||||
_ok(f"ffmpeg at {shutil.which('ffmpeg')}")
|
||||
return True
|
||||
|
||||
|
||||
def check_supertonic() -> bool:
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
SUPERTONIC_URL,
|
||||
data=json.dumps(
|
||||
{
|
||||
"model": "supertonic-3",
|
||||
"input": "test",
|
||||
"voice": SUPERTONIC_VOICE,
|
||||
"response_format": "wav",
|
||||
"lang": "ro",
|
||||
}
|
||||
).encode("utf-8"),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
if resp.status == 200:
|
||||
_ok(f"Supertonic up at {SUPERTONIC_URL}")
|
||||
return True
|
||||
except (urllib.error.URLError, ConnectionError) as e:
|
||||
_fail(f"Supertonic unreachable at :7788 — {e}. Start: systemctl --user start supertonic-tts")
|
||||
return False
|
||||
_fail(f"Supertonic returned non-200")
|
||||
return False
|
||||
|
||||
|
||||
def warm_whisper() -> bool:
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
except ImportError:
|
||||
_fail("faster-whisper not installed")
|
||||
return False
|
||||
|
||||
print(" Warming faster-whisper small int8 (downloads if cold)...")
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=4)
|
||||
elapsed = time.perf_counter() - t0
|
||||
_ok(f"faster-whisper small int8 warm ({elapsed:.1f}s)")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(f"faster-whisper warm failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def warm_silero() -> bool:
|
||||
try:
|
||||
from silero_vad import load_silero_vad
|
||||
except ImportError:
|
||||
_fail("silero-vad not installed")
|
||||
return False
|
||||
|
||||
print(" Warming silero-vad...")
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
load_silero_vad()
|
||||
elapsed = time.perf_counter() - t0
|
||||
_ok(f"silero-vad warm ({elapsed:.1f}s)")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(f"silero-vad warm failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _supertonic_synth(text: str, out_path: Path) -> bool:
|
||||
payload = {
|
||||
"model": "supertonic-3",
|
||||
"input": text,
|
||||
"voice": SUPERTONIC_VOICE,
|
||||
"response_format": "wav",
|
||||
"lang": "ro",
|
||||
}
|
||||
req = urllib.request.Request(
|
||||
SUPERTONIC_URL,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
wav_bytes = resp.read()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_bytes(wav_bytes)
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(f"Supertonic synth failed for {out_path.name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def gen_thinking_wav() -> bool:
|
||||
path = ASSETS_DIR / "thinking.wav"
|
||||
if path.exists() and path.stat().st_size > 1024:
|
||||
_ok(f"thinking.wav exists ({path.stat().st_size} bytes)")
|
||||
return True
|
||||
print(" Generating thinking.wav via Supertonic...")
|
||||
if _supertonic_synth("Stai puțin să-mi adun gândurile.", path):
|
||||
_ok(f"thinking.wav generated ({path.stat().st_size} bytes)")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def gen_mhm_wav() -> bool:
|
||||
path = ASSETS_DIR / "mhm.wav"
|
||||
if path.exists() and path.stat().st_size > 512:
|
||||
_ok(f"mhm.wav exists ({path.stat().st_size} bytes)")
|
||||
return True
|
||||
print(" Generating mhm.wav via Supertonic...")
|
||||
if _supertonic_synth("Mhm.", path):
|
||||
_ok(f"mhm.wav generated ({path.stat().st_size} bytes)")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def gen_beep_wav() -> bool:
|
||||
path = ASSETS_DIR / "beep_200ms.wav"
|
||||
if path.exists() and path.stat().st_size > 512:
|
||||
_ok(f"beep_200ms.wav exists ({path.stat().st_size} bytes)")
|
||||
return True
|
||||
print(" Generating beep_200ms.wav via ffmpeg (880Hz sine, 200ms)...")
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-f",
|
||||
"lavfi",
|
||||
"-i",
|
||||
"sine=frequency=880:duration=0.2:sample_rate=48000",
|
||||
"-af",
|
||||
"afade=t=out:st=0.15:d=0.05,volume=0.3",
|
||||
"-ac",
|
||||
"2",
|
||||
str(path),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
_ok(f"beep_200ms.wav generated ({path.stat().st_size} bytes)")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
_fail(f"ffmpeg beep gen failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main() -> int:
|
||||
print(f"voice_setup.py — Discord voice pipeline setup\n")
|
||||
|
||||
checks: list[tuple[str, bool]] = []
|
||||
|
||||
checks.append(("libopus", check_libopus()))
|
||||
checks.append(("ffmpeg", check_ffmpeg()))
|
||||
checks.append(("Supertonic", check_supertonic()))
|
||||
checks.append(("faster-whisper", warm_whisper()))
|
||||
checks.append(("silero-vad", warm_silero()))
|
||||
|
||||
if checks[2][1]: # Supertonic OK
|
||||
checks.append(("thinking.wav", gen_thinking_wav()))
|
||||
checks.append(("mhm.wav", gen_mhm_wav()))
|
||||
else:
|
||||
_warn("Skipping thinking.wav / mhm.wav generation — Supertonic down")
|
||||
checks.append(("thinking.wav", False))
|
||||
checks.append(("mhm.wav", False))
|
||||
|
||||
if checks[1][1]: # ffmpeg OK
|
||||
checks.append(("beep_200ms.wav", gen_beep_wav()))
|
||||
else:
|
||||
_warn("Skipping beep_200ms.wav — ffmpeg missing")
|
||||
checks.append(("beep_200ms.wav", False))
|
||||
|
||||
print()
|
||||
failed = [name for name, ok in checks if not ok]
|
||||
if failed:
|
||||
print(f"{RED}FAILED:{RESET} {len(failed)}/{len(checks)} — fix above before /voice join works:")
|
||||
for name in failed:
|
||||
print(f" - {name}")
|
||||
return 1
|
||||
|
||||
print(f"{GREEN}ALL GREEN{RESET} ({len(checks)} checks). Voice pipeline ready.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
132
vendor/discord-ext-voice-recv/.gitignore
vendored
Normal file
132
vendor/discord-ext-voice-recv/.gitignore
vendored
Normal file
@@ -0,0 +1,132 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
.vscode/
|
||||
*.code-*
|
||||
21
vendor/discord-ext-voice-recv/LICENSE
vendored
Normal file
21
vendor/discord-ext-voice-recv/LICENSE
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015-present Imayhaveborkedit
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
230
vendor/discord-ext-voice-recv/README.md
vendored
Normal file
230
vendor/discord-ext-voice-recv/README.md
vendored
Normal file
@@ -0,0 +1,230 @@
|
||||

|
||||
|
||||
# discord-ext-voice-recv
|
||||
Voice receive extension package for discord.py
|
||||
|
||||
## Warning
|
||||
**This extension should be more or less functional, but the code is not yet feature complete. No guarantees are given for stability or random breaking changes.**
|
||||
|
||||
See the [update notes](update_notes.md) for a poor excuse for a changelog.
|
||||
|
||||
## Installing
|
||||
**Python 3.8 or higher is required**, preferably at least 3.11 or whatever is latest
|
||||
|
||||
```
|
||||
python -m pip install discord-ext-voice-recv
|
||||
```
|
||||
|
||||
To install directly from github:
|
||||
```
|
||||
python -m pip install git+https://github.com/imayhaveborkedit/discord-ext-voice-recv
|
||||
```
|
||||
|
||||
Naturally, this extension depends on `discord.py` being installed with voice support (`pynacl`).
|
||||
|
||||
## Example
|
||||
See the [example script](examples/recv.py).
|
||||
|
||||
## Feature overview
|
||||
### Custom VoiceProtocol client
|
||||
No monkey patching or bizarre hacks required. Simply use the library feature to use `VoiceRecvClient` as the voice client class. See [Usage](#usage).
|
||||
|
||||
### New events
|
||||
This extension adds the unimplemented voice websocket events and three virtual events. See [New Events](#new-events).
|
||||
|
||||
### Speaking state
|
||||
It is now possible to determine if a member is speaking or not, using `VoiceRecvClient.get_speaking()`, or using the speaking events inside an `AudioSink`.
|
||||
|
||||
### Simple and familiar API
|
||||
The overall API is designed to mirror the discord.py voice send API, with `AudioSink` being the counterpart to the existing `AudioSource`. See [Sinks](#sinks).
|
||||
|
||||
### Convenient included utilities
|
||||
Batteries included in the form of useful built in `AudioSinks`. Some to match their `AudioSource` counterpart, some I merely considered useful. See... uh... TODO.
|
||||
|
||||
### Optional extras
|
||||
Slightly more complex included batteries that depend on external packages. These live in `voice_recv.extras`. They can be installed by adding their optional dependency during install, ex: `pip install discord-ext-voice-recv[extras_thing]`, or all of them can be installed by specifying `extras` instead. See [Extras](#extras).
|
||||
|
||||
### More or less typed
|
||||
It's probably fine.
|
||||
|
||||
## Usage
|
||||
### VoiceRecvClient
|
||||
The class `voice_recv.VoiceRecvClient` must be used in `VoiceChannel.connect()` to enable voice receive functionality.
|
||||
```python
|
||||
from discord.ext import voice_recv
|
||||
|
||||
voice_client = await voice_channel.connect(cls=voice_recv.VoiceRecvClient)
|
||||
```
|
||||
|
||||
### New voice client functions
|
||||
```python
|
||||
def listen(sink: voice_recv.AudioSink, *, after=None) -> None
|
||||
```
|
||||
Receives audio data into an `AudioSink`. A sink is similar to the `AudioSource` class, where most of the logic is done in a single callback function, but in reverse. Sinks are explained in detail in the [Sinks](#sinks) section below.
|
||||
|
||||
The finalizer, `after` is called after the sink has been exhausted or an error occurred. The callback signature is the same as the after callback for `play()`, one parameter for an optional Exception object.
|
||||
|
||||
```python
|
||||
def is_listening() -> bool
|
||||
```
|
||||
Returns `True` if the voice client is currently receiving audio. Specifically, if the bot is reading from the voice socket.
|
||||
|
||||
```python
|
||||
def stop() -> None
|
||||
```
|
||||
This function now stops both receiving and sending of audio.
|
||||
|
||||
```python
|
||||
def stop_listening() -> None
|
||||
```
|
||||
Stops receiving audio.
|
||||
|
||||
```python
|
||||
def stop_playing() -> None
|
||||
```
|
||||
Stops playing audio. This function is identical to `discord.VoiceClient.stop()`.
|
||||
|
||||
```python
|
||||
def get_speaking(member: discord.Member | discord.User) -> bool | None
|
||||
```
|
||||
Gets the speaking state (voice activity, the green circle) of a member. User is typed in for convenience. Returns None if the member was not found.
|
||||
|
||||
## Sinks
|
||||
The API of this extension is designed to mirror the discord.py voice send API. Sending audio uses the `AudioSource` class, while receiving audio uses the `AudioSink` class. A sink is designed to be the inverse of a source. Essentially, a source is a callback called by discord.py to produce a chunk of audio data. Conversely, a sink is a callback called by the library to handle a chunk of audio. Sinks can be composed in the same fashion as sources, creating an audio processing pipeline. Sources and sinks can even combined into one object to handle both tasks, such as creating a feedback loop.
|
||||
|
||||
Special care should be taken not to write excessively computationally expensive code, as python is not particularly well suited to real-time audio processing.
|
||||
|
||||
Due to voice receive being somewhat more complex than voice sending, sinks have additional functionality compared to sources. However, the core sink functions should look relatively familiar.
|
||||
|
||||
```python
|
||||
class MySink(voice_recv.AudioSink):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def write(self, user: User | Member | None, data: VoiceData):
|
||||
...
|
||||
|
||||
def cleanup(self):
|
||||
...
|
||||
```
|
||||
|
||||
These are the main functions of a sink, names and purpose reflecting that of their source counterparts. It is important to note that `super().__init__()` must be called when inheriting from `AudioSink`, in contrast to `AudioSource` which does not have a default `__init__` function.
|
||||
|
||||
- The `wants_opus()` function determines if the sink should receive opus packets or decoded PCM packets. Care should be taken not to unintentionally mix sinks that want different types.
|
||||
- The `write()` function is the main callback, where the sink logic takes place. In a sink pipeline, this could alter, inspect, or log a packet, and then write it to a child sink. `VoiceData` is a simple container class with attributes for the origin member, opus data, optionally pcm data, and raw audio packet.
|
||||
- The `cleanup()` function is identical to `AudioSource.cleanup()`, a finalizer to cleanup any loose ends when the sink has finished its job.
|
||||
|
||||
Additionally, sinks also have properties for their `client` and `voice_client`, as well as `parent` and `child`/`children` sinks.
|
||||
|
||||
### Built in Sinks
|
||||
This extension comes with several useful built in sinks, as well as a few [extras](#extras) mentioned later. For a more information, you will have to [source dive](discord/ext/voice_recv/sinks.py) for now.
|
||||
|
||||
- `AudioSink` - The base class for most sinks, similar in purpose to the discord.py `AudioSource`.
|
||||
- `MultiAudioSink` - A sink that supports writing to multiple destination sinks. Has no subclass implementations currently. Generally intended to be extended by the user.
|
||||
- `BasicSink` - A simple sink that operates based on a user provided callback. Useful for testing or simple tasks not performed by other sinks.
|
||||
- `WaveSink` - Writes audio data to a .wav file. It does not fill in silence or mix audio from multiple users on its own. `WavSink` is an alias for this sink.
|
||||
- `FFmpegSink` - Uses ffmpeg to convert the audio stream to an arbitrary format, or whatever else ffmpeg can do to it. Requires ffmpeg, but you should already have it working for discord.py.
|
||||
- `PCMVolumeTransformer` - The AudioSink analog to the discord.py AudioSource version. Does exactly the same thing: controls the volume.
|
||||
- `ConditionalFilter` - Filters audio data based on a given predicate. If the predicate fails for a packet, it is not written to the destination sink.
|
||||
- `UserFilter` - A conditional filter to check if data is from a given user.
|
||||
- `TimedFilter` - A conditional filter with a timer for how long it should operate.
|
||||
- `SilenceGeneratorSink` - Generates silence to fill in audio transmission downtime for a continuous data stream. **Note: This sink is pretty broken and buggy right now and slated for rewrite. Usage is not advised.**
|
||||
|
||||
### Sink event listeners
|
||||
With AudioSinks being potentially more complex and stateful than AudioSources and the addition of new events, it is sometimes necessary to handle events in the context of a sink. It would be rather awkward to have to register a sink function with `commands.Bot.add_listener()` while dealing with thread safety, and even more so using `discord.Client`. To remedy this, listeners can be defined within sinks, similarly to how they work in Cogs.
|
||||
|
||||
```python
|
||||
class MySink(AudioSink):
|
||||
@AudioSink.listener()
|
||||
def on_voice_member_disconnect(self, member: discord.Member, ssrc: int | None):
|
||||
print(f"{member} has disconnected")
|
||||
self.do_something_like_handle_disconnect(ssrc)
|
||||
```
|
||||
|
||||
Note that these functions must be sync functions, as they are dispatched from a thread. Trying to use an async function will result in an error. This restriction only applies to sink listeners, and normal async event listeners will function as per usual. The event listener dispatch thread is different from the one used to dispatch the `write()` callback so potential thread safety issues should be considered. A decorator argument to run the event callback in the other thread *may* be added later.
|
||||
|
||||
## New events
|
||||
```python
|
||||
async def on_voice_member_speaking_state(member: discord.Member, ssrc: int, state: SpeakingState | int)
|
||||
```
|
||||
First and foremost, this event does **NOT** refer to the speaking indicator in discord (the green circle). For voice activity, see `on_voice_member_speaking_start`.
|
||||
This event is fired when the speaking state (speaking mode) of a member changes. This happens when:
|
||||
- A member first speaks (transmits audio) in a voice, but only once per session
|
||||
- A member activates or deactivates priority speaker mode
|
||||
|
||||
This event is fired once initially to reveal the ssrc of a member, an identifier to map packets to their originating member. Any packets received from this member before this event fires can (probably) be safely ignored since they are likely just silence packets.
|
||||
|
||||
```python
|
||||
async def on_voice_member_connect(member: discord.Member)
|
||||
```
|
||||
|
||||
Called when a member connects to a voice channel. Also called on initial connection for every member in the channel.
|
||||
|
||||
```python
|
||||
async def on_voice_member_disconnect(member: discord.Member, ssrc: int | None)
|
||||
```
|
||||
Called when a member disconnects from a voice channel. The `ssrc` parameter is the unique id a member has to identify which packets belong to them. This is useful when using custom sinks, particularly those that handle packets from multiple members.
|
||||
|
||||
```python
|
||||
async def on_voice_member_video(member: discord.Member, data: voice_recv.VoiceVideoStreams)
|
||||
```
|
||||
Called when a member in voice channel toggles their webcam on or off, NOT screenshare. Screenshare status is only indicated in the `self_video` attribute of `discord.VoiceState`.
|
||||
|
||||
```python
|
||||
async def on_voice_member_flags(member: discord.Member, flags: voice_recv.VoiceFlags)
|
||||
```
|
||||
An undocumented event dispatched when a member joins a voice channel containing a flags bitfield. Also called on initial connection for every member in the channel.
|
||||
|
||||
Flags:
|
||||
- `VoiceFlags.clips_enabled`: User has [clips](https://support.discord.com/hc/en-us/articles/16861982215703-Clips) enabled
|
||||
- `VoiceFlags.allow_voice_recording`: User has consented to their voice being clipped
|
||||
- `VoiceFlags.allow_any_viewer_clips`: User has consented to stream viewers clipping them
|
||||
|
||||
```python
|
||||
async def on_voice_member_platform(member: discord.Member, platform: voice_recv.VoicePlatform | None)
|
||||
```
|
||||
An undocumented event dispatched when a member joins a voice channel containing the member's platform. Also called on initial connection for every member in the channel.
|
||||
|
||||
Values:
|
||||
- `VoicePlatform.desktop`
|
||||
- `VoicePlatform.mobile`
|
||||
- `VoicePlatform.xbox`
|
||||
- `VoicePlatform.playstation`
|
||||
|
||||
```python
|
||||
def on_rtcp_packet(packet: RTCPPacket, guild: discord.Guild)
|
||||
```
|
||||
A virtual event for when an RTCP packet is received. This event only works inside of sinks, so it cannot be async.
|
||||
|
||||
```python
|
||||
def on_voice_member_speaking_start(member: discord.Member)
|
||||
def on_voice_member_speaking_stop(member: discord.Member)
|
||||
```
|
||||
Virtual events for the state of the speaking indicator (the green circle). These events are synthesized from packet activity and may not exactly match what is displayed in the discord client. Due to performance issues with asyncio, this event is sink only and cannot be async.
|
||||
|
||||
## Extras
|
||||
|
||||
### `voice_recv.extras.speechrecognition`
|
||||
- Optional dependency: `extras_speech`
|
||||
- Requires package: `SpeechRecognition`
|
||||
- Provides: `SpeechRecognitionSink`
|
||||
|
||||
A helper sink for using `SpeechRecognition` to perform speech-to-text conversion. Generally depends on third party services for reasonable quality. Results may vary.
|
||||
|
||||
### `voice_recv.extras.localplayback`
|
||||
- Optional dependency: `extras_local`
|
||||
- Requires package: `pyaudio`
|
||||
- Provides: `LocalPlaybackSink`, `SimpleLocalPlaybackSink`
|
||||
|
||||
Helper sinks for playing audio through an audio output device the local system. Defaults to the system default device, but other output devices can also be specified.
|
||||
|
||||
## Currently missing or WIP features
|
||||
- Silence generation (WIP, pending rewrite)
|
||||
|
||||
## Future plans
|
||||
- Muxer AudioSink (mixes multiple audio streams into a single stream)
|
||||
- Rust implementations of some components for improved performance
|
||||
- Alternative voice client implementation with a minimal interface intended for use with external data processing
|
||||
76
vendor/discord-ext-voice-recv/VENDOR_INFO.md
vendored
Normal file
76
vendor/discord-ext-voice-recv/VENDOR_INFO.md
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
# Vendored: discord-ext-voice-recv
|
||||
|
||||
**Upstream:** https://github.com/imayhaveborkedit/discord-ext-voice-recv
|
||||
**Pinned commit:** `ac04ea7b0941112e83767cf1c1469b408fa06748` (bump version 0.5.3a, master HEAD Jun 2025)
|
||||
**Vendored at:** 2026-05-27
|
||||
**Echo Core fork version:** `0.5.3a+echo.dave1` (PEP 440 local segment)
|
||||
**Reason:** Discord voice protocol is fragile, upstream is hobby fork. Adapter
|
||||
layer in `src/voice/_discord_voice_adapter.py` isolates upstream churn — if this
|
||||
package breaks, swap to py-cord by rewriting only that file.
|
||||
|
||||
## Echo Core patch: `+echo.dave1` (DAVE E2E receive-side decrypt)
|
||||
|
||||
### Why
|
||||
|
||||
Discord enforces DAVE (E2E media encryption) on voice gateway `v=8` whenever the
|
||||
bot advertises `max_dave_protocol_version > 0` in IDENTIFY. discord.py 2.7.1 (the
|
||||
version Echo Core pins) does so unconditionally — Discord then closes the WS
|
||||
with code **4017** if the bot opts out by sending `max_dave_protocol_version=0`.
|
||||
DAVE is **mandatory**.
|
||||
|
||||
Audio received from a DAVE-active room is **dual-wrapped**: transport layer
|
||||
(`aead_xchacha20_poly1305_rtpsize`) + DAVE E2E. Upstream voice-recv decrypts
|
||||
only the transport layer, then hands DAVE ciphertext to libopus, which raises
|
||||
`OpusError: corrupted stream` on every packet.
|
||||
|
||||
### Patch shape
|
||||
|
||||
~30 lines, all in `discord/ext/voice_recv/reader.py`:
|
||||
|
||||
1. Module-level optional `davey` import (no-op when missing).
|
||||
2. `AudioReader._maybe_dave_decrypt(rtp_packet) -> Optional[bytes]` — gate logic
|
||||
mirrors discord.py 2.7.1 send-side `can_encrypt` exactly. Returns the
|
||||
DAVE-unwrapped payload, the original payload (DAVE inactive), or `None` to
|
||||
drop the packet (unknown SSRC, decrypt failure).
|
||||
3. 4-line hook in `callback()` between transport-decrypt and `feed_rtp`:
|
||||
overwrites `rtp_packet.decrypted_data` in place, or returns early to drop.
|
||||
|
||||
The post-decrypt `is_silence()` check (formerly at reader.py:172) still works
|
||||
because we overwrite `decrypted_data` in place — silence frames produced by
|
||||
davey reach the existing check unchanged.
|
||||
|
||||
### Dependency
|
||||
|
||||
`davey==0.1.5` — matches discord.py 2.7.1 expectation. Pin in
|
||||
`echo-core/requirements.txt`. The import is optional at module level so tests
|
||||
and non-DAVE environments still run; the gate degrades to a bypass.
|
||||
|
||||
### Re-sync strategy
|
||||
|
||||
When upstream voice-recv adds DAVE support natively:
|
||||
|
||||
1. Drop the three patch hunks in `reader.py` (davey import block,
|
||||
`_maybe_dave_decrypt` method, hook in `callback()`).
|
||||
2. Revert `__version__` to upstream value in `__init__.py`.
|
||||
3. Update `Pinned commit` below.
|
||||
4. Run `pytest tests/test_voice_recv_dave.py tests/test_voice_adapter_contract.py`.
|
||||
|
||||
The contract test `test_voice_recv_fork_version` asserts `__version__ ==
|
||||
'0.5.3a+echo.dave1'` and will fail fast on any accidental wipe during a careless
|
||||
upstream sync — forcing a conscious decision to either re-port or drop the
|
||||
patch.
|
||||
|
||||
## Update procedure (vanilla upstream sync)
|
||||
|
||||
```bash
|
||||
cd vendor/discord-ext-voice-recv
|
||||
git fetch origin master
|
||||
git log HEAD..origin/master --oneline # review what changed
|
||||
git checkout <new-commit>
|
||||
# RE-APPLY the +echo.dave1 patch if upstream still lacks DAVE
|
||||
cd ../..
|
||||
source .venv/bin/activate && pip install -e vendor/discord-ext-voice-recv --force-reinstall
|
||||
pytest tests/test_voice_adapter_contract.py tests/test_voice_recv_dave.py -v # MUST PASS — contract + DAVE guards
|
||||
```
|
||||
|
||||
Update this file's `Pinned commit` after a successful upgrade.
|
||||
20
vendor/discord-ext-voice-recv/discord/ext/voice_recv/__init__.py
vendored
Normal file
20
vendor/discord-ext-voice-recv/discord/ext/voice_recv/__init__.py
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .voice_client import *
|
||||
from .reader import *
|
||||
from .sinks import *
|
||||
from .video import *
|
||||
from .opus import *
|
||||
from .rtp import *
|
||||
from .enums import *
|
||||
|
||||
from . import (
|
||||
rtp as rtp,
|
||||
extras as extras,
|
||||
)
|
||||
|
||||
__title__ = 'discord.ext.voice_recv'
|
||||
__author__ = 'Imayhaveborkedit'
|
||||
__license__ = 'MIT'
|
||||
__copyright__ = 'Copyright 2021-present Imayhaveborkedit'
|
||||
__version__ = '0.5.3a+echo.dave1'
|
||||
249
vendor/discord-ext-voice-recv/discord/ext/voice_recv/buffer.py
vendored
Normal file
249
vendor/discord-ext-voice-recv/discord/ext/voice_recv/buffer.py
vendored
Normal file
@@ -0,0 +1,249 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import heapq
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from .utils import gap_wrapped, add_wrapped
|
||||
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Protocol,
|
||||
TypeVar,
|
||||
)
|
||||
|
||||
from .rtp import _PacketCmpMixin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional, List
|
||||
from .rtp import AudioPacket
|
||||
|
||||
__all__ = [
|
||||
'HeapJitterBuffer',
|
||||
]
|
||||
|
||||
|
||||
_T = TypeVar('_T')
|
||||
PacketT = TypeVar('PacketT', bound=_PacketCmpMixin)
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Buffer(Protocol[_T]):
|
||||
"""The base class representing a simple buffer with no extra features."""
|
||||
|
||||
# fmt: off
|
||||
def __len__(self) -> int: ...
|
||||
def push(self, item: _T) -> None: ...
|
||||
def pop(self) -> Optional[_T]: ...
|
||||
def peek(self) -> Optional[_T]: ...
|
||||
def flush(self) -> List[_T]: ...
|
||||
def reset(self) -> None: ...
|
||||
# fmt: on
|
||||
|
||||
|
||||
class BaseBuffer(Buffer[PacketT]):
|
||||
"""A basic buffer."""
|
||||
|
||||
def __init__(self):
|
||||
self._buffer: List[PacketT] = []
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._buffer)
|
||||
|
||||
def push(self, item: PacketT) -> None:
|
||||
self._buffer.append(item)
|
||||
|
||||
def pop(self) -> Optional[PacketT]:
|
||||
return self._buffer.pop()
|
||||
|
||||
def peek(self) -> Optional[PacketT]:
|
||||
return self._buffer[-1] if self._buffer else None
|
||||
|
||||
def flush(self) -> List[PacketT]:
|
||||
buf = self._buffer.copy()
|
||||
self._buffer.clear()
|
||||
return buf
|
||||
|
||||
def reset(self) -> None:
|
||||
self._buffer.clear()
|
||||
|
||||
|
||||
class HeapJitterBuffer(BaseBuffer[PacketT]):
|
||||
"""Push item in, pop items out"""
|
||||
|
||||
_threshold: int = 10000
|
||||
|
||||
def __init__(self, maxsize: int = 10, *, prefsize: int = 1, prefill: int = 1):
|
||||
if maxsize < 1:
|
||||
raise ValueError(f'maxsize ({maxsize}) must be greater than 0')
|
||||
|
||||
if not 0 <= prefsize <= maxsize:
|
||||
raise ValueError(f'prefsize must be between 0 and maxsize ({maxsize})')
|
||||
|
||||
self.maxsize: int = maxsize
|
||||
self.prefsize: int = prefsize
|
||||
self.prefill: int = prefill
|
||||
self._prefill: int = prefill
|
||||
|
||||
self._last_tx_seq: int = -1
|
||||
|
||||
self._has_item: threading.Event = threading.Event()
|
||||
# I sure hope I dont need to add a lock to this
|
||||
self._buffer: List[AudioPacket] = []
|
||||
|
||||
def _push(self, packet: AudioPacket) -> None:
|
||||
heapq.heappush(self._buffer, packet)
|
||||
|
||||
def _pop(self) -> AudioPacket:
|
||||
return heapq.heappop(self._buffer)
|
||||
|
||||
def _get_packet_if_ready(self) -> Optional[AudioPacket]:
|
||||
return self._buffer[0] if len(self._buffer) > self.prefsize else None
|
||||
|
||||
def _pop_if_ready(self) -> Optional[AudioPacket]:
|
||||
return self._pop() if len(self._buffer) > self.prefsize else None
|
||||
|
||||
def _update_has_item(self) -> None:
|
||||
prefilled = self._prefill == 0
|
||||
packet_ready = len(self._buffer) > self.prefsize
|
||||
|
||||
if not prefilled or not packet_ready:
|
||||
self._has_item.clear()
|
||||
return
|
||||
|
||||
next_packet = self._buffer[0]
|
||||
sequential = add_wrapped(self._last_tx_seq, 1) == next_packet.sequence
|
||||
positive_seq = self._last_tx_seq >= 0
|
||||
|
||||
# We have the next packet ready
|
||||
# OR we havent sent a packet out yet
|
||||
# OR the buffer is full
|
||||
if (sequential and positive_seq) or not positive_seq or len(self._buffer) >= self.maxsize:
|
||||
self._has_item.set()
|
||||
else:
|
||||
self._has_item.clear()
|
||||
|
||||
def _cleanup(self) -> None:
|
||||
# Logging this is pointless until I fix the stale remote buffer issue
|
||||
# if len(self._buffer) > self.maxsize:
|
||||
# log.debug("Buffer overfilled: %s > %s", len(self._buffer), self.maxsize)
|
||||
|
||||
# drop oldest packets if buffer overfilled
|
||||
while len(self._buffer) > self.maxsize:
|
||||
packet = heapq.heappop(self._buffer)
|
||||
# log.debug("Dropped extra packet %s", packet)
|
||||
|
||||
def push(self, packet: AudioPacket) -> bool:
|
||||
"""
|
||||
Push a packet into the buffer. If the packet would make the buffer
|
||||
exceed its maxsize, the oldest packet will be dropped.
|
||||
"""
|
||||
|
||||
seq = packet.sequence
|
||||
|
||||
# for the gap between _last_tx_seq and the current one, a large gap is old, a small gap is new
|
||||
# the gap for old packets will generally be very large since they wrap all the way around
|
||||
if gap_wrapped(self._last_tx_seq, seq) > self._threshold and self._last_tx_seq != -1:
|
||||
log.debug("Dropping old packet %s", packet)
|
||||
return False
|
||||
|
||||
self._push(packet)
|
||||
|
||||
if self._prefill > 0:
|
||||
self._prefill -= 1
|
||||
|
||||
self._cleanup()
|
||||
self._update_has_item()
|
||||
|
||||
return True
|
||||
|
||||
def pop(self, *, timeout: float | None = 0) -> Optional[AudioPacket]:
|
||||
"""
|
||||
If timeout is a positive number, wait as long as timeout for a packet
|
||||
to be ready and return that packet, otherwise return None.
|
||||
"""
|
||||
|
||||
ok = self._has_item.wait(timeout)
|
||||
if not ok:
|
||||
return None
|
||||
|
||||
if self._prefill > 0:
|
||||
return None
|
||||
|
||||
# This function should actually be redundant but i'll leave it for now
|
||||
packet = self._pop_if_ready()
|
||||
|
||||
if packet is not None:
|
||||
self._last_tx_seq = packet.sequence
|
||||
|
||||
self._update_has_item()
|
||||
return packet
|
||||
|
||||
def peek(self, *, all: bool = False) -> Optional[AudioPacket]:
|
||||
"""
|
||||
Returns the next packet in the buffer only if it is ready, meaning it can
|
||||
be popped. When `all` is set to True, it returns the next packet, if any.
|
||||
"""
|
||||
|
||||
if not self._buffer:
|
||||
return None
|
||||
|
||||
if all:
|
||||
return self._buffer[0]
|
||||
else:
|
||||
return self._get_packet_if_ready()
|
||||
|
||||
def peek_next(self) -> Optional[AudioPacket]:
|
||||
"""
|
||||
Returns the next packet in the buffer only if it is sequential.
|
||||
"""
|
||||
|
||||
packet = self.peek(all=True)
|
||||
|
||||
if packet is None:
|
||||
return
|
||||
|
||||
if packet.sequence == add_wrapped(self._last_tx_seq, 1) or self._last_tx_seq < 0:
|
||||
return packet
|
||||
|
||||
def gap(self) -> int:
|
||||
"""
|
||||
Returns the number of missing packets between the last packet to be
|
||||
popped and the currently held next packet. Returns 0 otherwise.
|
||||
"""
|
||||
|
||||
if self._buffer and self._last_tx_seq > 0:
|
||||
return gap_wrapped(self._last_tx_seq, self._buffer[0].sequence)
|
||||
|
||||
return 0
|
||||
|
||||
def flush(self) -> List[AudioPacket]:
|
||||
"""
|
||||
Return all remaining packets.
|
||||
"""
|
||||
|
||||
packets = sorted(self._buffer)
|
||||
self._buffer.clear()
|
||||
|
||||
if packets:
|
||||
self._last_tx_seq = packets[-1].sequence
|
||||
|
||||
self._prefill = self.prefill
|
||||
self._has_item.clear()
|
||||
|
||||
return packets
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Clear buffer and reset internal counters.
|
||||
"""
|
||||
|
||||
self._buffer.clear()
|
||||
self._has_item.clear()
|
||||
self._prefill = self.prefill
|
||||
self._last_tx_seq = -1
|
||||
30
vendor/discord-ext-voice-recv/discord/ext/voice_recv/enums.py
vendored
Normal file
30
vendor/discord-ext-voice-recv/discord/ext/voice_recv/enums.py
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
from discord.flags import BaseFlags, fill_with_flags, flag_value
|
||||
from discord.enums import Enum
|
||||
|
||||
__all__ = (
|
||||
'VoiceFlags',
|
||||
'VoicePlatform',
|
||||
)
|
||||
|
||||
@fill_with_flags()
|
||||
class VoiceFlags(BaseFlags):
|
||||
__slots__ = ()
|
||||
|
||||
@flag_value
|
||||
def clips_enabled(self):
|
||||
return 1 << 0
|
||||
|
||||
@flag_value
|
||||
def allow_voice_recording(self):
|
||||
return 1 << 1
|
||||
|
||||
@flag_value
|
||||
def allow_any_viewer_clips(self):
|
||||
return 1 << 2
|
||||
|
||||
|
||||
class VoicePlatform(Enum):
|
||||
desktop = 0
|
||||
mobile = 1
|
||||
xbox = 2
|
||||
playstation = 3
|
||||
2
vendor/discord-ext-voice-recv/discord/ext/voice_recv/extras/__init__.py
vendored
Normal file
2
vendor/discord-ext-voice-recv/discord/ext/voice_recv/extras/__init__.py
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
from . import speechrecognition
|
||||
from . import localplayback
|
||||
132
vendor/discord-ext-voice-recv/discord/ext/voice_recv/extras/localplayback.py
vendored
Normal file
132
vendor/discord-ext-voice-recv/discord/ext/voice_recv/extras/localplayback.py
vendored
Normal file
@@ -0,0 +1,132 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from ..sinks import AudioSink
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..opus import VoiceData
|
||||
from ..types import MemberOrUser
|
||||
|
||||
|
||||
__all__ = [
|
||||
'LocalPlaybackSink',
|
||||
'SimpleLocalPlaybackSink',
|
||||
]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import pyaudio
|
||||
except ImportError:
|
||||
|
||||
def __getattr__(name: str):
|
||||
if name in __all__:
|
||||
raise RuntimeError('The pyaudio module is required to use this sink.')
|
||||
|
||||
else:
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional, Dict
|
||||
|
||||
from discord import Member
|
||||
|
||||
PyAudioStream = pyaudio._Stream
|
||||
|
||||
class _BaseLocalPlaybackSink(AudioSink):
|
||||
pa: pyaudio.PyAudio = None # type: ignore
|
||||
|
||||
def __init__(self, output_device_id: Optional[int] = None, *, py_audio: Optional[pyaudio.PyAudio] = None):
|
||||
self._init_pa(py_audio)
|
||||
|
||||
if output_device_id is None:
|
||||
output_device_id = self.pa.get_default_output_device_info().get("index") # type: ignore
|
||||
self.output_device_id = output_device_id
|
||||
|
||||
@classmethod
|
||||
def _init_pa(cls, pa: Optional[pyaudio.PyAudio]) -> None:
|
||||
if pa is None:
|
||||
if cls.pa is None:
|
||||
cls.pa = pyaudio.PyAudio()
|
||||
else:
|
||||
if cls.pa is None:
|
||||
cls.pa = pa
|
||||
elif cls.pa is not pa:
|
||||
raise RuntimeError("Conflicting PyAudio objects")
|
||||
|
||||
def write(self, user: Optional[MemberOrUser], data: VoiceData) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def terminate_pyaudio(cls):
|
||||
"""Call this when you are completely done using all instances of LocalPlayback sinks."""
|
||||
|
||||
cls.pa.terminate()
|
||||
cls.pa = None # type: ignore
|
||||
|
||||
class SimpleLocalPlaybackSink(_BaseLocalPlaybackSink):
|
||||
"""
|
||||
A simplified version of LocalPlaybackSink that only supports one stream of audio.
|
||||
Convenient for when you have already isolated a single member's audio.
|
||||
"""
|
||||
|
||||
def __init__(self, output_device_id: Optional[int] = None, *, py_audio: Optional[pyaudio.PyAudio] = None):
|
||||
super().__init__(output_device_id, py_audio=py_audio)
|
||||
self._stream: PyAudioStream = self.pa.open(
|
||||
rate=48000,
|
||||
channels=2,
|
||||
format=pyaudio.paInt16,
|
||||
output=True,
|
||||
output_device_index=output_device_id,
|
||||
)
|
||||
|
||||
def write(self, user: Optional[MemberOrUser], data: VoiceData) -> None:
|
||||
self._stream.write(data.pcm)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self._stream.close()
|
||||
|
||||
class LocalPlaybackSink(_BaseLocalPlaybackSink):
|
||||
"""
|
||||
An AudioSink for playing received audio directly to one of the system's audio output devices using PyAudio.
|
||||
This sink can handle playback of multiple users' audio without additional stream mixing beforehand.
|
||||
|
||||
The `output_device_id` parameter defaults to the system's default audio device, and can otherwise be
|
||||
acquired via PyAudio functions. A specific `PyAudio` instance can also be passed to use a specific instance.
|
||||
"""
|
||||
|
||||
def __init__(self, output_device_id: Optional[int] = None, *, py_audio: Optional[pyaudio.PyAudio] = None):
|
||||
super().__init__(output_device_id, py_audio=py_audio)
|
||||
self._streams: Dict[int, PyAudioStream] = {}
|
||||
|
||||
def _get_stream(self, user: MemberOrUser) -> PyAudioStream:
|
||||
stream = self._streams.get(user.id)
|
||||
if stream is None:
|
||||
stream = self._streams[user.id] = self.pa.open(
|
||||
rate=48000,
|
||||
channels=2,
|
||||
format=pyaudio.paInt16,
|
||||
output=True,
|
||||
output_device_index=self.output_device_id,
|
||||
)
|
||||
return stream
|
||||
|
||||
def write(self, user: Optional[MemberOrUser], data: VoiceData) -> None:
|
||||
if user:
|
||||
self._get_stream(user).write(data.pcm)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
for stream in tuple(self._streams.values()):
|
||||
stream.close()
|
||||
|
||||
@AudioSink.listener()
|
||||
def on_voice_member_disconnect(self, member: Member, ssrc: Optional[int]) -> None:
|
||||
stream = self._streams.pop(member.id, None)
|
||||
if stream:
|
||||
stream.close()
|
||||
237
vendor/discord-ext-voice-recv/discord/ext/voice_recv/extras/speechrecognition.py
vendored
Normal file
237
vendor/discord-ext-voice-recv/discord/ext/voice_recv/extras/speechrecognition.py
vendored
Normal file
@@ -0,0 +1,237 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from ..sinks import AudioSink
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
'SpeechRecognitionSink',
|
||||
]
|
||||
|
||||
try:
|
||||
import speech_recognition as sr
|
||||
except ImportError:
|
||||
|
||||
def __getattr__(name: str):
|
||||
if name in __all__:
|
||||
raise RuntimeError('The SpeechRecognition module is required to use this sink.')
|
||||
|
||||
else:
|
||||
import time
|
||||
import array
|
||||
import asyncio
|
||||
import audioop
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from ..rtp import SilencePacket
|
||||
|
||||
from typing import TYPE_CHECKING, TypedDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from concurrent.futures import Future as CFuture
|
||||
from typing import Literal, Callable, Optional, Any, Final, Protocol, Awaitable, TypeVar
|
||||
|
||||
from discord import Member
|
||||
|
||||
from ..opus import VoiceData
|
||||
from ..types import MemberOrUser as User
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
# [r.split('_', 1)[1] for r in dir(sr.Recognizer()) if r.startswith("recognize")]
|
||||
SRRecognizerMethod = Literal[
|
||||
'amazon',
|
||||
'api',
|
||||
'assemblyai',
|
||||
'azure',
|
||||
'bing',
|
||||
'faster_whisper',
|
||||
'google',
|
||||
'google_cloud',
|
||||
'groq',
|
||||
'houndify',
|
||||
'ibm',
|
||||
'lex',
|
||||
'openai',
|
||||
'sphinx',
|
||||
'tensorflow',
|
||||
'vosk',
|
||||
'whisper',
|
||||
'wit',
|
||||
]
|
||||
|
||||
class SRStopper(Protocol):
|
||||
def __call__(self, wait: bool = True, /) -> None: ...
|
||||
|
||||
SRProcessDataCB = Callable[[sr.Recognizer, sr.AudioData, User], Optional[str]]
|
||||
SRTextCB = Callable[[User, str], Any]
|
||||
|
||||
class _StreamData(TypedDict):
|
||||
stopper: Optional[SRStopper]
|
||||
recognizer: sr.Recognizer
|
||||
buffer: array.array[int]
|
||||
|
||||
class SpeechRecognitionSink(AudioSink):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
process_cb: Optional[SRProcessDataCB] = None,
|
||||
text_cb: Optional[SRTextCB] = None,
|
||||
default_recognizer: SRRecognizerMethod = 'google',
|
||||
phrase_time_limit: int = 10,
|
||||
ignore_silence_packets: bool = True,
|
||||
):
|
||||
super().__init__(None)
|
||||
self.process_cb: Optional[SRProcessDataCB] = process_cb
|
||||
self.text_cb: Optional[SRTextCB] = text_cb
|
||||
self.phrase_time_limmit: int = phrase_time_limit
|
||||
self.ignore_silence_packets: bool = ignore_silence_packets
|
||||
|
||||
self.default_recognizer: SRRecognizerMethod = default_recognizer
|
||||
self._stream_data: defaultdict[int, _StreamData] = defaultdict(
|
||||
lambda: _StreamData(stopper=None, recognizer=sr.Recognizer(), buffer=array.array('B'))
|
||||
)
|
||||
|
||||
def _await(self, coro: Awaitable[T]) -> CFuture[T]:
|
||||
assert self.client is not None
|
||||
return asyncio.run_coroutine_threadsafe(coro, self.client.loop) # type: ignore
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData) -> None:
|
||||
if self.ignore_silence_packets and isinstance(data.packet, SilencePacket):
|
||||
return
|
||||
|
||||
if user is None:
|
||||
return
|
||||
|
||||
sdata = self._stream_data[user.id]
|
||||
sdata['buffer'].extend(data.pcm)
|
||||
|
||||
if not sdata['stopper']:
|
||||
sdata['stopper'] = sdata['recognizer'].listen_in_background(
|
||||
DiscordSRAudioSource(sdata['buffer']), self.background_listener(user), self.phrase_time_limmit
|
||||
)
|
||||
|
||||
def background_listener(self, user: User):
|
||||
process_cb = self.process_cb or self.get_default_process_callback()
|
||||
text_cb = self.text_cb or self.get_default_text_callback()
|
||||
|
||||
def callback(_recognizer: sr.Recognizer, _audio: sr.AudioData):
|
||||
output = process_cb(_recognizer, _audio, user)
|
||||
if output is not None:
|
||||
text_cb(user, output)
|
||||
|
||||
return callback
|
||||
|
||||
def get_default_process_callback(self) -> SRProcessDataCB:
|
||||
def cb(recognizer: sr.Recognizer, audio: sr.AudioData, user: Optional[User]) -> Optional[str]:
|
||||
log.debug("Got %s, %s, %s", audio, audio.sample_rate, audio.sample_width)
|
||||
text: Optional[str] = None
|
||||
try:
|
||||
# they changed recognize_google to be optionally assigned at runtime...
|
||||
func = getattr(recognizer, 'recognize_' + self.default_recognizer, recognizer.recognize_google) # type: ignore
|
||||
text = func(audio)
|
||||
except sr.UnknownValueError:
|
||||
log.debug("Bad speech chunk")
|
||||
# self._debug_audio_chunk(audio)
|
||||
|
||||
return text
|
||||
|
||||
return cb
|
||||
|
||||
def get_default_text_callback(self) -> SRTextCB:
|
||||
def cb(user: Optional[User], text: Optional[str]) -> Any:
|
||||
log.info("%s said: %s", user.display_name if user else 'Someone', text)
|
||||
|
||||
return cb
|
||||
|
||||
@AudioSink.listener()
|
||||
def on_voice_member_disconnect(self, member: Member, ssrc: Optional[int]) -> None:
|
||||
if member is not None:
|
||||
self._drop(member.id)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
for user_id in tuple(self._stream_data.keys()):
|
||||
self._drop(user_id)
|
||||
|
||||
def _drop(self, user_id: int) -> None:
|
||||
data = self._stream_data.pop(user_id, None)
|
||||
if data is None:
|
||||
log.debug("Cannot drop user id: %s, no data", user_id)
|
||||
return
|
||||
|
||||
stopper = data.get('stopper')
|
||||
if stopper:
|
||||
stopper()
|
||||
|
||||
buffer = data.get('buffer')
|
||||
if buffer:
|
||||
# arrays don't have a clear function
|
||||
del buffer[:]
|
||||
|
||||
def _debug_audio_chunk(self, audio: sr.AudioData, filename: str = 'sound.wav') -> None:
|
||||
import io, wave, discord
|
||||
|
||||
with io.BytesIO() as b:
|
||||
with wave.open(b, 'wb') as writer:
|
||||
writer.setframerate(48000)
|
||||
writer.setsampwidth(2)
|
||||
writer.setnchannels(2)
|
||||
writer.writeframes(audio.get_wav_data())
|
||||
|
||||
b.seek(0)
|
||||
f = discord.File(b, filename)
|
||||
self._await(self.voice_client.channel.send(file=f)) # type: ignore
|
||||
|
||||
class DiscordSRAudioSource(sr.AudioSource):
|
||||
little_endian: Final[bool] = True
|
||||
SAMPLE_RATE: Final[int] = 48_000
|
||||
SAMPLE_WIDTH: Final[int] = 2
|
||||
CHANNELS: Final[int] = 2
|
||||
CHUNK: Final[int] = 960
|
||||
|
||||
def __init__(self, buffer: array.array[int]):
|
||||
self.buffer = buffer
|
||||
self._entered: bool = False
|
||||
|
||||
@property
|
||||
def stream(self):
|
||||
return self
|
||||
|
||||
def __enter__(self):
|
||||
if self._entered:
|
||||
log.warning('Already entered sr audio source')
|
||||
self._entered = True
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc) -> None:
|
||||
self._entered = False
|
||||
if any(exc):
|
||||
log.exception('Error closing sr audio source')
|
||||
|
||||
def read(self, size: int) -> bytes:
|
||||
# TODO: make this timeout configurable
|
||||
for _ in range(10):
|
||||
if len(self.buffer) < size * self.CHANNELS:
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
if len(self.buffer) == 0:
|
||||
return b''
|
||||
|
||||
chunksize = size * self.CHANNELS
|
||||
audiochunk = self.buffer[:chunksize].tobytes()
|
||||
del self.buffer[: min(chunksize, len(audiochunk))]
|
||||
audiochunk = audioop.tomono(audiochunk, 2, 1, 1)
|
||||
return audiochunk
|
||||
|
||||
def close(self) -> None:
|
||||
self.buffer.clear()
|
||||
122
vendor/discord-ext-voice-recv/discord/ext/voice_recv/gateway.py
vendored
Normal file
122
vendor/discord-ext-voice-recv/discord/ext/voice_recv/gateway.py
vendored
Normal file
@@ -0,0 +1,122 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from discord.enums import SpeakingState, try_enum
|
||||
|
||||
from .enums import VoiceFlags, VoicePlatform
|
||||
from .video import VoiceVideoStreams
|
||||
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Dict, Any
|
||||
|
||||
from discord.gateway import DiscordVoiceWebSocket
|
||||
from .voice_client import VoiceRecvClient
|
||||
from .video import VoiceVideoPayload
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# https://cdn.discordapp.com/attachments/381887113391505410/1094473412623204533/image.png
|
||||
# fmt: off
|
||||
IDENTIFY = 0
|
||||
SELECT_PROTOCOL = 1
|
||||
READY = 2
|
||||
HEARTBEAT = 3
|
||||
SESSION_DESCRIPTION = 4 # (aka SELECT_PROTOCOL_ACK)
|
||||
SPEAKING = 5
|
||||
HEARTBEAT_ACK = 6
|
||||
RESUME = 7
|
||||
HELLO = 8
|
||||
RESUMED = 9
|
||||
CLIENT_CONNECT = 11
|
||||
VIDEO = 12
|
||||
CLIENT_DISCONNECT = 13
|
||||
SESSION_UPDATE = 14 # (useless)
|
||||
MEDIA_SINK_WANTS = 15 # (useless)
|
||||
VOICE_BACKEND_VERSION = 16 # (useless)
|
||||
CHANNEL_OPTIONS_UPDATE = 17 # (dead)
|
||||
FLAGS = 18
|
||||
SPEED_TEST = 19 # (dead)
|
||||
PLATFORM = 20
|
||||
# fmt: on
|
||||
|
||||
|
||||
async def hook(self: DiscordVoiceWebSocket, msg: Dict[str, Any]):
|
||||
op: int = msg['op']
|
||||
data: Dict[str, Any] = msg.get('d', {})
|
||||
vc: VoiceRecvClient = self._connection.voice_client # type: ignore
|
||||
|
||||
if op not in (3, 6):
|
||||
from pprint import pformat
|
||||
|
||||
log.debug("Received op %s: \n%s", op, pformat(data, compact=True))
|
||||
|
||||
if len(msg.keys()) > 2:
|
||||
m = msg.copy()
|
||||
m.pop('op')
|
||||
m.pop('d')
|
||||
log.info("WS payload has extra keys: %s", m)
|
||||
|
||||
if op == self.READY:
|
||||
vc._add_ssrc(vc.guild.me.id, data['ssrc'])
|
||||
|
||||
elif op == self.SESSION_DESCRIPTION:
|
||||
if vc._reader:
|
||||
# TODO: remove bytes cast once type is fixed in dpy
|
||||
vc._reader.update_secret_key(bytes(self.secret_key)) # type: ignore
|
||||
|
||||
elif op == self.SPEAKING:
|
||||
# this event refers to the speaking MODE, e.g. priority speaker
|
||||
# it also sends the user's ssrc
|
||||
uid = int(data['user_id'])
|
||||
ssrc = data['ssrc']
|
||||
vc._add_ssrc(uid, ssrc)
|
||||
member = vc.guild.get_member(uid)
|
||||
state = try_enum(SpeakingState, data['speaking'])
|
||||
vc.dispatch("voice_member_speaking_state", member, ssrc, state)
|
||||
|
||||
elif op == CLIENT_CONNECT:
|
||||
uids = [int(uid) for uid in data['user_ids']]
|
||||
|
||||
# Multiple user IDs means this is the initial member list
|
||||
for uid in uids:
|
||||
member = vc.guild.get_member(uid)
|
||||
vc.dispatch("voice_member_connect", member)
|
||||
|
||||
elif op == VIDEO:
|
||||
uid = int(data['user_id'])
|
||||
vc._add_ssrc(uid, data['audio_ssrc'])
|
||||
member = vc.guild.get_member(uid)
|
||||
streams = VoiceVideoStreams(data=cast('VoiceVideoPayload', data), vc=vc)
|
||||
vc.dispatch("voice_member_video", member, streams)
|
||||
|
||||
elif op == CLIENT_DISCONNECT:
|
||||
uid = int(data['user_id'])
|
||||
ssrc = vc._get_ssrc_from_id(uid)
|
||||
|
||||
if vc._reader and ssrc is not None:
|
||||
log.debug("Destroying decoder for %s, ssrc=%s", uid, ssrc)
|
||||
vc._reader.packet_router.destroy_decoder(ssrc)
|
||||
|
||||
vc._remove_ssrc(user_id=uid)
|
||||
member = vc.guild.get_member(uid)
|
||||
vc.dispatch("voice_member_disconnect", member, ssrc)
|
||||
|
||||
elif op == FLAGS:
|
||||
uid = int(data['user_id'])
|
||||
member = vc.guild.get_member(uid)
|
||||
vc.dispatch("voice_member_flags", member, VoiceFlags._from_value(data['flags'] or 0))
|
||||
|
||||
elif op == PLATFORM:
|
||||
uid = int(data['user_id'])
|
||||
member = vc.guild.get_member(uid)
|
||||
vc.dispatch(
|
||||
"voice_member_platform",
|
||||
member,
|
||||
try_enum(VoicePlatform, data['platform']) if data['platform'] is not None else None,
|
||||
)
|
||||
174
vendor/discord-ext-voice-recv/discord/ext/voice_recv/opus.py
vendored
Normal file
174
vendor/discord-ext-voice-recv/discord/ext/voice_recv/opus.py
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from typing import TYPE_CHECKING, Final
|
||||
|
||||
from .buffer import HeapJitterBuffer as JitterBuffer
|
||||
from .rtp import FakePacket
|
||||
from .utils import add_wrapped
|
||||
|
||||
from discord.opus import Decoder
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional, Tuple, Dict, Callable, Any
|
||||
from .rtp import AudioPacket
|
||||
from .sinks import AudioSink
|
||||
from .router import PacketRouter
|
||||
from .voice_client import VoiceRecvClient
|
||||
from .types import MemberOrUser as User
|
||||
|
||||
EventCB = Callable[..., Any]
|
||||
EventData = Tuple[str, Tuple[Any, ...], Dict[str, Any]]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
'VoiceData',
|
||||
]
|
||||
|
||||
|
||||
class VoiceData:
|
||||
"""Container object for audio data and source user."""
|
||||
|
||||
__slots__ = ('packet', 'source', 'pcm')
|
||||
|
||||
def __init__(self, packet: AudioPacket, source: Optional[User], *, pcm: Optional[bytes] = None):
|
||||
self.packet: AudioPacket = packet
|
||||
self.source: Optional[User] = source
|
||||
self.pcm: bytes = pcm if pcm else b''
|
||||
|
||||
@property
|
||||
def opus(self) -> Optional[bytes]:
|
||||
return self.packet.decrypted_data
|
||||
|
||||
|
||||
class PacketDecoder:
|
||||
def __init__(self, router: PacketRouter, ssrc: int):
|
||||
self.router: PacketRouter = router
|
||||
self.ssrc: int = ssrc
|
||||
|
||||
self._decoder: Optional[Decoder] = None if self.sink.wants_opus() else Decoder()
|
||||
self._buffer: JitterBuffer = JitterBuffer()
|
||||
self._cached_id: Optional[int] = None
|
||||
|
||||
self._last_seq: int = -1
|
||||
self._last_ts: int = -1
|
||||
|
||||
@property
|
||||
def sink(self) -> AudioSink:
|
||||
return self.router.sink
|
||||
|
||||
def _get_user(self, user_id: int) -> Optional[User]:
|
||||
vc: VoiceRecvClient = self.sink.voice_client # type: ignore
|
||||
return vc.guild.get_member(user_id) or vc.client.get_user(user_id)
|
||||
|
||||
def _get_cached_member(self) -> Optional[User]:
|
||||
return self._get_user(self._cached_id) if self._cached_id else None
|
||||
|
||||
def _flag_ready_state(self):
|
||||
if self._buffer.peek():
|
||||
self.router.waiter.register(self)
|
||||
else:
|
||||
self.router.waiter.unregister(self)
|
||||
|
||||
def push_packet(self, packet: AudioPacket) -> None:
|
||||
self._buffer.push(packet)
|
||||
self._flag_ready_state()
|
||||
|
||||
def pop_data(self, *, timeout: float = 0) -> Optional[VoiceData]:
|
||||
packet = self._get_next_packet(timeout)
|
||||
self._flag_ready_state()
|
||||
|
||||
if packet is None:
|
||||
return
|
||||
|
||||
return self._process_packet(packet)
|
||||
|
||||
def set_user_id(self, user_id: int) -> None:
|
||||
self._cached_id = user_id
|
||||
|
||||
def reset(self) -> None:
|
||||
self._buffer.reset()
|
||||
self._decoder = None if self.sink.wants_opus() else Decoder()
|
||||
self._last_seq = self._last_ts = -1
|
||||
self._flag_ready_state()
|
||||
|
||||
def destroy(self) -> None:
|
||||
self._buffer.reset()
|
||||
self._decoder = None
|
||||
self._flag_ready_state()
|
||||
|
||||
def _get_next_packet(self, timeout: float) -> Optional[AudioPacket]:
|
||||
packet = self._buffer.pop(timeout=timeout)
|
||||
|
||||
if packet is None:
|
||||
# Gets the last (buffered) packet out (i think)
|
||||
# TODO: revist this, might be an issue
|
||||
if self._buffer:
|
||||
packets = self._buffer.flush()
|
||||
if any(packets[1:]):
|
||||
log.warning(
|
||||
"%s packets were lost being flushed in decoder-%s\n --> (last=%s) %s",
|
||||
len(packets) - 1,
|
||||
self.ssrc,
|
||||
self._last_seq,
|
||||
[p.sequence for p in packets],
|
||||
)
|
||||
return packets[0]
|
||||
return
|
||||
elif not packet:
|
||||
packet = self._make_fakepacket()
|
||||
|
||||
return packet
|
||||
|
||||
def _make_fakepacket(self) -> FakePacket:
|
||||
seq = add_wrapped(self._last_seq, 1)
|
||||
ts = add_wrapped(self._last_ts, Decoder.SAMPLES_PER_FRAME, wrap=2**32)
|
||||
return FakePacket(self.ssrc, seq, ts)
|
||||
|
||||
def _process_packet(self, packet: AudioPacket) -> VoiceData:
|
||||
pcm = None
|
||||
if not self.sink.wants_opus():
|
||||
packet, pcm = self._decode_packet(packet)
|
||||
|
||||
member = self._get_cached_member()
|
||||
|
||||
if member is None:
|
||||
self._cached_id = self.sink.voice_client._get_id_from_ssrc(self.ssrc) # type: ignore
|
||||
member = self._get_cached_member()
|
||||
|
||||
data = VoiceData(packet, member, pcm=pcm)
|
||||
self._last_seq = packet.sequence
|
||||
self._last_ts = packet.timestamp
|
||||
|
||||
return data
|
||||
|
||||
def _decode_packet(self, packet: AudioPacket) -> Tuple[AudioPacket, bytes]:
|
||||
assert self._decoder is not None
|
||||
|
||||
# Decode as per usual
|
||||
if packet:
|
||||
pcm = self._decoder.decode(packet.decrypted_data, fec=False)
|
||||
return packet, pcm
|
||||
|
||||
# Fake packet, need to check next one to use fec
|
||||
next_packet = self._buffer.peek_next()
|
||||
|
||||
if next_packet is not None:
|
||||
nextdata: bytes = next_packet.decrypted_data # type: ignore
|
||||
|
||||
log.debug(
|
||||
"Generating fec packet: fake=%s, fec=%s",
|
||||
packet.sequence,
|
||||
next_packet.sequence,
|
||||
)
|
||||
pcm = self._decoder.decode(nextdata, fec=True)
|
||||
|
||||
# Need to drop a packet
|
||||
else:
|
||||
pcm = self._decoder.decode(None, fec=False)
|
||||
|
||||
return packet, pcm
|
||||
482
vendor/discord-ext-voice-recv/discord/ext/voice_recv/reader.py
vendored
Normal file
482
vendor/discord-ext-voice-recv/discord/ext/voice_recv/reader.py
vendored
Normal file
@@ -0,0 +1,482 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from . import rtp
|
||||
from .sinks import AudioSink
|
||||
from .router import PacketRouter, SinkEventRouter
|
||||
|
||||
try:
|
||||
import nacl.secret
|
||||
from nacl.exceptions import CryptoError
|
||||
except ImportError as e:
|
||||
raise RuntimeError("pynacl is required") from e
|
||||
|
||||
# Echo Core +echo.dave1 patch: DAVE E2E receive-side decrypt. See VENDOR_INFO.md.
|
||||
try:
|
||||
import davey
|
||||
_MEDIA_TYPE_AUDIO = davey.MediaType.audio
|
||||
_HAS_DAVE = True
|
||||
except ImportError:
|
||||
_MEDIA_TYPE_AUDIO = None
|
||||
_HAS_DAVE = False
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional, Callable, Any, Dict, Literal, Union
|
||||
|
||||
from discord import Member
|
||||
from discord.types.voice import SupportedModes
|
||||
from .voice_client import VoiceRecvClient
|
||||
from .rtp import RTPPacket
|
||||
|
||||
DecryptRTP = Callable[[RTPPacket], bytes]
|
||||
DecryptRTCP = Callable[[bytes], bytes]
|
||||
AfterCB = Callable[[Optional[Exception]], Any]
|
||||
SpeakingEvent = Literal['voice_member_speaking_start', 'voice_member_speaking_stop']
|
||||
EncryptionBox = Union[nacl.secret.SecretBox, nacl.secret.Aead]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
'AudioReader',
|
||||
]
|
||||
|
||||
|
||||
class AudioReader:
|
||||
def __init__(self, sink: AudioSink, voice_client: VoiceRecvClient, *, after: Optional[AfterCB] = None):
|
||||
if after is not None and not callable(after):
|
||||
raise TypeError('Expected a callable for the "after" parameter.')
|
||||
|
||||
self.sink: AudioSink = sink
|
||||
self.voice_client: VoiceRecvClient = voice_client
|
||||
self.after: Optional[AfterCB] = after
|
||||
|
||||
# No need for the whole set_sink() call
|
||||
self.sink._voice_client = voice_client
|
||||
|
||||
self.active: bool = False
|
||||
self.error: Optional[Exception] = None
|
||||
self.packet_router: PacketRouter = PacketRouter(sink, self)
|
||||
self.event_router: SinkEventRouter = SinkEventRouter(sink, self)
|
||||
self.decryptor: PacketDecryptor = PacketDecryptor(voice_client.mode, bytes(voice_client.secret_key))
|
||||
self.speaking_timer: SpeakingTimer = SpeakingTimer(self)
|
||||
self.keepalive: UDPKeepAlive = UDPKeepAlive(voice_client)
|
||||
|
||||
def is_listening(self) -> bool:
|
||||
return self.active
|
||||
|
||||
def update_secret_key(self, secret_key: bytes) -> None:
|
||||
self.decryptor.update_secret_key(secret_key)
|
||||
|
||||
def start(self) -> None:
|
||||
if self.active:
|
||||
log.debug('Reader is already started', exc_info=True)
|
||||
return
|
||||
|
||||
self.speaking_timer.start()
|
||||
self.event_router.start()
|
||||
self.packet_router.start()
|
||||
self.voice_client._connection.add_socket_listener(self.callback)
|
||||
self.keepalive.start()
|
||||
self.active = True
|
||||
|
||||
def stop(self) -> None:
|
||||
if not self.active:
|
||||
log.debug('Tried to stop an inactive reader', exc_info=True)
|
||||
return
|
||||
|
||||
self.voice_client._connection.remove_socket_listener(self.callback)
|
||||
self.active = False
|
||||
self.speaking_timer.notify()
|
||||
|
||||
threading.Thread(target=self._stop, name=f'audioreader-stopper-{id(self):x}').start()
|
||||
|
||||
def _stop(self) -> None:
|
||||
try:
|
||||
self.packet_router.stop()
|
||||
except Exception as e:
|
||||
self.error = e
|
||||
log.exception('Error stopping packet router')
|
||||
|
||||
try:
|
||||
self.event_router.stop()
|
||||
except Exception as e:
|
||||
self.error = e
|
||||
log.exception('Error stopping event router')
|
||||
|
||||
self.speaking_timer.stop()
|
||||
self.keepalive.stop()
|
||||
|
||||
if self.after:
|
||||
try:
|
||||
self.after(self.error)
|
||||
except Exception:
|
||||
log.exception('Error calling listener after function')
|
||||
|
||||
for sink in self.sink.root.walk_children(with_self=True):
|
||||
try:
|
||||
sink.cleanup()
|
||||
except Exception:
|
||||
log.exception('Error calling cleanup() for %s', sink)
|
||||
|
||||
def set_sink(self, sink: AudioSink) -> AudioSink:
|
||||
"""Sets the new sink for the reader and returns the old one.
|
||||
Does not call cleanup()
|
||||
"""
|
||||
# This whole function is potentially very racy
|
||||
old_sink = self.sink
|
||||
old_sink._voice_client = None
|
||||
sink._voice_client = self.voice_client
|
||||
self.packet_router.set_sink(sink)
|
||||
self.sink = sink
|
||||
|
||||
return old_sink
|
||||
|
||||
def _is_ip_discovery_packet(self, data: bytes) -> bool:
|
||||
return len(data) == 74 and data[1] == 0x02
|
||||
|
||||
def _maybe_dave_decrypt(self, rtp_packet) -> Optional[bytes]:
|
||||
"""DAVE E2E layer applied after transport decrypt.
|
||||
|
||||
Returns the (possibly DAVE-unwrapped) opus payload, or None to drop the
|
||||
packet. No-op when DAVE is inactive — non-DAVE rooms and environments
|
||||
without `davey` installed pass through unchanged.
|
||||
|
||||
NOTE: `is_silence()` is NOT checked here. In a DAVE-active room the
|
||||
transport-decrypted payload is ciphertext, so `is_silence()` (which
|
||||
compares to plaintext OPUS_SILENCE ``b'\\xf8\\xff\\xfe'``) never matches.
|
||||
Silence frames are handled either by davey.decrypt returning plaintext
|
||||
silence (then caught at the existing post-decrypt silence check on
|
||||
``decrypted_data``), or dropped via the decrypt-raises path. The
|
||||
existing post-decrypt silence check continues to work because we
|
||||
overwrite ``decrypted_data`` in place.
|
||||
"""
|
||||
if not _HAS_DAVE:
|
||||
return rtp_packet.decrypted_data
|
||||
conn = self.voice_client._connection
|
||||
if getattr(conn, 'dave_protocol_version', 0) == 0:
|
||||
return rtp_packet.decrypted_data
|
||||
dave = getattr(conn, 'dave_session', None)
|
||||
if dave is None or not dave.ready:
|
||||
return rtp_packet.decrypted_data
|
||||
user_id = self.voice_client._ssrc_to_id.get(rtp_packet.ssrc)
|
||||
if user_id is None:
|
||||
# ACCEPTED REGRESSION: davey requires per-user key. When SPEAKING
|
||||
# event races behind the first audio packet, we drop 1-5 packets
|
||||
# (~40-200ms) per new speaker per session.
|
||||
return None
|
||||
# can_passthrough(user_id) mirrors Discord's protocol: when this user's
|
||||
# decryptor is in passthrough mode, packets are not DAVE-wrapped and
|
||||
# must be returned as-is. Otherwise davey.decrypt unwraps DAVE E2E.
|
||||
try:
|
||||
if dave.can_passthrough(user_id):
|
||||
return rtp_packet.decrypted_data
|
||||
except Exception as e:
|
||||
log.debug("can_passthrough check failed for ssrc=%s user=%s: %s: %s",
|
||||
rtp_packet.ssrc, user_id, type(e).__name__, e)
|
||||
try:
|
||||
return dave.decrypt(user_id, _MEDIA_TYPE_AUDIO, rtp_packet.decrypted_data)
|
||||
except Exception as e:
|
||||
log.debug("DAVE decrypt failed for ssrc=%s user=%s: %s: %s",
|
||||
rtp_packet.ssrc, user_id, type(e).__name__, e)
|
||||
return None
|
||||
|
||||
def callback(self, packet_data: bytes) -> None:
|
||||
packet = rtp_packet = rtcp_packet = None
|
||||
try:
|
||||
if not rtp.is_rtcp(packet_data):
|
||||
packet = rtp_packet = rtp.decode_rtp(packet_data)
|
||||
packet.decrypted_data = self.decryptor.decrypt_rtp(packet)
|
||||
# Echo Core +echo.dave1: DAVE E2E layer (no-op when inactive).
|
||||
dave_payload = self._maybe_dave_decrypt(rtp_packet)
|
||||
if dave_payload is None:
|
||||
return # drop packet, do not feed_rtp; reader thread stays alive
|
||||
rtp_packet.decrypted_data = dave_payload
|
||||
else:
|
||||
packet = rtcp_packet = rtp.decode_rtcp(self.decryptor.decrypt_rtcp(packet_data))
|
||||
|
||||
if not isinstance(packet, rtp.ReceiverReportPacket):
|
||||
log.info("Received unexpected rtcp packet: type=%s, %s", packet.type, type(packet))
|
||||
log.debug("Packet info:\n packet=%s\n data=%s", packet, packet_data)
|
||||
except CryptoError as e:
|
||||
log.error("CryptoError decoding packet data")
|
||||
log.debug("CryptoError details:\n data=%s\n secret_key=%s", packet_data, self.voice_client.secret_key)
|
||||
return
|
||||
except Exception as e:
|
||||
if self._is_ip_discovery_packet(packet_data):
|
||||
log.debug("Ignoring ip discovery packet")
|
||||
return
|
||||
|
||||
log.exception("Error unpacking packet")
|
||||
log.debug("Packet data: len=%s data=%s", len(packet_data), packet_data)
|
||||
finally:
|
||||
if self.error:
|
||||
self.stop()
|
||||
return
|
||||
if not packet:
|
||||
return
|
||||
|
||||
if rtcp_packet:
|
||||
self.packet_router.feed_rtcp(rtcp_packet)
|
||||
elif rtp_packet:
|
||||
ssrc = rtp_packet.ssrc
|
||||
|
||||
if ssrc not in self.voice_client._ssrc_to_id:
|
||||
if rtp_packet.is_silence():
|
||||
# TODO: buffer packets from unknown ssrcs, 50 max?
|
||||
# also remove this log later its pointless
|
||||
log.debug("Skipping silence packet for unknown ssrc %s", ssrc)
|
||||
return
|
||||
else:
|
||||
log.info("Received packet for unknown ssrc %s:\n%s", ssrc, rtp_packet)
|
||||
|
||||
self.speaking_timer.notify(ssrc)
|
||||
try:
|
||||
self.packet_router.feed_rtp(rtp_packet)
|
||||
except Exception as e:
|
||||
log.exception('Error processing rtp packet')
|
||||
self.error = e
|
||||
self.stop()
|
||||
|
||||
|
||||
class PacketDecryptor:
|
||||
supported_modes: list[SupportedModes] = [
|
||||
'aead_xchacha20_poly1305_rtpsize',
|
||||
'xsalsa20_poly1305_lite',
|
||||
'xsalsa20_poly1305_suffix',
|
||||
'xsalsa20_poly1305',
|
||||
]
|
||||
|
||||
def __init__(self, mode: SupportedModes, secret_key: bytes) -> None:
|
||||
self.mode: SupportedModes = mode
|
||||
try:
|
||||
self.decrypt_rtp: DecryptRTP = getattr(self, '_decrypt_rtp_' + mode)
|
||||
self.decrypt_rtcp: DecryptRTCP = getattr(self, '_decrypt_rtcp_' + mode)
|
||||
except AttributeError as e:
|
||||
raise NotImplementedError(mode) from e
|
||||
|
||||
self.box: EncryptionBox = self._make_box(secret_key)
|
||||
|
||||
def _make_box(self, secret_key: bytes) -> EncryptionBox:
|
||||
if self.mode.startswith("aead"):
|
||||
return nacl.secret.Aead(secret_key)
|
||||
else:
|
||||
return nacl.secret.SecretBox(secret_key)
|
||||
|
||||
def update_secret_key(self, secret_key: bytes) -> None:
|
||||
self.box = self._make_box(secret_key)
|
||||
|
||||
def _decrypt_rtp_xsalsa20_poly1305(self, packet: RTPPacket) -> bytes:
|
||||
nonce = bytearray(24)
|
||||
nonce[:12] = packet.header
|
||||
result = self.box.decrypt(bytes(packet.data), bytes(nonce))
|
||||
|
||||
if packet.extended:
|
||||
offset = packet.update_ext_headers(result)
|
||||
result = result[offset:]
|
||||
|
||||
return result
|
||||
|
||||
def _decrypt_rtcp_xsalsa20_poly1305(self, data: bytes) -> bytes:
|
||||
nonce = bytearray(24)
|
||||
nonce[:8] = data[:8]
|
||||
result = self.box.decrypt(data[8:], bytes(nonce))
|
||||
|
||||
return data[:8] + result
|
||||
|
||||
def _decrypt_rtp_xsalsa20_poly1305_suffix(self, packet: RTPPacket) -> bytes:
|
||||
nonce = packet.data[-24:]
|
||||
voice_data = packet.data[:-24]
|
||||
result = self.box.decrypt(bytes(voice_data), bytes(nonce))
|
||||
|
||||
if packet.extended:
|
||||
offset = packet.update_ext_headers(result)
|
||||
result = result[offset:]
|
||||
|
||||
return result
|
||||
|
||||
def _decrypt_rtcp_xsalsa20_poly1305_suffix(self, data: bytes) -> bytes:
|
||||
nonce = data[-24:]
|
||||
header = data[:8]
|
||||
result = self.box.decrypt(data[8:-24], nonce)
|
||||
|
||||
return header + result
|
||||
|
||||
def _decrypt_rtp_xsalsa20_poly1305_lite(self, packet: RTPPacket) -> bytes:
|
||||
nonce = bytearray(24)
|
||||
nonce[:4] = packet.data[-4:]
|
||||
voice_data = packet.data[:-4]
|
||||
result = self.box.decrypt(bytes(voice_data), bytes(nonce))
|
||||
|
||||
if packet.extended:
|
||||
offset = packet.update_ext_headers(result)
|
||||
result = result[offset:]
|
||||
|
||||
return result
|
||||
|
||||
def _decrypt_rtcp_xsalsa20_poly1305_lite(self, data: bytes) -> bytes:
|
||||
nonce = bytearray(24)
|
||||
nonce[:4] = data[-4:]
|
||||
header = data[:8]
|
||||
result = self.box.decrypt(data[8:-4], bytes(nonce))
|
||||
|
||||
return header + result
|
||||
|
||||
def _decrypt_rtp_aead_xchacha20_poly1305_rtpsize(self, packet: RTPPacket) -> bytes:
|
||||
packet.adjust_rtpsize()
|
||||
|
||||
nonce = bytearray(24)
|
||||
nonce[:4] = packet.nonce
|
||||
voice_data = packet.data
|
||||
|
||||
# Blob vomit
|
||||
assert isinstance(self.box, nacl.secret.Aead)
|
||||
result = self.box.decrypt(bytes(voice_data), bytes(packet.header), bytes(nonce))
|
||||
|
||||
if packet.extended:
|
||||
offset = packet.update_ext_headers(result)
|
||||
result = result[offset:]
|
||||
|
||||
return result
|
||||
|
||||
def _decrypt_rtcp_aead_xchacha20_poly1305_rtpsize(self, data: bytes) -> bytes:
|
||||
nonce = bytearray(24)
|
||||
nonce[:4] = data[-4:]
|
||||
header = data[:8]
|
||||
|
||||
assert isinstance(self.box, nacl.secret.Aead)
|
||||
result = self.box.decrypt(data[8:-4], bytes(header), bytes(nonce))
|
||||
|
||||
return header + result
|
||||
|
||||
|
||||
class SpeakingTimer(threading.Thread):
|
||||
def __init__(self, reader: AudioReader):
|
||||
super().__init__(daemon=True, name=f'speaking-timer-{id(self):x}')
|
||||
|
||||
self.reader: AudioReader = reader
|
||||
self.voice_client = reader.voice_client
|
||||
self.speaking_timeout_delay: float = 0.2
|
||||
self.last_speaking_state: Dict[int, bool] = {}
|
||||
self.speaking_cache: Dict[int, float] = {}
|
||||
self.speaking_timer_event: threading.Event = threading.Event()
|
||||
self._end_thread: threading.Event = threading.Event()
|
||||
|
||||
def _lookup_member(self, ssrc: int) -> Optional[Member]:
|
||||
whoid = self.voice_client._get_id_from_ssrc(ssrc)
|
||||
return self.voice_client.guild.get_member(whoid) if whoid else None
|
||||
|
||||
def maybe_dispatch_speaking_start(self, ssrc: int) -> None:
|
||||
tlast = self.speaking_cache.get(ssrc)
|
||||
if tlast is None or tlast + self.speaking_timeout_delay < time.perf_counter():
|
||||
self.dispatch('voice_member_speaking_start', ssrc)
|
||||
|
||||
def dispatch(self, event: SpeakingEvent, ssrc: int) -> None:
|
||||
who = self._lookup_member(ssrc)
|
||||
if not who:
|
||||
return
|
||||
self.voice_client.dispatch_sink(event, who)
|
||||
|
||||
def notify(self, ssrc: Optional[int] = None) -> None:
|
||||
if ssrc is not None:
|
||||
self.last_speaking_state[ssrc] = True
|
||||
self.maybe_dispatch_speaking_start(ssrc)
|
||||
self.speaking_cache[ssrc] = time.perf_counter()
|
||||
|
||||
self.speaking_timer_event.set()
|
||||
self.speaking_timer_event.clear()
|
||||
|
||||
def drop_ssrc(self, ssrc: int) -> None:
|
||||
self.speaking_cache.pop(ssrc, None)
|
||||
state = self.last_speaking_state.pop(ssrc, None)
|
||||
if state:
|
||||
self.dispatch('voice_member_speaking_stop', ssrc)
|
||||
self.notify()
|
||||
|
||||
def get_speaking(self, ssrc: int) -> Optional[bool]:
|
||||
return self.last_speaking_state.get(ssrc)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._end_thread.set()
|
||||
self.notify()
|
||||
|
||||
def run(self) -> None:
|
||||
_i1 = itemgetter(1)
|
||||
|
||||
def get_next_entry():
|
||||
cache = sorted(self.speaking_cache.items(), key=_i1)
|
||||
for ssrc, tlast in cache:
|
||||
# only return pair if speaking
|
||||
if self.last_speaking_state.get(ssrc):
|
||||
return ssrc, tlast
|
||||
|
||||
return None, None
|
||||
|
||||
self.speaking_timer_event.wait()
|
||||
while not self._end_thread.is_set():
|
||||
if not self.speaking_cache:
|
||||
self.speaking_timer_event.wait()
|
||||
|
||||
tnow = time.perf_counter()
|
||||
ssrc, tlast = get_next_entry()
|
||||
|
||||
# no ssrc has been speaking, nothing to timeout
|
||||
if ssrc is None or tlast is None:
|
||||
self.speaking_timer_event.wait()
|
||||
continue
|
||||
|
||||
self.speaking_timer_event.wait(tlast + self.speaking_timeout_delay - tnow)
|
||||
|
||||
if time.perf_counter() < tlast + self.speaking_timeout_delay:
|
||||
continue
|
||||
|
||||
self.dispatch('voice_member_speaking_stop', ssrc)
|
||||
self.last_speaking_state[ssrc] = False
|
||||
|
||||
|
||||
# TODO: unify into a single thread that does all keepalives
|
||||
class UDPKeepAlive(threading.Thread):
|
||||
delay: int = 5000
|
||||
|
||||
def __init__(self, voice_client: VoiceRecvClient):
|
||||
super().__init__(daemon=True, name=f"voice-udp-keepalive-{id(self):x}")
|
||||
|
||||
self.voice_client: VoiceRecvClient = voice_client
|
||||
|
||||
self.last_time: float = 0
|
||||
self.counter: int = 0
|
||||
self._end_thread: threading.Event = threading.Event()
|
||||
|
||||
def run(self) -> None:
|
||||
self.voice_client.wait_until_connected()
|
||||
|
||||
while not self._end_thread.is_set():
|
||||
vc = self.voice_client
|
||||
try:
|
||||
packet = self.counter.to_bytes(8, 'big')
|
||||
except OverflowError:
|
||||
self.counter = 0
|
||||
continue
|
||||
|
||||
try:
|
||||
vc._connection.socket.sendto(packet, (vc._connection.endpoint_ip, vc._connection.voice_port))
|
||||
except Exception as e:
|
||||
log.debug("Error sending keepalive to socket: %s: %s", e.__class__.__name__, e)
|
||||
# TODO: test connection interruptions
|
||||
vc.wait_until_connected()
|
||||
if vc.is_connected():
|
||||
continue
|
||||
break
|
||||
else:
|
||||
self.counter += 1
|
||||
time.sleep(self.delay)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._end_thread.set()
|
||||
203
vendor/discord-ext-voice-recv/discord/ext/voice_recv/router.py
vendored
Normal file
203
vendor/discord-ext-voice-recv/discord/ext/voice_recv/router.py
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import queue
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from collections import deque
|
||||
|
||||
from .utils import MultiDataEvent
|
||||
from .opus import PacketDecoder
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Tuple, Dict, List, Callable, Any, Optional
|
||||
from .rtp import RTPPacket, RTCPPacket
|
||||
from .sinks import AudioSink
|
||||
from .reader import AudioReader
|
||||
|
||||
EventCB = Callable[..., Any]
|
||||
EventData = Tuple[str, Tuple[Any, ...], Dict[str, Any]]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PacketRouter(threading.Thread):
|
||||
def __init__(self, sink: AudioSink, reader: AudioReader):
|
||||
super().__init__(daemon=True, name=f"packet-router-{id(self):x}")
|
||||
|
||||
self.sink: AudioSink = sink
|
||||
self.decoders: Dict[int, PacketDecoder] = {}
|
||||
self.reader: AudioReader = reader
|
||||
self.waiter: MultiDataEvent[PacketDecoder] = MultiDataEvent()
|
||||
|
||||
self._lock: threading.RLock = threading.RLock()
|
||||
self._end_thread: threading.Event = threading.Event()
|
||||
self._dropped_ssrcs: deque[int] = deque(maxlen=16)
|
||||
|
||||
def feed_rtp(self, packet: RTPPacket) -> None:
|
||||
# TODO: stale packet check
|
||||
|
||||
if packet.ssrc in self._dropped_ssrcs:
|
||||
log.debug("Ignoring packet from dropped ssrc %s", packet.ssrc)
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
decoder = self.get_decoder(packet.ssrc)
|
||||
if decoder is not None:
|
||||
decoder.push_packet(packet)
|
||||
|
||||
def feed_rtcp(self, packet: RTCPPacket) -> None:
|
||||
guild = self.sink.voice_client.guild if self.sink.voice_client else None
|
||||
event_router = self.reader.event_router
|
||||
event_router.dispatch('rtcp_packet', packet, guild)
|
||||
|
||||
def get_decoder(self, ssrc: int) -> Optional[PacketDecoder]:
|
||||
with self._lock:
|
||||
decoder = self.decoders.get(ssrc)
|
||||
if decoder is None:
|
||||
decoder = self.decoders[ssrc] = PacketDecoder(self, ssrc)
|
||||
|
||||
return decoder
|
||||
|
||||
def set_sink(self, sink: AudioSink) -> None:
|
||||
with self._lock:
|
||||
self.sink = sink
|
||||
|
||||
def set_user_id(self, ssrc: int, user_id: int) -> None:
|
||||
with self._lock:
|
||||
if ssrc in self._dropped_ssrcs:
|
||||
self._dropped_ssrcs.remove(ssrc)
|
||||
|
||||
decoder = self.decoders.get(ssrc)
|
||||
|
||||
if decoder is not None:
|
||||
decoder.set_user_id(user_id)
|
||||
|
||||
def destroy_decoder(self, ssrc: int) -> None:
|
||||
with self._lock:
|
||||
decoder = self.decoders.pop(ssrc, None)
|
||||
if decoder is not None:
|
||||
self._dropped_ssrcs.append(ssrc)
|
||||
decoder.destroy()
|
||||
|
||||
def destroy_all_decoders(self) -> None:
|
||||
with self._lock:
|
||||
for ssrc in list(self.decoders.keys()):
|
||||
self.destroy_decoder(ssrc)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._end_thread.set()
|
||||
self.waiter.notify()
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
self._do_run()
|
||||
except Exception as e:
|
||||
log.exception("Error in %s loop", self)
|
||||
self.reader.error = e
|
||||
finally:
|
||||
self.reader.voice_client.stop_listening()
|
||||
self.waiter.clear()
|
||||
|
||||
def _do_run(self) -> None:
|
||||
while not self._end_thread.is_set():
|
||||
self.waiter.wait()
|
||||
with self._lock:
|
||||
for decoder in self.waiter.items:
|
||||
data = decoder.pop_data()
|
||||
if data is not None:
|
||||
self.sink.write(data.source, data)
|
||||
|
||||
|
||||
class SinkEventRouter(threading.Thread):
|
||||
def __init__(self, sink: AudioSink, reader: AudioReader):
|
||||
super().__init__(daemon=True, name=f"sink-event-router-{id(self):x}")
|
||||
|
||||
self.sink: AudioSink = sink
|
||||
self.reader: AudioReader = reader
|
||||
|
||||
self._event_listeners: Dict[str, List[EventCB]] = {}
|
||||
self._buffer: queue.SimpleQueue[EventData] = queue.SimpleQueue()
|
||||
self._lock = threading.RLock()
|
||||
self._end_thread: threading.Event = threading.Event()
|
||||
|
||||
self.register_events()
|
||||
|
||||
def dispatch(self, event: str, /, *args: Any, **kwargs: Any) -> None:
|
||||
log.debug("Dispatching voice_client event %s", event)
|
||||
self._buffer.put_nowait((event, args, kwargs))
|
||||
|
||||
def set_sink(self, sink: AudioSink) -> None:
|
||||
with self._lock:
|
||||
self.unregister_events()
|
||||
self.sink = sink
|
||||
self.register_events()
|
||||
|
||||
def register_events(self) -> None:
|
||||
with self._lock:
|
||||
self._register_listeners(self.sink)
|
||||
for child in self.sink.walk_children():
|
||||
self._register_listeners(child)
|
||||
|
||||
def unregister_events(self) -> None:
|
||||
with self._lock:
|
||||
self._unregister_listeners(self.sink)
|
||||
for child in self.sink.walk_children():
|
||||
self._unregister_listeners(child)
|
||||
|
||||
def _register_listeners(self, sink: AudioSink) -> None:
|
||||
log.debug("Registering events for %s: %s ", sink, sink.__sink_listeners__)
|
||||
|
||||
for name, method_name in sink.__sink_listeners__:
|
||||
func = getattr(sink, method_name)
|
||||
|
||||
log.debug("Registering event: %r, func: %r", name, method_name)
|
||||
if name in self._event_listeners:
|
||||
self._event_listeners[name].append(func)
|
||||
else:
|
||||
self._event_listeners[name] = [func]
|
||||
|
||||
def _unregister_listeners(self, sink: AudioSink):
|
||||
for name, method_name in sink.__sink_listeners__:
|
||||
func = getattr(sink, method_name)
|
||||
|
||||
if name in self._event_listeners:
|
||||
try:
|
||||
self._event_listeners[name].remove(func)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _dispatch_to_listeners(self, event: str, *args: Any, **kwargs: Any) -> None:
|
||||
for listener in self._event_listeners.get(f'on_{event}', []):
|
||||
try:
|
||||
listener(*args, **kwargs)
|
||||
except Exception:
|
||||
log.exception("Unhandled exception dispatching voice listener event %r", event)
|
||||
log.debug("event=%r, args=%r, kwargs=%r, listener=%r", event, args, kwargs, listener)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._end_thread.set()
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
self._do_run()
|
||||
except Exception as e:
|
||||
log.exception("Error in %s", self.name)
|
||||
self.reader.error = e
|
||||
self.reader.voice_client.stop_listening()
|
||||
|
||||
def _do_run(self) -> None:
|
||||
while not self._end_thread.is_set():
|
||||
try:
|
||||
event, args, kwargs = self._buffer.get(timeout=0.5)
|
||||
except queue.Empty:
|
||||
continue
|
||||
else:
|
||||
with self._lock:
|
||||
# this looks dumb
|
||||
with self.reader.packet_router._lock:
|
||||
self._dispatch_to_listeners(event, *args, **kwargs)
|
||||
471
vendor/discord-ext-voice-recv/discord/ext/voice_recv/rtp.py
vendored
Normal file
471
vendor/discord-ext-voice-recv/discord/ext/voice_recv/rtp.py
vendored
Normal file
@@ -0,0 +1,471 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import struct
|
||||
import logging
|
||||
|
||||
from math import ceil
|
||||
from collections import namedtuple
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional, Literal, Union, Final, Dict, Any, Tuple
|
||||
|
||||
AudioPacket = Union['RTPPacket', 'FakePacket', 'SilencePacket']
|
||||
RealPacket = Union['RTPPacket', 'RTCPPacket']
|
||||
Packet = Union[RealPacket, 'FakePacket', 'SilencePacket']
|
||||
|
||||
PacketTypes = Union[
|
||||
'SenderReportPacket',
|
||||
'ReceiverReportPacket',
|
||||
'SDESPacket',
|
||||
'BYEPacket',
|
||||
'APPPacket',
|
||||
]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
'RTPPacket',
|
||||
'RTCPPacket',
|
||||
'FakePacket',
|
||||
'SilencePacket',
|
||||
'ExtensionID',
|
||||
]
|
||||
|
||||
OPUS_SILENCE: Final = b'\xf8\xff\xfe'
|
||||
|
||||
|
||||
class ExtensionID:
|
||||
audio_power: Final = 1
|
||||
speaking_state: Final = 9
|
||||
|
||||
|
||||
def decode(data: bytes) -> RealPacket:
|
||||
"""Creates an :class:`RTPPacket` or an :class:`RTCPPacket`.
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
data : bytes
|
||||
The raw packet data.
|
||||
"""
|
||||
|
||||
# While technically unreliable, discord RTP packets (should)
|
||||
# always be distinguishable from RTCP packets. RTCP packets
|
||||
# should always have 200-204 as their second byte, while RTP
|
||||
# packet are (probably) always 73 (or at least not 200-204).
|
||||
|
||||
# check version bits
|
||||
if not data[0] >> 6 == 2:
|
||||
raise ValueError(f'Invalid packet header 0b{data[0]:0>8b}')
|
||||
return _rtcp_map.get(data[1], RTPPacket)(data)
|
||||
|
||||
|
||||
def decode_rtp(data: bytes) -> RTPPacket:
|
||||
return decode(data) # type: ignore
|
||||
|
||||
|
||||
def decode_rtcp(data: bytes) -> RTCPPacket:
|
||||
return decode(data) # type: ignore
|
||||
|
||||
|
||||
def is_rtcp(data: bytes) -> bool:
|
||||
return 200 <= data[1] <= 204
|
||||
|
||||
|
||||
def _parse_low(x: int, bitlen: int = 32) -> float:
|
||||
return x / 2.0**bitlen
|
||||
|
||||
|
||||
def _into_low(x: float, bitlen: int = 32) -> int:
|
||||
return int(x * 2.0**bitlen)
|
||||
|
||||
|
||||
class _PacketCmpMixin:
|
||||
__slots__ = ('ssrc', 'sequence', 'timestamp')
|
||||
|
||||
def __lt__(self, other: _PacketCmpMixin) -> bool:
|
||||
if self.ssrc != other.ssrc:
|
||||
raise TypeError("packet ssrc mismatch (%s, %s)" % (self.ssrc, other.ssrc))
|
||||
return self.sequence < other.sequence and self.timestamp < other.timestamp
|
||||
|
||||
def __gt__(self, other: _PacketCmpMixin) -> bool:
|
||||
if self.ssrc != other.ssrc:
|
||||
raise TypeError("packet ssrc mismatch (%s, %s)" % (self.ssrc, other.ssrc))
|
||||
return self.sequence > other.sequence or self.timestamp > other.timestamp
|
||||
|
||||
def __eq__(self, other: _PacketCmpMixin) -> bool:
|
||||
if self.ssrc != other.ssrc:
|
||||
return False
|
||||
return self.sequence == other.sequence and self.timestamp == other.timestamp
|
||||
|
||||
def is_silence(self) -> bool:
|
||||
data = getattr(self, 'decrypted_data', None)
|
||||
return data == OPUS_SILENCE
|
||||
|
||||
|
||||
class FakePacket(_PacketCmpMixin):
|
||||
__slots__ = ('ssrc', 'sequence', 'timestamp')
|
||||
decrypted_data: bytes = b''
|
||||
extension_data: dict = {}
|
||||
|
||||
def __init__(self, ssrc: int, sequence: int, timestamp: int):
|
||||
self.ssrc: int = ssrc
|
||||
self.sequence: int = sequence
|
||||
self.timestamp: int = timestamp
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<FakePacket ssrc={0.ssrc}, sequence={0.sequence}, timestamp={0.timestamp}>'.format(self)
|
||||
|
||||
def __bool__(self) -> Literal[False]:
|
||||
return False
|
||||
|
||||
|
||||
class SilencePacket(_PacketCmpMixin):
|
||||
__slots__ = ('ssrc', 'timestamp')
|
||||
decrypted_data: Final = OPUS_SILENCE
|
||||
extension_data: Final[Dict[int, Any]] = {}
|
||||
sequence: int = -1
|
||||
|
||||
def __init__(self, ssrc: int, timestamp: int):
|
||||
self.ssrc: int = ssrc
|
||||
self.timestamp: int = timestamp
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<SilencePacket ssrc={0.ssrc}, timestamp={0.timestamp}>'.format(self)
|
||||
|
||||
def is_silence(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
class RTPPacket(_PacketCmpMixin):
|
||||
__slots__ = (
|
||||
'version',
|
||||
'padding',
|
||||
'extended',
|
||||
'cc',
|
||||
'marker',
|
||||
'payload',
|
||||
'sequence',
|
||||
'timestamp',
|
||||
'ssrc',
|
||||
'csrcs',
|
||||
'header',
|
||||
'data',
|
||||
'decrypted_data',
|
||||
'nonce',
|
||||
'extension',
|
||||
'extension_data',
|
||||
'_rtpsize',
|
||||
)
|
||||
|
||||
_hstruct = struct.Struct('>xxHII')
|
||||
_ext_header = namedtuple("Extension", 'profile length values')
|
||||
_ext_magic = b'\xbe\xde'
|
||||
|
||||
def __init__(self, data: bytes):
|
||||
data = bytearray(data) # type: ignore
|
||||
|
||||
# fmt: off
|
||||
self.version: int = data[0] >> 6
|
||||
self.padding: bool = bool(data[0] & 0b00100000)
|
||||
self.extended: bool = bool(data[0] & 0b00010000)
|
||||
self.cc: int = data[0] & 0b00001111
|
||||
|
||||
self.marker: bool = bool(data[1] & 0b10000000)
|
||||
self.payload: int = data[1] & 0b01111111
|
||||
# fmt: on
|
||||
|
||||
sequence, timestamp, ssrc = self._hstruct.unpack_from(data)
|
||||
self.sequence: int = sequence
|
||||
self.timestamp: int = timestamp
|
||||
self.ssrc: int = ssrc
|
||||
|
||||
self.csrcs: Tuple[int, ...] = ()
|
||||
self.extension = None
|
||||
self.extension_data: Dict[int, bytes] = {}
|
||||
|
||||
self.header = data[:12]
|
||||
self.data = data[12:]
|
||||
self.decrypted_data: Optional[bytes] = None
|
||||
|
||||
self.nonce: bytes = b''
|
||||
self._rtpsize: bool = False
|
||||
|
||||
if self.cc:
|
||||
fmt = '>%sI' % self.cc
|
||||
offset = struct.calcsize(fmt) + 12
|
||||
self.csrcs = struct.unpack(fmt, data[12:offset])
|
||||
self.data = data[offset:]
|
||||
|
||||
# TODO?: impl padding calculations (though discord doesn't seem to use that bit)
|
||||
|
||||
def adjust_rtpsize(self):
|
||||
"""Adjusts the packet header and data based on the rtpsize format."""
|
||||
|
||||
self._rtpsize = True
|
||||
self.nonce = self.data[-4:]
|
||||
|
||||
if not self.extended:
|
||||
self.data = self.data[:-4]
|
||||
return
|
||||
|
||||
# rtpsize based formats are laid out similarly to SRTP packets, which includes the ext header now
|
||||
# the nonce also needs to be removed from the end
|
||||
self.header += self.data[:4]
|
||||
self.data = self.data[4:-4]
|
||||
|
||||
def update_ext_headers(self, data: bytes) -> int:
|
||||
"""Adds extended header data to this packet, returns payload offset"""
|
||||
|
||||
if not self.extended:
|
||||
return 0
|
||||
|
||||
# rtpsize formats have the extension header in the rtp header instead of payload
|
||||
if self._rtpsize:
|
||||
data = self.header[-4:] + data
|
||||
|
||||
# data is the decrypted packet payload containing the extension header and opus data
|
||||
profile, length = struct.unpack_from('>2sH', data)
|
||||
|
||||
if profile == self._ext_magic:
|
||||
self._parse_bede_header(data, length)
|
||||
|
||||
values = struct.unpack('>%sI' % length, data[4 : 4 + length * 4])
|
||||
self.extension = self._ext_header(profile, length, values)
|
||||
|
||||
offset = 4 + length * 4
|
||||
if self._rtpsize:
|
||||
# remove the extra offset from adding the header in
|
||||
offset -= 4
|
||||
|
||||
return offset
|
||||
|
||||
# https://www.rfcreader.com/#rfc5285_line186
|
||||
def _parse_bede_header(self, data: bytes, length: int) -> None:
|
||||
offset = 4
|
||||
n = 0
|
||||
|
||||
while n < length:
|
||||
next_byte = data[offset : offset + 1]
|
||||
|
||||
if next_byte == b'\x00':
|
||||
offset += 1
|
||||
continue
|
||||
|
||||
header = struct.unpack('>B', next_byte)[0]
|
||||
|
||||
element_id = header >> 4
|
||||
element_len = 1 + (header & 0b0000_1111)
|
||||
|
||||
self.extension_data[element_id] = data[offset + 1 : offset + 1 + element_len]
|
||||
offset += 1 + element_len
|
||||
n += 1
|
||||
|
||||
def _dump_info(self) -> str:
|
||||
attrs = {name: getattr(self, name) for name in self.__slots__}
|
||||
return ''.join(("<RTPPacket ", *['{}={}, '.format(n, v) for n, v in attrs.items()], '>'))
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
'<RTPPacket '
|
||||
'ssrc={0.ssrc}, '
|
||||
'sequence={0.sequence}, '
|
||||
'timestamp={0.timestamp}, '
|
||||
'size={1}, '
|
||||
'ext={2}'
|
||||
'>'.format(self, len(self.data), set(self.extension_data))
|
||||
)
|
||||
|
||||
|
||||
# http://www.rfcreader.com/#rfc3550_line855
|
||||
class RTCPPacket:
|
||||
__slots__ = ('version', 'padding', 'length')
|
||||
_header = struct.Struct('>BBH')
|
||||
_ssrc_fmt = struct.Struct('>I')
|
||||
type = None
|
||||
|
||||
def __init__(self, data: bytes):
|
||||
self.length: int
|
||||
head, _, self.length = self._header.unpack_from(data)
|
||||
self.version: int = head >> 6
|
||||
self.padding: bool = bool(head & 0b00100000)
|
||||
# dubious, yet devious
|
||||
setattr(self, self.__slots__[0], head & 0b00011111)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
content = ', '.join("{}: {}".format(k, getattr(self, k, None)) for k in self.__slots__)
|
||||
return "<{} {}>".format(self.__class__.__name__, content)
|
||||
|
||||
@classmethod
|
||||
def from_data(cls, data: bytes) -> PacketTypes:
|
||||
_, ptype, _ = cls._header.unpack_from(data)
|
||||
return _rtcp_map[ptype](data)
|
||||
|
||||
|
||||
# TODO?: consider moving repeated code to a ReportPacket type
|
||||
# http://www.rfcreader.com/#rfc3550_line1614
|
||||
class SenderReportPacket(RTCPPacket):
|
||||
__slots__ = ('report_count', 'ssrc', 'info', 'reports', 'extension')
|
||||
_info_fmt = struct.Struct('>5I')
|
||||
_report_fmt = struct.Struct('>IB3x4I')
|
||||
_24bit_int_fmt = struct.Struct('>4xI')
|
||||
_info = namedtuple('RRSenderInfo', 'ntp_ts rtp_ts packet_count octet_count')
|
||||
_report = namedtuple("RReport", 'ssrc perc_loss total_lost last_seq jitter lsr dlsr')
|
||||
type = 200
|
||||
|
||||
def __init__(self, data):
|
||||
super().__init__(data)
|
||||
self.ssrc = self._ssrc_fmt.unpack_from(data, 4)[0]
|
||||
self.info = self._read_sender_info(data, 8)
|
||||
|
||||
reports = []
|
||||
for x in range(self.report_count):
|
||||
offset = 28 + 24 * x
|
||||
reports.append(self._read_report(data, offset))
|
||||
|
||||
self.reports = tuple(reports)
|
||||
|
||||
self.extension = None
|
||||
if len(data) > 28 + 24 * self.report_count:
|
||||
self.extension = data[28 + 24 * self.report_count :]
|
||||
|
||||
def _read_sender_info(self, data, offset):
|
||||
nhigh, nlow, rtp_ts, pcount, ocount = self._info_fmt.unpack_from(data, offset)
|
||||
ntotal = nhigh + _parse_low(nlow)
|
||||
return self._info(ntotal, rtp_ts, pcount, ocount)
|
||||
|
||||
def _read_report(self, data, offset):
|
||||
ssrc, flost, seq, jit, lsr, dlsr = self._report_fmt.unpack_from(data, offset)
|
||||
clost = self._24bit_int_fmt.unpack_from(data, offset)[0] & 0xFFFFFF
|
||||
return self._report(ssrc, flost, clost, seq, jit, lsr, dlsr)
|
||||
|
||||
|
||||
# http://www.rfcreader.com/#rfc3550_line1879
|
||||
class ReceiverReportPacket(RTCPPacket):
|
||||
__slots__ = ('report_count', 'ssrc', 'reports', 'extension')
|
||||
_report_fmt = struct.Struct('>IB3x4I')
|
||||
_24bit_int_fmt = struct.Struct('>4xI')
|
||||
_report = namedtuple("RReport", 'ssrc perc_loss total_lost last_seq jitter lsr dlsr')
|
||||
type = 201
|
||||
|
||||
reports: Tuple[_report, ...]
|
||||
|
||||
def __init__(self, data: bytes):
|
||||
super().__init__(data)
|
||||
self.ssrc: int = self._ssrc_fmt.unpack_from(data, 4)[0]
|
||||
|
||||
reports = []
|
||||
for x in range(self.report_count):
|
||||
offset = 8 + 24 * x
|
||||
reports.append(self._read_report(data, offset))
|
||||
|
||||
self.reports = tuple(reports)
|
||||
|
||||
self.extension: Optional[bytes] = None
|
||||
if len(data) > 8 + 24 * self.report_count:
|
||||
self.extension = data[8 + 24 * self.report_count :]
|
||||
|
||||
def _read_report(self, data: bytes, offset: int) -> _report:
|
||||
ssrc, flost, seq, jit, lsr, dlsr = self._report_fmt.unpack_from(data, offset)
|
||||
clost = self._24bit_int_fmt.unpack_from(data, offset)[0] & 0xFFFFFF
|
||||
return self._report(ssrc, flost, clost, seq, jit, lsr, dlsr)
|
||||
|
||||
|
||||
# UNFORTUNATELY it seems discord only uses the above ~~two packet types~~ packet type.
|
||||
# Good thing I knew that when I made the rest of these. Haha yes.
|
||||
|
||||
|
||||
# http://www.rfcreader.com/#rfc3550_line2024
|
||||
class SDESPacket(RTCPPacket):
|
||||
__slots__ = ('source_count', 'chunks', '_pos')
|
||||
_item_header = struct.Struct('>BB')
|
||||
_chunk = namedtuple("SDESChunk", 'ssrc items')
|
||||
_item = namedtuple("SDESItem", 'type size length text')
|
||||
type = 202
|
||||
|
||||
def __init__(self, data):
|
||||
super().__init__(data)
|
||||
_chunks = []
|
||||
self._pos = 4
|
||||
|
||||
for _ in range(self.source_count):
|
||||
_chunks.append(self._read_chunk(data))
|
||||
|
||||
self.chunks = tuple(_chunks)
|
||||
|
||||
def _read_chunk(self, data):
|
||||
ssrc = self._ssrc_fmt.unpack_from(data, self._pos)[0]
|
||||
self._pos += 4
|
||||
|
||||
# check for chunk with no items
|
||||
if data[self._pos : self._pos + 4] == b'\x00\x00\x00\x00':
|
||||
self._pos += 4
|
||||
return self._chunk(ssrc, ())
|
||||
|
||||
items = [self._read_item(data)]
|
||||
|
||||
# Read items until END type is found
|
||||
while items[-1].type != 0:
|
||||
items.append(self._read_item(data))
|
||||
|
||||
# pad chunk to 4 bytes
|
||||
if self._pos % 4:
|
||||
self._pos = ceil(self._pos / 4) * 4
|
||||
|
||||
return self._chunk(ssrc, items)
|
||||
|
||||
def _read_item(self, data):
|
||||
itype, ilen = self._item_header.unpack_from(data, self._pos)
|
||||
self._pos += 2
|
||||
text = None
|
||||
|
||||
if ilen:
|
||||
text = data[self._pos : self._pos + ilen].decode()
|
||||
self._pos += ilen
|
||||
|
||||
return self._item(itype, ilen + 2, ilen, text)
|
||||
|
||||
def _get_chunk_size(self, chunk):
|
||||
return 4 + max(4, sum(i.size for i in chunk.items)) # + padding?
|
||||
|
||||
|
||||
# http://www.rfcreader.com/#rfc3550_line2311
|
||||
class BYEPacket(RTCPPacket):
|
||||
__slots__ = ('source_count', 'ssrcs', 'reason')
|
||||
type = 203
|
||||
|
||||
def __init__(self, data):
|
||||
super().__init__(data)
|
||||
self.ssrcs = struct.unpack_from('>%sI' % self.source_count, data, 4)
|
||||
self.reason = None
|
||||
|
||||
body_length = 4 + len(self.ssrcs) * 4
|
||||
if len(data) > body_length:
|
||||
extra_len = struct.unpack_from('B', data, body_length)[0]
|
||||
reason = struct.unpack_from('%ss' % extra_len, data, body_length + 1)
|
||||
self.reason = reason.decode()
|
||||
|
||||
|
||||
# http://www.rfcreader.com/#rfc3550_line2353
|
||||
class APPPacket(RTCPPacket):
|
||||
__slots__ = ('subtype', 'ssrc', 'name', 'data')
|
||||
_packet_info = struct.Struct('>I4s')
|
||||
type = 204
|
||||
|
||||
def __init__(self, data):
|
||||
super().__init__(data)
|
||||
self.ssrc, name = self._packet_info.unpack_from(data, 4)
|
||||
self.name = name.decode('ascii')
|
||||
self.data = data[12:] # should be a multiple of 32 bits but idc
|
||||
|
||||
|
||||
_rtcp_map = {
|
||||
200: SenderReportPacket,
|
||||
201: ReceiverReportPacket,
|
||||
202: SDESPacket,
|
||||
203: BYEPacket,
|
||||
204: APPPacket,
|
||||
}
|
||||
152
vendor/discord-ext-voice-recv/discord/ext/voice_recv/silence.py
vendored
Normal file
152
vendor/discord-ext-voice-recv/discord/ext/voice_recv/silence.py
vendored
Normal file
@@ -0,0 +1,152 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from .opus import VoiceData
|
||||
from .rtp import SilencePacket
|
||||
|
||||
from discord.utils import MISSING
|
||||
from discord.opus import Decoder
|
||||
|
||||
from typing import TYPE_CHECKING, Tuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Callable, Any, Dict, Optional, Final, Union
|
||||
from .rtp import AudioPacket
|
||||
from .types import MemberOrUser as User
|
||||
|
||||
SilenceGenFN = Callable[[Optional[User], VoiceData], Any]
|
||||
SSRCData = Tuple[float, Optional[User], AudioPacket]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
'SilenceGenerator',
|
||||
]
|
||||
|
||||
SILENCE_PCM: Final = b'\0' * Decoder.FRAME_SIZE
|
||||
PACKET_INTERVAL: Final = 0.02
|
||||
|
||||
|
||||
class SilenceGenerator(threading.Thread):
|
||||
"""Generates and sends silence packets."""
|
||||
|
||||
def __init__(self, callback: SilenceGenFN, *, grace_period: float = 0.015):
|
||||
super().__init__(daemon=True, name=f'silencegen-{id(self):x}')
|
||||
self.callback: SilenceGenFN = callback
|
||||
self.grace_period: float = grace_period
|
||||
|
||||
self._ssrc_data: Dict[int, SSRCData] = {} # {ssrc: (time, _, _)}
|
||||
self._last_timestamp: Dict[int, int] = {} # {ssrc: timestamp}
|
||||
self._user_map_backup: Dict[int, int] = {} # {id: ssrc}
|
||||
self._end: threading.Event = threading.Event()
|
||||
self._has_data: threading.Event = threading.Event()
|
||||
self._lock: threading.Lock = threading.Lock()
|
||||
|
||||
def push(self, user: Optional[User], packet: AudioPacket) -> None:
|
||||
"""Updates the last time a packet was received and from whom.
|
||||
Calling this function will start generating silence packets for `packet.ssrc`
|
||||
until `drop(ssrc)` or `stop()` is called.
|
||||
"""
|
||||
|
||||
with self._lock:
|
||||
self._ssrc_data[packet.ssrc] = (time.perf_counter(), user, packet)
|
||||
self._last_timestamp[packet.ssrc] = packet.timestamp
|
||||
|
||||
if user:
|
||||
self._user_map_backup[user.id] = packet.ssrc
|
||||
|
||||
self._has_data.set()
|
||||
|
||||
def _get_next_info(self) -> SSRCData:
|
||||
return min(self._ssrc_data.values())
|
||||
|
||||
def drop(self, *, ssrc: Optional[int] = None, user: User = MISSING) -> None:
|
||||
"""Stop generating silence packets for `ssrc`, or whatever is cached for `user`
|
||||
if `ssrc` is None, if any.
|
||||
"""
|
||||
|
||||
with self._lock:
|
||||
if ssrc is None:
|
||||
ssrc = self._user_map_backup.pop(user.id, None)
|
||||
if ssrc is None:
|
||||
return # weird but ok
|
||||
|
||||
self._last_timestamp.pop(ssrc, None)
|
||||
last_data = self._ssrc_data.pop(ssrc, None)
|
||||
if last_data is None and user is not MISSING:
|
||||
ssrc = self._user_map_backup.pop(user.id)
|
||||
self._ssrc_data.pop(ssrc, None)
|
||||
|
||||
if not self._ssrc_data:
|
||||
self._has_data.clear()
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stops generating silence for everything and clears the cache."""
|
||||
|
||||
self._end.set()
|
||||
self._has_data.set()
|
||||
|
||||
with self._lock:
|
||||
self._ssrc_data.clear()
|
||||
self._user_map_backup.clear()
|
||||
self._last_timestamp.clear()
|
||||
self._has_data.clear()
|
||||
|
||||
self.join(1)
|
||||
|
||||
def start(self) -> None:
|
||||
self._end.clear()
|
||||
super().start()
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
self._do_run()
|
||||
except Exception as e:
|
||||
log.exception("Error in %s", self)
|
||||
|
||||
def _do_run(self) -> None:
|
||||
while not self._end.is_set():
|
||||
self._has_data.wait()
|
||||
if self._end.is_set():
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
tlast, user, packet = self._get_next_info()
|
||||
ssrc = packet.ssrc
|
||||
|
||||
# prepare the object before the sleep as a little micro optimization
|
||||
next_packet = SilencePacket(
|
||||
ssrc, self._last_timestamp.get(ssrc, packet.timestamp) + Decoder.SAMPLES_PER_FRAME
|
||||
)
|
||||
# TODO: check if destination wants opus or not
|
||||
next_data = VoiceData(next_packet, user, pcm=SILENCE_PCM)
|
||||
|
||||
tnext = tlast + PACKET_INTERVAL
|
||||
tnow = time.perf_counter()
|
||||
# wait a little bit longer than when the next one should be
|
||||
# so we don't have to race with the next packet
|
||||
delay = tnext + self.grace_period - tnow
|
||||
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
|
||||
with self._lock:
|
||||
tlast2, luser, lpacket = self._ssrc_data.get(ssrc, (-1, None, packet))
|
||||
|
||||
if next_packet.ssrc != lpacket.ssrc or tlast != tlast2 or self._end.is_set():
|
||||
continue # another packet came in and bumped up the time
|
||||
|
||||
next_data.source = luser # is there any point in doing this?
|
||||
self.callback(luser, next_data)
|
||||
|
||||
with self._lock:
|
||||
# If there was no packet update during the sleep...
|
||||
if tlast == tlast2 and ssrc in self._ssrc_data:
|
||||
# update the existing packet time for the next window
|
||||
self._ssrc_data[ssrc] = (tlast + PACKET_INTERVAL, user, packet)
|
||||
self._last_timestamp[ssrc] += Decoder.SAMPLES_PER_FRAME
|
||||
634
vendor/discord-ext-voice-recv/discord/ext/voice_recv/sinks.py
vendored
Normal file
634
vendor/discord-ext-voice-recv/discord/ext/voice_recv/sinks.py
vendored
Normal file
@@ -0,0 +1,634 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import abc
|
||||
import time
|
||||
import wave
|
||||
import shlex
|
||||
import inspect
|
||||
import audioop
|
||||
import logging
|
||||
import threading
|
||||
import subprocess
|
||||
|
||||
from .opus import VoiceData
|
||||
from .silence import SilenceGenerator
|
||||
|
||||
import discord
|
||||
|
||||
from discord.utils import MISSING, SequenceProxy
|
||||
from discord.opus import Decoder as OpusDecoder
|
||||
|
||||
from typing import TYPE_CHECKING, overload
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Callable, Optional, Any, IO, Sequence, Tuple, Generator, Union, Dict, List
|
||||
|
||||
from .rtp import AudioPacket, RTCPPacket
|
||||
from .voice_client import VoiceRecvClient
|
||||
from .opus import VoiceData
|
||||
from .types import MemberOrUser as User
|
||||
|
||||
BasicSinkWriteCB = Callable[[Optional[User], VoiceData], Any]
|
||||
BasicSinkWriteRTCPCB = Callable[[RTCPPacket], Any]
|
||||
ConditionalFilterFn = Callable[[Optional[User], VoiceData], bool]
|
||||
FFmpegErrorCB = Callable[['FFmpegSink', Exception, Optional[VoiceData]], Any]
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
'AudioSink',
|
||||
'MultiAudioSink',
|
||||
'BasicSink',
|
||||
'WaveSink',
|
||||
'FFmpegSink',
|
||||
'PCMVolumeTransformer',
|
||||
'ConditionalFilter',
|
||||
'TimedFilter',
|
||||
'UserFilter',
|
||||
'SilenceGeneratorSink',
|
||||
]
|
||||
|
||||
|
||||
# TODO: use this in more places
|
||||
class VoiceRecvException(discord.DiscordException):
|
||||
"""Generic exception for voice recv related errors"""
|
||||
|
||||
def __init__(self, message: str):
|
||||
self.message: str = message
|
||||
|
||||
|
||||
class SinkMeta(abc.ABCMeta):
|
||||
__sink_listeners__: List[Tuple[str, str]]
|
||||
|
||||
def __new__(cls, name: str, bases: Tuple[type, ...], attrs: Dict[str, Any], **kwargs):
|
||||
listeners: Dict[str, Any] = {}
|
||||
new_cls = super().__new__(cls, name, bases, attrs, **kwargs)
|
||||
|
||||
for base in reversed(new_cls.__mro__):
|
||||
for elem, value in base.__dict__.items():
|
||||
# If it exists in a subclass, delete the higher level one
|
||||
if elem in listeners:
|
||||
del listeners[elem]
|
||||
|
||||
is_static_method = isinstance(value, staticmethod)
|
||||
if is_static_method:
|
||||
value = value.__func__
|
||||
|
||||
if not hasattr(value, '__sink_listener__'):
|
||||
continue
|
||||
|
||||
listeners[elem] = value
|
||||
|
||||
listener_list = []
|
||||
for listener in listeners.values():
|
||||
for listener_name in listener.__sink_listener_names__:
|
||||
listener_list.append((listener_name, listener.__name__))
|
||||
|
||||
new_cls.__sink_listeners__ = listener_list
|
||||
return new_cls
|
||||
|
||||
|
||||
class SinkABC(metaclass=SinkMeta):
|
||||
__sink_listeners__: List[Tuple[str, str]]
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def root(self) -> AudioSink:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def parent(self) -> Optional[AudioSink]:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def child(self) -> Optional[AudioSink]:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def children(self) -> Sequence[AudioSink]:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def voice_client(self) -> Optional[VoiceRecvClient]:
|
||||
raise NotImplementedError
|
||||
|
||||
# handling opus vs pcm is not strictly mutually exclusive
|
||||
# a sink could handle both but idk about that pattern
|
||||
@abc.abstractmethod
|
||||
def wants_opus(self) -> bool:
|
||||
"""If sink handles opus data"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def write(self, user: Optional[User], data: VoiceData):
|
||||
"""Callback for when the sink receives data"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def cleanup(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def _register_child(self, child: AudioSink) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class AudioSink(SinkABC):
|
||||
_voice_client: Optional[VoiceRecvClient]
|
||||
_parent: Optional[AudioSink] = None
|
||||
_child: Optional[AudioSink] = None
|
||||
|
||||
def __init__(self, destination: Optional[AudioSink] = None, /):
|
||||
if destination is not None:
|
||||
self._register_child(destination)
|
||||
else:
|
||||
self._child = None
|
||||
|
||||
def __del__(self):
|
||||
self.cleanup()
|
||||
|
||||
def _register_child(self, child: AudioSink) -> None:
|
||||
if child in self.root.walk_children():
|
||||
raise RuntimeError('Sink is already registered.')
|
||||
|
||||
self._child = child
|
||||
child._parent = self
|
||||
|
||||
@property
|
||||
def root(self) -> AudioSink:
|
||||
if self.parent is None:
|
||||
return self
|
||||
|
||||
return self.parent.root
|
||||
|
||||
@property
|
||||
def parent(self) -> Optional[AudioSink]:
|
||||
return self._parent
|
||||
|
||||
@property
|
||||
def child(self) -> Optional[AudioSink]:
|
||||
return self._child
|
||||
|
||||
@property
|
||||
def children(self) -> Sequence[AudioSink]:
|
||||
return [self._child] if self._child else []
|
||||
|
||||
@property
|
||||
def voice_client(self) -> Optional[VoiceRecvClient]:
|
||||
"""Guaranteed to not be None inside write()"""
|
||||
|
||||
if self.parent is not None:
|
||||
return self.parent.voice_client
|
||||
else:
|
||||
return self._voice_client
|
||||
|
||||
@property
|
||||
def client(self) -> Optional[discord.Client]:
|
||||
"""Guaranteed to not be None inside write()"""
|
||||
return self.voice_client and self.voice_client.client
|
||||
|
||||
def walk_children(self, *, with_self: bool = False) -> Generator[AudioSink, None, None]:
|
||||
"""Returns a generator of all the children of this sink, recursively, depth first."""
|
||||
|
||||
if with_self:
|
||||
yield self
|
||||
|
||||
for child in self.children:
|
||||
yield child
|
||||
yield from child.walk_children()
|
||||
|
||||
@classmethod
|
||||
def listener(cls, name: str = MISSING):
|
||||
"""Marks a function as an event listener."""
|
||||
|
||||
if name is not MISSING and not isinstance(name, str):
|
||||
raise TypeError(f'AudioSink.listener expected str but received {type(name).__name__} instead.')
|
||||
|
||||
def decorator(func):
|
||||
actual = func
|
||||
|
||||
if isinstance(actual, staticmethod):
|
||||
actual = actual.__func__
|
||||
|
||||
if inspect.iscoroutinefunction(actual):
|
||||
raise TypeError('Listener function must not be a coroutine function.')
|
||||
|
||||
actual.__sink_listener__ = True
|
||||
to_assign = name or actual.__name__
|
||||
|
||||
try:
|
||||
actual.__sink_listener_names__.append(to_assign)
|
||||
except AttributeError:
|
||||
actual.__sink_listener_names__ = [to_assign]
|
||||
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class MultiAudioSink(AudioSink):
|
||||
def __init__(self, destinations: Sequence[AudioSink], /):
|
||||
# Intentionally not calling super().__init__ here
|
||||
if destinations is not None:
|
||||
for dest in destinations:
|
||||
self._register_child(dest)
|
||||
|
||||
self._children: List[AudioSink] = list(destinations)
|
||||
|
||||
def _register_child(self, child: AudioSink) -> None:
|
||||
if child in self.root.walk_children():
|
||||
raise RuntimeError('Sink is already registered.')
|
||||
|
||||
child._parent = self
|
||||
|
||||
@property
|
||||
def child(self) -> Optional[AudioSink]:
|
||||
return self._children[0] if self._children else None
|
||||
|
||||
@property
|
||||
def children(self) -> Sequence[AudioSink]:
|
||||
return SequenceProxy(self._children)
|
||||
|
||||
# TODO: add functions to add/remove children?
|
||||
|
||||
|
||||
class BasicSink(AudioSink):
|
||||
"""Simple callback based sink."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
event: BasicSinkWriteCB,
|
||||
*,
|
||||
rtcp_event: Optional[BasicSinkWriteRTCPCB] = None,
|
||||
decode: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.cb = event
|
||||
self.cb_rtcp = rtcp_event
|
||||
self.decode = decode
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return not self.decode
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData) -> None:
|
||||
self.cb(user, data)
|
||||
|
||||
@AudioSink.listener()
|
||||
def on_rtcp_packet(self, packet: RTCPPacket, guild: discord.Guild) -> None:
|
||||
self.cb_rtcp(packet) if self.cb_rtcp else None
|
||||
|
||||
def cleanup(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class WaveSink(AudioSink):
|
||||
"""Endpoint AudioSink that generates a wav file.
|
||||
Best used in conjunction with a silence generating sink. (TBD)
|
||||
"""
|
||||
|
||||
CHANNELS = OpusDecoder.CHANNELS
|
||||
SAMPLE_WIDTH = OpusDecoder.SAMPLE_SIZE // OpusDecoder.CHANNELS
|
||||
SAMPLING_RATE = OpusDecoder.SAMPLING_RATE
|
||||
|
||||
def __init__(self, destination: wave._File):
|
||||
super().__init__()
|
||||
|
||||
self._file: wave.Wave_write = wave.open(destination, 'wb')
|
||||
self._file.setnchannels(self.CHANNELS)
|
||||
self._file.setsampwidth(self.SAMPLE_WIDTH)
|
||||
self._file.setframerate(self.SAMPLING_RATE)
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData) -> None:
|
||||
self._file.writeframes(data.pcm)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
try:
|
||||
self._file.close()
|
||||
except Exception:
|
||||
log.warning("WaveSink got error closing file on cleanup", exc_info=True)
|
||||
|
||||
|
||||
WavSink = WaveSink
|
||||
|
||||
|
||||
class FFmpegSink(AudioSink):
|
||||
@overload
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
filename: str,
|
||||
executable: str = 'ffmpeg',
|
||||
stderr: Optional[IO[bytes]] = None,
|
||||
before_options: Optional[str] = None,
|
||||
options: Optional[str] = None,
|
||||
on_error: Optional[FFmpegErrorCB] = None,
|
||||
): ...
|
||||
|
||||
@overload
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
buffer: IO[bytes],
|
||||
executable: str = 'ffmpeg',
|
||||
stderr: Optional[IO[bytes]] = None,
|
||||
before_options: Optional[str] = None,
|
||||
options: Optional[str] = None,
|
||||
on_error: Optional[FFmpegErrorCB] = None,
|
||||
): ...
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
filename: str = MISSING,
|
||||
buffer: IO[bytes] = MISSING,
|
||||
executable: str = 'ffmpeg',
|
||||
stderr: Optional[IO[bytes]] = None,
|
||||
before_options: Optional[str] = None,
|
||||
options: Optional[str] = None,
|
||||
on_error: Optional[FFmpegErrorCB] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.filename: str = filename or 'pipe:1'
|
||||
self.buffer: IO[bytes] = buffer
|
||||
self.on_error: FFmpegErrorCB = on_error or self._on_error
|
||||
|
||||
args = [executable, '-hide_banner']
|
||||
subprocess_kwargs: Dict[str, Any] = {'stdin': subprocess.PIPE}
|
||||
if self.buffer is not MISSING:
|
||||
subprocess_kwargs['stdout'] = subprocess.PIPE
|
||||
|
||||
piping_stderr = False
|
||||
if stderr is not None:
|
||||
try:
|
||||
stderr.fileno()
|
||||
except Exception:
|
||||
piping_stderr = True
|
||||
subprocess_kwargs['stderr'] = subprocess.PIPE
|
||||
|
||||
if isinstance(before_options, str):
|
||||
args.extend(shlex.split(before_options))
|
||||
|
||||
# fmt: off
|
||||
args.extend((
|
||||
'-f', 's16le',
|
||||
'-ar', '48000',
|
||||
'-ac', '2',
|
||||
'-i', 'pipe:0',
|
||||
'-loglevel', 'warning',
|
||||
'-blocksize', str(discord.FFmpegAudio.BLOCKSIZE)
|
||||
))
|
||||
# fmt: on
|
||||
|
||||
if isinstance(options, str):
|
||||
args.extend(shlex.split(options))
|
||||
|
||||
args.append(self.filename)
|
||||
|
||||
self._process: subprocess.Popen = MISSING
|
||||
self._process = self._spawn_process(args, **subprocess_kwargs)
|
||||
|
||||
self._stdin: IO[bytes] = self._process.stdin # type: ignore
|
||||
self._stdout: Optional[IO[bytes]] = None
|
||||
self._stderr: Optional[IO[bytes]] = None
|
||||
self._stdout_reader_thread: Optional[threading.Thread] = None
|
||||
self._stderr_reader_thread: Optional[threading.Thread] = None
|
||||
|
||||
if self.buffer:
|
||||
n = f'popen-stout-reader:pid-{self._process.pid}'
|
||||
self._stdout = self._process.stdout
|
||||
_args = (self._stdout, self.buffer)
|
||||
self._stdout_reader_thread = threading.Thread(target=self._pipe_reader, args=_args, daemon=True, name=n)
|
||||
self._stdout_reader_thread.start()
|
||||
|
||||
if piping_stderr:
|
||||
n = f'popen-stderr-reader:pid-{self._process.pid}'
|
||||
self._stderr = self._process.stderr
|
||||
_args = (self._stderr, stderr)
|
||||
self._stderr_reader_thread = threading.Thread(target=self._pipe_reader, args=_args, daemon=True, name=n)
|
||||
self._stderr_reader_thread.start()
|
||||
|
||||
@staticmethod
|
||||
def _on_error(_self: FFmpegSink, error: Exception, data: Optional[VoiceData]) -> None:
|
||||
_self.voice_client.stop_listening() # type: ignore
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def cleanup(self):
|
||||
self._kill_process()
|
||||
self._process = self._stdout = self._stdin = self._stderr = MISSING
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData):
|
||||
if self._process and not self._stdin.closed:
|
||||
audio = data.opus if self.wants_opus() else data.pcm
|
||||
assert audio is not None
|
||||
try:
|
||||
self._stdin.write(audio)
|
||||
except Exception as e:
|
||||
log.exception('Error writing data to ffmpeg')
|
||||
self._kill_process()
|
||||
self.on_error(self, e, data)
|
||||
|
||||
def _spawn_process(self, args: Any, **subprocess_kwargs: Any) -> subprocess.Popen:
|
||||
log.debug('Spawning ffmpeg process with command: %s, kwargs: %s', args, subprocess_kwargs)
|
||||
process = None
|
||||
try:
|
||||
process = subprocess.Popen(args, creationflags=discord.player.CREATE_NO_WINDOW, **subprocess_kwargs)
|
||||
except FileNotFoundError:
|
||||
executable = args.partition(' ')[0] if isinstance(args, str) else args[0]
|
||||
raise Exception(executable + ' was not found.') from None
|
||||
except subprocess.SubprocessError as exc:
|
||||
raise Exception(f'Popen failed: {exc.__class__.__name__}: {exc}') from exc
|
||||
else:
|
||||
return process
|
||||
|
||||
def _kill_process(self) -> None:
|
||||
# this function gets called in __del__ so instance attributes might not even exist
|
||||
proc: subprocess.Popen = getattr(self, '_process', MISSING)
|
||||
if proc is MISSING:
|
||||
return
|
||||
|
||||
log.debug('Terminating ffmpeg process %s.', proc.pid)
|
||||
|
||||
try:
|
||||
self._stdin.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# TODO: extract wait time
|
||||
log.debug('Waiting for ffmpeg process %s for up to 5 seconds.', proc.pid)
|
||||
try:
|
||||
proc.wait(5)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
proc.kill()
|
||||
except Exception:
|
||||
log.exception('Ignoring error attempting to kill ffmpeg process %s', proc.pid)
|
||||
|
||||
if proc.poll() is None:
|
||||
log.info('ffmpeg process %s has not terminated. Waiting to terminate...', proc.pid)
|
||||
proc.communicate()
|
||||
log.info('ffmpeg process %s should have terminated with a return code of %s.', proc.pid, proc.returncode)
|
||||
else:
|
||||
log.info('ffmpeg process %s successfully terminated with return code of %s.', proc.pid, proc.returncode)
|
||||
|
||||
self._process = MISSING
|
||||
|
||||
def _pipe_reader(self, source: IO[bytes], dest: IO[bytes]) -> None:
|
||||
while self._process:
|
||||
if source.closed:
|
||||
return
|
||||
try:
|
||||
data = source.read(discord.FFmpegAudio.BLOCKSIZE)
|
||||
except (OSError, ValueError) as e:
|
||||
log.debug('FFmpeg stdin pipe closed: %s', e)
|
||||
return
|
||||
except Exception:
|
||||
log.debug('Read error for %s, this is probably not a problem', self, exc_info=True)
|
||||
return
|
||||
if data is None:
|
||||
return
|
||||
try:
|
||||
dest.write(data)
|
||||
except Exception as e:
|
||||
log.exception('Write error for %s', self)
|
||||
self._kill_process()
|
||||
self.on_error(self, e, None)
|
||||
return
|
||||
|
||||
|
||||
class PCMVolumeTransformer(AudioSink):
|
||||
"""AudioSink used to change the volume of PCM data, just like
|
||||
:class:`discord.PCMVolumeTransformer`.
|
||||
"""
|
||||
|
||||
def __init__(self, destination: AudioSink, volume: float = 1.0):
|
||||
if not isinstance(destination, AudioSink):
|
||||
raise TypeError(f'expected AudioSink not {type(destination).__name__}')
|
||||
|
||||
if destination.wants_opus():
|
||||
raise VoiceRecvException('AudioSink must not request Opus encoding.')
|
||||
|
||||
super().__init__(destination)
|
||||
|
||||
self.destination: AudioSink = destination
|
||||
self._volume: float = volume
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def volume(self) -> float:
|
||||
"""Retrieves or sets the volume as a floating point percentage (e.g. 1.0 for 100%)."""
|
||||
return self._volume
|
||||
|
||||
@volume.setter
|
||||
def volume(self, value: float):
|
||||
self._volume = max(value, 0.0)
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData) -> None:
|
||||
data.pcm = audioop.mul(data.pcm, 2, min(self._volume, 2.0))
|
||||
self.destination.write(user, data)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class ConditionalFilter(AudioSink):
|
||||
"""AudioSink for filtering packets based on an arbitrary predicate function."""
|
||||
|
||||
def __init__(self, destination: AudioSink, predicate: ConditionalFilterFn):
|
||||
super().__init__(destination)
|
||||
|
||||
self.destination: AudioSink = destination
|
||||
self.predicate: ConditionalFilterFn = predicate
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return self.destination.wants_opus()
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData) -> None:
|
||||
if self.predicate(user, data):
|
||||
self.destination.write(user, data)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
del self.predicate
|
||||
|
||||
|
||||
class UserFilter(ConditionalFilter):
|
||||
"""A convenience class for a User based ConditionalFilter."""
|
||||
|
||||
def __init__(self, destination: AudioSink, user: User):
|
||||
super().__init__(destination, self._predicate)
|
||||
self.user: User = user
|
||||
|
||||
def _predicate(self, user: Optional[User], data: VoiceData) -> bool:
|
||||
return user == self.user
|
||||
|
||||
|
||||
class TimedFilter(ConditionalFilter):
|
||||
"""A convenience class for a timed ConditionalFilter."""
|
||||
|
||||
def __init__(self, destination: AudioSink, duration: float, *, start_on_init: bool = False):
|
||||
super().__init__(destination, self.predicate)
|
||||
self.duration: float = duration
|
||||
self.start_time: Optional[float]
|
||||
|
||||
if start_on_init:
|
||||
self.start_time = self.get_time()
|
||||
else:
|
||||
self.start_time = None
|
||||
self.write = self._write_once
|
||||
|
||||
def _write_once(self, user: Optional[User], data: VoiceData):
|
||||
self.start_time = self.get_time()
|
||||
super().write(user, data)
|
||||
self.write = super().write
|
||||
|
||||
def predicate(self, user: Optional[User], data: VoiceData) -> bool:
|
||||
return self.start_time is not None and self.get_time() - self.start_time < self.duration
|
||||
|
||||
def get_time(self) -> float:
|
||||
"""Function to generate a timestamp. Defaults to `time.perf_counter()`.
|
||||
Can be overridden.
|
||||
"""
|
||||
return time.perf_counter()
|
||||
|
||||
|
||||
class SilenceGeneratorSink(AudioSink):
|
||||
"""Generates intermittent silence packets during transmission downtime."""
|
||||
|
||||
def __init__(self, destination: AudioSink):
|
||||
super().__init__(destination)
|
||||
|
||||
self.destination: AudioSink = destination
|
||||
self.silencegen: SilenceGenerator = SilenceGenerator(self.destination.write)
|
||||
self.silencegen.start()
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return self.destination.wants_opus()
|
||||
|
||||
def write(self, user: Optional[User], data: VoiceData) -> None:
|
||||
self.silencegen.push(user, data.packet)
|
||||
self.destination.write(user, data)
|
||||
|
||||
@AudioSink.listener()
|
||||
def on_voice_member_disconnect(self, member: discord.Member, ssrc: Optional[int]) -> None:
|
||||
self.silencegen.drop(ssrc=ssrc, user=member)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self.silencegen.stop()
|
||||
59
vendor/discord-ext-voice-recv/discord/ext/voice_recv/types.py
vendored
Normal file
59
vendor/discord-ext-voice-recv/discord/ext/voice_recv/types.py
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Literal, Optional, TypedDict
|
||||
|
||||
from discord.types.snowflake import Snowflake
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Union
|
||||
import discord
|
||||
|
||||
MemberOrUser = Union[discord.Member, discord.User]
|
||||
|
||||
ResolutionTypes = Literal['fixed', 'source']
|
||||
StreamTypes = Literal['audio', 'video', 'screen', 'test'] # only video appears to be used
|
||||
|
||||
|
||||
class VideoResolution(TypedDict):
|
||||
height: int
|
||||
width: int
|
||||
type: ResolutionTypes
|
||||
|
||||
|
||||
class VideoStream(TypedDict):
|
||||
type: StreamTypes
|
||||
active: bool
|
||||
max_bitrate: int
|
||||
max_framerate: int
|
||||
max_resolution: VideoResolution
|
||||
quality: int
|
||||
rid: int
|
||||
rtx_ssrc: int
|
||||
ssrc: int
|
||||
|
||||
|
||||
class VoiceVideoPayload(TypedDict):
|
||||
audio_ssrc: int
|
||||
video_ssrc: int
|
||||
user_id: Snowflake
|
||||
streams: list[VideoStream]
|
||||
|
||||
|
||||
class VoiceClientConnectPayload(TypedDict):
|
||||
user_ids: List[Snowflake]
|
||||
|
||||
|
||||
class VoiceClientDisconnectPayload(TypedDict):
|
||||
user_id: Snowflake
|
||||
|
||||
|
||||
class VoiceFlagsPayload(TypedDict):
|
||||
flags: Optional[int]
|
||||
user_id: Snowflake
|
||||
|
||||
|
||||
class VoicePlatformPayload(TypedDict):
|
||||
platform: Optional[Literal[0, 1, 2, 3]]
|
||||
user_id: Snowflake
|
||||
205
vendor/discord-ext-voice-recv/discord/ext/voice_recv/utils.py
vendored
Normal file
205
vendor/discord-ext-voice-recv/discord/ext/voice_recv/utils.py
vendored
Normal file
@@ -0,0 +1,205 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import threading
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from typing import TYPE_CHECKING, Generic, TypeVar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Callable, Sequence
|
||||
|
||||
TimeFunc = Callable[[], float]
|
||||
|
||||
_dataT = TypeVar("_dataT")
|
||||
|
||||
|
||||
def gap_wrapped(a: int, b: int, *, wrap: int = 65536) -> int:
|
||||
"""
|
||||
Returns the gap between two numbers, acounting for unsigned integer wraparound.
|
||||
"""
|
||||
return (b - (a + 1) + wrap) % wrap
|
||||
|
||||
|
||||
def add_wrapped(a: int, b: int, *, wrap: int = 65536) -> int:
|
||||
"""
|
||||
Returns the sum of two numbers, accounting for unsigned integer wraparound.
|
||||
"""
|
||||
return (a + b) % wrap
|
||||
|
||||
|
||||
# May not even be needed if i dont use the dict subclasses
|
||||
class Bidict(dict):
|
||||
"""A bi-directional dict"""
|
||||
|
||||
_None = object()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
super().update({v: k for k, v in self.items()})
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
# Delete related mappings
|
||||
# if we have 1 <-> 2 and we set 2 <-> 3, 2 is now unrelated to 1
|
||||
|
||||
if key in self:
|
||||
del self[key]
|
||||
if value in self:
|
||||
del self[value]
|
||||
|
||||
super().__setitem__(key, value)
|
||||
super().__setitem__(value, key)
|
||||
|
||||
def __delitem__(self, key):
|
||||
value = super().__getitem__(key)
|
||||
super().__delitem__(value)
|
||||
|
||||
if key == value:
|
||||
return
|
||||
|
||||
super().__delitem__(key)
|
||||
|
||||
def to_dict(self):
|
||||
return super().copy()
|
||||
|
||||
def pop(self, k, d=_None):
|
||||
try:
|
||||
v = super().pop(k)
|
||||
super().pop(v, d)
|
||||
return v
|
||||
except KeyError:
|
||||
if d is not self._None:
|
||||
return d
|
||||
raise
|
||||
|
||||
def popitem(self):
|
||||
item = super().popitem()
|
||||
super().__delitem__(item[1])
|
||||
return item
|
||||
|
||||
def setdefault(self, k, d=None):
|
||||
try:
|
||||
return self[k]
|
||||
except KeyError:
|
||||
if d in self:
|
||||
return d
|
||||
|
||||
self[k] = d
|
||||
return d
|
||||
|
||||
def update(self, *args, **F):
|
||||
try:
|
||||
E = args[0]
|
||||
if callable(getattr(E, 'keys', None)):
|
||||
for k in E:
|
||||
self[k] = E[k]
|
||||
else:
|
||||
for k, v in E:
|
||||
self[k] = v
|
||||
except IndexError:
|
||||
pass
|
||||
finally:
|
||||
for k in F:
|
||||
self[k] = F[k]
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(super().copy())
|
||||
|
||||
# incompatible
|
||||
# https://docs.python.org/3/library/exceptions.html#NotImplementedError, Note 1
|
||||
fromkeys = None # type: ignore
|
||||
|
||||
|
||||
class Defaultdict(defaultdict):
|
||||
def __missing__(self, key):
|
||||
if self.default_factory is None:
|
||||
raise KeyError((key,))
|
||||
|
||||
self[key] = value = self.default_factory(key) # type: ignore
|
||||
return value
|
||||
|
||||
|
||||
class LoopTimer:
|
||||
def __init__(self, delay: float, *, timefunc: TimeFunc = time.perf_counter):
|
||||
self._delay: float = delay
|
||||
self._time: TimeFunc = timefunc
|
||||
self._start: float = 0
|
||||
self._loops: int = 0
|
||||
|
||||
@property
|
||||
def delay(self) -> float:
|
||||
return self._delay
|
||||
|
||||
@property
|
||||
def loops(self) -> int:
|
||||
return self._loops
|
||||
|
||||
@property
|
||||
def start_time(self) -> float:
|
||||
return self._start
|
||||
|
||||
@property
|
||||
def remaining_time(self) -> float:
|
||||
next_time = self._start + self._delay * self._loops
|
||||
return self._delay + (next_time - self._time())
|
||||
|
||||
def start(self) -> None:
|
||||
self._loops = 0
|
||||
self._start = self._time()
|
||||
|
||||
def mark(self) -> None:
|
||||
self._loops += 1
|
||||
|
||||
def sleep(self) -> None:
|
||||
time.sleep(max(0, self.remaining_time))
|
||||
|
||||
|
||||
class MultiDataEvent(Generic[_dataT]):
|
||||
"""
|
||||
Something like the inverse of a Condition. A 1-waiting-on-N type of object,
|
||||
with accompanying data object for convenience.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._items: list[_dataT] = []
|
||||
self._ready: threading.Event = threading.Event()
|
||||
|
||||
@property
|
||||
def items(self) -> list[_dataT]:
|
||||
"""A shallow copy of the currently ready objects."""
|
||||
return self._items.copy()
|
||||
|
||||
def is_ready(self) -> bool:
|
||||
return self._ready.is_set()
|
||||
|
||||
def _check_ready(self) -> None:
|
||||
if self._items:
|
||||
self._ready.set()
|
||||
else:
|
||||
self._ready.clear()
|
||||
|
||||
def notify(self) -> None:
|
||||
self._ready.set()
|
||||
self._check_ready()
|
||||
|
||||
def wait(self, timeout: float | None = None) -> bool:
|
||||
self._check_ready()
|
||||
return self._ready.wait(timeout)
|
||||
|
||||
def register(self, item: _dataT) -> None:
|
||||
self._items.append(item)
|
||||
self._ready.set()
|
||||
|
||||
def unregister(self, item: _dataT) -> None:
|
||||
try:
|
||||
self._items.remove(item)
|
||||
except ValueError:
|
||||
pass
|
||||
self._check_ready()
|
||||
|
||||
def clear(self) -> None:
|
||||
self._items.clear()
|
||||
self._ready.clear()
|
||||
95
vendor/discord-ext-voice-recv/discord/ext/voice_recv/video.py
vendored
Normal file
95
vendor/discord-ext-voice-recv/discord/ext/voice_recv/video.py
vendored
Normal file
@@ -0,0 +1,95 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .types import (
|
||||
VoiceVideoPayload,
|
||||
VideoStream as VideoStreamPayload,
|
||||
VideoResolution as VideoResolutionPayload,
|
||||
)
|
||||
from .voice_client import VoiceRecvClient
|
||||
|
||||
__all__ = [
|
||||
'VoiceVideoStreams',
|
||||
]
|
||||
|
||||
|
||||
class VoiceVideoStreams:
|
||||
__slots__ = (
|
||||
'audio_ssrc',
|
||||
'video_ssrc',
|
||||
'member',
|
||||
'streams',
|
||||
)
|
||||
|
||||
def __init__(self, *, data: VoiceVideoPayload, vc: VoiceRecvClient):
|
||||
self.audio_ssrc = data['audio_ssrc']
|
||||
self.video_ssrc = data['video_ssrc']
|
||||
self.member = vc.guild.get_member(int(data['user_id']))
|
||||
self.streams = self._get_streams(data['streams'])
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<VoiceVideoStreams member={self.member!s} streams={self._minify_streams()}>"
|
||||
|
||||
def _get_streams(self, data: list[VideoStreamPayload]) -> list[VideoStreamInfo]:
|
||||
return [VideoStreamInfo(data=stream) for stream in data]
|
||||
|
||||
def _minify_streams(self) -> str:
|
||||
streams = [f"<rid={s.rid} active={s.active}>" for s in self.streams]
|
||||
return f"[{', '.join(streams)}]"
|
||||
|
||||
|
||||
class VideoStreamInfo:
|
||||
__slots__ = (
|
||||
'type',
|
||||
'active',
|
||||
'max_bitrate',
|
||||
'max_framerate',
|
||||
'max_resolution',
|
||||
'quality',
|
||||
'rid',
|
||||
'rtx_ssrc',
|
||||
'ssrc',
|
||||
)
|
||||
|
||||
def __init__(self, *, data: VideoStreamPayload):
|
||||
self.type: str = data.get('type', 'video')
|
||||
self.active = data['active']
|
||||
self.max_bitrate = data.get('max_bitrate', 0)
|
||||
self.max_framerate = data['max_framerate']
|
||||
self.max_resolution = VideoStreamResolution(data['max_resolution'])
|
||||
self.quality = data['quality']
|
||||
self.rid = data['rid']
|
||||
self.rtx_ssrc = data['rtx_ssrc']
|
||||
self.ssrc = data['ssrc']
|
||||
|
||||
def __repr__(self) -> str:
|
||||
attrs = [
|
||||
('ssrc', self.ssrc),
|
||||
('active', self.active),
|
||||
('quality', self.quality),
|
||||
('max_bitrate', self.max_bitrate),
|
||||
('max_framerate', self.max_framerate),
|
||||
('max_resolution', self.max_resolution),
|
||||
]
|
||||
inner = ' '.join('%s=%r' % t for t in attrs)
|
||||
return f'<{self.__class__.__name__} {inner}>'
|
||||
|
||||
|
||||
class VideoStreamResolution:
|
||||
__slots__ = (
|
||||
'height',
|
||||
'width',
|
||||
'type',
|
||||
)
|
||||
|
||||
def __init__(self, data: VideoResolutionPayload):
|
||||
self.height = data['height']
|
||||
self.width = data['width']
|
||||
self.type = data['type']
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<VideoStreamResolution width={self.width!r} height={self.height!r} type={self.type!r}>"
|
||||
196
vendor/discord-ext-voice-recv/discord/ext/voice_recv/voice_client.py
vendored
Normal file
196
vendor/discord-ext-voice-recv/discord/ext/voice_recv/voice_client.py
vendored
Normal file
@@ -0,0 +1,196 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
import discord
|
||||
from discord.voice_state import VoiceConnectionState
|
||||
from discord.utils import MISSING
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from .gateway import hook
|
||||
from .reader import AudioReader
|
||||
from .sinks import AudioSink
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Optional, Dict, Any, Union
|
||||
from discord.ext.commands._types import CoroFunc
|
||||
from .reader import AfterCB
|
||||
|
||||
from pprint import pformat
|
||||
|
||||
__all__ = [
|
||||
'VoiceRecvClient',
|
||||
]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VoiceRecvClient(discord.VoiceClient):
|
||||
endpoint_ip: str
|
||||
voice_port: int
|
||||
|
||||
def __init__(self, client: discord.Client, channel: discord.abc.Connectable):
|
||||
super().__init__(client, channel)
|
||||
|
||||
self._reader: AudioReader = MISSING
|
||||
self._ssrc_to_id: Dict[int, int] = {}
|
||||
self._id_to_ssrc: Dict[int, int] = {}
|
||||
self._event_listeners: Dict[str, list] = {}
|
||||
|
||||
def create_connection_state(self) -> VoiceConnectionState:
|
||||
return VoiceConnectionState(self, hook=hook)
|
||||
|
||||
async def on_voice_state_update(self, data) -> None:
|
||||
old_channel_id = self.channel.id if self.channel else None
|
||||
|
||||
await super().on_voice_state_update(data)
|
||||
|
||||
log.debug("Got voice_client VSU: \n%s", pformat(data, compact=True))
|
||||
|
||||
# this can be None
|
||||
try:
|
||||
channel_id = int(data['channel_id'])
|
||||
except TypeError:
|
||||
return
|
||||
|
||||
# if we joined, left, or switched channels, reset the decoders
|
||||
if self._reader and channel_id != old_channel_id:
|
||||
log.debug("Destroying all decoders in guild %s", self.guild.id)
|
||||
self._reader.packet_router.destroy_all_decoders()
|
||||
|
||||
def add_listener(self, func: CoroFunc, *, name: str = MISSING) -> None:
|
||||
name = func.__name__ if name is MISSING else name
|
||||
|
||||
if not asyncio.iscoroutinefunction(func):
|
||||
raise TypeError('Listeners must be coroutines')
|
||||
|
||||
if name in self._event_listeners:
|
||||
self._event_listeners[name].append(func)
|
||||
else:
|
||||
self._event_listeners[name] = [func]
|
||||
|
||||
def remove_listener(self, func: CoroFunc, *, name: str = MISSING) -> None:
|
||||
name = func.__name__ if name is MISSING else name
|
||||
|
||||
if name in self._event_listeners:
|
||||
try:
|
||||
self._event_listeners[name].remove(func)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
async def _run_event(self, coro: CoroFunc, event_name: str, *args: Any, **kwargs: Any) -> None:
|
||||
try:
|
||||
await coro(*args, **kwargs)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("Error calling %s", event_name)
|
||||
|
||||
def _schedule_event(self, coro: CoroFunc, event_name: str, *args: Any, **kwargs: Any) -> asyncio.Task:
|
||||
wrapped = self._run_event(coro, event_name, *args, **kwargs)
|
||||
return self.client.loop.create_task(wrapped, name=f"ext.voice_recv: {event_name}")
|
||||
|
||||
def dispatch(self, event: str, /, *args: Any, **kwargs: Any) -> None:
|
||||
log.debug("Dispatching voice_client event %s", event)
|
||||
|
||||
event_name = f"on_{event}"
|
||||
for coro in self._event_listeners.get(event_name, []):
|
||||
self._schedule_event(coro, event_name, *args, **kwargs)
|
||||
|
||||
self.dispatch_sink(event, *args, **kwargs)
|
||||
self.client.dispatch(event, *args, **kwargs)
|
||||
|
||||
def dispatch_sink(self, event: str, /, *args: Any, **kwargs: Any) -> None:
|
||||
if self._reader:
|
||||
self._reader.event_router.dispatch(event, *args, **kwargs)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
# TODO: Does the order here matter?
|
||||
super().cleanup()
|
||||
self._event_listeners.clear()
|
||||
self.stop()
|
||||
|
||||
def _add_ssrc(self, user_id: int, ssrc: int) -> None:
|
||||
self._ssrc_to_id[ssrc] = user_id
|
||||
self._id_to_ssrc[user_id] = ssrc
|
||||
|
||||
if self._reader:
|
||||
self._reader.packet_router.set_user_id(ssrc, user_id)
|
||||
|
||||
def _remove_ssrc(self, *, user_id: int) -> None:
|
||||
ssrc = self._id_to_ssrc.pop(user_id, None)
|
||||
if ssrc:
|
||||
self._reader.speaking_timer.drop_ssrc(ssrc)
|
||||
self._ssrc_to_id.pop(ssrc, None)
|
||||
|
||||
def _get_ssrc_from_id(self, user_id: int) -> Optional[int]:
|
||||
return self._id_to_ssrc.get(user_id)
|
||||
|
||||
def _get_id_from_ssrc(self, ssrc: int) -> Optional[int]:
|
||||
return self._ssrc_to_id.get(ssrc)
|
||||
|
||||
def listen(self, sink: AudioSink, *, after: Optional[AfterCB] = None) -> None:
|
||||
"""Receives audio into a :class:`AudioSink`."""
|
||||
# TODO: more info
|
||||
|
||||
if not self.is_connected():
|
||||
raise discord.ClientException('Not connected to voice.')
|
||||
|
||||
if not isinstance(sink, AudioSink):
|
||||
raise TypeError('sink must be an AudioSink not {0.__class__.__name__}'.format(sink))
|
||||
|
||||
if self.is_listening():
|
||||
raise discord.ClientException('Already receiving audio.')
|
||||
|
||||
self._reader = AudioReader(sink, self, after=after)
|
||||
self._reader.start()
|
||||
|
||||
def is_listening(self) -> bool:
|
||||
"""Indicates if we're currently receiving audio."""
|
||||
return self._reader and self._reader.is_listening()
|
||||
|
||||
def stop_listening(self) -> None:
|
||||
"""Stops receiving audio."""
|
||||
if self._reader:
|
||||
self._reader.stop()
|
||||
self._reader = MISSING
|
||||
|
||||
def stop_playing(self) -> None:
|
||||
"""Stops playing audio."""
|
||||
if self._player:
|
||||
self._player.stop()
|
||||
self._player = None
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stops playing and receiving audio."""
|
||||
self.stop_playing()
|
||||
self.stop_listening()
|
||||
|
||||
@property
|
||||
def sink(self) -> Optional[AudioSink]:
|
||||
return self._reader.sink if self._reader else None
|
||||
|
||||
@sink.setter
|
||||
def sink(self, sink: AudioSink) -> None:
|
||||
if not isinstance(sink, AudioSink):
|
||||
raise TypeError('expected AudioSink not {0.__class__.__name__}.'.format(sink))
|
||||
|
||||
if not self._reader:
|
||||
raise ValueError('Not receiving anything.')
|
||||
|
||||
self._reader.set_sink(sink)
|
||||
|
||||
def get_speaking(self, member: Union[discord.Member, discord.User]) -> Optional[bool]:
|
||||
"""Returns if a member is speaking (approximately), or None if not found."""
|
||||
|
||||
ssrc = self._get_ssrc_from_id(member.id)
|
||||
if ssrc is None:
|
||||
return
|
||||
|
||||
if self._reader:
|
||||
return self._reader.speaking_timer.get_speaking(ssrc)
|
||||
47
vendor/discord-ext-voice-recv/examples/recv.py
vendored
Normal file
47
vendor/discord-ext-voice-recv/examples/recv.py
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import discord
|
||||
from discord.ext import commands, voice_recv
|
||||
|
||||
discord.opus._load_default()
|
||||
|
||||
bot = commands.Bot(command_prefix=commands.when_mentioned, intents=discord.Intents.all())
|
||||
|
||||
class Testing(commands.Cog):
|
||||
def __init__(self, bot):
|
||||
self.bot = bot
|
||||
|
||||
@commands.command()
|
||||
async def test(self, ctx):
|
||||
def callback(user, data: voice_recv.VoiceData):
|
||||
print(f"Got packet from {user}")
|
||||
|
||||
## voice power level, how loud the user is speaking
|
||||
# ext_data = packet.extension_data.get(voice_recv.ExtensionID.audio_power)
|
||||
# value = int.from_bytes(ext_data, 'big')
|
||||
# power = 127-(value & 127)
|
||||
# print('#' * int(power * (79/128)))
|
||||
## instead of 79 you can use shutil.get_terminal_size().columns-1
|
||||
|
||||
vc = await ctx.author.voice.channel.connect(cls=voice_recv.VoiceRecvClient)
|
||||
vc.listen(voice_recv.BasicSink(callback))
|
||||
|
||||
@commands.command()
|
||||
async def stop(self, ctx):
|
||||
await ctx.voice_client.disconnect()
|
||||
|
||||
@commands.command()
|
||||
async def die(self, ctx):
|
||||
ctx.voice_client.stop()
|
||||
await ctx.bot.close()
|
||||
|
||||
@bot.event
|
||||
async def on_ready():
|
||||
print('Logged in as {0.id}/{0}'.format(bot.user))
|
||||
print('------')
|
||||
|
||||
@bot.event
|
||||
async def setup_hook():
|
||||
await bot.add_cog(Testing(bot))
|
||||
|
||||
bot.run("token")
|
||||
27
vendor/discord-ext-voice-recv/pyproject.toml
vendored
Normal file
27
vendor/discord-ext-voice-recv/pyproject.toml
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.black]
|
||||
line-length = 125
|
||||
skip-string-normalization = true
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
combine_as_imports = true
|
||||
combine_star = true
|
||||
line_length = 125
|
||||
|
||||
[tool.pyright]
|
||||
include = [
|
||||
"discord/ext/voice_recv",
|
||||
]
|
||||
exclude = [
|
||||
"**/__pycache__",
|
||||
"build",
|
||||
"dist",
|
||||
]
|
||||
reportUnnecessaryTypeIgnoreComment = "warning"
|
||||
# reportUnusedImport = "error"
|
||||
pythonVersion = "3.8"
|
||||
typeCheckingMode = "basic"
|
||||
1
vendor/discord-ext-voice-recv/requirements.txt
vendored
Normal file
1
vendor/discord-ext-voice-recv/requirements.txt
vendored
Normal file
@@ -0,0 +1 @@
|
||||
discord.py[voice]>=2.2.0
|
||||
70
vendor/discord-ext-voice-recv/setup.py
vendored
Normal file
70
vendor/discord-ext-voice-recv/setup.py
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from setuptools import setup
|
||||
import re
|
||||
|
||||
with open('discord/ext/voice_recv/__init__.py') as f:
|
||||
version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE).group(1) # type: ignore
|
||||
|
||||
if not version:
|
||||
raise RuntimeError('version is not set')
|
||||
|
||||
if version.endswith(('a', 'b', 'rc')):
|
||||
# append version identifier based on commit count
|
||||
try:
|
||||
import subprocess
|
||||
|
||||
p = subprocess.Popen(['git', 'rev-list', '--count', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
out, err = p.communicate()
|
||||
if out:
|
||||
version = version + out.decode('utf-8').strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with open('README.md') as f:
|
||||
readme = f.read()
|
||||
|
||||
extras_require = {
|
||||
'extras_speech': [
|
||||
'SpeechRecognition',
|
||||
],
|
||||
'extras_local': [
|
||||
'pyaudio',
|
||||
],
|
||||
'extras': [
|
||||
'SpeechRecognition',
|
||||
'pyaudio',
|
||||
],
|
||||
}
|
||||
|
||||
setup(
|
||||
name='discord-ext-voice_recv',
|
||||
author='Imayhaveborkedit',
|
||||
url='https://github.com/imayhaveborkedit/discord-ext-voice-recv',
|
||||
version=version,
|
||||
packages=['discord.ext.voice_recv', 'discord.ext.voice_recv.extras'],
|
||||
license='MIT',
|
||||
description='Experimental voice receive extension for discord.py',
|
||||
long_description=readme,
|
||||
long_description_content_type='text/markdown',
|
||||
include_package_data=True,
|
||||
python_requires='>=3.8',
|
||||
install_requires=['discord.py[voice]>=2.5'],
|
||||
extras_require=extras_require,
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
'Development Status :: 3 - Alpha',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Intended Audience :: Developers',
|
||||
'Natural Language :: English',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
'Programming Language :: Python :: 3.12',
|
||||
'Operating System :: POSIX',
|
||||
'Operating System :: Microsoft :: Windows',
|
||||
'Operating System :: MacOS',
|
||||
'Topic :: Multimedia :: Sound/Audio :: Capture/Recording',
|
||||
],
|
||||
)
|
||||
21
vendor/discord-ext-voice-recv/update_notes.md
vendored
Normal file
21
vendor/discord-ext-voice-recv/update_notes.md
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
# Update notes
|
||||
Notably, not a changelog, just notes.
|
||||
|
||||
## 0.5.2
|
||||
- Adds `extras.localplayback` module
|
||||
- Adds info about the extras modules to the readme
|
||||
- Adds `WavSink` as an alias to `WaveSink`
|
||||
- Fixed a member cleanup error in SpeechRecognitionSink
|
||||
- Changes the optional dependency format
|
||||
- Previously it was a single optional dep, `extras`. Now there is a dependency per module, with `extras` installing all of them. See the readme for details.
|
||||
|
||||
## 0.5.1
|
||||
- Fixes a build process related error
|
||||
- Changes `voice_recv.extras` import semantics
|
||||
- The `__all__` contents of the extras modules are no longer `*` imported into `voice_recv.extras` (this was only `extras.SpeechRecognitionSink`). You will have to access them directly, or import that specific extra module. Example:
|
||||
```py
|
||||
from discord.ext.voice_recv.extras.speechrecognition import SpeechRecognitionSink
|
||||
# or
|
||||
from discord.ext.voice_recv.extras import speechrecognition
|
||||
sink = speechrecognition.SpeechRecognitionSink(...)
|
||||
```
|
||||
Reference in New Issue
Block a user