feat(voice): Pas 5 — voice/pipeline.py VoiceSession + EchoVoiceSink + cleanup
Central voice pipeline (~250 LOC + docstrings = ~430 lines):
VoiceSession (context manager + idempotent cleanup pe 5 căi):
- __enter__: acquire _lock, open JSONL (record=on)
- __exit__: calls cleanup("exit"), nu suprimă exceptions
- cleanup(reason): IDEMPOTENT, side effects o singură dată — JSONL
flush+close (record=on) sau delete (record=off), bot presence cleared,
voice_client.cleanup(), ttsq.stop(), cancel filler task, lock release,
structured log la logs/voice_metrics.jsonl
- on_segment_done(speaker_id, text, no_speech_prob): mirror text channel,
append JSONL, arm 3s filler timer, route_message cu on_text callback
+ cancel filler la first block
- last_activity_ts: time.monotonic() — caller-driven 5min auto-leave
EchoVoiceSink(session, bot_user_id):
- wants_opus() False (PCM)
- write() runs în voice_recv reader thread (threading primitives only):
- GUARD 1: user None/id==0/id==bot_user_id → return (load-bearing
echo prevention)
- GUARD 2: whitelist filter (empty = allow all)
- Buffer 20ms packets per-user → batch 100ms (5×20ms = 19200 bytes)
→ silero-vad threshold 0.5 → 800ms cumulative silence flush
- _flush_to_stt: faster-whisper small int8 cpu_threads=4 lang=ro
beam_size=1, no_speech_prob > 0.6 drop, schedule on_segment_done
via run_coroutine_threadsafe pe session.loop
Module helpers (lazy thread-safe singletons): _get_whisper_model,
_get_silero_vad. Constants: FILLER_DELAY_S=3.0, SILENCE_FLUSH_MS=800,
VAD_THRESHOLD=0.5, VAD_WINDOW_MS=100, NO_SPEECH_DROP_THRESHOLD=0.6.
Decisions:
- STT runs in audio thread — acceptable la 2.25s p50 (user just stopped
talking, no batching contention). Wrap în ThreadPoolExecutor.submit
if perf bites later.
- Downsample 48k→16k via 3-sample averaging (no scipy dep). Whisper
robust la mild aliasing.
- Energy-RMS VAD fallback dacă torch import fail — graceful degrade.
- router_route_message injection seam ca kwarg pentru testabilitate.
- bot.change_presence handling cross-thread via run_coroutine_threadsafe.
tests/test_voice_session_cleanup.py — 6 tests:
- voice_leave / disconnect / crash via __exit__ / auto_leave /
user_left_channel (5 cleanup paths each verified for: JSONL state,
presence cleared, voice_client.cleanup, ttsq.stop, lock release,
idempotency)
- 1 robustness cross-cut (double-cleanup safety)
6/6 PASS. Regression suite 63/63 PASS (normalize + adapter + mutex).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
551
src/voice/pipeline.py
Normal file
551
src/voice/pipeline.py
Normal file
@@ -0,0 +1,551 @@
|
||||
"""Central voice pipeline: VAD -> STT -> Claude -> TTS for Discord voice.
|
||||
|
||||
``VoiceSession`` binds per-call state — voice_client, TTS queue, transcript
|
||||
JSONL buffer, whitelist, presence — and exposes a single idempotent
|
||||
``cleanup()`` invoked from every exit path (user /voice leave, network
|
||||
disconnect, crash via ``__exit__``, auto-leave timer, user leaves channel).
|
||||
|
||||
``EchoVoiceSink`` is the discord-ext-voice-recv ``AudioSink`` subclass that
|
||||
runs in the voice_recv reader thread. It batches 20ms PCM packets into
|
||||
100ms windows for silero-vad inference, marks per-user speech timestamps,
|
||||
and on 800ms cumulative silence flushes the accumulated audio through
|
||||
faster-whisper. Hallucinated segments (``no_speech_prob > 0.6``) are
|
||||
dropped. Valid transcripts are scheduled onto the session's event loop
|
||||
via ``asyncio.run_coroutine_threadsafe``.
|
||||
|
||||
The bot's own ``user.id`` is filtered FIRST inside ``write()`` — load-bearing
|
||||
echo prevention so a future whitelist expansion (Bianca, etc.) never lets
|
||||
the bot transcribe itself.
|
||||
|
||||
See plan: ``src/voice/pipeline.py`` (Pas 5), Engineering decisions #4
|
||||
(VAD 100ms batched), #5 (cleanup centralizat), #7 (bot.user.id explicit
|
||||
guard), #8 (filler audio ``thinking.wav`` at 3s pre-first-block).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.voice._discord_voice_adapter import AudioSink, VoiceData
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Discord delivers 48kHz s16le stereo PCM, 20ms per packet (3840 bytes).
|
||||
SAMPLE_RATE_DISCORD = 48000
|
||||
SAMPLE_RATE_WHISPER = 16000
|
||||
PACKET_MS = 20
|
||||
PACKET_BYTES = 3840 # 48000 Hz * 0.020 s * 2 channels * 2 bytes
|
||||
VAD_WINDOW_MS = 100 # batch 5 * 20ms packets per VAD inference (Decision #4)
|
||||
VAD_WINDOW_BYTES = PACKET_BYTES * (VAD_WINDOW_MS // PACKET_MS)
|
||||
VAD_THRESHOLD = 0.5
|
||||
SILENCE_FLUSH_MS = 800
|
||||
NO_SPEECH_DROP_THRESHOLD = 0.6
|
||||
FILLER_DELAY_S = 3.0
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
LOGS_DIR = PROJECT_ROOT / "logs"
|
||||
VOICE_METRICS_PATH = LOGS_DIR / "voice_metrics.jsonl"
|
||||
|
||||
|
||||
# ---------- Lazy model singletons ----------
|
||||
|
||||
_whisper_model: Any = None
|
||||
_whisper_lock = threading.Lock()
|
||||
_silero_model: Any = None
|
||||
_silero_get_timestamps: Any = None
|
||||
_silero_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_whisper_model() -> Any:
|
||||
"""Lazy-load faster-whisper ``small`` int8 with the spike-validated
|
||||
``cpu_threads=4`` (see ``tasks/voice-bench-results.md``)."""
|
||||
global _whisper_model
|
||||
if _whisper_model is not None:
|
||||
return _whisper_model
|
||||
with _whisper_lock:
|
||||
if _whisper_model is not None:
|
||||
return _whisper_model
|
||||
from faster_whisper import WhisperModel
|
||||
_whisper_model = WhisperModel(
|
||||
"small", device="cpu", compute_type="int8", cpu_threads=4,
|
||||
)
|
||||
return _whisper_model
|
||||
|
||||
|
||||
def _get_silero_vad():
|
||||
"""Lazy-load silero-vad. Returns ``(model, get_speech_timestamps)``."""
|
||||
global _silero_model, _silero_get_timestamps
|
||||
if _silero_model is not None:
|
||||
return _silero_model, _silero_get_timestamps
|
||||
with _silero_lock:
|
||||
if _silero_model is not None:
|
||||
return _silero_model, _silero_get_timestamps
|
||||
from silero_vad import get_speech_timestamps, load_silero_vad
|
||||
_silero_model = load_silero_vad()
|
||||
_silero_get_timestamps = get_speech_timestamps
|
||||
return _silero_model, _silero_get_timestamps
|
||||
|
||||
|
||||
# ---------- Audio helpers ----------
|
||||
|
||||
def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
|
||||
"""Discord 48kHz s16le stereo bytes -> 16kHz mono float32 in [-1, 1].
|
||||
|
||||
Cheap downsample: average the two channels, then average every 3
|
||||
samples (48k / 3 = 16k). faster-whisper + silero-vad accept the
|
||||
resulting ``np.float32`` array directly.
|
||||
"""
|
||||
if not pcm:
|
||||
return np.zeros(0, dtype=np.float32)
|
||||
samples = np.frombuffer(pcm, dtype=np.int16)
|
||||
if samples.size % 2 != 0:
|
||||
samples = samples[:-1]
|
||||
stereo = samples.reshape(-1, 2)
|
||||
mono = stereo.mean(axis=1).astype(np.float32) / 32768.0
|
||||
if mono.size == 0:
|
||||
return mono
|
||||
trim = (mono.size // 3) * 3
|
||||
if trim == 0:
|
||||
return np.zeros(0, dtype=np.float32)
|
||||
mono = mono[:trim].reshape(-1, 3).mean(axis=1)
|
||||
return mono.astype(np.float32)
|
||||
|
||||
|
||||
# ---------- VoiceSession ----------
|
||||
|
||||
class VoiceSession:
|
||||
"""Per-voice-call state with a single idempotent ``cleanup()``."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
channel_id: int,
|
||||
guild_id: int,
|
||||
text_channel: Any,
|
||||
voice_client: Any,
|
||||
bot: Any,
|
||||
ttsq: Any,
|
||||
whitelist: Optional[set] = None,
|
||||
record_enabled: bool = False,
|
||||
mirror_enabled: bool = True,
|
||||
transcripts_jsonl_path: Optional[Path] = None,
|
||||
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||
router_route_message: Optional[Callable] = None,
|
||||
):
|
||||
self.channel_id = int(channel_id)
|
||||
self.guild_id = int(guild_id)
|
||||
self.text_channel = text_channel
|
||||
self.voice_client = voice_client
|
||||
self.bot = bot
|
||||
self.ttsq = ttsq
|
||||
self.whitelist: set = set(whitelist or set())
|
||||
self.record_enabled = bool(record_enabled)
|
||||
self.mirror_enabled = bool(mirror_enabled)
|
||||
self.transcripts_jsonl_path = transcripts_jsonl_path
|
||||
self.loop = loop
|
||||
# Injection seam so tests can replace router.route_message without
|
||||
# mocking the whole module.
|
||||
if router_route_message is None:
|
||||
from src.router import route_message as _rm
|
||||
self._route_message = _rm
|
||||
else:
|
||||
self._route_message = router_route_message
|
||||
|
||||
self.last_activity_ts = time.monotonic()
|
||||
self._jsonl_fh = None
|
||||
self._lock = threading.Lock()
|
||||
self._cleaned_up = False
|
||||
self._lock_owner_thread: Optional[int] = None
|
||||
self._filler_task: Optional[asyncio.Task] = None
|
||||
self._first_block_seen = False
|
||||
|
||||
# ----- context manager -----
|
||||
|
||||
def __enter__(self) -> "VoiceSession":
|
||||
self._lock.acquire()
|
||||
self._lock_owner_thread = threading.get_ident()
|
||||
if self.record_enabled and self.transcripts_jsonl_path is not None:
|
||||
try:
|
||||
self.transcripts_jsonl_path.parent.mkdir(
|
||||
parents=True, exist_ok=True,
|
||||
)
|
||||
self._jsonl_fh = open(
|
||||
self.transcripts_jsonl_path, "a",
|
||||
buffering=1, encoding="utf-8",
|
||||
)
|
||||
except OSError as e:
|
||||
log.warning("voice transcript open failed: %s", e)
|
||||
self._jsonl_fh = None
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
|
||||
self.cleanup("exit")
|
||||
return False # never suppress exceptions
|
||||
|
||||
# ----- cleanup (centralized, idempotent) -----
|
||||
|
||||
def cleanup(self, reason: str) -> None:
|
||||
"""Single drain path for ALL 5 exit scenarios. Safe to call twice."""
|
||||
if self._cleaned_up:
|
||||
return
|
||||
self._cleaned_up = True
|
||||
|
||||
# 1. Flush or discard JSONL transcript.
|
||||
if self._jsonl_fh is not None:
|
||||
try:
|
||||
self._jsonl_fh.flush()
|
||||
self._jsonl_fh.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice transcript flush failed: %s", e)
|
||||
self._jsonl_fh = None
|
||||
if (not self.record_enabled
|
||||
and self.transcripts_jsonl_path is not None
|
||||
and self.transcripts_jsonl_path.exists()):
|
||||
try:
|
||||
self.transcripts_jsonl_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# 2. Restore bot presence (clear Listening activity).
|
||||
if self.bot is not None:
|
||||
try:
|
||||
change = getattr(self.bot, "change_presence", None)
|
||||
if callable(change):
|
||||
coro = change(activity=None)
|
||||
if asyncio.iscoroutine(coro):
|
||||
if self.loop is not None and self.loop.is_running():
|
||||
asyncio.run_coroutine_threadsafe(coro, self.loop)
|
||||
else:
|
||||
# Best-effort: close the coroutine so Python
|
||||
# doesn't emit "coroutine was never awaited".
|
||||
coro.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice presence restore failed: %s", e)
|
||||
|
||||
# 3. Tear down the voice client.
|
||||
if self.voice_client is not None:
|
||||
try:
|
||||
self.voice_client.cleanup()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice_client.cleanup failed: %s", e)
|
||||
|
||||
# 4. Stop the TTS queue worker.
|
||||
if self.ttsq is not None:
|
||||
try:
|
||||
self.ttsq.stop()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq.stop failed: %s", e)
|
||||
|
||||
# 5. Cancel pending filler task.
|
||||
if self._filler_task is not None and not self._filler_task.done():
|
||||
try:
|
||||
self._filler_task.cancel()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
# 6. Release the session lock (held since __enter__).
|
||||
try:
|
||||
if self._lock.locked():
|
||||
self._lock.release()
|
||||
except RuntimeError:
|
||||
# Released from a different thread than acquired it — already
|
||||
# free for the next caller; nothing to do.
|
||||
pass
|
||||
|
||||
self._log_metric({"event": "cleanup", "reason": reason})
|
||||
|
||||
# ----- segment completion (scheduled from sink) -----
|
||||
|
||||
async def on_segment_done(
|
||||
self,
|
||||
speaker_id: int,
|
||||
text: str,
|
||||
no_speech_prob: float,
|
||||
) -> None:
|
||||
"""Mirror, persist, route to Claude, drive TTS via streaming callback."""
|
||||
if self._cleaned_up:
|
||||
return
|
||||
self.last_activity_ts = time.monotonic()
|
||||
speaker_name = self._resolve_speaker_name(speaker_id)
|
||||
|
||||
# 1. Mirror to text channel (one Unicode 🎤 — exception per plan).
|
||||
if self.mirror_enabled and self.text_channel is not None:
|
||||
try:
|
||||
send = getattr(self.text_channel, "send", None)
|
||||
if callable(send):
|
||||
coro = send(f"\U0001f3a4 {speaker_name}: \"{text}\"")
|
||||
if asyncio.iscoroutine(coro):
|
||||
await coro
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice mirror send failed: %s", e)
|
||||
|
||||
# 2. Append to JSONL transcript buffer if recording.
|
||||
if self._jsonl_fh is not None:
|
||||
try:
|
||||
self._jsonl_fh.write(
|
||||
json.dumps({
|
||||
"ts": time.time(),
|
||||
"speaker_id": speaker_id,
|
||||
"speaker": speaker_name,
|
||||
"text": text,
|
||||
"no_speech_prob": no_speech_prob,
|
||||
}, ensure_ascii=False) + "\n"
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice transcript write failed: %s", e)
|
||||
|
||||
# 3. Arm the 3s filler timer — fires only if no Claude block arrives.
|
||||
self._first_block_seen = False
|
||||
if self._filler_task is not None and not self._filler_task.done():
|
||||
self._filler_task.cancel()
|
||||
try:
|
||||
self._filler_task = asyncio.create_task(self._filler_after_delay())
|
||||
except RuntimeError:
|
||||
# No running loop (test path). Skip the timer.
|
||||
self._filler_task = None
|
||||
|
||||
def voice_stream_callback(block: str) -> None:
|
||||
"""Called once per Claude streamed text block — pushes to TTS
|
||||
and cancels the filler on first arrival."""
|
||||
if not self._first_block_seen:
|
||||
self._first_block_seen = True
|
||||
ft = self._filler_task
|
||||
if ft is not None and not ft.done():
|
||||
try:
|
||||
ft.cancel()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
try:
|
||||
self.ttsq.push_text(block)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq.push_text failed: %s", e)
|
||||
|
||||
# 4. Dispatch to Claude. send_message is sync subprocess, run on
|
||||
# a worker thread so the loop stays responsive for mirror/TTS.
|
||||
try:
|
||||
await asyncio.to_thread(
|
||||
self._route_message,
|
||||
str(self.channel_id),
|
||||
str(speaker_id),
|
||||
text,
|
||||
None, # model
|
||||
voice_stream_callback, # on_text
|
||||
"discord-voice", # adapter_name
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.error("route_message voice path failed: %s", e)
|
||||
|
||||
async def _filler_after_delay(self) -> None:
|
||||
"""Push ``assets/voice/thinking.wav`` after FILLER_DELAY_S if Claude
|
||||
hasn't produced a first block yet."""
|
||||
try:
|
||||
await asyncio.sleep(FILLER_DELAY_S)
|
||||
except asyncio.CancelledError:
|
||||
return
|
||||
if self._first_block_seen or self._cleaned_up:
|
||||
return
|
||||
try:
|
||||
self.ttsq.push_filler()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("ttsq.push_filler failed: %s", e)
|
||||
|
||||
# ----- helpers -----
|
||||
|
||||
def _resolve_speaker_name(self, speaker_id: int) -> str:
|
||||
"""Best-effort display name lookup via the bot user cache."""
|
||||
try:
|
||||
if self.bot is not None and hasattr(self.bot, "get_user"):
|
||||
user = self.bot.get_user(speaker_id)
|
||||
if user is not None:
|
||||
name = getattr(user, "display_name", None) or getattr(
|
||||
user, "name", None,
|
||||
)
|
||||
if name:
|
||||
return str(name)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return str(speaker_id)
|
||||
|
||||
def _log_metric(self, payload: dict) -> None:
|
||||
"""Append a structured event to ``logs/voice_metrics.jsonl``."""
|
||||
event = {"ts": time.time(), "channel_id": self.channel_id, **payload}
|
||||
try:
|
||||
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(VOICE_METRICS_PATH, "a", buffering=1, encoding="utf-8") as f:
|
||||
f.write(json.dumps(event, ensure_ascii=False) + "\n")
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
# ---------- EchoVoiceSink ----------
|
||||
|
||||
class EchoVoiceSink(AudioSink):
|
||||
"""PCM-in sink: per-user 20ms buffer -> 100ms VAD windows -> 800ms
|
||||
silence triggers Whisper STT -> schedules ``on_segment_done`` on the
|
||||
session loop.
|
||||
|
||||
Lives in the voice_recv reader thread; uses ``threading`` primitives
|
||||
only (no asyncio in the hot path).
|
||||
"""
|
||||
|
||||
def __init__(self, session: VoiceSession, bot_user_id: int):
|
||||
super().__init__()
|
||||
self.session = session
|
||||
self.bot_user_id = int(bot_user_id) if bot_user_id is not None else 0
|
||||
self.whitelist: set = set(session.whitelist or set())
|
||||
self._user_buffers: dict[int, bytearray] = {}
|
||||
self._packet_accum: dict[int, bytearray] = {}
|
||||
self._last_speech_ts: dict[int, float] = {}
|
||||
self._has_speech: dict[int, bool] = {}
|
||||
self._sink_lock = threading.Lock()
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
return False
|
||||
|
||||
def cleanup(self) -> None:
|
||||
with self._sink_lock:
|
||||
self._user_buffers.clear()
|
||||
self._packet_accum.clear()
|
||||
self._last_speech_ts.clear()
|
||||
self._has_speech.clear()
|
||||
|
||||
def write(self, user, voice_data: VoiceData) -> None:
|
||||
# ---- FIRST GUARD (LOAD-BEARING): bot's own voice ---------------
|
||||
if user is None:
|
||||
return
|
||||
uid = int(getattr(user, "id", 0) or 0)
|
||||
if uid == 0:
|
||||
return
|
||||
if uid == self.bot_user_id:
|
||||
return
|
||||
|
||||
# ---- SECOND GUARD: whitelist filter ----------------------------
|
||||
if self.whitelist and uid not in self.whitelist:
|
||||
return
|
||||
|
||||
pcm = getattr(voice_data, "pcm", None)
|
||||
if not pcm:
|
||||
return
|
||||
|
||||
window_pcm: Optional[bytes] = None
|
||||
pcm_for_stt: Optional[bytes] = None
|
||||
|
||||
try:
|
||||
with self._sink_lock:
|
||||
buf = self._user_buffers.setdefault(uid, bytearray())
|
||||
accum = self._packet_accum.setdefault(uid, bytearray())
|
||||
buf.extend(pcm)
|
||||
accum.extend(pcm)
|
||||
if len(accum) >= VAD_WINDOW_BYTES:
|
||||
window_pcm = bytes(accum[:VAD_WINDOW_BYTES])
|
||||
del accum[:VAD_WINDOW_BYTES]
|
||||
|
||||
if window_pcm is not None:
|
||||
if self._vad_detects_speech(window_pcm):
|
||||
with self._sink_lock:
|
||||
self._last_speech_ts[uid] = time.monotonic()
|
||||
self._has_speech[uid] = True
|
||||
|
||||
with self._sink_lock:
|
||||
if self._has_speech.get(uid):
|
||||
last = self._last_speech_ts.get(uid, 0.0)
|
||||
silence_ms = (time.monotonic() - last) * 1000.0
|
||||
if silence_ms >= SILENCE_FLUSH_MS:
|
||||
pcm_for_stt = bytes(self._user_buffers.get(uid, b""))
|
||||
self._user_buffers[uid] = bytearray()
|
||||
self._packet_accum[uid] = bytearray()
|
||||
self._has_speech[uid] = False
|
||||
|
||||
if pcm_for_stt:
|
||||
self._flush_to_stt(uid, pcm_for_stt)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("EchoVoiceSink.write failed: %s", e)
|
||||
|
||||
# ----- VAD -----
|
||||
|
||||
def _vad_detects_speech(self, pcm48_stereo: bytes) -> bool:
|
||||
"""Run silero-vad on a 100ms window. Falls back to an RMS energy
|
||||
threshold if torch / silero are unavailable."""
|
||||
try:
|
||||
mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
|
||||
if mono16.size == 0:
|
||||
return False
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
rms = float(np.sqrt(np.mean(mono16.astype(np.float64) ** 2)))
|
||||
return rms > 0.02
|
||||
model, _ = _get_silero_vad()
|
||||
with torch.no_grad():
|
||||
prob = float(model(torch.from_numpy(mono16),
|
||||
SAMPLE_RATE_WHISPER).item())
|
||||
return prob >= VAD_THRESHOLD
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.debug("VAD inference failed: %s", e)
|
||||
return False
|
||||
|
||||
# ----- STT flush -----
|
||||
|
||||
def _flush_to_stt(self, user_id: int, pcm48_stereo: bytes) -> None:
|
||||
"""Downsample, Whisper-transcribe RO, drop hallucinations, dispatch."""
|
||||
try:
|
||||
mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
|
||||
if mono16.size == 0:
|
||||
return
|
||||
model = _get_whisper_model()
|
||||
segments, _info = model.transcribe(
|
||||
mono16, language="ro", beam_size=1,
|
||||
)
|
||||
text_parts: list[str] = []
|
||||
worst_no_speech = 0.0
|
||||
for seg in segments:
|
||||
no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
|
||||
if no_sp > worst_no_speech:
|
||||
worst_no_speech = no_sp
|
||||
if no_sp > NO_SPEECH_DROP_THRESHOLD:
|
||||
continue
|
||||
seg_text = (getattr(seg, "text", "") or "").strip()
|
||||
if seg_text:
|
||||
text_parts.append(seg_text)
|
||||
if not text_parts:
|
||||
return
|
||||
text = " ".join(text_parts).strip()
|
||||
if not text:
|
||||
return
|
||||
self._schedule_segment_done(user_id, text, worst_no_speech)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("Whisper transcribe failed: %s", e)
|
||||
|
||||
def _schedule_segment_done(
|
||||
self, user_id: int, text: str, no_speech_prob: float,
|
||||
) -> None:
|
||||
loop = self.session.loop
|
||||
if loop is None or not loop.is_running():
|
||||
log.debug("voice session loop missing — dropping segment")
|
||||
return
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.session.on_segment_done(user_id, text, no_speech_prob),
|
||||
loop,
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("voice segment dispatch failed: %s", e)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"VoiceSession",
|
||||
"EchoVoiceSink",
|
||||
"FILLER_DELAY_S",
|
||||
"SILENCE_FLUSH_MS",
|
||||
"VAD_THRESHOLD",
|
||||
"VAD_WINDOW_MS",
|
||||
"NO_SPEECH_DROP_THRESHOLD",
|
||||
]
|
||||
Reference in New Issue
Block a user