feat(voice): improve Romanian STT — hallucination gate + finetuned model
Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken upstream, no deploy host), so improve faster-whisper instead. - Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear audio up to 6x (source of the 16-24s latency outliers); reject hallucinated segments via avg_logprob/compression_ratio in the new pure _filter_segments. - Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model: spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget). - Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model eval with diacritic scoring) + tests for the gate and miner. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -49,6 +49,14 @@ VAD_WINDOW_BYTES = PACKET_BYTES * (VAD_WINDOW_MS // PACKET_MS)
|
||||
VAD_THRESHOLD = 0.5
|
||||
SILENCE_FLUSH_MS = 800
|
||||
NO_SPEECH_DROP_THRESHOLD = 0.6
|
||||
# Hallucination rejection (no re-decode). faster-whisper's default temperature
|
||||
# is a fallback ladder [0.0..1.0]; on unclear audio it re-decodes the segment up
|
||||
# to 6x, which is what produced the 16-24s outliers in voice_stt_log.jsonl
|
||||
# against a >7s conversational-abort budget. We pin temperature=0.0 (no fallback)
|
||||
# and instead REJECT low-quality segments using the avg_logprob / compression_ratio
|
||||
# that faster-whisper already computes per segment — zero extra latency.
|
||||
AVG_LOGPROB_DROP_THRESHOLD = -1.0 # drop seg if avg_logprob below this
|
||||
COMPRESSION_RATIO_DROP_THRESHOLD = 2.4 # drop seg if gzip ratio above this (repetition/garbage)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
LOGS_DIR = PROJECT_ROOT / "logs"
|
||||
@@ -83,19 +91,28 @@ _silero_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_whisper_model() -> Any:
|
||||
"""Lazy-load faster-whisper ``small`` int8 with the spike-validated
|
||||
``cpu_threads=4`` (see ``tasks/voice-bench-results.md``)."""
|
||||
"""Lazy-load faster-whisper int8 with the spike-validated ``cpu_threads=4``
|
||||
(see ``tasks/voice-bench-results.md``).
|
||||
|
||||
Model is configurable via ``voice.stt_model`` (default ``"small"``). It may be
|
||||
a faster-whisper model name or a path to a local CT2 dir — e.g. the Romanian
|
||||
Common-Voice finetune that halved WER and fixed number transcription in the
|
||||
D1 spike (``tools/voice_stt_spike.py``). Custom paths still load with
|
||||
``local_files_only=True`` since they live on disk."""
|
||||
global _whisper_model
|
||||
if _whisper_model is not None:
|
||||
return _whisper_model
|
||||
with _whisper_lock:
|
||||
if _whisper_model is not None:
|
||||
return _whisper_model
|
||||
from src.config import Config
|
||||
model_id = Config().get("voice.stt_model", "small") or "small"
|
||||
from faster_whisper import WhisperModel
|
||||
_whisper_model = WhisperModel(
|
||||
"small", device="cpu", compute_type="int8", cpu_threads=4,
|
||||
model_id, device="cpu", compute_type="int8", cpu_threads=4,
|
||||
local_files_only=True,
|
||||
)
|
||||
log.info("STT model loaded: %s", model_id)
|
||||
return _whisper_model
|
||||
|
||||
|
||||
@@ -145,6 +162,38 @@ def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
|
||||
return np.ascontiguousarray(mono16, dtype=np.float32)
|
||||
|
||||
|
||||
def _filter_segments(segments: Any) -> tuple[list[str], float]:
|
||||
"""Keep transcribable segments, drop silence and hallucinations.
|
||||
|
||||
Pure + side-effect free (no model, no I/O) so the rejection thresholds are
|
||||
unit-testable with fake segment objects. A segment is dropped when:
|
||||
- ``no_speech_prob`` is high (silence/non-speech), OR
|
||||
- ``avg_logprob`` is below ``AVG_LOGPROB_DROP_THRESHOLD`` (decoder unsure), OR
|
||||
- ``compression_ratio`` exceeds ``COMPRESSION_RATIO_DROP_THRESHOLD`` (looped/garbage).
|
||||
The avg_logprob/compression checks replace faster-whisper's temperature-fallback
|
||||
re-decode (the source of the 16-24s latency outliers) with zero-cost rejection.
|
||||
Returns ``(kept_text_parts, worst_no_speech_prob)``.
|
||||
"""
|
||||
text_parts: list[str] = []
|
||||
worst_no_speech = 0.0
|
||||
for seg in segments:
|
||||
no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
|
||||
if no_sp > worst_no_speech:
|
||||
worst_no_speech = no_sp
|
||||
if no_sp > NO_SPEECH_DROP_THRESHOLD:
|
||||
continue
|
||||
avg_lp = getattr(seg, "avg_logprob", None)
|
||||
if avg_lp is not None and float(avg_lp) < AVG_LOGPROB_DROP_THRESHOLD:
|
||||
continue
|
||||
comp = getattr(seg, "compression_ratio", None)
|
||||
if comp is not None and float(comp) > COMPRESSION_RATIO_DROP_THRESHOLD:
|
||||
continue
|
||||
seg_text = (getattr(seg, "text", "") or "").strip()
|
||||
if seg_text:
|
||||
text_parts.append(seg_text)
|
||||
return text_parts, worst_no_speech
|
||||
|
||||
|
||||
# ---------- VoiceSession ----------
|
||||
|
||||
class VoiceSession:
|
||||
@@ -679,6 +728,7 @@ class EchoVoiceSink(AudioSink):
|
||||
model = _get_whisper_model()
|
||||
segments, _info = model.transcribe(
|
||||
mono16, language="ro", beam_size=5,
|
||||
temperature=0.0, # no fallback ladder — reject bad segments instead (see thresholds above)
|
||||
initial_prompt=(
|
||||
"Conversatie in romana cu asistentul Eco (Echo Core). "
|
||||
"Marius i se adreseaza cu 'Salut, Eco', 'Eco' sau 'Echo Core' "
|
||||
@@ -689,20 +739,10 @@ class EchoVoiceSink(AudioSink):
|
||||
"F1, F2, F3, F4, F5. Exemple: vorbeste cu vocea M5, voce F3, "
|
||||
"treci pe vocea F1."
|
||||
),
|
||||
hotwords="Eco Echo Core Marius Bianca",
|
||||
hotwords="Eco Echo Core Marius Bianca Bitcoin",
|
||||
condition_on_previous_text=False,
|
||||
)
|
||||
text_parts: list[str] = []
|
||||
worst_no_speech = 0.0
|
||||
for seg in segments:
|
||||
no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
|
||||
if no_sp > worst_no_speech:
|
||||
worst_no_speech = no_sp
|
||||
if no_sp > NO_SPEECH_DROP_THRESHOLD:
|
||||
continue
|
||||
seg_text = (getattr(seg, "text", "") or "").strip()
|
||||
if seg_text:
|
||||
text_parts.append(seg_text)
|
||||
text_parts, worst_no_speech = _filter_segments(segments)
|
||||
if not text_parts:
|
||||
return
|
||||
text = " ".join(text_parts).strip()
|
||||
|
||||
Reference in New Issue
Block a user