feat(voice): improve Romanian STT — hallucination gate + finetuned model

Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken
upstream, no deploy host), so improve faster-whisper instead.

- Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear
  audio up to 6x (source of the 16-24s latency outliers); reject hallucinated
  segments via avg_logprob/compression_ratio in the new pure _filter_segments.
- Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model:
  spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget).
- Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model
  eval with diacritic scoring) + tests for the gate and miner.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-27 18:16:16 +00:00
parent ec23d188ec
commit ce273d14db
9 changed files with 664 additions and 16 deletions

View File

@@ -49,6 +49,14 @@ VAD_WINDOW_BYTES = PACKET_BYTES * (VAD_WINDOW_MS // PACKET_MS)
VAD_THRESHOLD = 0.5
SILENCE_FLUSH_MS = 800
NO_SPEECH_DROP_THRESHOLD = 0.6
# Hallucination rejection (no re-decode). faster-whisper's default temperature
# is a fallback ladder [0.0..1.0]; on unclear audio it re-decodes the segment up
# to 6x, which is what produced the 16-24s outliers in voice_stt_log.jsonl
# against a >7s conversational-abort budget. We pin temperature=0.0 (no fallback)
# and instead REJECT low-quality segments using the avg_logprob / compression_ratio
# that faster-whisper already computes per segment — zero extra latency.
AVG_LOGPROB_DROP_THRESHOLD = -1.0 # drop seg if avg_logprob below this
COMPRESSION_RATIO_DROP_THRESHOLD = 2.4 # drop seg if gzip ratio above this (repetition/garbage)
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
LOGS_DIR = PROJECT_ROOT / "logs"
@@ -83,19 +91,28 @@ _silero_lock = threading.Lock()
def _get_whisper_model() -> Any:
"""Lazy-load faster-whisper ``small`` int8 with the spike-validated
``cpu_threads=4`` (see ``tasks/voice-bench-results.md``)."""
"""Lazy-load faster-whisper int8 with the spike-validated ``cpu_threads=4``
(see ``tasks/voice-bench-results.md``).
Model is configurable via ``voice.stt_model`` (default ``"small"``). It may be
a faster-whisper model name or a path to a local CT2 dir — e.g. the Romanian
Common-Voice finetune that halved WER and fixed number transcription in the
D1 spike (``tools/voice_stt_spike.py``). Custom paths still load with
``local_files_only=True`` since they live on disk."""
global _whisper_model
if _whisper_model is not None:
return _whisper_model
with _whisper_lock:
if _whisper_model is not None:
return _whisper_model
from src.config import Config
model_id = Config().get("voice.stt_model", "small") or "small"
from faster_whisper import WhisperModel
_whisper_model = WhisperModel(
"small", device="cpu", compute_type="int8", cpu_threads=4,
model_id, device="cpu", compute_type="int8", cpu_threads=4,
local_files_only=True,
)
log.info("STT model loaded: %s", model_id)
return _whisper_model
@@ -145,6 +162,38 @@ def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
return np.ascontiguousarray(mono16, dtype=np.float32)
def _filter_segments(segments: Any) -> tuple[list[str], float]:
"""Keep transcribable segments, drop silence and hallucinations.
Pure + side-effect free (no model, no I/O) so the rejection thresholds are
unit-testable with fake segment objects. A segment is dropped when:
- ``no_speech_prob`` is high (silence/non-speech), OR
- ``avg_logprob`` is below ``AVG_LOGPROB_DROP_THRESHOLD`` (decoder unsure), OR
- ``compression_ratio`` exceeds ``COMPRESSION_RATIO_DROP_THRESHOLD`` (looped/garbage).
The avg_logprob/compression checks replace faster-whisper's temperature-fallback
re-decode (the source of the 16-24s latency outliers) with zero-cost rejection.
Returns ``(kept_text_parts, worst_no_speech_prob)``.
"""
text_parts: list[str] = []
worst_no_speech = 0.0
for seg in segments:
no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
if no_sp > worst_no_speech:
worst_no_speech = no_sp
if no_sp > NO_SPEECH_DROP_THRESHOLD:
continue
avg_lp = getattr(seg, "avg_logprob", None)
if avg_lp is not None and float(avg_lp) < AVG_LOGPROB_DROP_THRESHOLD:
continue
comp = getattr(seg, "compression_ratio", None)
if comp is not None and float(comp) > COMPRESSION_RATIO_DROP_THRESHOLD:
continue
seg_text = (getattr(seg, "text", "") or "").strip()
if seg_text:
text_parts.append(seg_text)
return text_parts, worst_no_speech
# ---------- VoiceSession ----------
class VoiceSession:
@@ -679,6 +728,7 @@ class EchoVoiceSink(AudioSink):
model = _get_whisper_model()
segments, _info = model.transcribe(
mono16, language="ro", beam_size=5,
temperature=0.0, # no fallback ladder — reject bad segments instead (see thresholds above)
initial_prompt=(
"Conversatie in romana cu asistentul Eco (Echo Core). "
"Marius i se adreseaza cu 'Salut, Eco', 'Eco' sau 'Echo Core' "
@@ -689,20 +739,10 @@ class EchoVoiceSink(AudioSink):
"F1, F2, F3, F4, F5. Exemple: vorbeste cu vocea M5, voce F3, "
"treci pe vocea F1."
),
hotwords="Eco Echo Core Marius Bianca",
hotwords="Eco Echo Core Marius Bianca Bitcoin",
condition_on_previous_text=False,
)
text_parts: list[str] = []
worst_no_speech = 0.0
for seg in segments:
no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
if no_sp > worst_no_speech:
worst_no_speech = no_sp
if no_sp > NO_SPEECH_DROP_THRESHOLD:
continue
seg_text = (getattr(seg, "text", "") or "").strip()
if seg_text:
text_parts.append(seg_text)
text_parts, worst_no_speech = _filter_segments(segments)
if not text_parts:
return
text = " ".join(text_parts).strip()