feat(voice): improve Romanian STT — hallucination gate + finetuned model

Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken upstream, no deploy host), so improve faster-whisper instead. - Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear audio up to 6x (source of the 16-24s latency outliers); reject hallucinated segments via avg_logprob/compression_ratio in the new pure _filter_segments. - Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model: spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget). - Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model eval with diacritic scoring) + tests for the gate and miner. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:16 +00:00
parent ec23d188ec
commit ce273d14db
9 changed files with 664 additions and 16 deletions
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -49,6 +49,14 @@ VAD_WINDOW_BYTES = PACKET_BYTES * (VAD_WINDOW_MS // PACKET_MS)
 VAD_THRESHOLD = 0.5
 SILENCE_FLUSH_MS = 800
 NO_SPEECH_DROP_THRESHOLD = 0.6
+# Hallucination rejection (no re-decode). faster-whisper's default temperature
+# is a fallback ladder [0.0..1.0]; on unclear audio it re-decodes the segment up
+# to 6x, which is what produced the 16-24s outliers in voice_stt_log.jsonl
+# against a >7s conversational-abort budget. We pin temperature=0.0 (no fallback)
+# and instead REJECT low-quality segments using the avg_logprob / compression_ratio
+# that faster-whisper already computes per segment — zero extra latency.
+AVG_LOGPROB_DROP_THRESHOLD = -1.0      # drop seg if avg_logprob below this
+COMPRESSION_RATIO_DROP_THRESHOLD = 2.4  # drop seg if gzip ratio above this (repetition/garbage)

 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
 LOGS_DIR = PROJECT_ROOT / "logs"
@@ -83,19 +91,28 @@ _silero_lock = threading.Lock()


 def _get_whisper_model() -> Any:
-    """Lazy-load faster-whisper ``small`` int8 with the spike-validated
-    ``cpu_threads=4`` (see ``tasks/voice-bench-results.md``)."""
+    """Lazy-load faster-whisper int8 with the spike-validated ``cpu_threads=4``
+    (see ``tasks/voice-bench-results.md``).
+
+    Model is configurable via ``voice.stt_model`` (default ``"small"``). It may be
+    a faster-whisper model name or a path to a local CT2 dir — e.g. the Romanian
+    Common-Voice finetune that halved WER and fixed number transcription in the
+    D1 spike (``tools/voice_stt_spike.py``). Custom paths still load with
+    ``local_files_only=True`` since they live on disk."""
    global _whisper_model
    if _whisper_model is not None:
        return _whisper_model
    with _whisper_lock:
        if _whisper_model is not None:
            return _whisper_model
+        from src.config import Config
+        model_id = Config().get("voice.stt_model", "small") or "small"
        from faster_whisper import WhisperModel
        _whisper_model = WhisperModel(
-            "small", device="cpu", compute_type="int8", cpu_threads=4,
+            model_id, device="cpu", compute_type="int8", cpu_threads=4,
            local_files_only=True,
        )
+        log.info("STT model loaded: %s", model_id)
        return _whisper_model


@@ -145,6 +162,38 @@ def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
    return np.ascontiguousarray(mono16, dtype=np.float32)


+def _filter_segments(segments: Any) -> tuple[list[str], float]:
+    """Keep transcribable segments, drop silence and hallucinations.
+
+    Pure + side-effect free (no model, no I/O) so the rejection thresholds are
+    unit-testable with fake segment objects. A segment is dropped when:
+      - ``no_speech_prob`` is high (silence/non-speech), OR
+      - ``avg_logprob`` is below ``AVG_LOGPROB_DROP_THRESHOLD`` (decoder unsure), OR
+      - ``compression_ratio`` exceeds ``COMPRESSION_RATIO_DROP_THRESHOLD`` (looped/garbage).
+    The avg_logprob/compression checks replace faster-whisper's temperature-fallback
+    re-decode (the source of the 16-24s latency outliers) with zero-cost rejection.
+    Returns ``(kept_text_parts, worst_no_speech_prob)``.
+    """
+    text_parts: list[str] = []
+    worst_no_speech = 0.0
+    for seg in segments:
+        no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
+        if no_sp > worst_no_speech:
+            worst_no_speech = no_sp
+        if no_sp > NO_SPEECH_DROP_THRESHOLD:
+            continue
+        avg_lp = getattr(seg, "avg_logprob", None)
+        if avg_lp is not None and float(avg_lp) < AVG_LOGPROB_DROP_THRESHOLD:
+            continue
+        comp = getattr(seg, "compression_ratio", None)
+        if comp is not None and float(comp) > COMPRESSION_RATIO_DROP_THRESHOLD:
+            continue
+        seg_text = (getattr(seg, "text", "") or "").strip()
+        if seg_text:
+            text_parts.append(seg_text)
+    return text_parts, worst_no_speech
+
+
 # ---------- VoiceSession ----------

 class VoiceSession:
@@ -679,6 +728,7 @@ class EchoVoiceSink(AudioSink):
            model = _get_whisper_model()
            segments, _info = model.transcribe(
                mono16, language="ro", beam_size=5,
+                temperature=0.0,  # no fallback ladder — reject bad segments instead (see thresholds above)
                initial_prompt=(
                    "Conversatie in romana cu asistentul Eco (Echo Core). "
                    "Marius i se adreseaza cu 'Salut, Eco', 'Eco' sau 'Echo Core' "
@@ -689,20 +739,10 @@ class EchoVoiceSink(AudioSink):
                    "F1, F2, F3, F4, F5. Exemple: vorbeste cu vocea M5, voce F3, "
                    "treci pe vocea F1."
                ),
-                hotwords="Eco Echo Core Marius Bianca",
+                hotwords="Eco Echo Core Marius Bianca Bitcoin",
                condition_on_previous_text=False,
            )
-            text_parts: list[str] = []
-            worst_no_speech = 0.0
-            for seg in segments:
-                no_sp = float(getattr(seg, "no_speech_prob", 0.0) or 0.0)
-                if no_sp > worst_no_speech:
-                    worst_no_speech = no_sp
-                if no_sp > NO_SPEECH_DROP_THRESHOLD:
-                    continue
-                seg_text = (getattr(seg, "text", "") or "").strip()
-                if seg_text:
-                    text_parts.append(seg_text)
+            text_parts, worst_no_speech = _filter_segments(segments)
            if not text_parts:
                return
            text = " ".join(text_parts).strip()