feat(voice): improve Romanian STT — hallucination gate + finetuned model

Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken upstream, no deploy host), so improve faster-whisper instead. - Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear audio up to 6x (source of the 16-24s latency outliers); reject hallucinated segments via avg_logprob/compression_ratio in the new pure _filter_segments. - Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model: spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget). - Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model eval with diacritic scoring) + tests for the gate and miner. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:16 +00:00
parent ec23d188ec
commit ce273d14db
9 changed files with 664 additions and 16 deletions
--- a/tests/test_voice_pipeline_filter.py
+++ b/tests/test_voice_pipeline_filter.py
@@ -0,0 +1,85 @@
+"""Tests for src/voice/pipeline.py::_filter_segments — STT hallucination gate.
+
+The gate replaces faster-whisper's temperature-fallback re-decode (the source of
+16-24s latency outliers) with zero-cost segment rejection on no_speech_prob,
+avg_logprob, and compression_ratio.
+"""
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from src.voice.pipeline import (  # noqa: E402
+    AVG_LOGPROB_DROP_THRESHOLD,
+    COMPRESSION_RATIO_DROP_THRESHOLD,
+    NO_SPEECH_DROP_THRESHOLD,
+    _filter_segments,
+)
+
+
+@dataclass
+class FakeSeg:
+    text: str = ""
+    no_speech_prob: float = 0.0
+    avg_logprob: Optional[float] = 0.0
+    compression_ratio: Optional[float] = 1.0
+
+
+def test_keeps_clean_segment():
+    parts, worst = _filter_segments([FakeSeg(text="salut eco", avg_logprob=-0.3, compression_ratio=1.5)])
+    assert parts == ["salut eco"]
+    assert worst == 0.0
+
+
+def test_drops_high_no_speech():
+    seg = FakeSeg(text="hmm", no_speech_prob=NO_SPEECH_DROP_THRESHOLD + 0.1)
+    parts, worst = _filter_segments([seg])
+    assert parts == []
+    assert worst == NO_SPEECH_DROP_THRESHOLD + 0.1  # still tracked for logging
+
+
+def test_drops_low_avg_logprob_hallucination():
+    # "Care pune o zana judiciul tugea" style: decoder unsure
+    seg = FakeSeg(text="zana judiciul tugea", avg_logprob=AVG_LOGPROB_DROP_THRESHOLD - 0.5)
+    parts, _ = _filter_segments([seg])
+    assert parts == []
+
+
+def test_drops_high_compression_ratio_loop():
+    seg = FakeSeg(text="da da da da da", compression_ratio=COMPRESSION_RATIO_DROP_THRESHOLD + 1.0)
+    parts, _ = _filter_segments([seg])
+    assert parts == []
+
+
+def test_keeps_when_metrics_missing():
+    # Older/edge segments may not expose avg_logprob/compression_ratio
+    seg = FakeSeg(text="ok", avg_logprob=None, compression_ratio=None)
+    parts, _ = _filter_segments([seg])
+    assert parts == ["ok"]
+
+
+def test_drops_empty_text():
+    parts, _ = _filter_segments([FakeSeg(text="   ", avg_logprob=-0.2)])
+    assert parts == []
+
+
+def test_worst_no_speech_is_max_across_segments():
+    segs = [
+        FakeSeg(text="a", no_speech_prob=0.1, avg_logprob=-0.2),
+        FakeSeg(text="b", no_speech_prob=0.4, avg_logprob=-0.2),
+    ]
+    parts, worst = _filter_segments(segs)
+    assert parts == ["a", "b"]
+    assert worst == 0.4
+
+
+def test_mixed_keep_and_drop():
+    segs = [
+        FakeSeg(text="bun venit", avg_logprob=-0.3),
+        FakeSeg(text="garbage", avg_logprob=-3.0),       # dropped: low logprob
+        FakeSeg(text="la revedere", avg_logprob=-0.5),
+    ]
+    parts, _ = _filter_segments(segs)
+    assert parts == ["bun venit", "la revedere"]
--- a/tests/test_voice_stt_mine.py
+++ b/tests/test_voice_stt_mine.py
@@ -0,0 +1,100 @@
+"""Tests for tools/voice_stt_mine.py — STT log mining helpers.
+
+Pure-function coverage: tokenize, token_frequency, rare_tokens,
+missing_diacritic_candidates, suspect_rows, row_text (back-compat with rows
+that predate the text_corrected field).
+"""
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from tools.voice_stt_mine import (  # noqa: E402
+    missing_diacritic_candidates,
+    rare_tokens,
+    row_text,
+    suspect_rows,
+    token_frequency,
+    tokenize,
+)
+
+
+def test_tokenize_lowercases_and_drops_punct():
+    assert tokenize("Salut, Eco!") == ["salut", "eco"]
+
+
+def test_tokenize_keeps_diacritics():
+    assert tokenize("ședință și prețul") == ["ședință", "și", "prețul"]
+
+
+def test_tokenize_drops_digits():
+    # M3, numbers etc. are not alphabetic word tokens
+    assert tokenize("M3 are 120 lei") == ["m", "are", "lei"]
+
+
+def test_tokenize_empty_and_none():
+    assert tokenize("") == []
+    assert tokenize(None) == []
+
+
+def test_row_text_prefers_raw_text_field():
+    # Mining always wants raw STT output (the `text` field), even once
+    # newer rows add `text_corrected`.
+    assert row_text({"text": "cat", "text_corrected": "cât"}) == "cat"
+
+
+def test_row_text_missing_field():
+    assert row_text({}) == ""
+
+
+def test_token_frequency_counts_across_rows():
+    rows = [{"text": "eco eco"}, {"text": "Eco salut"}]
+    freq = token_frequency(rows)
+    assert freq["eco"] == 3
+    assert freq["salut"] == 1
+
+
+def test_rare_tokens_returns_singletons_sorted():
+    rows = [{"text": "eco eco salut bitcoin"}]
+    rare = rare_tokens(token_frequency(rows))
+    assert rare == ["bitcoin", "salut"]  # eco appears twice -> excluded
+    assert "eco" not in rare
+
+
+def test_missing_diacritic_candidates_flags_ascii_words():
+    rows = [{"text": "pretul este mare"}, {"text": "ședință corectă"}]
+    cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
+    assert "pretul" in cands
+    assert "mare" in cands
+    # words carrying diacritics are NOT restore candidates
+    assert "ședință" not in cands
+    assert "corectă" not in cands
+
+
+def test_missing_diacritic_respects_min_len():
+    rows = [{"text": "cat de bun"}]
+    cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
+    assert "cat" not in cands  # len 3 < 4
+    assert "bun" not in cands
+
+
+def test_suspect_rows_flags_high_latency():
+    rows = [
+        {"text": "ok", "stt_latency_s": 2.0, "no_speech_prob": 0.0},
+        {"text": "M3.", "stt_latency_s": 24.4, "no_speech_prob": 0.58},
+    ]
+    suspects = suspect_rows(rows)
+    assert len(suspects) == 1
+    assert suspects[0]["text"] == "M3."
+
+
+def test_suspect_rows_flags_borderline_no_speech():
+    rows = [{"text": "x", "stt_latency_s": 1.0, "no_speech_prob": 0.55}]
+    assert len(suspect_rows(rows)) == 1
+
+
+def test_suspect_rows_tolerates_missing_fields():
+    # rows without latency/no_speech must not crash
+    assert suspect_rows([{"text": "x"}]) == []