feat(voice): improve Romanian STT — hallucination gate + finetuned model
Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken upstream, no deploy host), so improve faster-whisper instead. - Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear audio up to 6x (source of the 16-24s latency outliers); reject hallucinated segments via avg_logprob/compression_ratio in the new pure _filter_segments. - Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model: spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget). - Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model eval with diacritic scoring) + tests for the gate and miner. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
100
tests/test_voice_stt_mine.py
Normal file
100
tests/test_voice_stt_mine.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Tests for tools/voice_stt_mine.py — STT log mining helpers.
|
||||
|
||||
Pure-function coverage: tokenize, token_frequency, rare_tokens,
|
||||
missing_diacritic_candidates, suspect_rows, row_text (back-compat with rows
|
||||
that predate the text_corrected field).
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from tools.voice_stt_mine import ( # noqa: E402
|
||||
missing_diacritic_candidates,
|
||||
rare_tokens,
|
||||
row_text,
|
||||
suspect_rows,
|
||||
token_frequency,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
|
||||
def test_tokenize_lowercases_and_drops_punct():
|
||||
assert tokenize("Salut, Eco!") == ["salut", "eco"]
|
||||
|
||||
|
||||
def test_tokenize_keeps_diacritics():
|
||||
assert tokenize("ședință și prețul") == ["ședință", "și", "prețul"]
|
||||
|
||||
|
||||
def test_tokenize_drops_digits():
|
||||
# M3, numbers etc. are not alphabetic word tokens
|
||||
assert tokenize("M3 are 120 lei") == ["m", "are", "lei"]
|
||||
|
||||
|
||||
def test_tokenize_empty_and_none():
|
||||
assert tokenize("") == []
|
||||
assert tokenize(None) == []
|
||||
|
||||
|
||||
def test_row_text_prefers_raw_text_field():
|
||||
# Mining always wants raw STT output (the `text` field), even once
|
||||
# newer rows add `text_corrected`.
|
||||
assert row_text({"text": "cat", "text_corrected": "cât"}) == "cat"
|
||||
|
||||
|
||||
def test_row_text_missing_field():
|
||||
assert row_text({}) == ""
|
||||
|
||||
|
||||
def test_token_frequency_counts_across_rows():
|
||||
rows = [{"text": "eco eco"}, {"text": "Eco salut"}]
|
||||
freq = token_frequency(rows)
|
||||
assert freq["eco"] == 3
|
||||
assert freq["salut"] == 1
|
||||
|
||||
|
||||
def test_rare_tokens_returns_singletons_sorted():
|
||||
rows = [{"text": "eco eco salut bitcoin"}]
|
||||
rare = rare_tokens(token_frequency(rows))
|
||||
assert rare == ["bitcoin", "salut"] # eco appears twice -> excluded
|
||||
assert "eco" not in rare
|
||||
|
||||
|
||||
def test_missing_diacritic_candidates_flags_ascii_words():
|
||||
rows = [{"text": "pretul este mare"}, {"text": "ședință corectă"}]
|
||||
cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
|
||||
assert "pretul" in cands
|
||||
assert "mare" in cands
|
||||
# words carrying diacritics are NOT restore candidates
|
||||
assert "ședință" not in cands
|
||||
assert "corectă" not in cands
|
||||
|
||||
|
||||
def test_missing_diacritic_respects_min_len():
|
||||
rows = [{"text": "cat de bun"}]
|
||||
cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
|
||||
assert "cat" not in cands # len 3 < 4
|
||||
assert "bun" not in cands
|
||||
|
||||
|
||||
def test_suspect_rows_flags_high_latency():
|
||||
rows = [
|
||||
{"text": "ok", "stt_latency_s": 2.0, "no_speech_prob": 0.0},
|
||||
{"text": "M3.", "stt_latency_s": 24.4, "no_speech_prob": 0.58},
|
||||
]
|
||||
suspects = suspect_rows(rows)
|
||||
assert len(suspects) == 1
|
||||
assert suspects[0]["text"] == "M3."
|
||||
|
||||
|
||||
def test_suspect_rows_flags_borderline_no_speech():
|
||||
rows = [{"text": "x", "stt_latency_s": 1.0, "no_speech_prob": 0.55}]
|
||||
assert len(suspect_rows(rows)) == 1
|
||||
|
||||
|
||||
def test_suspect_rows_tolerates_missing_fields():
|
||||
# rows without latency/no_speech must not crash
|
||||
assert suspect_rows([{"text": "x"}]) == []
|
||||
Reference in New Issue
Block a user