Files
echo-core/tests/test_voice_stt_mine.py
Marius Mutu ce273d14db feat(voice): improve Romanian STT — hallucination gate + finetuned model
Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken
upstream, no deploy host), so improve faster-whisper instead.

- Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear
  audio up to 6x (source of the 16-24s latency outliers); reject hallucinated
  segments via avg_logprob/compression_ratio in the new pure _filter_segments.
- Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model:
  spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget).
- Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model
  eval with diacritic scoring) + tests for the gate and miner.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:16 +00:00

101 lines
2.9 KiB
Python

"""Tests for tools/voice_stt_mine.py — STT log mining helpers.
Pure-function coverage: tokenize, token_frequency, rare_tokens,
missing_diacritic_candidates, suspect_rows, row_text (back-compat with rows
that predate the text_corrected field).
"""
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from tools.voice_stt_mine import ( # noqa: E402
missing_diacritic_candidates,
rare_tokens,
row_text,
suspect_rows,
token_frequency,
tokenize,
)
def test_tokenize_lowercases_and_drops_punct():
assert tokenize("Salut, Eco!") == ["salut", "eco"]
def test_tokenize_keeps_diacritics():
assert tokenize("ședință și prețul") == ["ședință", "și", "prețul"]
def test_tokenize_drops_digits():
# M3, numbers etc. are not alphabetic word tokens
assert tokenize("M3 are 120 lei") == ["m", "are", "lei"]
def test_tokenize_empty_and_none():
assert tokenize("") == []
assert tokenize(None) == []
def test_row_text_prefers_raw_text_field():
# Mining always wants raw STT output (the `text` field), even once
# newer rows add `text_corrected`.
assert row_text({"text": "cat", "text_corrected": "cât"}) == "cat"
def test_row_text_missing_field():
assert row_text({}) == ""
def test_token_frequency_counts_across_rows():
rows = [{"text": "eco eco"}, {"text": "Eco salut"}]
freq = token_frequency(rows)
assert freq["eco"] == 3
assert freq["salut"] == 1
def test_rare_tokens_returns_singletons_sorted():
rows = [{"text": "eco eco salut bitcoin"}]
rare = rare_tokens(token_frequency(rows))
assert rare == ["bitcoin", "salut"] # eco appears twice -> excluded
assert "eco" not in rare
def test_missing_diacritic_candidates_flags_ascii_words():
rows = [{"text": "pretul este mare"}, {"text": "ședință corectă"}]
cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
assert "pretul" in cands
assert "mare" in cands
# words carrying diacritics are NOT restore candidates
assert "ședință" not in cands
assert "corectă" not in cands
def test_missing_diacritic_respects_min_len():
rows = [{"text": "cat de bun"}]
cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
assert "cat" not in cands # len 3 < 4
assert "bun" not in cands
def test_suspect_rows_flags_high_latency():
rows = [
{"text": "ok", "stt_latency_s": 2.0, "no_speech_prob": 0.0},
{"text": "M3.", "stt_latency_s": 24.4, "no_speech_prob": 0.58},
]
suspects = suspect_rows(rows)
assert len(suspects) == 1
assert suspects[0]["text"] == "M3."
def test_suspect_rows_flags_borderline_no_speech():
rows = [{"text": "x", "stt_latency_s": 1.0, "no_speech_prob": 0.55}]
assert len(suspect_rows(rows)) == 1
def test_suspect_rows_tolerates_missing_fields():
# rows without latency/no_speech must not crash
assert suspect_rows([{"text": "x"}]) == []