"""Tests for tools/voice_stt_mine.py — STT log mining helpers. Pure-function coverage: tokenize, token_frequency, rare_tokens, missing_diacritic_candidates, suspect_rows, row_text (back-compat with rows that predate the text_corrected field). """ import sys from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from tools.voice_stt_mine import ( # noqa: E402 missing_diacritic_candidates, rare_tokens, row_text, suspect_rows, token_frequency, tokenize, ) def test_tokenize_lowercases_and_drops_punct(): assert tokenize("Salut, Eco!") == ["salut", "eco"] def test_tokenize_keeps_diacritics(): assert tokenize("ședință și prețul") == ["ședință", "și", "prețul"] def test_tokenize_drops_digits(): # M3, numbers etc. are not alphabetic word tokens assert tokenize("M3 are 120 lei") == ["m", "are", "lei"] def test_tokenize_empty_and_none(): assert tokenize("") == [] assert tokenize(None) == [] def test_row_text_prefers_raw_text_field(): # Mining always wants raw STT output (the `text` field), even once # newer rows add `text_corrected`. assert row_text({"text": "cat", "text_corrected": "cât"}) == "cat" def test_row_text_missing_field(): assert row_text({}) == "" def test_token_frequency_counts_across_rows(): rows = [{"text": "eco eco"}, {"text": "Eco salut"}] freq = token_frequency(rows) assert freq["eco"] == 3 assert freq["salut"] == 1 def test_rare_tokens_returns_singletons_sorted(): rows = [{"text": "eco eco salut bitcoin"}] rare = rare_tokens(token_frequency(rows)) assert rare == ["bitcoin", "salut"] # eco appears twice -> excluded assert "eco" not in rare def test_missing_diacritic_candidates_flags_ascii_words(): rows = [{"text": "pretul este mare"}, {"text": "ședință corectă"}] cands = missing_diacritic_candidates(token_frequency(rows), min_len=4) assert "pretul" in cands assert "mare" in cands # words carrying diacritics are NOT restore candidates assert "ședință" not in cands assert "corectă" not in cands def test_missing_diacritic_respects_min_len(): rows = [{"text": "cat de bun"}] cands = missing_diacritic_candidates(token_frequency(rows), min_len=4) assert "cat" not in cands # len 3 < 4 assert "bun" not in cands def test_suspect_rows_flags_high_latency(): rows = [ {"text": "ok", "stt_latency_s": 2.0, "no_speech_prob": 0.0}, {"text": "M3.", "stt_latency_s": 24.4, "no_speech_prob": 0.58}, ] suspects = suspect_rows(rows) assert len(suspects) == 1 assert suspects[0]["text"] == "M3." def test_suspect_rows_flags_borderline_no_speech(): rows = [{"text": "x", "stt_latency_s": 1.0, "no_speech_prob": 0.55}] assert len(suspect_rows(rows)) == 1 def test_suspect_rows_tolerates_missing_fields(): # rows without latency/no_speech must not crash assert suspect_rows([{"text": "x"}]) == []