echo-core/tests/test_voice_stt_mine.py

"""Tests for tools/voice_stt_mine.py — STT log mining helpers.

Pure-function coverage: tokenize, token_frequency, rare_tokens,
missing_diacritic_candidates, suspect_rows, row_text (back-compat with rows
that predate the text_corrected field).
"""
import sys
from pathlib import Path

import pytest

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from tools.voice_stt_mine import (  # noqa: E402
    missing_diacritic_candidates,
    rare_tokens,
    row_text,
    suspect_rows,
    token_frequency,
    tokenize,
)


def test_tokenize_lowercases_and_drops_punct():
    assert tokenize("Salut, Eco!") == ["salut", "eco"]


def test_tokenize_keeps_diacritics():
    assert tokenize("ședință și prețul") == ["ședință", "și", "prețul"]


def test_tokenize_drops_digits():
    # M3, numbers etc. are not alphabetic word tokens
    assert tokenize("M3 are 120 lei") == ["m", "are", "lei"]


def test_tokenize_empty_and_none():
    assert tokenize("") == []
    assert tokenize(None) == []


def test_row_text_prefers_raw_text_field():
    # Mining always wants raw STT output (the `text` field), even once
    # newer rows add `text_corrected`.
    assert row_text({"text": "cat", "text_corrected": "cât"}) == "cat"


def test_row_text_missing_field():
    assert row_text({}) == ""


def test_token_frequency_counts_across_rows():
    rows = [{"text": "eco eco"}, {"text": "Eco salut"}]
    freq = token_frequency(rows)
    assert freq["eco"] == 3
    assert freq["salut"] == 1


def test_rare_tokens_returns_singletons_sorted():
    rows = [{"text": "eco eco salut bitcoin"}]
    rare = rare_tokens(token_frequency(rows))
    assert rare == ["bitcoin", "salut"]  # eco appears twice -> excluded
    assert "eco" not in rare


def test_missing_diacritic_candidates_flags_ascii_words():
    rows = [{"text": "pretul este mare"}, {"text": "ședință corectă"}]
    cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
    assert "pretul" in cands
    assert "mare" in cands
    # words carrying diacritics are NOT restore candidates
    assert "ședință" not in cands
    assert "corectă" not in cands


def test_missing_diacritic_respects_min_len():
    rows = [{"text": "cat de bun"}]
    cands = missing_diacritic_candidates(token_frequency(rows), min_len=4)
    assert "cat" not in cands  # len 3 < 4
    assert "bun" not in cands


def test_suspect_rows_flags_high_latency():
    rows = [
        {"text": "ok", "stt_latency_s": 2.0, "no_speech_prob": 0.0},
        {"text": "M3.", "stt_latency_s": 24.4, "no_speech_prob": 0.58},
    ]
    suspects = suspect_rows(rows)
    assert len(suspects) == 1
    assert suspects[0]["text"] == "M3."


def test_suspect_rows_flags_borderline_no_speech():
    rows = [{"text": "x", "stt_latency_s": 1.0, "no_speech_prob": 0.55}]
    assert len(suspect_rows(rows)) == 1


def test_suspect_rows_tolerates_missing_fields():
    # rows without latency/no_speech must not crash
    assert suspect_rows([{"text": "x"}]) == []