echo-core/tools/voice_stt_mine.py

#!/usr/bin/env python3
"""Mine logs/voice_stt_log.jsonl for STT correction candidates.

Read-only analysis tool. Surfaces what the always-on STT log has captured so
Marius can decide hotwords, spot recurring mistranscriptions, and judge whether
a model swap (e.g. a Romanian-finetuned Whisper) actually helps.

Pure helpers (tokenize / aggregate) are importable and tested; the CLI just
prints reports. Tolerates rows written before the `text_corrected` field
existed (falls back to `text`).

Usage:
    python3 tools/voice_stt_mine.py                # full report
    python3 tools/voice_stt_mine.py --tokens       # token frequency only
    python3 tools/voice_stt_mine.py --rare         # one-off tokens (candidates)
    python3 tools/voice_stt_mine.py --suspect      # likely hallucination rows
    python3 tools/voice_stt_mine.py --log PATH     # custom log path
"""
from __future__ import annotations

import argparse
import json
import re
import sys
from collections import Counter
from pathlib import Path
from typing import Iterable, Iterator

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_LOG = PROJECT_ROOT / "logs" / "voice_stt_log.jsonl"

# Latency above this (s) almost always means the decoder thrashed on unclear
# audio — a strong hallucination signal worth reviewing. Mirrors the >7s
# conversational-abort budget from tasks/voice-bench-results.md.
SUSPECT_LATENCY_S = 7.0
SUSPECT_NO_SPEECH = 0.5

_TOKEN_RE = re.compile(r"[A-Za-zĂÂÎȘȚăâîșț]+", re.UNICODE)
# Romanian diacritic letters; a token with none of these is a diacritic-restore
# candidate worth a human glance (not auto-corrected — see plan D2).
_DIACRITICS = set("ĂÂÎȘȚăâîșț")


def read_log(path: Path) -> list[dict]:
    """Parse the JSONL log; skip malformed lines instead of crashing."""
    rows: list[dict] = []
    if not path.exists():
        return rows
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return rows


def row_text(row: dict) -> str:
    """Raw transcript for a row. New rows may add `text_corrected`; mining
    always wants the raw STT output, which lives in `text`."""
    return (row.get("text") or "").strip()


def tokenize(text: str) -> list[str]:
    """Split into alphabetic word tokens, lowercased. Drops digits/punct."""
    return [t.lower() for t in _TOKEN_RE.findall(text or "")]


def token_frequency(rows: Iterable[dict]) -> Counter:
    counter: Counter = Counter()
    for row in rows:
        counter.update(tokenize(row_text(row)))
    return counter


def rare_tokens(freq: Counter, max_count: int = 1) -> list[str]:
    """Tokens seen at most `max_count` times — candidate mistranscriptions,
    proper nouns to add as hotwords, or code-switch garbage."""
    return sorted(t for t, c in freq.items() if c <= max_count)


def missing_diacritic_candidates(freq: Counter, min_len: int = 4) -> list[str]:
    """All-ASCII tokens (no Romanian diacritics) of reasonable length, sorted by
    frequency. These are the words a diacritic-restore pass would target — kept
    as a review list only (v1 does not auto-restore, per plan D2)."""
    out = [
        (t, c) for t, c in freq.items()
        if len(t) >= min_len and not (set(t) & _DIACRITICS) and t.isalpha()
    ]
    out.sort(key=lambda tc: (-tc[1], tc[0]))
    return [t for t, _ in out]


def suspect_rows(rows: Iterable[dict]) -> list[dict]:
    """Rows that look like hallucinations: very high latency or borderline
    no_speech_prob that still produced text."""
    out = []
    for row in rows:
        lat = float(row.get("stt_latency_s") or 0.0)
        nsp = float(row.get("no_speech_prob") or 0.0)
        if lat >= SUSPECT_LATENCY_S or nsp >= SUSPECT_NO_SPEECH:
            out.append(row)
    return out


def _iter_report(rows: list[dict]) -> Iterator[str]:
    freq = token_frequency(rows)
    yield f"entries: {len(rows)}"
    if rows:
        lats = [float(r.get("stt_latency_s") or 0.0) for r in rows]
        yield f"latency: mean={sum(lats)/len(lats):.2f}s max={max(lats):.2f}s"
    yield ""
    yield "== top tokens =="
    for tok, cnt in freq.most_common(20):
        yield f"  {cnt:>3}  {tok}"
    yield ""
    yield "== rare tokens (<=1, candidate corrections / hotwords) =="
    rare = rare_tokens(freq)
    yield "  " + (", ".join(rare) if rare else "(none)")
    yield ""
    yield "== missing-diacritic candidates (review only) =="
    cands = missing_diacritic_candidates(freq)[:30]
    yield "  " + (", ".join(cands) if cands else "(none)")
    yield ""
    suspects = suspect_rows(rows)
    yield f"== likely-hallucination rows ({len(suspects)}) =="
    for r in suspects:
        yield (f"  lat={float(r.get('stt_latency_s') or 0):.1f}s "
               f"nsp={float(r.get('no_speech_prob') or 0):.2f}  "
               f"{row_text(r)!r}")


def main(argv: list[str] | None = None) -> int:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--log", type=Path, default=DEFAULT_LOG, help="path to voice_stt_log.jsonl")
    ap.add_argument("--tokens", action="store_true", help="token frequency only")
    ap.add_argument("--rare", action="store_true", help="one-off tokens only")
    ap.add_argument("--suspect", action="store_true", help="likely-hallucination rows only")
    args = ap.parse_args(argv)

    rows = read_log(args.log)
    if not rows:
        print(f"no entries in {args.log}", file=sys.stderr)
        return 1

    freq = token_frequency(rows)
    if args.tokens:
        for tok, cnt in freq.most_common():
            print(f"{cnt:>4}  {tok}")
    elif args.rare:
        print("\n".join(rare_tokens(freq)))
    elif args.suspect:
        for r in suspect_rows(rows):
            print(f"lat={float(r.get('stt_latency_s') or 0):.1f}s "
                  f"nsp={float(r.get('no_speech_prob') or 0):.2f}  {row_text(r)!r}")
    else:
        print("\n".join(_iter_report(rows)))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())