#!/usr/bin/env python3 """Mine logs/voice_stt_log.jsonl for STT correction candidates. Read-only analysis tool. Surfaces what the always-on STT log has captured so Marius can decide hotwords, spot recurring mistranscriptions, and judge whether a model swap (e.g. a Romanian-finetuned Whisper) actually helps. Pure helpers (tokenize / aggregate) are importable and tested; the CLI just prints reports. Tolerates rows written before the `text_corrected` field existed (falls back to `text`). Usage: python3 tools/voice_stt_mine.py # full report python3 tools/voice_stt_mine.py --tokens # token frequency only python3 tools/voice_stt_mine.py --rare # one-off tokens (candidates) python3 tools/voice_stt_mine.py --suspect # likely hallucination rows python3 tools/voice_stt_mine.py --log PATH # custom log path """ from __future__ import annotations import argparse import json import re import sys from collections import Counter from pathlib import Path from typing import Iterable, Iterator PROJECT_ROOT = Path(__file__).resolve().parent.parent DEFAULT_LOG = PROJECT_ROOT / "logs" / "voice_stt_log.jsonl" # Latency above this (s) almost always means the decoder thrashed on unclear # audio — a strong hallucination signal worth reviewing. Mirrors the >7s # conversational-abort budget from tasks/voice-bench-results.md. SUSPECT_LATENCY_S = 7.0 SUSPECT_NO_SPEECH = 0.5 _TOKEN_RE = re.compile(r"[A-Za-zĂÂÎȘȚăâîșț]+", re.UNICODE) # Romanian diacritic letters; a token with none of these is a diacritic-restore # candidate worth a human glance (not auto-corrected — see plan D2). _DIACRITICS = set("ĂÂÎȘȚăâîșț") def read_log(path: Path) -> list[dict]: """Parse the JSONL log; skip malformed lines instead of crashing.""" rows: list[dict] = [] if not path.exists(): return rows with path.open(encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: rows.append(json.loads(line)) except json.JSONDecodeError: continue return rows def row_text(row: dict) -> str: """Raw transcript for a row. New rows may add `text_corrected`; mining always wants the raw STT output, which lives in `text`.""" return (row.get("text") or "").strip() def tokenize(text: str) -> list[str]: """Split into alphabetic word tokens, lowercased. Drops digits/punct.""" return [t.lower() for t in _TOKEN_RE.findall(text or "")] def token_frequency(rows: Iterable[dict]) -> Counter: counter: Counter = Counter() for row in rows: counter.update(tokenize(row_text(row))) return counter def rare_tokens(freq: Counter, max_count: int = 1) -> list[str]: """Tokens seen at most `max_count` times — candidate mistranscriptions, proper nouns to add as hotwords, or code-switch garbage.""" return sorted(t for t, c in freq.items() if c <= max_count) def missing_diacritic_candidates(freq: Counter, min_len: int = 4) -> list[str]: """All-ASCII tokens (no Romanian diacritics) of reasonable length, sorted by frequency. These are the words a diacritic-restore pass would target — kept as a review list only (v1 does not auto-restore, per plan D2).""" out = [ (t, c) for t, c in freq.items() if len(t) >= min_len and not (set(t) & _DIACRITICS) and t.isalpha() ] out.sort(key=lambda tc: (-tc[1], tc[0])) return [t for t, _ in out] def suspect_rows(rows: Iterable[dict]) -> list[dict]: """Rows that look like hallucinations: very high latency or borderline no_speech_prob that still produced text.""" out = [] for row in rows: lat = float(row.get("stt_latency_s") or 0.0) nsp = float(row.get("no_speech_prob") or 0.0) if lat >= SUSPECT_LATENCY_S or nsp >= SUSPECT_NO_SPEECH: out.append(row) return out def _iter_report(rows: list[dict]) -> Iterator[str]: freq = token_frequency(rows) yield f"entries: {len(rows)}" if rows: lats = [float(r.get("stt_latency_s") or 0.0) for r in rows] yield f"latency: mean={sum(lats)/len(lats):.2f}s max={max(lats):.2f}s" yield "" yield "== top tokens ==" for tok, cnt in freq.most_common(20): yield f" {cnt:>3} {tok}" yield "" yield "== rare tokens (<=1, candidate corrections / hotwords) ==" rare = rare_tokens(freq) yield " " + (", ".join(rare) if rare else "(none)") yield "" yield "== missing-diacritic candidates (review only) ==" cands = missing_diacritic_candidates(freq)[:30] yield " " + (", ".join(cands) if cands else "(none)") yield "" suspects = suspect_rows(rows) yield f"== likely-hallucination rows ({len(suspects)}) ==" for r in suspects: yield (f" lat={float(r.get('stt_latency_s') or 0):.1f}s " f"nsp={float(r.get('no_speech_prob') or 0):.2f} " f"{row_text(r)!r}") def main(argv: list[str] | None = None) -> int: ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--log", type=Path, default=DEFAULT_LOG, help="path to voice_stt_log.jsonl") ap.add_argument("--tokens", action="store_true", help="token frequency only") ap.add_argument("--rare", action="store_true", help="one-off tokens only") ap.add_argument("--suspect", action="store_true", help="likely-hallucination rows only") args = ap.parse_args(argv) rows = read_log(args.log) if not rows: print(f"no entries in {args.log}", file=sys.stderr) return 1 freq = token_frequency(rows) if args.tokens: for tok, cnt in freq.most_common(): print(f"{cnt:>4} {tok}") elif args.rare: print("\n".join(rare_tokens(freq))) elif args.suspect: for r in suspect_rows(rows): print(f"lat={float(r.get('stt_latency_s') or 0):.1f}s " f"nsp={float(r.get('no_speech_prob') or 0):.2f} {row_text(r)!r}") else: print("\n".join(_iter_report(rows))) return 0 if __name__ == "__main__": raise SystemExit(main())