Gemma 4 cloud audio was infeasible (31b-cloud has no audio; E4B broken upstream, no deploy host), so improve faster-whisper instead. - Pin temperature=0.0 to disable the fallback ladder that re-decoded unclear audio up to 6x (source of the 16-24s latency outliers); reject hallucinated segments via avg_logprob/compression_ratio in the new pure _filter_segments. - Adopt mikr/whisper-small-ro-cv11 (CT2 int8) via configurable voice.stt_model: spike showed WER 24%->10%, numbers fixed at source, +0.33s p50 (in budget). - Add tools/voice_stt_mine.py (log mining) + tools/voice_stt_spike.py (model eval with diacritic scoring) + tests for the gate and miner. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
167 lines
6.1 KiB
Python
167 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Mine logs/voice_stt_log.jsonl for STT correction candidates.
|
|
|
|
Read-only analysis tool. Surfaces what the always-on STT log has captured so
|
|
Marius can decide hotwords, spot recurring mistranscriptions, and judge whether
|
|
a model swap (e.g. a Romanian-finetuned Whisper) actually helps.
|
|
|
|
Pure helpers (tokenize / aggregate) are importable and tested; the CLI just
|
|
prints reports. Tolerates rows written before the `text_corrected` field
|
|
existed (falls back to `text`).
|
|
|
|
Usage:
|
|
python3 tools/voice_stt_mine.py # full report
|
|
python3 tools/voice_stt_mine.py --tokens # token frequency only
|
|
python3 tools/voice_stt_mine.py --rare # one-off tokens (candidates)
|
|
python3 tools/voice_stt_mine.py --suspect # likely hallucination rows
|
|
python3 tools/voice_stt_mine.py --log PATH # custom log path
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Iterable, Iterator
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
DEFAULT_LOG = PROJECT_ROOT / "logs" / "voice_stt_log.jsonl"
|
|
|
|
# Latency above this (s) almost always means the decoder thrashed on unclear
|
|
# audio — a strong hallucination signal worth reviewing. Mirrors the >7s
|
|
# conversational-abort budget from tasks/voice-bench-results.md.
|
|
SUSPECT_LATENCY_S = 7.0
|
|
SUSPECT_NO_SPEECH = 0.5
|
|
|
|
_TOKEN_RE = re.compile(r"[A-Za-zĂÂÎȘȚăâîșț]+", re.UNICODE)
|
|
# Romanian diacritic letters; a token with none of these is a diacritic-restore
|
|
# candidate worth a human glance (not auto-corrected — see plan D2).
|
|
_DIACRITICS = set("ĂÂÎȘȚăâîșț")
|
|
|
|
|
|
def read_log(path: Path) -> list[dict]:
|
|
"""Parse the JSONL log; skip malformed lines instead of crashing."""
|
|
rows: list[dict] = []
|
|
if not path.exists():
|
|
return rows
|
|
with path.open(encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
rows.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
continue
|
|
return rows
|
|
|
|
|
|
def row_text(row: dict) -> str:
|
|
"""Raw transcript for a row. New rows may add `text_corrected`; mining
|
|
always wants the raw STT output, which lives in `text`."""
|
|
return (row.get("text") or "").strip()
|
|
|
|
|
|
def tokenize(text: str) -> list[str]:
|
|
"""Split into alphabetic word tokens, lowercased. Drops digits/punct."""
|
|
return [t.lower() for t in _TOKEN_RE.findall(text or "")]
|
|
|
|
|
|
def token_frequency(rows: Iterable[dict]) -> Counter:
|
|
counter: Counter = Counter()
|
|
for row in rows:
|
|
counter.update(tokenize(row_text(row)))
|
|
return counter
|
|
|
|
|
|
def rare_tokens(freq: Counter, max_count: int = 1) -> list[str]:
|
|
"""Tokens seen at most `max_count` times — candidate mistranscriptions,
|
|
proper nouns to add as hotwords, or code-switch garbage."""
|
|
return sorted(t for t, c in freq.items() if c <= max_count)
|
|
|
|
|
|
def missing_diacritic_candidates(freq: Counter, min_len: int = 4) -> list[str]:
|
|
"""All-ASCII tokens (no Romanian diacritics) of reasonable length, sorted by
|
|
frequency. These are the words a diacritic-restore pass would target — kept
|
|
as a review list only (v1 does not auto-restore, per plan D2)."""
|
|
out = [
|
|
(t, c) for t, c in freq.items()
|
|
if len(t) >= min_len and not (set(t) & _DIACRITICS) and t.isalpha()
|
|
]
|
|
out.sort(key=lambda tc: (-tc[1], tc[0]))
|
|
return [t for t, _ in out]
|
|
|
|
|
|
def suspect_rows(rows: Iterable[dict]) -> list[dict]:
|
|
"""Rows that look like hallucinations: very high latency or borderline
|
|
no_speech_prob that still produced text."""
|
|
out = []
|
|
for row in rows:
|
|
lat = float(row.get("stt_latency_s") or 0.0)
|
|
nsp = float(row.get("no_speech_prob") or 0.0)
|
|
if lat >= SUSPECT_LATENCY_S or nsp >= SUSPECT_NO_SPEECH:
|
|
out.append(row)
|
|
return out
|
|
|
|
|
|
def _iter_report(rows: list[dict]) -> Iterator[str]:
|
|
freq = token_frequency(rows)
|
|
yield f"entries: {len(rows)}"
|
|
if rows:
|
|
lats = [float(r.get("stt_latency_s") or 0.0) for r in rows]
|
|
yield f"latency: mean={sum(lats)/len(lats):.2f}s max={max(lats):.2f}s"
|
|
yield ""
|
|
yield "== top tokens =="
|
|
for tok, cnt in freq.most_common(20):
|
|
yield f" {cnt:>3} {tok}"
|
|
yield ""
|
|
yield "== rare tokens (<=1, candidate corrections / hotwords) =="
|
|
rare = rare_tokens(freq)
|
|
yield " " + (", ".join(rare) if rare else "(none)")
|
|
yield ""
|
|
yield "== missing-diacritic candidates (review only) =="
|
|
cands = missing_diacritic_candidates(freq)[:30]
|
|
yield " " + (", ".join(cands) if cands else "(none)")
|
|
yield ""
|
|
suspects = suspect_rows(rows)
|
|
yield f"== likely-hallucination rows ({len(suspects)}) =="
|
|
for r in suspects:
|
|
yield (f" lat={float(r.get('stt_latency_s') or 0):.1f}s "
|
|
f"nsp={float(r.get('no_speech_prob') or 0):.2f} "
|
|
f"{row_text(r)!r}")
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--log", type=Path, default=DEFAULT_LOG, help="path to voice_stt_log.jsonl")
|
|
ap.add_argument("--tokens", action="store_true", help="token frequency only")
|
|
ap.add_argument("--rare", action="store_true", help="one-off tokens only")
|
|
ap.add_argument("--suspect", action="store_true", help="likely-hallucination rows only")
|
|
args = ap.parse_args(argv)
|
|
|
|
rows = read_log(args.log)
|
|
if not rows:
|
|
print(f"no entries in {args.log}", file=sys.stderr)
|
|
return 1
|
|
|
|
freq = token_frequency(rows)
|
|
if args.tokens:
|
|
for tok, cnt in freq.most_common():
|
|
print(f"{cnt:>4} {tok}")
|
|
elif args.rare:
|
|
print("\n".join(rare_tokens(freq)))
|
|
elif args.suspect:
|
|
for r in suspect_rows(rows):
|
|
print(f"lat={float(r.get('stt_latency_s') or 0):.1f}s "
|
|
f"nsp={float(r.get('no_speech_prob') or 0):.2f} {row_text(r)!r}")
|
|
else:
|
|
print("\n".join(_iter_report(rows)))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|