From 763999f3a903f1ef1cb820b515583a7b74b146da Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Tue, 24 Mar 2026 21:17:14 +0200 Subject: [PATCH] feat: anti-hallucination params + retranscribe script for fixing broken transcripts - transcribe.py: add --max-context 0, --entropy-thold 2.4, --max-len 60, --suppress-nst, --no-fallback to whisper.cpp to prevent hallucination loops - transcribe.py: remove interactive quality gate (runs unattended now) - run.bat: remove pause prompts for unattended operation - retranscribe_tail.py: new script that detects hallucination bursts in SRT files, extracts and re-transcribes only the affected audio segments, then splices the result back together. Drops segments that re-hallucinate (silence/music). Backs up originals to transcripts/backup/. - fix_hallucinations.bat: Windows wrapper for retranscribe_tail.py Co-Authored-By: Claude Opus 4.6 (1M context) --- fix_hallucinations.bat | 74 ++++++++ retranscribe_tail.py | 415 +++++++++++++++++++++++++++++++++++++++++ run.bat | 4 +- transcribe.py | 17 +- 4 files changed, 497 insertions(+), 13 deletions(-) create mode 100644 fix_hallucinations.bat create mode 100644 retranscribe_tail.py diff --git a/fix_hallucinations.bat b/fix_hallucinations.bat new file mode 100644 index 0000000..3d0ed18 --- /dev/null +++ b/fix_hallucinations.bat @@ -0,0 +1,74 @@ +@echo off +setlocal enabledelayedexpansion +cd /d "%~dp0" + +echo ============================================================ +echo Fix Hallucinated Transcripts +echo ============================================================ +echo. + +:: --- Find Python --- +set "PYTHON_CMD=" +where py >nul 2>&1 +if not errorlevel 1 ( + set "PYTHON_CMD=py" +) +if not defined PYTHON_CMD ( + for /f "delims=" %%p in ('where python 2^>nul ^| findstr /v /i "WindowsApps"') do ( + if not defined PYTHON_CMD set "PYTHON_CMD=%%p" + ) +) +if not defined PYTHON_CMD ( + echo [X] Python not found. Install from https://www.python.org/downloads/ + pause + exit /b 1 +) + +:: --- Find whisper-cli.exe --- +if not defined WHISPER_BIN ( + if exist "whisper-cli.exe" ( + set "WHISPER_BIN=whisper-cli.exe" + ) else if exist "whisper-bin\whisper-cli.exe" ( + set "WHISPER_BIN=whisper-bin\whisper-cli.exe" + ) else if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" ( + set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe" + ) else ( + echo [X] whisper-cli.exe not found + pause + exit /b 1 + ) +) +echo Using whisper: %WHISPER_BIN% + +:: --- Find model --- +if not defined WHISPER_MODEL ( + set "WHISPER_MODEL=models\ggml-medium-q5_0.bin" +) + +:: --- Activate venv if available --- +if exist ".venv\Scripts\activate.bat" ( + call .venv\Scripts\activate.bat +) + +:: --- Dry run first --- +echo. +echo [1/2] Scanning for hallucinations... +echo. +set "WHISPER_BIN=%WHISPER_BIN%" +set "WHISPER_MODEL=%WHISPER_MODEL%" +.venv\Scripts\python retranscribe_tail.py --dry-run +echo. + +:: --- Fix --- +echo [2/2] Fixing hallucinated transcripts... +echo. +.venv\Scripts\python retranscribe_tail.py +if errorlevel 1 ( + echo. + echo WARNING: Some fixes failed. Check output above. +) + +echo. +echo ============================================================ +echo Done! Originals backed up to transcripts\backup\ +echo ============================================================ diff --git a/retranscribe_tail.py b/retranscribe_tail.py new file mode 100644 index 0000000..ed6e32d --- /dev/null +++ b/retranscribe_tail.py @@ -0,0 +1,415 @@ +""" +Re-transcribe only the hallucinated portions of a transcript. + +Detects hallucination bursts (repeated lines in SRT), classifies each as: + - "burst": short hallucination with good content after → extract just that segment + - "tail": hallucination runs to end of file → extract from burst start to end + +Extracts audio for each bad segment, re-transcribes with anti-hallucination +parameters, and splices everything back together. + +Usage: + python retranscribe_tail.py # auto-detect all broken transcripts + python retranscribe_tail.py "Master 25M1 Z2B" # fix a specific file + python retranscribe_tail.py --dry-run # show what would be fixed, don't run +""" + +import os +import re +import shutil +import subprocess +import sys +import logging +from pathlib import Path + +TRANSCRIPTS_DIR = Path("transcripts") +AUDIO_DIR = Path("audio") +WAV_CACHE_DIR = Path("audio_wav") +TEMP_DIR = Path("retranscribe_tmp") + +WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe") +WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +log = logging.getLogger(__name__) + +MIN_REPEATS = 4 # consecutive identical lines to count as hallucination +# If fewer than this many good entries remain after the last burst, treat as tail +TAIL_THRESHOLD = 50 + + +# --- SRT parsing --- + +def parse_srt(srt_path: Path) -> list[dict]: + """Parse SRT file into list of {index, start, end, start_sec, end_sec, text}.""" + content = srt_path.read_text(encoding="utf-8") + blocks = re.split(r"\n\n+", content.strip()) + entries = [] + for block in blocks: + lines = block.strip().split("\n") + if len(lines) < 3: + continue + try: + idx = int(lines[0]) + except ValueError: + continue + ts_match = re.match( + r"(\d{2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})[,.](\d{3})", + lines[1], + ) + if not ts_match: + continue + h, m, s, ms = (int(x) for x in ts_match.groups()[:4]) + start_sec = h * 3600 + m * 60 + s + ms / 1000 + h2, m2, s2, ms2 = (int(x) for x in ts_match.groups()[4:]) + end_sec = h2 * 3600 + m2 * 60 + s2 + ms2 / 1000 + text = "\n".join(lines[2:]).strip() + entries.append({ + "index": idx, + "start": lines[1].split("-->")[0].strip(), + "end": lines[1].split("-->")[1].strip(), + "start_sec": start_sec, + "end_sec": end_sec, + "text": text, + }) + return entries + + +def _fmt_ts(sec: float) -> str: + h = int(sec // 3600) + m = int((sec % 3600) // 60) + s = int(sec % 60) + ms = int((sec % 1) * 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +# --- Hallucination detection --- + +def detect_bursts(entries: list[dict]) -> list[dict]: + """ + Find all hallucination bursts. Returns list of: + {start_idx, end_idx, start_sec, end_sec, text, count, type} + where type is "burst" (good content follows) or "tail" (nothing useful after). + """ + bursts = [] + i = 0 + while i < len(entries) - MIN_REPEATS: + text = entries[i]["text"].strip() + if not text: + i += 1 + continue + run = 1 + while i + run < len(entries) and entries[i + run]["text"].strip() == text: + run += 1 + if run >= MIN_REPEATS: + bursts.append({ + "start_idx": i, + "end_idx": i + run - 1, + "start_sec": entries[i]["start_sec"], + "end_sec": entries[i + run - 1]["end_sec"], + "text": text, + "count": run, + }) + i += run + else: + i += 1 + + # Classify each burst + for burst in bursts: + remaining_after = len(entries) - burst["end_idx"] - 1 + burst["type"] = "burst" if remaining_after >= TAIL_THRESHOLD else "tail" + + return bursts + + +# --- Audio / transcription --- + +def find_ffmpeg() -> str: + """Find ffmpeg binary.""" + for candidate in ["ffmpeg", "ffmpeg.exe", r"ffmpeg\ffmpeg.exe"]: + if Path(candidate).exists(): + return candidate + try: + if subprocess.run([candidate, "-version"], capture_output=True).returncode == 0: + return candidate + except FileNotFoundError: + continue + return "ffmpeg" + + +def extract_audio_segment(wav_path: str, start_sec: float, end_sec: float | None, + output_path: str): + """Extract audio segment. If end_sec is None, extract to end of file.""" + ffmpeg = find_ffmpeg() + cmd = [ + ffmpeg, + "-i", wav_path, + "-ss", f"{start_sec:.3f}", + ] + if end_sec is not None: + duration = end_sec - start_sec + cmd += ["-t", f"{duration:.3f}"] + cmd += [ + "-acodec", "pcm_s16le", + "-ar", "16000", + "-ac", "1", + "-y", + output_path, + ] + label = f"{start_sec:.1f}s-{'end' if end_sec is None else f'{end_sec:.1f}s'}" + log.info(f" Extracting audio [{label}]: {Path(output_path).name}") + result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {result.stderr[:300]}") + + +def transcribe_chunk(audio_path: str, output_base: str) -> bool: + """Run whisper.cpp with anti-hallucination params on a chunk.""" + cmd = [ + WHISPER_BIN, + "--model", WHISPER_MODEL, + "--language", "ro", + "--no-gpu", + "--threads", str(os.cpu_count() or 4), + "--beam-size", "1", + "--best-of", "1", + "--max-context", "0", # don't carry context between segments + "--entropy-thold", "2.4", + "--max-len", "60", + "--suppress-nst", # suppress non-speech tokens (reduces hallucination on silence) + "--no-fallback", # don't retry with higher temperature (prevents hallucination amplification) + "--output-txt", + "--output-srt", + "--output-file", output_base, + "--file", audio_path, + ] + + log.info(f" CMD: {' '.join(cmd)}") + + env = os.environ.copy() + whisper_dir = str(Path(WHISPER_BIN).resolve().parent) + env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "") + + result = subprocess.run( + cmd, + stdout=sys.stdout, + stderr=sys.stderr, + timeout=7200, + env=env, + ) + return result.returncode == 0 + + +# --- Splice --- + +def build_srt_block(idx: int, entry: dict) -> str: + """Format a single SRT entry.""" + return f"{idx}\n{entry['start']} --> {entry['end']}\n{entry['text']}\n" + + +def build_srt_block_offset(idx: int, entry: dict, offset_sec: float) -> str: + """Format a single SRT entry with timestamp offset.""" + new_start = entry["start_sec"] + offset_sec + new_end = entry["end_sec"] + offset_sec + return f"{idx}\n{_fmt_ts(new_start)} --> {_fmt_ts(new_end)}\n{entry['text']}\n" + + +def fix_transcript(stem: str, dry_run: bool = False) -> bool: + """Fix one hallucinated transcript. Returns True on success.""" + srt_path = TRANSCRIPTS_DIR / f"{stem}.srt" + txt_path = TRANSCRIPTS_DIR / f"{stem}.txt" + if not srt_path.exists(): + log.error(f"SRT not found: {srt_path}") + return False + + entries = parse_srt(srt_path) + bursts = detect_bursts(entries) + if not bursts: + log.info(f" {stem}: no hallucination detected, skipping") + return True + + # Report findings + for b in bursts: + log.info( + f" {b['type'].upper()} (RE-TRANSCRIBE): entries {b['start_idx']}-{b['end_idx']} " + f"({_fmt_ts(b['start_sec'])} - {_fmt_ts(b['end_sec'])}), " + f"\"{b['text'][:50]}\" x{b['count']}" + ) + + if dry_run: + return True + + # Find audio source (WAV) + audio_src = find_wav_for_stem(stem) + if not audio_src: + log.error(f" No audio found for {stem}") + return False + + TEMP_DIR.mkdir(exist_ok=True) + + # Backup originals + backup_dir = TRANSCRIPTS_DIR / "backup" + backup_dir.mkdir(exist_ok=True) + for p in [txt_path, srt_path]: + if p.exists(): + backup = backup_dir / p.name + if backup.exists(): + backup.unlink() + shutil.copy2(p, backup) + log.info(f" Backed up: {backup}") + + # Build the fixed transcript by processing segments between bursts. + # Strategy: keep good entries as-is, replace each burst with re-transcription. + # + # Segments to process: + # [0 .. burst0.start) → keep (good) + # [burst0.start .. burst0.end] → re-transcribe + # (burst0.end .. burst1.start) → keep (good) + # [burst1.start .. burst1.end] → re-transcribe + # ... + # after last burst: + # if type=burst → keep remaining entries + # if type=tail → re-transcribe from burst start to end of audio + + result_txt_parts = [] + result_srt_entries = [] # list of (start_sec, end_sec, start_ts, end_ts, text) + chunk_idx = 0 + + prev_end_idx = -1 # last processed entry index + + for burst in bursts: + # 1) Keep good entries before this burst + for i in range(prev_end_idx + 1, burst["start_idx"]): + e = entries[i] + result_srt_entries.append(e) + result_txt_parts.append(e["text"]) + + # 2) Re-transcribe the hallucinated segment + chunk_wav = str(TEMP_DIR / f"{stem}_chunk{chunk_idx}.wav") + if burst["type"] == "tail": + extract_audio_segment(audio_src, burst["start_sec"], None, chunk_wav) + else: + extract_audio_segment( + audio_src, burst["start_sec"], burst["end_sec"], chunk_wav + ) + + chunk_base = str(TEMP_DIR / f"{stem}_chunk{chunk_idx}") + success = transcribe_chunk(chunk_wav, chunk_base) + + chunk_srt = Path(f"{chunk_base}.srt") + chunk_usable = False + + if success and chunk_srt.exists(): + chunk_entries = parse_srt(chunk_srt) + # Check if the retranscription itself hallucinated + if detect_bursts(chunk_entries): + log.warning(f" Chunk {chunk_idx} hallucinated again — " + f"likely silence/music, dropping segment") + elif not chunk_entries: + log.warning(f" Chunk {chunk_idx} produced empty output, dropping segment") + else: + chunk_usable = True + else: + log.warning(f" Whisper failed on chunk {chunk_idx}, dropping segment") + + if chunk_usable: + # Read re-transcribed entries and offset timestamps + offset = burst["start_sec"] + for ce in chunk_entries: + ce["start_sec"] += offset + ce["end_sec"] += offset + ce["start"] = _fmt_ts(ce["start_sec"]) + ce["end"] = _fmt_ts(ce["end_sec"]) + result_srt_entries.append(ce) + result_txt_parts.append(ce["text"]) + else: + log.info(f" Segment {_fmt_ts(burst['start_sec'])} - " + f"{_fmt_ts(burst['end_sec'])} removed (no usable speech)") + + if burst["type"] == "tail": + prev_end_idx = len(entries) + else: + prev_end_idx = burst["end_idx"] + chunk_idx += 1 + + # 3) Keep any remaining good entries after last burst + if prev_end_idx < len(entries) - 1: + for i in range(prev_end_idx + 1, len(entries)): + e = entries[i] + result_srt_entries.append(e) + result_txt_parts.append(e["text"]) + + # Write final TXT + with open(txt_path, "w", encoding="utf-8") as f: + f.write("\n".join(result_txt_parts)) + f.write("\n") + log.info(f" Written: {txt_path} ({txt_path.stat().st_size} bytes)") + + # Write final SRT + with open(srt_path, "w", encoding="utf-8") as f: + for i, e in enumerate(result_srt_entries, 1): + f.write(f"{i}\n{e['start']} --> {e['end']}\n{e['text']}\n\n") + log.info(f" Written: {srt_path} ({srt_path.stat().st_size} bytes)") + + log.info(f" {stem}: FIXED ({len(result_srt_entries)} entries, " + f"{chunk_idx} chunk(s) re-transcribed)") + return True + + +def find_wav_for_stem(stem: str) -> str | None: + """Find the WAV file corresponding to a transcript stem.""" + # Direct match + wav = WAV_CACHE_DIR / f"{stem}.wav" + if wav.exists(): + return str(wav) + # Try with [Audio] suffix (original download names) + wav_audio = WAV_CACHE_DIR / f"{stem} [Audio].wav" + if wav_audio.exists(): + return str(wav_audio) + # Glob for partial match + for w in WAV_CACHE_DIR.glob(f"{stem}*.wav"): + return str(w) + return None + + +def find_broken_transcripts() -> list[str]: + """Scan all SRT files and return stems with hallucination.""" + broken = [] + for srt_file in sorted(TRANSCRIPTS_DIR.glob("*.srt")): + entries = parse_srt(srt_file) + if detect_bursts(entries): + broken.append(srt_file.stem) + return broken + + +def main(): + args = sys.argv[1:] + dry_run = "--dry-run" in args + args = [a for a in args if a != "--dry-run"] + + if args: + stems = args + else: + log.info("Scanning for hallucinated transcripts...") + stems = find_broken_transcripts() + if not stems: + log.info("All transcripts look clean!") + return + log.info(f"Found {len(stems)} broken transcript(s): {stems}") + + for stem in stems: + log.info(f"\n{'='*60}") + log.info(f"Processing: {stem}") + log.info(f"{'='*60}") + fix_transcript(stem, dry_run=dry_run) + + if not dry_run and TEMP_DIR.exists(): + shutil.rmtree(TEMP_DIR) + log.info("Cleaned up temp directory") + + +if __name__ == "__main__": + main() diff --git a/run.bat b/run.bat index 24bbe5a..3fb5be8 100644 --- a/run.bat +++ b/run.bat @@ -274,8 +274,7 @@ if "%~1"=="" ( if errorlevel 1 ( echo. echo WARNING: Some downloads failed. Check download_errors.log - echo Press any key to continue to transcription anyway, or Ctrl+C to abort. - pause >nul + echo Continuing to transcription automatically... ) :: ============================================================ @@ -312,4 +311,3 @@ echo. echo Next step: generate summaries from WSL2 with Claude Code echo python summarize.py echo ============================================================ -pause diff --git a/transcribe.py b/transcribe.py index f5ba638..05c2af2 100644 --- a/transcribe.py +++ b/transcribe.py @@ -131,6 +131,11 @@ def transcribe_file(audio_path: str, output_base: str) -> bool: "--threads", str(os.cpu_count() or 4), "--beam-size", "1", "--best-of", "1", + "--max-context", "0", # don't carry context between segments (prevents hallucination loops) + "--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments + "--max-len", "60", # shorter segments reduce drift + "--suppress-nst", # suppress non-speech tokens (reduces hallucination on silence) + "--no-fallback", # don't retry with higher temperature "--output-txt", "--output-srt", "--output-file", output_base, @@ -260,17 +265,9 @@ def main(): # Save manifest after each file (checkpoint) save_manifest(manifest) - # Quality gate: pause after first module + # Log milestone after first module (no longer pauses) if mod == manifest["modules"][0] and transcribed > 0: - log.info("\n" + "!" * 60) - log.info("QUALITY GATE: First module complete.") - log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.") - log.info("Press Enter to continue, or Ctrl+C to abort...") - log.info("!" * 60) - try: - input() - except EOFError: - pass # Non-interactive mode, continue + log.info(f"First module complete ({transcribed} files). Continuing automatically...") # Validation empty_outputs = [