From 763999f3a903f1ef1cb820b515583a7b74b146da Mon Sep 17 00:00:00 2001
From: Marius Mutu <mmarius28@gmail.com>
Date: Tue, 24 Mar 2026 21:17:14 +0200
Subject: [PATCH] feat: anti-hallucination params + retranscribe script for
 fixing broken transcripts

- transcribe.py: add --max-context 0, --entropy-thold 2.4, --max-len 60,
  --suppress-nst, --no-fallback to whisper.cpp to prevent hallucination loops
- transcribe.py: remove interactive quality gate (runs unattended now)
- run.bat: remove pause prompts for unattended operation
- retranscribe_tail.py: new script that detects hallucination bursts in SRT
  files, extracts and re-transcribes only the affected audio segments, then
  splices the result back together. Drops segments that re-hallucinate
  (silence/music). Backs up originals to transcripts/backup/.
- fix_hallucinations.bat: Windows wrapper for retranscribe_tail.py

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fix_hallucinations.bat |  74 ++++++++
 retranscribe_tail.py   | 415 +++++++++++++++++++++++++++++++++++++++++
 run.bat                |   4 +-
 transcribe.py          |  17 +-
 4 files changed, 497 insertions(+), 13 deletions(-)
 create mode 100644 fix_hallucinations.bat
 create mode 100644 retranscribe_tail.py

diff --git a/fix_hallucinations.bat b/fix_hallucinations.bat
new file mode 100644
index 0000000..3d0ed18
--- /dev/null
+++ b/fix_hallucinations.bat
@@ -0,0 +1,74 @@
+@echo off
+setlocal enabledelayedexpansion
+cd /d "%~dp0"
+
+echo ============================================================
+echo  Fix Hallucinated Transcripts
+echo ============================================================
+echo.
+
+:: --- Find Python ---
+set "PYTHON_CMD="
+where py >nul 2>&1
+if not errorlevel 1 (
+    set "PYTHON_CMD=py"
+)
+if not defined PYTHON_CMD (
+    for /f "delims=" %%p in ('where python 2^>nul ^| findstr /v /i "WindowsApps"') do (
+        if not defined PYTHON_CMD set "PYTHON_CMD=%%p"
+    )
+)
+if not defined PYTHON_CMD (
+    echo [X] Python not found. Install from https://www.python.org/downloads/
+    pause
+    exit /b 1
+)
+
+:: --- Find whisper-cli.exe ---
+if not defined WHISPER_BIN (
+    if exist "whisper-cli.exe" (
+        set "WHISPER_BIN=whisper-cli.exe"
+    ) else if exist "whisper-bin\whisper-cli.exe" (
+        set "WHISPER_BIN=whisper-bin\whisper-cli.exe"
+    ) else if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" (
+        set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe"
+    ) else (
+        echo [X] whisper-cli.exe not found
+        pause
+        exit /b 1
+    )
+)
+echo Using whisper: %WHISPER_BIN%
+
+:: --- Find model ---
+if not defined WHISPER_MODEL (
+    set "WHISPER_MODEL=models\ggml-medium-q5_0.bin"
+)
+
+:: --- Activate venv if available ---
+if exist ".venv\Scripts\activate.bat" (
+    call .venv\Scripts\activate.bat
+)
+
+:: --- Dry run first ---
+echo.
+echo [1/2] Scanning for hallucinations...
+echo.
+set "WHISPER_BIN=%WHISPER_BIN%"
+set "WHISPER_MODEL=%WHISPER_MODEL%"
+.venv\Scripts\python retranscribe_tail.py --dry-run
+echo.
+
+:: --- Fix ---
+echo [2/2] Fixing hallucinated transcripts...
+echo.
+.venv\Scripts\python retranscribe_tail.py
+if errorlevel 1 (
+    echo.
+    echo WARNING: Some fixes failed. Check output above.
+)
+
+echo.
+echo ============================================================
+echo  Done! Originals backed up to transcripts\backup\
+echo ============================================================
diff --git a/retranscribe_tail.py b/retranscribe_tail.py
new file mode 100644
index 0000000..ed6e32d
--- /dev/null
+++ b/retranscribe_tail.py
@@ -0,0 +1,415 @@
+"""
+Re-transcribe only the hallucinated portions of a transcript.
+
+Detects hallucination bursts (repeated lines in SRT), classifies each as:
+  - "burst": short hallucination with good content after → extract just that segment
+  - "tail":  hallucination runs to end of file → extract from burst start to end
+
+Extracts audio for each bad segment, re-transcribes with anti-hallucination
+parameters, and splices everything back together.
+
+Usage:
+    python retranscribe_tail.py                     # auto-detect all broken transcripts
+    python retranscribe_tail.py "Master 25M1 Z2B"   # fix a specific file
+    python retranscribe_tail.py --dry-run            # show what would be fixed, don't run
+"""
+
+import os
+import re
+import shutil
+import subprocess
+import sys
+import logging
+from pathlib import Path
+
+TRANSCRIPTS_DIR = Path("transcripts")
+AUDIO_DIR = Path("audio")
+WAV_CACHE_DIR = Path("audio_wav")
+TEMP_DIR = Path("retranscribe_tmp")
+
+WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+log = logging.getLogger(__name__)
+
+MIN_REPEATS = 4  # consecutive identical lines to count as hallucination
+# If fewer than this many good entries remain after the last burst, treat as tail
+TAIL_THRESHOLD = 50
+
+
+# --- SRT parsing ---
+
+def parse_srt(srt_path: Path) -> list[dict]:
+    """Parse SRT file into list of {index, start, end, start_sec, end_sec, text}."""
+    content = srt_path.read_text(encoding="utf-8")
+    blocks = re.split(r"\n\n+", content.strip())
+    entries = []
+    for block in blocks:
+        lines = block.strip().split("\n")
+        if len(lines) < 3:
+            continue
+        try:
+            idx = int(lines[0])
+        except ValueError:
+            continue
+        ts_match = re.match(
+            r"(\d{2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})[,.](\d{3})",
+            lines[1],
+        )
+        if not ts_match:
+            continue
+        h, m, s, ms = (int(x) for x in ts_match.groups()[:4])
+        start_sec = h * 3600 + m * 60 + s + ms / 1000
+        h2, m2, s2, ms2 = (int(x) for x in ts_match.groups()[4:])
+        end_sec = h2 * 3600 + m2 * 60 + s2 + ms2 / 1000
+        text = "\n".join(lines[2:]).strip()
+        entries.append({
+            "index": idx,
+            "start": lines[1].split("-->")[0].strip(),
+            "end": lines[1].split("-->")[1].strip(),
+            "start_sec": start_sec,
+            "end_sec": end_sec,
+            "text": text,
+        })
+    return entries
+
+
+def _fmt_ts(sec: float) -> str:
+    h = int(sec // 3600)
+    m = int((sec % 3600) // 60)
+    s = int(sec % 60)
+    ms = int((sec % 1) * 1000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+
+# --- Hallucination detection ---
+
+def detect_bursts(entries: list[dict]) -> list[dict]:
+    """
+    Find all hallucination bursts. Returns list of:
+    {start_idx, end_idx, start_sec, end_sec, text, count, type}
+    where type is "burst" (good content follows) or "tail" (nothing useful after).
+    """
+    bursts = []
+    i = 0
+    while i < len(entries) - MIN_REPEATS:
+        text = entries[i]["text"].strip()
+        if not text:
+            i += 1
+            continue
+        run = 1
+        while i + run < len(entries) and entries[i + run]["text"].strip() == text:
+            run += 1
+        if run >= MIN_REPEATS:
+            bursts.append({
+                "start_idx": i,
+                "end_idx": i + run - 1,
+                "start_sec": entries[i]["start_sec"],
+                "end_sec": entries[i + run - 1]["end_sec"],
+                "text": text,
+                "count": run,
+            })
+            i += run
+        else:
+            i += 1
+
+    # Classify each burst
+    for burst in bursts:
+        remaining_after = len(entries) - burst["end_idx"] - 1
+        burst["type"] = "burst" if remaining_after >= TAIL_THRESHOLD else "tail"
+
+    return bursts
+
+
+# --- Audio / transcription ---
+
+def find_ffmpeg() -> str:
+    """Find ffmpeg binary."""
+    for candidate in ["ffmpeg", "ffmpeg.exe", r"ffmpeg\ffmpeg.exe"]:
+        if Path(candidate).exists():
+            return candidate
+        try:
+            if subprocess.run([candidate, "-version"], capture_output=True).returncode == 0:
+                return candidate
+        except FileNotFoundError:
+            continue
+    return "ffmpeg"
+
+
+def extract_audio_segment(wav_path: str, start_sec: float, end_sec: float | None,
+                          output_path: str):
+    """Extract audio segment. If end_sec is None, extract to end of file."""
+    ffmpeg = find_ffmpeg()
+    cmd = [
+        ffmpeg,
+        "-i", wav_path,
+        "-ss", f"{start_sec:.3f}",
+    ]
+    if end_sec is not None:
+        duration = end_sec - start_sec
+        cmd += ["-t", f"{duration:.3f}"]
+    cmd += [
+        "-acodec", "pcm_s16le",
+        "-ar", "16000",
+        "-ac", "1",
+        "-y",
+        output_path,
+    ]
+    label = f"{start_sec:.1f}s-{'end' if end_sec is None else f'{end_sec:.1f}s'}"
+    log.info(f"  Extracting audio [{label}]: {Path(output_path).name}")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg failed: {result.stderr[:300]}")
+
+
+def transcribe_chunk(audio_path: str, output_base: str) -> bool:
+    """Run whisper.cpp with anti-hallucination params on a chunk."""
+    cmd = [
+        WHISPER_BIN,
+        "--model", WHISPER_MODEL,
+        "--language", "ro",
+        "--no-gpu",
+        "--threads", str(os.cpu_count() or 4),
+        "--beam-size", "1",
+        "--best-of", "1",
+        "--max-context", "0",      # don't carry context between segments
+        "--entropy-thold", "2.4",
+        "--max-len", "60",
+        "--suppress-nst",          # suppress non-speech tokens (reduces hallucination on silence)
+        "--no-fallback",           # don't retry with higher temperature (prevents hallucination amplification)
+        "--output-txt",
+        "--output-srt",
+        "--output-file", output_base,
+        "--file", audio_path,
+    ]
+
+    log.info(f"  CMD: {' '.join(cmd)}")
+
+    env = os.environ.copy()
+    whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
+    env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
+
+    result = subprocess.run(
+        cmd,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        timeout=7200,
+        env=env,
+    )
+    return result.returncode == 0
+
+
+# --- Splice ---
+
+def build_srt_block(idx: int, entry: dict) -> str:
+    """Format a single SRT entry."""
+    return f"{idx}\n{entry['start']} --> {entry['end']}\n{entry['text']}\n"
+
+
+def build_srt_block_offset(idx: int, entry: dict, offset_sec: float) -> str:
+    """Format a single SRT entry with timestamp offset."""
+    new_start = entry["start_sec"] + offset_sec
+    new_end = entry["end_sec"] + offset_sec
+    return f"{idx}\n{_fmt_ts(new_start)} --> {_fmt_ts(new_end)}\n{entry['text']}\n"
+
+
+def fix_transcript(stem: str, dry_run: bool = False) -> bool:
+    """Fix one hallucinated transcript. Returns True on success."""
+    srt_path = TRANSCRIPTS_DIR / f"{stem}.srt"
+    txt_path = TRANSCRIPTS_DIR / f"{stem}.txt"
+    if not srt_path.exists():
+        log.error(f"SRT not found: {srt_path}")
+        return False
+
+    entries = parse_srt(srt_path)
+    bursts = detect_bursts(entries)
+    if not bursts:
+        log.info(f"  {stem}: no hallucination detected, skipping")
+        return True
+
+    # Report findings
+    for b in bursts:
+        log.info(
+            f"  {b['type'].upper()} (RE-TRANSCRIBE): entries {b['start_idx']}-{b['end_idx']} "
+            f"({_fmt_ts(b['start_sec'])} - {_fmt_ts(b['end_sec'])}), "
+            f"\"{b['text'][:50]}\" x{b['count']}"
+        )
+
+    if dry_run:
+        return True
+
+    # Find audio source (WAV)
+    audio_src = find_wav_for_stem(stem)
+    if not audio_src:
+        log.error(f"  No audio found for {stem}")
+        return False
+
+    TEMP_DIR.mkdir(exist_ok=True)
+
+    # Backup originals
+    backup_dir = TRANSCRIPTS_DIR / "backup"
+    backup_dir.mkdir(exist_ok=True)
+    for p in [txt_path, srt_path]:
+        if p.exists():
+            backup = backup_dir / p.name
+            if backup.exists():
+                backup.unlink()
+            shutil.copy2(p, backup)
+            log.info(f"  Backed up: {backup}")
+
+    # Build the fixed transcript by processing segments between bursts.
+    # Strategy: keep good entries as-is, replace each burst with re-transcription.
+    #
+    # Segments to process:
+    #   [0 .. burst0.start)          → keep (good)
+    #   [burst0.start .. burst0.end] → re-transcribe
+    #   (burst0.end .. burst1.start) → keep (good)
+    #   [burst1.start .. burst1.end] → re-transcribe
+    #   ...
+    #   after last burst:
+    #     if type=burst → keep remaining entries
+    #     if type=tail  → re-transcribe from burst start to end of audio
+
+    result_txt_parts = []
+    result_srt_entries = []  # list of (start_sec, end_sec, start_ts, end_ts, text)
+    chunk_idx = 0
+
+    prev_end_idx = -1  # last processed entry index
+
+    for burst in bursts:
+        # 1) Keep good entries before this burst
+        for i in range(prev_end_idx + 1, burst["start_idx"]):
+            e = entries[i]
+            result_srt_entries.append(e)
+            result_txt_parts.append(e["text"])
+
+        # 2) Re-transcribe the hallucinated segment
+        chunk_wav = str(TEMP_DIR / f"{stem}_chunk{chunk_idx}.wav")
+        if burst["type"] == "tail":
+            extract_audio_segment(audio_src, burst["start_sec"], None, chunk_wav)
+        else:
+            extract_audio_segment(
+                audio_src, burst["start_sec"], burst["end_sec"], chunk_wav
+            )
+
+        chunk_base = str(TEMP_DIR / f"{stem}_chunk{chunk_idx}")
+        success = transcribe_chunk(chunk_wav, chunk_base)
+
+        chunk_srt = Path(f"{chunk_base}.srt")
+        chunk_usable = False
+
+        if success and chunk_srt.exists():
+            chunk_entries = parse_srt(chunk_srt)
+            # Check if the retranscription itself hallucinated
+            if detect_bursts(chunk_entries):
+                log.warning(f"  Chunk {chunk_idx} hallucinated again — "
+                            f"likely silence/music, dropping segment")
+            elif not chunk_entries:
+                log.warning(f"  Chunk {chunk_idx} produced empty output, dropping segment")
+            else:
+                chunk_usable = True
+        else:
+            log.warning(f"  Whisper failed on chunk {chunk_idx}, dropping segment")
+
+        if chunk_usable:
+            # Read re-transcribed entries and offset timestamps
+            offset = burst["start_sec"]
+            for ce in chunk_entries:
+                ce["start_sec"] += offset
+                ce["end_sec"] += offset
+                ce["start"] = _fmt_ts(ce["start_sec"])
+                ce["end"] = _fmt_ts(ce["end_sec"])
+                result_srt_entries.append(ce)
+                result_txt_parts.append(ce["text"])
+        else:
+            log.info(f"  Segment {_fmt_ts(burst['start_sec'])} - "
+                     f"{_fmt_ts(burst['end_sec'])} removed (no usable speech)")
+
+        if burst["type"] == "tail":
+            prev_end_idx = len(entries)
+        else:
+            prev_end_idx = burst["end_idx"]
+        chunk_idx += 1
+
+    # 3) Keep any remaining good entries after last burst
+    if prev_end_idx < len(entries) - 1:
+        for i in range(prev_end_idx + 1, len(entries)):
+            e = entries[i]
+            result_srt_entries.append(e)
+            result_txt_parts.append(e["text"])
+
+    # Write final TXT
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(result_txt_parts))
+        f.write("\n")
+    log.info(f"  Written: {txt_path} ({txt_path.stat().st_size} bytes)")
+
+    # Write final SRT
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for i, e in enumerate(result_srt_entries, 1):
+            f.write(f"{i}\n{e['start']} --> {e['end']}\n{e['text']}\n\n")
+    log.info(f"  Written: {srt_path} ({srt_path.stat().st_size} bytes)")
+
+    log.info(f"  {stem}: FIXED ({len(result_srt_entries)} entries, "
+             f"{chunk_idx} chunk(s) re-transcribed)")
+    return True
+
+
+def find_wav_for_stem(stem: str) -> str | None:
+    """Find the WAV file corresponding to a transcript stem."""
+    # Direct match
+    wav = WAV_CACHE_DIR / f"{stem}.wav"
+    if wav.exists():
+        return str(wav)
+    # Try with [Audio] suffix (original download names)
+    wav_audio = WAV_CACHE_DIR / f"{stem} [Audio].wav"
+    if wav_audio.exists():
+        return str(wav_audio)
+    # Glob for partial match
+    for w in WAV_CACHE_DIR.glob(f"{stem}*.wav"):
+        return str(w)
+    return None
+
+
+def find_broken_transcripts() -> list[str]:
+    """Scan all SRT files and return stems with hallucination."""
+    broken = []
+    for srt_file in sorted(TRANSCRIPTS_DIR.glob("*.srt")):
+        entries = parse_srt(srt_file)
+        if detect_bursts(entries):
+            broken.append(srt_file.stem)
+    return broken
+
+
+def main():
+    args = sys.argv[1:]
+    dry_run = "--dry-run" in args
+    args = [a for a in args if a != "--dry-run"]
+
+    if args:
+        stems = args
+    else:
+        log.info("Scanning for hallucinated transcripts...")
+        stems = find_broken_transcripts()
+        if not stems:
+            log.info("All transcripts look clean!")
+            return
+        log.info(f"Found {len(stems)} broken transcript(s): {stems}")
+
+    for stem in stems:
+        log.info(f"\n{'='*60}")
+        log.info(f"Processing: {stem}")
+        log.info(f"{'='*60}")
+        fix_transcript(stem, dry_run=dry_run)
+
+    if not dry_run and TEMP_DIR.exists():
+        shutil.rmtree(TEMP_DIR)
+        log.info("Cleaned up temp directory")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/run.bat b/run.bat
index 24bbe5a..3fb5be8 100644
--- a/run.bat
+++ b/run.bat
@@ -274,8 +274,7 @@ if "%~1"=="" (
 if errorlevel 1 (
     echo.
     echo WARNING: Some downloads failed. Check download_errors.log
-    echo Press any key to continue to transcription anyway, or Ctrl+C to abort.
-    pause >nul
+    echo Continuing to transcription automatically...
 )
 
 :: ============================================================
@@ -312,4 +311,3 @@ echo.
 echo  Next step: generate summaries from WSL2 with Claude Code
 echo    python summarize.py
 echo ============================================================
-pause
diff --git a/transcribe.py b/transcribe.py
index f5ba638..05c2af2 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -131,6 +131,11 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
         "--threads", str(os.cpu_count() or 4),
         "--beam-size", "1",
         "--best-of", "1",
+        "--max-context", "0",     # don't carry context between segments (prevents hallucination loops)
+        "--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments
+        "--max-len", "60",        # shorter segments reduce drift
+        "--suppress-nst",         # suppress non-speech tokens (reduces hallucination on silence)
+        "--no-fallback",          # don't retry with higher temperature
         "--output-txt",
         "--output-srt",
         "--output-file", output_base,
@@ -260,17 +265,9 @@ def main():
             # Save manifest after each file (checkpoint)
             save_manifest(manifest)
 
-        # Quality gate: pause after first module
+        # Log milestone after first module (no longer pauses)
         if mod == manifest["modules"][0] and transcribed > 0:
-            log.info("\n" + "!" * 60)
-            log.info("QUALITY GATE: First module complete.")
-            log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.")
-            log.info("Press Enter to continue, or Ctrl+C to abort...")
-            log.info("!" * 60)
-            try:
-                input()
-            except EOFError:
-                pass  # Non-interactive mode, continue
+            log.info(f"First module complete ({transcribed} files). Continuing automatically...")
 
     # Validation
     empty_outputs = [