feat: anti-hallucination params + retranscribe script for fixing broken transcripts

- transcribe.py: add --max-context 0, --entropy-thold 2.4, --max-len 60,
  --suppress-nst, --no-fallback to whisper.cpp to prevent hallucination loops
- transcribe.py: remove interactive quality gate (runs unattended now)
- run.bat: remove pause prompts for unattended operation
- retranscribe_tail.py: new script that detects hallucination bursts in SRT
  files, extracts and re-transcribes only the affected audio segments, then
  splices the result back together. Drops segments that re-hallucinate
  (silence/music). Backs up originals to transcripts/backup/.
- fix_hallucinations.bat: Windows wrapper for retranscribe_tail.py

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-24 21:17:14 +02:00
parent 56e676618f
commit 763999f3a9
4 changed files with 497 additions and 13 deletions

View File

@@ -131,6 +131,11 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
"--threads", str(os.cpu_count() or 4),
"--beam-size", "1",
"--best-of", "1",
"--max-context", "0", # don't carry context between segments (prevents hallucination loops)
"--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments
"--max-len", "60", # shorter segments reduce drift
"--suppress-nst", # suppress non-speech tokens (reduces hallucination on silence)
"--no-fallback", # don't retry with higher temperature
"--output-txt",
"--output-srt",
"--output-file", output_base,
@@ -260,17 +265,9 @@ def main():
# Save manifest after each file (checkpoint)
save_manifest(manifest)
# Quality gate: pause after first module
# Log milestone after first module (no longer pauses)
if mod == manifest["modules"][0] and transcribed > 0:
log.info("\n" + "!" * 60)
log.info("QUALITY GATE: First module complete.")
log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.")
log.info("Press Enter to continue, or Ctrl+C to abort...")
log.info("!" * 60)
try:
input()
except EOFError:
pass # Non-interactive mode, continue
log.info(f"First module complete ({transcribed} files). Continuing automatically...")
# Validation
empty_outputs = [