""" Batch transcription using whisper.cpp. Reads manifest.json, transcribes each audio file in module order, outputs .txt and .srt files, updates manifest status. Resumable: skips files with existing transcripts. Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription. """ import json import logging import os import shutil import subprocess import sys from pathlib import Path MANIFEST_PATH = Path("manifest.json") TRANSCRIPTS_DIR = Path("transcripts") WAV_CACHE_DIR = Path("audio_wav") # whisper.cpp defaults — override with env vars or CLI args WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe") WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.StreamHandler(), logging.FileHandler("transcribe_errors.log"), ], ) log = logging.getLogger(__name__) def find_ffmpeg() -> str: """Find ffmpeg executable.""" if shutil.which("ffmpeg"): return "ffmpeg" # Check local directories for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]: if p.exists(): return str(p.resolve()) # Try imageio-ffmpeg (pip fallback) try: import imageio_ffmpeg return imageio_ffmpeg.get_ffmpeg_exe() except ImportError: pass return "" def convert_to_wav(audio_path: str) -> str: """ Convert audio file to WAV 16kHz mono (optimal for whisper.cpp). Returns path to WAV file. Skips if WAV already exists. """ src = Path(audio_path) # Already a WAV file, skip if src.suffix.lower() == ".wav": return audio_path WAV_CACHE_DIR.mkdir(exist_ok=True) wav_path = WAV_CACHE_DIR / (src.stem + ".wav") # Skip if already converted if wav_path.exists() and wav_path.stat().st_size > 0: log.info(f" WAV cache hit: {wav_path}") return str(wav_path) ffmpeg = find_ffmpeg() if not ffmpeg: log.warning(" ffmpeg not found, using original file (may cause bad transcription)") return audio_path log.info(f" Converting to WAV: {src.name} -> {wav_path.name}") cmd = [ ffmpeg, "-i", audio_path, "-vn", # no video "-acodec", "pcm_s16le", # 16-bit PCM "-ar", "16000", # 16kHz sample rate (whisper standard) "-ac", "1", # mono "-y", # overwrite str(wav_path), ] try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=300, # 5 min max for conversion ) if result.returncode != 0: log.error(f" ffmpeg failed: {result.stderr[:300]}") return audio_path log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)") return str(wav_path) except FileNotFoundError: log.warning(f" ffmpeg not found at: {ffmpeg}") return audio_path except subprocess.TimeoutExpired: log.error(f" ffmpeg conversion timeout for {audio_path}") return audio_path def load_manifest() -> dict: with open(MANIFEST_PATH, encoding="utf-8") as f: return json.load(f) def save_manifest(manifest: dict): with open(MANIFEST_PATH, "w", encoding="utf-8") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) def transcribe_file(audio_path: str, output_base: str) -> bool: """ Run whisper.cpp on a single file. Returns True on success. """ cmd = [ WHISPER_BIN, "--model", WHISPER_MODEL, "--language", "ro", "--no-gpu", "--threads", str(os.cpu_count() or 4), "--beam-size", "1", "--best-of", "1", "--max-context", "0", # don't carry context between segments (prevents hallucination loops) "--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments "--max-len", "60", # shorter segments reduce drift "--suppress-nst", # suppress non-speech tokens (reduces hallucination on silence) "--no-fallback", # don't retry with higher temperature "--output-txt", "--output-srt", "--output-file", output_base, "--file", audio_path, ] log.info(f" CMD: {' '.join(cmd)}") try: # Add whisper.exe's directory to PATH so Windows finds its DLLs env = os.environ.copy() whisper_dir = str(Path(WHISPER_BIN).resolve().parent) env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "") result = subprocess.run( cmd, stdout=sys.stdout, stderr=sys.stderr, timeout=7200, # 2 hour timeout per file env=env, ) if result.returncode != 0: log.error(f" whisper.cpp failed (exit {result.returncode})") return False # Verify output exists and is non-empty txt_path = Path(f"{output_base}.txt") srt_path = Path(f"{output_base}.srt") if not txt_path.exists() or txt_path.stat().st_size == 0: log.error(f" Empty or missing transcript: {txt_path}") return False log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)") if srt_path.exists(): log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)") return True except subprocess.TimeoutExpired: log.error(f" Timeout (>2h) for {audio_path}") return False except FileNotFoundError: log.error(f" whisper.cpp not found at: {WHISPER_BIN}") log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH") return False except Exception as e: log.error(f" Error: {e}") return False def parse_module_filter(arg: str) -> set[int]: """Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices.""" result = set() for part in arg.split(","): part = part.strip() if "-" in part: a, b = part.split("-", 1) result.update(range(int(a), int(b) + 1)) else: result.add(int(part)) return result def main(): if not MANIFEST_PATH.exists(): log.error("manifest.json not found. Run download.py first.") sys.exit(1) # Parse --modules filter module_filter = None if "--modules" in sys.argv: idx = sys.argv.index("--modules") if idx + 1 < len(sys.argv): module_filter = parse_module_filter(sys.argv[idx + 1]) log.info(f"Module filter: {sorted(module_filter)}") manifest = load_manifest() TRANSCRIPTS_DIR.mkdir(exist_ok=True) total = 0 transcribed = 0 skipped = 0 failed = 0 for mod_idx, mod in enumerate(manifest["modules"], 1): if module_filter and mod_idx not in module_filter: log.info(f"\nSkipping module {mod_idx}: {mod['name']}") continue log.info(f"\n{'='*60}") log.info(f"Module: {mod['name']}") log.info(f"{'='*60}") for lec in mod["lectures"]: total += 1 if lec.get("download_status") != "complete": log.warning(f" Skipping (not downloaded): {lec['title']}") continue audio_path = lec["audio_path"] stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "") output_base = str(TRANSCRIPTS_DIR / stem) # Check if already transcribed txt_path = Path(f"{output_base}.txt") if txt_path.exists() and txt_path.stat().st_size > 0: lec["transcribe_status"] = "complete" skipped += 1 log.info(f" Skipping (exists): {stem}.txt") continue log.info(f" Transcribing: {lec['title']}") log.info(f" File: {audio_path}") # Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input wav_path = convert_to_wav(audio_path) if transcribe_file(wav_path, output_base): lec["transcribe_status"] = "complete" transcribed += 1 else: lec["transcribe_status"] = "failed" failed += 1 # Save manifest after each file (checkpoint) save_manifest(manifest) # Log milestone after first module (no longer pauses) if mod == manifest["modules"][0] and transcribed > 0: log.info(f"First module complete ({transcribed} files). Continuing automatically...") # Validation empty_outputs = [ lec["title"] for mod in manifest["modules"] for lec in mod["lectures"] if lec.get("transcribe_status") == "complete" and not Path(lec["transcript_path"]).exists() ] log.info("\n" + "=" * 60) log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.") log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}") if empty_outputs: for t in empty_outputs: log.error(f" Missing transcript: {t}") log.info("=" * 60) save_manifest(manifest) if failed: sys.exit(1) if __name__ == "__main__": main()