NLP Master: pipeline download + transcribe + summarize

- run.bat: one-click pipeline (download, convert, transcribe) - download.py: fetch audio from course platform - transcribe.py: whisper.cpp batch transcription (CPU, WAV 16kHz) - MP3->WAV conversion via ffmpeg - --modules filter for splitting work across machines - summarize.py: generate summaries from transcripts - setup_whisper.py: auto-download whisper.cpp, ffmpeg, and model - Medium model (q5_0) instead of large to avoid VRAM crashes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 01:37:13 +02:00
commit bbc5884545
10 changed files with 2203 additions and 0 deletions
--- a/transcribe.py
+++ b/transcribe.py
@@ -0,0 +1,299 @@
+"""
+Batch transcription using whisper.cpp.
+Reads manifest.json, transcribes each audio file in module order,
+outputs .txt and .srt files, updates manifest status.
+Resumable: skips files with existing transcripts.
+Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
+"""
+
+import json
+import logging
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+MANIFEST_PATH = Path("manifest.json")
+TRANSCRIPTS_DIR = Path("transcripts")
+WAV_CACHE_DIR = Path("audio_wav")
+
+# whisper.cpp defaults — override with env vars or CLI args
+WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler("transcribe_errors.log"),
+    ],
+)
+log = logging.getLogger(__name__)
+
+
+def find_ffmpeg() -> str:
+    """Find ffmpeg executable."""
+    if shutil.which("ffmpeg"):
+        return "ffmpeg"
+    # Check local directories
+    for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
+        if p.exists():
+            return str(p.resolve())
+    # Try imageio-ffmpeg (pip fallback)
+    try:
+        import imageio_ffmpeg
+        return imageio_ffmpeg.get_ffmpeg_exe()
+    except ImportError:
+        pass
+    return ""
+
+
+def convert_to_wav(audio_path: str) -> str:
+    """
+    Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
+    Returns path to WAV file. Skips if WAV already exists.
+    """
+    src = Path(audio_path)
+
+    # Already a WAV file, skip
+    if src.suffix.lower() == ".wav":
+        return audio_path
+
+    WAV_CACHE_DIR.mkdir(exist_ok=True)
+    wav_path = WAV_CACHE_DIR / (src.stem + ".wav")
+
+    # Skip if already converted
+    if wav_path.exists() and wav_path.stat().st_size > 0:
+        log.info(f"  WAV cache hit: {wav_path}")
+        return str(wav_path)
+
+    ffmpeg = find_ffmpeg()
+    if not ffmpeg:
+        log.warning("  ffmpeg not found, using original file (may cause bad transcription)")
+        return audio_path
+
+    log.info(f"  Converting to WAV: {src.name} -> {wav_path.name}")
+    cmd = [
+        ffmpeg,
+        "-i", audio_path,
+        "-vn",                   # no video
+        "-acodec", "pcm_s16le",  # 16-bit PCM
+        "-ar", "16000",          # 16kHz sample rate (whisper standard)
+        "-ac", "1",              # mono
+        "-y",                    # overwrite
+        str(wav_path),
+    ]
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 min max for conversion
+        )
+        if result.returncode != 0:
+            log.error(f"  ffmpeg failed: {result.stderr[:300]}")
+            return audio_path
+
+        log.info(f"  WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
+        return str(wav_path)
+
+    except FileNotFoundError:
+        log.warning(f"  ffmpeg not found at: {ffmpeg}")
+        return audio_path
+    except subprocess.TimeoutExpired:
+        log.error(f"  ffmpeg conversion timeout for {audio_path}")
+        return audio_path
+
+
+def load_manifest() -> dict:
+    with open(MANIFEST_PATH, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def save_manifest(manifest: dict):
+    with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
+        json.dump(manifest, f, indent=2, ensure_ascii=False)
+
+
+def transcribe_file(audio_path: str, output_base: str) -> bool:
+    """
+    Run whisper.cpp on a single file.
+    Returns True on success.
+    """
+    cmd = [
+        WHISPER_BIN,
+        "--model", WHISPER_MODEL,
+        "--language", "ro",
+        "--no-gpu",
+        "--threads", str(os.cpu_count() or 4),
+        "--beam-size", "1",
+        "--best-of", "1",
+        "--output-txt",
+        "--output-srt",
+        "--output-file", output_base,
+        "--file", audio_path,
+    ]
+
+    log.info(f"  CMD: {' '.join(cmd)}")
+
+    try:
+        # Add whisper.exe's directory to PATH so Windows finds its DLLs
+        env = os.environ.copy()
+        whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
+        env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
+
+        result = subprocess.run(
+            cmd,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            timeout=7200,  # 2 hour timeout per file
+            env=env,
+        )
+
+        if result.returncode != 0:
+            log.error(f"  whisper.cpp failed (exit {result.returncode})")
+            return False
+
+        # Verify output exists and is non-empty
+        txt_path = Path(f"{output_base}.txt")
+        srt_path = Path(f"{output_base}.srt")
+
+        if not txt_path.exists() or txt_path.stat().st_size == 0:
+            log.error(f"  Empty or missing transcript: {txt_path}")
+            return False
+
+        log.info(f"  Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
+        if srt_path.exists():
+            log.info(f"  Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
+
+        return True
+
+    except subprocess.TimeoutExpired:
+        log.error(f"  Timeout (>2h) for {audio_path}")
+        return False
+    except FileNotFoundError:
+        log.error(f"  whisper.cpp not found at: {WHISPER_BIN}")
+        log.error(f"  Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
+        return False
+    except Exception as e:
+        log.error(f"  Error: {e}")
+        return False
+
+
+def parse_module_filter(arg: str) -> set[int]:
+    """Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
+    result = set()
+    for part in arg.split(","):
+        part = part.strip()
+        if "-" in part:
+            a, b = part.split("-", 1)
+            result.update(range(int(a), int(b) + 1))
+        else:
+            result.add(int(part))
+    return result
+
+
+def main():
+    if not MANIFEST_PATH.exists():
+        log.error("manifest.json not found. Run download.py first.")
+        sys.exit(1)
+
+    # Parse --modules filter
+    module_filter = None
+    if "--modules" in sys.argv:
+        idx = sys.argv.index("--modules")
+        if idx + 1 < len(sys.argv):
+            module_filter = parse_module_filter(sys.argv[idx + 1])
+            log.info(f"Module filter: {sorted(module_filter)}")
+
+    manifest = load_manifest()
+    TRANSCRIPTS_DIR.mkdir(exist_ok=True)
+
+    total = 0
+    transcribed = 0
+    skipped = 0
+    failed = 0
+
+    for mod_idx, mod in enumerate(manifest["modules"], 1):
+        if module_filter and mod_idx not in module_filter:
+            log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
+            continue
+        log.info(f"\n{'='*60}")
+        log.info(f"Module: {mod['name']}")
+        log.info(f"{'='*60}")
+
+        for lec in mod["lectures"]:
+            total += 1
+
+            if lec.get("download_status") != "complete":
+                log.warning(f"  Skipping (not downloaded): {lec['title']}")
+                continue
+
+            audio_path = lec["audio_path"]
+            stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
+            output_base = str(TRANSCRIPTS_DIR / stem)
+
+            # Check if already transcribed
+            txt_path = Path(f"{output_base}.txt")
+            if txt_path.exists() and txt_path.stat().st_size > 0:
+                lec["transcribe_status"] = "complete"
+                skipped += 1
+                log.info(f"  Skipping (exists): {stem}.txt")
+                continue
+
+            log.info(f"  Transcribing: {lec['title']}")
+            log.info(f"  File: {audio_path}")
+
+            # Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input
+            wav_path = convert_to_wav(audio_path)
+
+            if transcribe_file(wav_path, output_base):
+                lec["transcribe_status"] = "complete"
+                transcribed += 1
+            else:
+                lec["transcribe_status"] = "failed"
+                failed += 1
+
+            # Save manifest after each file (checkpoint)
+            save_manifest(manifest)
+
+        # Quality gate: pause after first module
+        if mod == manifest["modules"][0] and transcribed > 0:
+            log.info("\n" + "!" * 60)
+            log.info("QUALITY GATE: First module complete.")
+            log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.")
+            log.info("Press Enter to continue, or Ctrl+C to abort...")
+            log.info("!" * 60)
+            try:
+                input()
+            except EOFError:
+                pass  # Non-interactive mode, continue
+
+    # Validation
+    empty_outputs = [
+        lec["title"]
+        for mod in manifest["modules"]
+        for lec in mod["lectures"]
+        if lec.get("transcribe_status") == "complete"
+        and not Path(lec["transcript_path"]).exists()
+    ]
+
+    log.info("\n" + "=" * 60)
+    log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
+    log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
+    if empty_outputs:
+        for t in empty_outputs:
+            log.error(f"  Missing transcript: {t}")
+    log.info("=" * 60)
+
+    save_manifest(manifest)
+
+    if failed:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()