nlp-master/transcribe.py

"""
Batch transcription using whisper.cpp.
Reads manifest.json, transcribes each audio file in module order,
outputs .txt and .srt files, updates manifest status.
Resumable: skips files with existing transcripts.
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
"""

import json
import logging
import os
import shutil
import subprocess
import sys
from pathlib import Path

MANIFEST_PATH = Path("manifest.json")
TRANSCRIPTS_DIR = Path("transcripts")
WAV_CACHE_DIR = Path("audio_wav")

# whisper.cpp defaults — override with env vars or CLI args
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("transcribe_errors.log"),
    ],
)
log = logging.getLogger(__name__)


def find_ffmpeg() -> str:
    """Find ffmpeg executable."""
    if shutil.which("ffmpeg"):
        return "ffmpeg"
    # Check local directories
    for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
        if p.exists():
            return str(p.resolve())
    # Try imageio-ffmpeg (pip fallback)
    try:
        import imageio_ffmpeg
        return imageio_ffmpeg.get_ffmpeg_exe()
    except ImportError:
        pass
    return ""


def convert_to_wav(audio_path: str) -> str:
    """
    Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
    Returns path to WAV file. Skips if WAV already exists.
    """
    src = Path(audio_path)

    # Already a WAV file, skip
    if src.suffix.lower() == ".wav":
        return audio_path

    WAV_CACHE_DIR.mkdir(exist_ok=True)
    wav_path = WAV_CACHE_DIR / (src.stem + ".wav")

    # Skip if already converted
    if wav_path.exists() and wav_path.stat().st_size > 0:
        log.info(f"  WAV cache hit: {wav_path}")
        return str(wav_path)

    ffmpeg = find_ffmpeg()
    if not ffmpeg:
        log.warning("  ffmpeg not found, using original file (may cause bad transcription)")
        return audio_path

    log.info(f"  Converting to WAV: {src.name} -> {wav_path.name}")
    cmd = [
        ffmpeg,
        "-i", audio_path,
        "-vn",                   # no video
        "-acodec", "pcm_s16le",  # 16-bit PCM
        "-ar", "16000",          # 16kHz sample rate (whisper standard)
        "-ac", "1",              # mono
        "-y",                    # overwrite
        str(wav_path),
    ]

    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=300,  # 5 min max for conversion
        )
        if result.returncode != 0:
            log.error(f"  ffmpeg failed: {result.stderr[:300]}")
            return audio_path

        log.info(f"  WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
        return str(wav_path)

    except FileNotFoundError:
        log.warning(f"  ffmpeg not found at: {ffmpeg}")
        return audio_path
    except subprocess.TimeoutExpired:
        log.error(f"  ffmpeg conversion timeout for {audio_path}")
        return audio_path


def load_manifest() -> dict:
    with open(MANIFEST_PATH, encoding="utf-8") as f:
        return json.load(f)


def save_manifest(manifest: dict):
    with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)


def transcribe_file(audio_path: str, output_base: str) -> bool:
    """
    Run whisper.cpp on a single file.
    Returns True on success.
    """
    cmd = [
        WHISPER_BIN,
        "--model", WHISPER_MODEL,
        "--language", "ro",
        "--no-gpu",
        "--threads", str(os.cpu_count() or 4),
        "--beam-size", "1",
        "--best-of", "1",
        "--max-context", "0",     # don't carry context between segments (prevents hallucination loops)
        "--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments
        "--max-len", "60",        # shorter segments reduce drift
        "--suppress-nst",         # suppress non-speech tokens (reduces hallucination on silence)
        "--no-fallback",          # don't retry with higher temperature
        "--output-txt",
        "--output-srt",
        "--output-file", output_base,
        "--file", audio_path,
    ]

    log.info(f"  CMD: {' '.join(cmd)}")

    try:
        # Add whisper.exe's directory to PATH so Windows finds its DLLs
        env = os.environ.copy()
        whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
        env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")

        result = subprocess.run(
            cmd,
            stdout=sys.stdout,
            stderr=sys.stderr,
            timeout=7200,  # 2 hour timeout per file
            env=env,
        )

        if result.returncode != 0:
            log.error(f"  whisper.cpp failed (exit {result.returncode})")
            return False

        # Verify output exists and is non-empty
        txt_path = Path(f"{output_base}.txt")
        srt_path = Path(f"{output_base}.srt")

        if not txt_path.exists() or txt_path.stat().st_size == 0:
            log.error(f"  Empty or missing transcript: {txt_path}")
            return False

        log.info(f"  Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
        if srt_path.exists():
            log.info(f"  Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")

        return True

    except subprocess.TimeoutExpired:
        log.error(f"  Timeout (>2h) for {audio_path}")
        return False
    except FileNotFoundError:
        log.error(f"  whisper.cpp not found at: {WHISPER_BIN}")
        log.error(f"  Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
        return False
    except Exception as e:
        log.error(f"  Error: {e}")
        return False


def parse_module_filter(arg: str) -> set[int]:
    """Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
    result = set()
    for part in arg.split(","):
        part = part.strip()
        if "-" in part:
            a, b = part.split("-", 1)
            result.update(range(int(a), int(b) + 1))
        else:
            result.add(int(part))
    return result


def main():
    if not MANIFEST_PATH.exists():
        log.error("manifest.json not found. Run download.py first.")
        sys.exit(1)

    # Parse --modules filter
    module_filter = None
    if "--modules" in sys.argv:
        idx = sys.argv.index("--modules")
        if idx + 1 < len(sys.argv):
            module_filter = parse_module_filter(sys.argv[idx + 1])
            log.info(f"Module filter: {sorted(module_filter)}")

    manifest = load_manifest()
    TRANSCRIPTS_DIR.mkdir(exist_ok=True)

    total = 0
    transcribed = 0
    skipped = 0
    failed = 0

    for mod_idx, mod in enumerate(manifest["modules"], 1):
        if module_filter and mod_idx not in module_filter:
            log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
            continue
        log.info(f"\n{'='*60}")
        log.info(f"Module: {mod['name']}")
        log.info(f"{'='*60}")

        for lec in mod["lectures"]:
            total += 1

            if lec.get("download_status") != "complete":
                log.warning(f"  Skipping (not downloaded): {lec['title']}")
                continue

            audio_path = lec["audio_path"]
            stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
            output_base = str(TRANSCRIPTS_DIR / stem)

            # Check if already transcribed
            txt_path = Path(f"{output_base}.txt")
            if txt_path.exists() and txt_path.stat().st_size > 0:
                lec["transcribe_status"] = "complete"
                skipped += 1
                log.info(f"  Skipping (exists): {stem}.txt")
                continue

            log.info(f"  Transcribing: {lec['title']}")
            log.info(f"  File: {audio_path}")

            # Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input
            wav_path = convert_to_wav(audio_path)

            if transcribe_file(wav_path, output_base):
                lec["transcribe_status"] = "complete"
                transcribed += 1
            else:
                lec["transcribe_status"] = "failed"
                failed += 1

            # Save manifest after each file (checkpoint)
            save_manifest(manifest)

        # Log milestone after first module (no longer pauses)
        if mod == manifest["modules"][0] and transcribed > 0:
            log.info(f"First module complete ({transcribed} files). Continuing automatically...")

    # Validation
    empty_outputs = [
        lec["title"]
        for mod in manifest["modules"]
        for lec in mod["lectures"]
        if lec.get("transcribe_status") == "complete"
        and not Path(lec["transcript_path"]).exists()
    ]

    log.info("\n" + "=" * 60)
    log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
    log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
    if empty_outputs:
        for t in empty_outputs:
            log.error(f"  Missing transcript: {t}")
    log.info("=" * 60)

    save_manifest(manifest)

    if failed:
        sys.exit(1)


if __name__ == "__main__":
    main()