NLP Master: pipeline download + transcribe + summarize
- run.bat: one-click pipeline (download, convert, transcribe) - download.py: fetch audio from course platform - transcribe.py: whisper.cpp batch transcription (CPU, WAV 16kHz) - MP3->WAV conversion via ffmpeg - --modules filter for splitting work across machines - summarize.py: generate summaries from transcripts - setup_whisper.py: auto-download whisper.cpp, ffmpeg, and model - Medium model (q5_0) instead of large to avoid VRAM crashes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
299
transcribe.py
Normal file
299
transcribe.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""
|
||||
Batch transcription using whisper.cpp.
|
||||
Reads manifest.json, transcribes each audio file in module order,
|
||||
outputs .txt and .srt files, updates manifest status.
|
||||
Resumable: skips files with existing transcripts.
|
||||
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
MANIFEST_PATH = Path("manifest.json")
|
||||
TRANSCRIPTS_DIR = Path("transcripts")
|
||||
WAV_CACHE_DIR = Path("audio_wav")
|
||||
|
||||
# whisper.cpp defaults — override with env vars or CLI args
|
||||
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler("transcribe_errors.log"),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def find_ffmpeg() -> str:
|
||||
"""Find ffmpeg executable."""
|
||||
if shutil.which("ffmpeg"):
|
||||
return "ffmpeg"
|
||||
# Check local directories
|
||||
for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
|
||||
if p.exists():
|
||||
return str(p.resolve())
|
||||
# Try imageio-ffmpeg (pip fallback)
|
||||
try:
|
||||
import imageio_ffmpeg
|
||||
return imageio_ffmpeg.get_ffmpeg_exe()
|
||||
except ImportError:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
def convert_to_wav(audio_path: str) -> str:
|
||||
"""
|
||||
Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
|
||||
Returns path to WAV file. Skips if WAV already exists.
|
||||
"""
|
||||
src = Path(audio_path)
|
||||
|
||||
# Already a WAV file, skip
|
||||
if src.suffix.lower() == ".wav":
|
||||
return audio_path
|
||||
|
||||
WAV_CACHE_DIR.mkdir(exist_ok=True)
|
||||
wav_path = WAV_CACHE_DIR / (src.stem + ".wav")
|
||||
|
||||
# Skip if already converted
|
||||
if wav_path.exists() and wav_path.stat().st_size > 0:
|
||||
log.info(f" WAV cache hit: {wav_path}")
|
||||
return str(wav_path)
|
||||
|
||||
ffmpeg = find_ffmpeg()
|
||||
if not ffmpeg:
|
||||
log.warning(" ffmpeg not found, using original file (may cause bad transcription)")
|
||||
return audio_path
|
||||
|
||||
log.info(f" Converting to WAV: {src.name} -> {wav_path.name}")
|
||||
cmd = [
|
||||
ffmpeg,
|
||||
"-i", audio_path,
|
||||
"-vn", # no video
|
||||
"-acodec", "pcm_s16le", # 16-bit PCM
|
||||
"-ar", "16000", # 16kHz sample rate (whisper standard)
|
||||
"-ac", "1", # mono
|
||||
"-y", # overwrite
|
||||
str(wav_path),
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300, # 5 min max for conversion
|
||||
)
|
||||
if result.returncode != 0:
|
||||
log.error(f" ffmpeg failed: {result.stderr[:300]}")
|
||||
return audio_path
|
||||
|
||||
log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
|
||||
return str(wav_path)
|
||||
|
||||
except FileNotFoundError:
|
||||
log.warning(f" ffmpeg not found at: {ffmpeg}")
|
||||
return audio_path
|
||||
except subprocess.TimeoutExpired:
|
||||
log.error(f" ffmpeg conversion timeout for {audio_path}")
|
||||
return audio_path
|
||||
|
||||
|
||||
def load_manifest() -> dict:
|
||||
with open(MANIFEST_PATH, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_manifest(manifest: dict):
|
||||
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def transcribe_file(audio_path: str, output_base: str) -> bool:
|
||||
"""
|
||||
Run whisper.cpp on a single file.
|
||||
Returns True on success.
|
||||
"""
|
||||
cmd = [
|
||||
WHISPER_BIN,
|
||||
"--model", WHISPER_MODEL,
|
||||
"--language", "ro",
|
||||
"--no-gpu",
|
||||
"--threads", str(os.cpu_count() or 4),
|
||||
"--beam-size", "1",
|
||||
"--best-of", "1",
|
||||
"--output-txt",
|
||||
"--output-srt",
|
||||
"--output-file", output_base,
|
||||
"--file", audio_path,
|
||||
]
|
||||
|
||||
log.info(f" CMD: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
# Add whisper.exe's directory to PATH so Windows finds its DLLs
|
||||
env = os.environ.copy()
|
||||
whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
|
||||
env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stderr,
|
||||
timeout=7200, # 2 hour timeout per file
|
||||
env=env,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
log.error(f" whisper.cpp failed (exit {result.returncode})")
|
||||
return False
|
||||
|
||||
# Verify output exists and is non-empty
|
||||
txt_path = Path(f"{output_base}.txt")
|
||||
srt_path = Path(f"{output_base}.srt")
|
||||
|
||||
if not txt_path.exists() or txt_path.stat().st_size == 0:
|
||||
log.error(f" Empty or missing transcript: {txt_path}")
|
||||
return False
|
||||
|
||||
log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
|
||||
if srt_path.exists():
|
||||
log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
log.error(f" Timeout (>2h) for {audio_path}")
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
log.error(f" whisper.cpp not found at: {WHISPER_BIN}")
|
||||
log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
|
||||
return False
|
||||
except Exception as e:
|
||||
log.error(f" Error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def parse_module_filter(arg: str) -> set[int]:
|
||||
"""Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
|
||||
result = set()
|
||||
for part in arg.split(","):
|
||||
part = part.strip()
|
||||
if "-" in part:
|
||||
a, b = part.split("-", 1)
|
||||
result.update(range(int(a), int(b) + 1))
|
||||
else:
|
||||
result.add(int(part))
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
if not MANIFEST_PATH.exists():
|
||||
log.error("manifest.json not found. Run download.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
# Parse --modules filter
|
||||
module_filter = None
|
||||
if "--modules" in sys.argv:
|
||||
idx = sys.argv.index("--modules")
|
||||
if idx + 1 < len(sys.argv):
|
||||
module_filter = parse_module_filter(sys.argv[idx + 1])
|
||||
log.info(f"Module filter: {sorted(module_filter)}")
|
||||
|
||||
manifest = load_manifest()
|
||||
TRANSCRIPTS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
total = 0
|
||||
transcribed = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for mod_idx, mod in enumerate(manifest["modules"], 1):
|
||||
if module_filter and mod_idx not in module_filter:
|
||||
log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
|
||||
continue
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"Module: {mod['name']}")
|
||||
log.info(f"{'='*60}")
|
||||
|
||||
for lec in mod["lectures"]:
|
||||
total += 1
|
||||
|
||||
if lec.get("download_status") != "complete":
|
||||
log.warning(f" Skipping (not downloaded): {lec['title']}")
|
||||
continue
|
||||
|
||||
audio_path = lec["audio_path"]
|
||||
stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
|
||||
output_base = str(TRANSCRIPTS_DIR / stem)
|
||||
|
||||
# Check if already transcribed
|
||||
txt_path = Path(f"{output_base}.txt")
|
||||
if txt_path.exists() and txt_path.stat().st_size > 0:
|
||||
lec["transcribe_status"] = "complete"
|
||||
skipped += 1
|
||||
log.info(f" Skipping (exists): {stem}.txt")
|
||||
continue
|
||||
|
||||
log.info(f" Transcribing: {lec['title']}")
|
||||
log.info(f" File: {audio_path}")
|
||||
|
||||
# Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input
|
||||
wav_path = convert_to_wav(audio_path)
|
||||
|
||||
if transcribe_file(wav_path, output_base):
|
||||
lec["transcribe_status"] = "complete"
|
||||
transcribed += 1
|
||||
else:
|
||||
lec["transcribe_status"] = "failed"
|
||||
failed += 1
|
||||
|
||||
# Save manifest after each file (checkpoint)
|
||||
save_manifest(manifest)
|
||||
|
||||
# Quality gate: pause after first module
|
||||
if mod == manifest["modules"][0] and transcribed > 0:
|
||||
log.info("\n" + "!" * 60)
|
||||
log.info("QUALITY GATE: First module complete.")
|
||||
log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.")
|
||||
log.info("Press Enter to continue, or Ctrl+C to abort...")
|
||||
log.info("!" * 60)
|
||||
try:
|
||||
input()
|
||||
except EOFError:
|
||||
pass # Non-interactive mode, continue
|
||||
|
||||
# Validation
|
||||
empty_outputs = [
|
||||
lec["title"]
|
||||
for mod in manifest["modules"]
|
||||
for lec in mod["lectures"]
|
||||
if lec.get("transcribe_status") == "complete"
|
||||
and not Path(lec["transcript_path"]).exists()
|
||||
]
|
||||
|
||||
log.info("\n" + "=" * 60)
|
||||
log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
|
||||
log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
|
||||
if empty_outputs:
|
||||
for t in empty_outputs:
|
||||
log.error(f" Missing transcript: {t}")
|
||||
log.info("=" * 60)
|
||||
|
||||
save_manifest(manifest)
|
||||
|
||||
if failed:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user