refactor: parametrize pipeline cu --course flag + suport Vimeo/text

Un singur set de scripturi acum rulează pe orice curs configurat în
courses.py. Master rămâne la rădăcina repo (backward-compat M1-M6);
cursuri noi (ex. practitioner la shop.cursnlp.ro) primesc un root
dedicat (nlp-practitioner/) cu propriile artefacte.

- courses.py: config dict (master, practitioner) + course_paths() +
  validate_manifest_course() (manifest fără course_key = master).
- download.py: --course + --modules; trei tipuri de lecții (audio HTTP,
  Vimeo iframe via yt-dlp audio-only, text-only cu captură HTML);
  merge cu manifest existent în loc de replace; strip [Audio] pentru
  backward-compat paths.
- transcribe.py: --course + --modules; skip type==text; path-uri prin
  course_paths(); validare course_key.
- summarize.py: --course + --compile; template prompt folosește
  course['name']; scrie SUPORT_CURS.md cu LF explicit (WSL2 baseline).
- md_to_pdf.py: --course resolv-ă summaries_dir / pdf_dir per curs.
- run.bat: detectează master|practitioner ca primul argument,
  propagă --course la sub-scripturi; backward-compat run.bat [modules].
- requirements.txt: + yt-dlp.
- .gitignore: nlp-practitioner/audio/, audio_wav/, scratch_recon.py, tmp_recon/.
- tests/test_regression.sh: 5 gate-uri read-only (import, schema,
  disk-coherence, SUPORT_CURS byte-identic, cross-course isolation).

Regression curs master: PASS (manifest + SUPORT_CURS.md hash
identic cu baseline /tmp/suport_before.md).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-22 14:33:19 +03:00
parent ada00e380d
commit d22038d002
9 changed files with 1192 additions and 795 deletions

View File

@@ -1,296 +1,279 @@
"""
Batch transcription using whisper.cpp.
Reads manifest.json, transcribes each audio file in module order,
outputs .txt and .srt files, updates manifest status.
Resumable: skips files with existing transcripts.
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
"""
import json
import logging
import os
import shutil
import subprocess
import sys
from pathlib import Path
MANIFEST_PATH = Path("manifest.json")
TRANSCRIPTS_DIR = Path("transcripts")
WAV_CACHE_DIR = Path("audio_wav")
# whisper.cpp defaults — override with env vars or CLI args
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler("transcribe_errors.log"),
],
)
log = logging.getLogger(__name__)
def find_ffmpeg() -> str:
"""Find ffmpeg executable."""
if shutil.which("ffmpeg"):
return "ffmpeg"
# Check local directories
for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
if p.exists():
return str(p.resolve())
# Try imageio-ffmpeg (pip fallback)
try:
import imageio_ffmpeg
return imageio_ffmpeg.get_ffmpeg_exe()
except ImportError:
pass
return ""
def convert_to_wav(audio_path: str) -> str:
"""
Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
Returns path to WAV file. Skips if WAV already exists.
"""
src = Path(audio_path)
# Already a WAV file, skip
if src.suffix.lower() == ".wav":
return audio_path
WAV_CACHE_DIR.mkdir(exist_ok=True)
wav_path = WAV_CACHE_DIR / (src.stem + ".wav")
# Skip if already converted
if wav_path.exists() and wav_path.stat().st_size > 0:
log.info(f" WAV cache hit: {wav_path}")
return str(wav_path)
ffmpeg = find_ffmpeg()
if not ffmpeg:
log.warning(" ffmpeg not found, using original file (may cause bad transcription)")
return audio_path
log.info(f" Converting to WAV: {src.name} -> {wav_path.name}")
cmd = [
ffmpeg,
"-i", audio_path,
"-vn", # no video
"-acodec", "pcm_s16le", # 16-bit PCM
"-ar", "16000", # 16kHz sample rate (whisper standard)
"-ac", "1", # mono
"-y", # overwrite
str(wav_path),
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300, # 5 min max for conversion
)
if result.returncode != 0:
log.error(f" ffmpeg failed: {result.stderr[:300]}")
return audio_path
log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
return str(wav_path)
except FileNotFoundError:
log.warning(f" ffmpeg not found at: {ffmpeg}")
return audio_path
except subprocess.TimeoutExpired:
log.error(f" ffmpeg conversion timeout for {audio_path}")
return audio_path
def load_manifest() -> dict:
with open(MANIFEST_PATH, encoding="utf-8") as f:
return json.load(f)
def save_manifest(manifest: dict):
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
def transcribe_file(audio_path: str, output_base: str) -> bool:
"""
Run whisper.cpp on a single file.
Returns True on success.
"""
cmd = [
WHISPER_BIN,
"--model", WHISPER_MODEL,
"--language", "ro",
"--no-gpu",
"--threads", str(os.cpu_count() or 4),
"--beam-size", "1",
"--best-of", "1",
"--max-context", "0", # don't carry context between segments (prevents hallucination loops)
"--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments
"--max-len", "60", # shorter segments reduce drift
"--suppress-nst", # suppress non-speech tokens (reduces hallucination on silence)
"--no-fallback", # don't retry with higher temperature
"--output-txt",
"--output-srt",
"--output-file", output_base,
"--file", audio_path,
]
log.info(f" CMD: {' '.join(cmd)}")
try:
# Add whisper.exe's directory to PATH so Windows finds its DLLs
env = os.environ.copy()
whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
result = subprocess.run(
cmd,
stdout=sys.stdout,
stderr=sys.stderr,
timeout=7200, # 2 hour timeout per file
env=env,
)
if result.returncode != 0:
log.error(f" whisper.cpp failed (exit {result.returncode})")
return False
# Verify output exists and is non-empty
txt_path = Path(f"{output_base}.txt")
srt_path = Path(f"{output_base}.srt")
if not txt_path.exists() or txt_path.stat().st_size == 0:
log.error(f" Empty or missing transcript: {txt_path}")
return False
log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
if srt_path.exists():
log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
return True
except subprocess.TimeoutExpired:
log.error(f" Timeout (>2h) for {audio_path}")
return False
except FileNotFoundError:
log.error(f" whisper.cpp not found at: {WHISPER_BIN}")
log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
return False
except Exception as e:
log.error(f" Error: {e}")
return False
def parse_module_filter(arg: str) -> set[int]:
"""Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
result = set()
for part in arg.split(","):
part = part.strip()
if "-" in part:
a, b = part.split("-", 1)
result.update(range(int(a), int(b) + 1))
else:
result.add(int(part))
return result
def main():
if not MANIFEST_PATH.exists():
log.error("manifest.json not found. Run download.py first.")
sys.exit(1)
# Parse --modules filter
module_filter = None
if "--modules" in sys.argv:
idx = sys.argv.index("--modules")
if idx + 1 < len(sys.argv):
module_filter = parse_module_filter(sys.argv[idx + 1])
log.info(f"Module filter: {sorted(module_filter)}")
manifest = load_manifest()
TRANSCRIPTS_DIR.mkdir(exist_ok=True)
total = 0
transcribed = 0
skipped = 0
failed = 0
for mod_idx, mod in enumerate(manifest["modules"], 1):
if module_filter and mod_idx not in module_filter:
log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
continue
log.info(f"\n{'='*60}")
log.info(f"Module: {mod['name']}")
log.info(f"{'='*60}")
for lec in mod["lectures"]:
total += 1
if lec.get("download_status") != "complete":
log.warning(f" Skipping (not downloaded): {lec['title']}")
continue
audio_path = lec["audio_path"]
stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
output_base = str(TRANSCRIPTS_DIR / stem)
# Check if already transcribed
txt_path = Path(f"{output_base}.txt")
if txt_path.exists() and txt_path.stat().st_size > 0:
lec["transcribe_status"] = "complete"
skipped += 1
log.info(f" Skipping (exists): {stem}.txt")
continue
log.info(f" Transcribing: {lec['title']}")
log.info(f" File: {audio_path}")
# Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input
wav_path = convert_to_wav(audio_path)
if transcribe_file(wav_path, output_base):
lec["transcribe_status"] = "complete"
transcribed += 1
else:
lec["transcribe_status"] = "failed"
failed += 1
# Save manifest after each file (checkpoint)
save_manifest(manifest)
# Log milestone after first module (no longer pauses)
if mod == manifest["modules"][0] and transcribed > 0:
log.info(f"First module complete ({transcribed} files). Continuing automatically...")
# Validation
empty_outputs = [
lec["title"]
for mod in manifest["modules"]
for lec in mod["lectures"]
if lec.get("transcribe_status") == "complete"
and not Path(lec["transcript_path"]).exists()
]
log.info("\n" + "=" * 60)
log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
if empty_outputs:
for t in empty_outputs:
log.error(f" Missing transcript: {t}")
log.info("=" * 60)
save_manifest(manifest)
if failed:
sys.exit(1)
if __name__ == "__main__":
main()
"""
Batch transcription using whisper.cpp.
Reads <root>/manifest.json, transcribes each audio file in module order,
outputs .txt and .srt files, updates manifest status.
Resumable: skips files with existing transcripts.
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
Text lectures (type=="text") are skipped — their transcript files are
written directly by download.py.
"""
import argparse
import json
import logging
import os
import shutil
import subprocess
import sys
from pathlib import Path
from courses import course_paths, get_course, validate_manifest_course
# whisper.cpp defaults — override with env vars or CLI args.
# Shared across courses (same model + binary).
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler("transcribe_errors.log"),
],
)
log = logging.getLogger(__name__)
def find_ffmpeg() -> str:
if shutil.which("ffmpeg"):
return "ffmpeg"
for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
if p.exists():
return str(p.resolve())
try:
import imageio_ffmpeg
return imageio_ffmpeg.get_ffmpeg_exe()
except ImportError:
pass
return ""
def convert_to_wav(audio_path: str, wav_cache_dir: Path) -> str:
src = Path(audio_path)
if src.suffix.lower() == ".wav":
return audio_path
wav_cache_dir.mkdir(parents=True, exist_ok=True)
wav_path = wav_cache_dir / (src.stem + ".wav")
if wav_path.exists() and wav_path.stat().st_size > 0:
log.info(f" WAV cache hit: {wav_path}")
return str(wav_path)
ffmpeg = find_ffmpeg()
if not ffmpeg:
log.warning(" ffmpeg not found, using original file (may cause bad transcription)")
return audio_path
log.info(f" Converting to WAV: {src.name} -> {wav_path.name}")
cmd = [
ffmpeg, "-i", audio_path,
"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
"-y", str(wav_path),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
log.error(f" ffmpeg failed: {result.stderr[:300]}")
return audio_path
log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
return str(wav_path)
except FileNotFoundError:
log.warning(f" ffmpeg not found at: {ffmpeg}")
return audio_path
except subprocess.TimeoutExpired:
log.error(f" ffmpeg conversion timeout for {audio_path}")
return audio_path
def load_manifest(manifest_path: Path) -> dict:
with open(manifest_path, encoding="utf-8") as f:
return json.load(f)
def save_manifest(manifest: dict, manifest_path: Path):
manifest_path.parent.mkdir(parents=True, exist_ok=True)
with open(manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
def transcribe_file(audio_path: str, output_base: str) -> bool:
cmd = [
WHISPER_BIN,
"--model", WHISPER_MODEL,
"--language", "ro",
"--no-gpu",
"--threads", str(os.cpu_count() or 4),
"--beam-size", "1",
"--best-of", "1",
"--max-context", "0",
"--entropy-thold", "2.4",
"--max-len", "60",
"--suppress-nst",
"--no-fallback",
"--output-txt",
"--output-srt",
"--output-file", output_base,
"--file", audio_path,
]
log.info(f" CMD: {' '.join(cmd)}")
try:
env = os.environ.copy()
whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
result = subprocess.run(
cmd,
stdout=sys.stdout,
stderr=sys.stderr,
timeout=7200,
env=env,
)
if result.returncode != 0:
log.error(f" whisper.cpp failed (exit {result.returncode})")
return False
txt_path = Path(f"{output_base}.txt")
srt_path = Path(f"{output_base}.srt")
if not txt_path.exists() or txt_path.stat().st_size == 0:
log.error(f" Empty or missing transcript: {txt_path}")
return False
log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
if srt_path.exists():
log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
return True
except subprocess.TimeoutExpired:
log.error(f" Timeout (>2h) for {audio_path}")
return False
except FileNotFoundError:
log.error(f" whisper.cpp not found at: {WHISPER_BIN}")
log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
return False
except Exception as e:
log.error(f" Error: {e}")
return False
def parse_module_filter(arg: str) -> set[int]:
result = set()
for part in arg.split(","):
part = part.strip()
if "-" in part:
a, b = part.split("-", 1)
result.update(range(int(a), int(b) + 1))
else:
result.add(int(part))
return result
def parse_args():
p = argparse.ArgumentParser(description="Transcribe lecture audio for a course")
p.add_argument("--course", default="master", help="Course key (see courses.py)")
p.add_argument("--modules", default=None, help="Module filter, e.g. '1-3' or '2,4,5'")
return p.parse_args()
def main():
args = parse_args()
course = get_course(args.course)
paths = course_paths(course)
if not paths["manifest"].exists():
log.error(f"{paths['manifest']} not found. Run download.py --course {course['key']} first.")
sys.exit(1)
module_filter = parse_module_filter(args.modules) if args.modules else None
if module_filter:
log.info(f"Module filter: {sorted(module_filter)}")
manifest = load_manifest(paths["manifest"])
validate_manifest_course(manifest, course["key"])
paths["transcripts_dir"].mkdir(parents=True, exist_ok=True)
total = 0
transcribed = 0
skipped = 0
failed = 0
for mod_idx, mod in enumerate(manifest["modules"], 1):
if module_filter and mod_idx not in module_filter:
log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
continue
log.info(f"\n{'='*60}")
log.info(f"Module: {mod['name']}")
log.info(f"{'='*60}")
for lec in mod["lectures"]:
total += 1
# Text lectures bypass whisper — transcript written by download.py.
if lec.get("type") == "text":
lec["transcribe_status"] = "complete"
skipped += 1
log.info(f" Skipping text: {lec['title']}")
continue
if lec.get("download_status") != "complete":
log.warning(f" Skipping (not downloaded): {lec['title']}")
continue
audio_path = lec["audio_path"]
# Reuse the stem already recorded in the manifest for backward-compat
# with M1-M6 paths (strips ' [Audio]' for aresens filenames).
stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
output_base = str(paths["transcripts_dir"] / stem)
txt_path = Path(f"{output_base}.txt")
if txt_path.exists() and txt_path.stat().st_size > 0:
lec["transcribe_status"] = "complete"
skipped += 1
log.info(f" Skipping (exists): {stem}.txt")
continue
log.info(f" Transcribing: {lec['title']}")
log.info(f" File: {audio_path}")
wav_path = convert_to_wav(audio_path, paths["wav_cache_dir"])
if transcribe_file(wav_path, output_base):
lec["transcribe_status"] = "complete"
transcribed += 1
else:
lec["transcribe_status"] = "failed"
failed += 1
save_manifest(manifest, paths["manifest"])
if mod == manifest["modules"][0] and transcribed > 0:
log.info(f"First module complete ({transcribed} files). Continuing automatically...")
empty_outputs = [
lec["title"]
for mod in manifest["modules"]
for lec in mod["lectures"]
if lec.get("transcribe_status") == "complete"
and lec.get("type") != "text"
and not Path(lec.get("transcript_path", "")).exists()
]
log.info("\n" + "=" * 60)
log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
if empty_outputs:
for t in empty_outputs:
log.error(f" Missing transcript: {t}")
log.info("=" * 60)
save_manifest(manifest, paths["manifest"])
if failed:
sys.exit(1)
if __name__ == "__main__":
main()