refactor: parametrize pipeline cu --course flag + suport Vimeo/text

Un singur set de scripturi acum rulează pe orice curs configurat în
courses.py. Master rămâne la rădăcina repo (backward-compat M1-M6);
cursuri noi (ex. practitioner la shop.cursnlp.ro) primesc un root
dedicat (nlp-practitioner/) cu propriile artefacte.

- courses.py: config dict (master, practitioner) + course_paths() +
  validate_manifest_course() (manifest fără course_key = master).
- download.py: --course + --modules; trei tipuri de lecții (audio HTTP,
  Vimeo iframe via yt-dlp audio-only, text-only cu captură HTML);
  merge cu manifest existent în loc de replace; strip [Audio] pentru
  backward-compat paths.
- transcribe.py: --course + --modules; skip type==text; path-uri prin
  course_paths(); validare course_key.
- summarize.py: --course + --compile; template prompt folosește
  course['name']; scrie SUPORT_CURS.md cu LF explicit (WSL2 baseline).
- md_to_pdf.py: --course resolv-ă summaries_dir / pdf_dir per curs.
- run.bat: detectează master|practitioner ca primul argument,
  propagă --course la sub-scripturi; backward-compat run.bat [modules].
- requirements.txt: + yt-dlp.
- .gitignore: nlp-practitioner/audio/, audio_wav/, scratch_recon.py, tmp_recon/.
- tests/test_regression.sh: 5 gate-uri read-only (import, schema,
  disk-coherence, SUPORT_CURS byte-identic, cross-course isolation).

Regression curs master: PASS (manifest + SUPORT_CURS.md hash
identic cu baseline /tmp/suport_before.md).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-22 14:33:19 +03:00
parent ada00e380d
commit d22038d002
9 changed files with 1192 additions and 795 deletions

8
.gitignore vendored
View File

@@ -38,3 +38,11 @@ __pycache__/
# Logs # Logs
*.log *.log
# Second course (practitioner) — artifacts only, scripts partajate
nlp-practitioner/audio/
nlp-practitioner/audio_wav/
# Recon scratch
scratch_recon.py
tmp_recon/

80
courses.py Normal file
View File

@@ -0,0 +1,80 @@
"""
Shared course configuration for the NLP Master pipeline.
A single pipeline (download -> transcribe -> summarize -> pdf) runs on
multiple courses by passing --course <key>. Scripts resolve all artifact
paths against course["root"], so curs master remains in-place at repo root
and subsequent courses land in their own subdirectory.
"""
from pathlib import Path
COURSES = {
"master": {
"name": "NLP Master Practitioner Bucuresti 2025",
"base_url": "https://cursuri.aresens.ro",
"course_path": "/curs/26",
"login_path": "/login",
"env_user": "COURSE_USERNAME",
"env_pass": "COURSE_PASSWORD",
# Curs master stays at repo root for backward-compat with M1-M6 outputs.
"root": Path("."),
},
"practitioner": {
"name": "NLP Practitioner (cursnlp.ro)",
"base_url": "https://shop.cursnlp.ro",
"course_path": "/curs/50",
"login_path": "/login",
"env_user": "PRACTITIONER_USERNAME",
"env_pass": "PRACTITIONER_PASSWORD",
"root": Path("nlp-practitioner"),
},
}
def get_course(key: str) -> dict:
"""Return course config by key; SystemExit on unknown key."""
if key not in COURSES:
raise SystemExit(
f"Unknown course '{key}'. Available: {sorted(COURSES)}"
)
c = dict(COURSES[key])
c["key"] = key
c["course_url"] = c["base_url"] + c["course_path"]
c["login_url"] = c["base_url"] + c["login_path"]
return c
def course_paths(course: dict) -> dict:
"""Resolve artifact paths under course['root']."""
root = course["root"]
return {
"root": root,
"manifest": root / "manifest.json",
"audio_dir": root / "audio",
"wav_cache_dir": root / "audio_wav",
"transcripts_dir": root / "transcripts",
"summaries_dir": root / "summaries",
"pdf_dir": root / "summaries" / "pdf",
"master_guide": root / "SUPORT_CURS.md",
}
def validate_manifest_course(manifest: dict, course_key: str) -> None:
"""
Ensure a pre-existing manifest belongs to the course currently being run.
Legacy policy: a manifest without `course_key` (written before this refactor)
is treated as `master`. This keeps backward-compat with the existing
curs_26 manifest.json from M1-M6.
"""
mck = manifest.get("course_key")
if mck is None:
effective = "master"
else:
effective = mck
if effective != course_key:
raise SystemExit(
f"Manifest belongs to course '{effective}' but --course='{course_key}'. "
f"Refusing to corrupt cross-course state. "
f"Delete {course_key}'s manifest to start fresh, or run with --course={effective}."
)

View File

@@ -1,26 +1,34 @@
""" """
Download all audio files from cursuri.aresens.ro NLP Master course. Download all lecture media from a configured course (see courses.py).
Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json.
Resumable: skips already-downloaded files. Logs in, discovers modules + lectures, downloads whichever media each
lecture exposes, writes <root>/manifest.json. Resumable: skips already-
downloaded files.
Lecture types:
- "audio": <audio source> MP3 on the course CDN -> requests stream download
- "vimeo": <iframe src="...player.vimeo.com/video/ID"> -> yt-dlp
(audio-only HLS track -> MP3 96kbps, no video bytes fetched)
- "text": neither audio nor video -> capture the lecture HTML body as
a plain-text transcript directly (skips whisper entirely)
""" """
import argparse
import json import json
import logging import logging
import os import os
import re
import sys import sys
import time import time
from pathlib import Path from pathlib import Path
from urllib.parse import urljoin from urllib.parse import urljoin, urlparse
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from dotenv import load_dotenv from dotenv import load_dotenv
BASE_URL = "https://cursuri.aresens.ro" from courses import course_paths, get_course, validate_manifest_course
COURSE_URL = f"{BASE_URL}/curs/26"
LOGIN_URL = f"{BASE_URL}/login"
AUDIO_DIR = Path("audio")
MANIFEST_PATH = Path("manifest.json")
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_BACKOFF = [5, 15, 30] RETRY_BACKOFF = [5, 15, 30]
@@ -35,22 +43,19 @@ logging.basicConfig(
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def login(session: requests.Session, email: str, password: str) -> bool: def login(session: requests.Session, course: dict, email: str, password: str) -> bool:
"""Login and return True on success.""" resp = session.post(course["login_url"], data={
resp = session.post(LOGIN_URL, data={
"email": email, "email": email,
"password": password, "password": password,
"act": "login", "act": "login",
"remember": "on", "remember": "on",
}, allow_redirects=True) }, allow_redirects=True)
# Successful login redirects to the course page, not back to /login
if "/login" in resp.url or "loginform" in resp.text: if "/login" in resp.url or "loginform" in resp.text:
return False return False
return True return True
def parse_module_filter(arg: str) -> set[int]: def parse_module_filter(arg: str) -> set[int]:
"""Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
result = set() result = set()
for part in arg.split(","): for part in arg.split(","):
part = part.strip() part = part.strip()
@@ -62,9 +67,8 @@ def parse_module_filter(arg: str) -> set[int]:
return result return result
def discover_modules(session: requests.Session) -> list[dict]: def discover_modules(session: requests.Session, course: dict) -> list[dict]:
"""Fetch course page and return list of {name, url, module_id}.""" resp = session.get(course["course_url"])
resp = session.get(COURSE_URL)
resp.raise_for_status() resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
@@ -78,15 +82,63 @@ def discover_modules(session: requests.Session) -> list[dict]:
module_id = href.rstrip("/").split("/")[-1] module_id = href.rstrip("/").split("/")[-1]
modules.append({ modules.append({
"name": number_el.get_text(strip=True), "name": number_el.get_text(strip=True),
"url": urljoin(BASE_URL, href), "url": urljoin(course["base_url"], href),
"module_id": module_id, "module_id": module_id,
}) })
log.info(f"Found {len(modules)} modules") log.info(f"Found {len(modules)} modules")
if not modules:
log.error("No modules found on course page — selectors mismatch or not logged in")
sys.exit(1)
return modules return modules
def discover_lectures(session: requests.Session, module: dict) -> list[dict]: VIMEO_ID_RE = re.compile(r"player\.vimeo\.com/video/(\d+)", re.I)
"""Fetch a module page and return list of lectures with audio URLs."""
def slugify(text: str) -> str:
"""Filesystem-safe slug for text lectures (no URL-derived filename)."""
text = text.strip().lower()
text = re.sub(r"[^\w\s-]", "", text, flags=re.UNICODE)
text = re.sub(r"[\s_-]+", "_", text)
return text[:80] or "untitled"
def derived_stem(filename: str) -> str:
"""
Stem used for transcript/srt/summary paths.
Strips the ' [Audio]' suffix used on curs master (aresens) filenames
so derived paths stay short and backward-compatible with M1-M6.
"""
return Path(filename).stem.replace(" [Audio]", "")
def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
"""
Return (lecture_type, media_url_or_empty, filename_stem).
Types:
- ("audio", mp3_url, filename_from_url)
- ("vimeo", vimeo_url, "vimeo_<id>")
- ("text", "", slug_from_title) # no media found
"""
audio_el = lesson_div.select_one("audio source")
if audio_el and audio_el.get("src", "").strip():
src = audio_el["src"].strip()
return "audio", urljoin(base_url, src), src.split("/")[-1].rsplit(".", 1)[0]
iframe_el = lesson_div.select_one("iframe")
if iframe_el:
src = (iframe_el.get("src") or iframe_el.get("data-src") or "").strip()
m = VIMEO_ID_RE.search(src)
if m:
vimeo_id = m.group(1)
# Canonical player URL works with yt-dlp + referer.
return "vimeo", f"https://player.vimeo.com/video/{vimeo_id}", f"vimeo_{vimeo_id}"
return "text", "", "" # stem filled in by caller using title slug
def discover_lectures(session: requests.Session, module: dict, course: dict) -> list[dict]:
resp = session.get(module["url"]) resp = session.get(module["url"])
resp.raise_for_status() resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
@@ -94,49 +146,81 @@ def discover_lectures(session: requests.Session, module: dict) -> list[dict]:
lectures = [] lectures = []
for lesson_div in soup.select("div.lesson"): for lesson_div in soup.select("div.lesson"):
name_el = lesson_div.select_one("div.module__name") name_el = lesson_div.select_one("div.module__name")
source_el = lesson_div.select_one("audio source") if not name_el:
if not name_el or not source_el:
continue continue
src = source_el.get("src", "").strip()
if not src:
continue
audio_url = urljoin(BASE_URL, src)
filename = src.split("/")[-1]
title = name_el.get_text(strip=True) title = name_el.get_text(strip=True)
lectures.append({ if not title:
"title": title, continue
"original_filename": filename,
"url": audio_url, ltype, media_url, stem = classify_lesson(lesson_div, course["base_url"])
"audio_path": str(AUDIO_DIR / filename), if ltype == "text":
}) stem = slugify(title)
log.info(f" {module['name']}: {len(lectures)} lectures") # Capture the lesson body HTML (source for text -> transcript)
# so we don't have to re-request it later.
body_el = lesson_div.select_one("div.module__content") or lesson_div
lecture = {
"type": "text",
"title": title,
"original_filename": stem + ".txt",
"url": module["url"], # lesson is inline in module page
"audio_path": "", # no audio
"text_content": body_el.get_text("\n", strip=True),
}
elif ltype == "vimeo":
# Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
lecture = {
"type": "vimeo",
"title": title,
"original_filename": f"{stem}.mp3",
"url": media_url,
"audio_path": str(audio_path),
}
else: # "audio"
# Preserve original filename (may contain spaces).
filename = media_url.split("/")[-1]
audio_path = course_paths(course)["audio_dir"] / filename
lecture = {
"type": "audio",
"title": title,
"original_filename": filename,
"url": media_url,
"audio_path": str(audio_path),
}
lectures.append(lecture)
counts = {
"audio": sum(1 for L in lectures if L["type"] == "audio"),
"vimeo": sum(1 for L in lectures if L["type"] == "vimeo"),
"text": sum(1 for L in lectures if L["type"] == "text"),
}
log.info(
f" {module['name']}: {len(lectures)} lectures "
f"(audio={counts['audio']}, vimeo={counts['vimeo']}, text={counts['text']})"
)
return lectures return lectures
def download_file(session: requests.Session, url: str, dest: Path) -> bool: def download_audio_http(session: requests.Session, url: str, dest: Path) -> bool:
"""Download a file with retry logic. Returns True on success.""" """HTTP stream download with retry. Returns True on success."""
for attempt in range(MAX_RETRIES): for attempt in range(MAX_RETRIES):
try: try:
resp = session.get(url, stream=True, timeout=300) resp = session.get(url, stream=True, timeout=300)
resp.raise_for_status() resp.raise_for_status()
tmp = dest.with_suffix(dest.suffix + ".tmp")
# Write to temp file first, then rename (atomic)
tmp = dest.with_suffix(".tmp")
total = 0 total = 0
with open(tmp, "wb") as f: with open(tmp, "wb") as f:
for chunk in resp.iter_content(chunk_size=1024 * 1024): for chunk in resp.iter_content(chunk_size=1024 * 1024):
f.write(chunk) f.write(chunk)
total += len(chunk) total += len(chunk)
if total < 1_000_000:
if total < 1_000_000: # < 1MB is suspicious
log.warning(f"File too small ({total} bytes): {dest.name}") log.warning(f"File too small ({total} bytes): {dest.name}")
tmp.unlink(missing_ok=True) tmp.unlink(missing_ok=True)
return False return False
tmp.rename(dest) tmp.rename(dest)
log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)") log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
return True return True
except Exception as e: except Exception as e:
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30 wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}") log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
@@ -148,69 +232,165 @@ def download_file(session: requests.Session, url: str, dest: Path) -> bool:
return False return False
def load_manifest() -> dict | None: def download_vimeo_audio(vimeo_url: str, referer: str, dest: Path) -> bool:
"""Load existing manifest if present.""" """
if MANIFEST_PATH.exists(): Download Vimeo audio-only stream via yt-dlp, output as MP3 96kbps.
with open(MANIFEST_PATH) as f: No video bytes fetched (Vimeo HLS has separate audio tracks).
"""
try:
import yt_dlp
except ImportError:
log.error("yt-dlp not installed. Run: pip install yt-dlp")
return False
dest.parent.mkdir(parents=True, exist_ok=True)
# yt-dlp adds .mp3 extension after postprocessing; give it the stem.
outtmpl_stem = str(dest.with_suffix(""))
ydl_opts = {
"format": "bestaudio",
"outtmpl": outtmpl_stem + ".%(ext)s",
"http_headers": {"Referer": referer},
"quiet": True,
"no_warnings": True,
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "96",
}],
}
for attempt in range(MAX_RETRIES):
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([vimeo_url])
if dest.exists() and dest.stat().st_size > 100_000:
log.info(f" Downloaded (vimeo): {dest.name} ({dest.stat().st_size / 1_000_000:.1f} MB)")
return True
log.warning(f" yt-dlp produced no file or too small: {dest}")
except Exception as e:
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
log.warning(f" Vimeo attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
log.info(f" Retrying in {wait}s...")
time.sleep(wait)
log.error(f" FAILED vimeo download after {MAX_RETRIES} attempts: {vimeo_url}")
return False
def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
"""
Write the lecture's captured HTML text as a transcript .txt file.
Text lectures bypass whisper — content is final here.
"""
transcripts_dir.mkdir(parents=True, exist_ok=True)
stem = Path(lecture["original_filename"]).stem
txt_path = transcripts_dir / f"{stem}.txt"
text = lecture.get("text_content", "").strip()
if len(text) < 50:
log.warning(f" text lesson '{lecture['title']}' has <50 chars, skipping")
return False
header = f"[TEXT LECTURE — no audio]\nTitlu: {lecture['title']}\n\n"
txt_path.write_text(header + text, encoding="utf-8")
log.info(f" Captured (text): {txt_path.name} ({txt_path.stat().st_size} bytes)")
return True
def load_manifest(manifest_path: Path) -> dict | None:
if manifest_path.exists():
with open(manifest_path, encoding="utf-8") as f:
return json.load(f) return json.load(f)
return None return None
def save_manifest(manifest: dict): def save_manifest(manifest: dict, manifest_path: Path):
"""Write manifest.json.""" manifest_path.parent.mkdir(parents=True, exist_ok=True)
with open(MANIFEST_PATH, "w", encoding="utf-8") as f: with open(manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False) json.dump(manifest, f, indent=2, ensure_ascii=False)
def parse_args():
p = argparse.ArgumentParser(description="Download lecture media for a course")
p.add_argument("--course", default="master", help="Course key (see courses.py)")
p.add_argument("--modules", default=None, help="Module filter, e.g. '1-3' or '2,4,5'")
return p.parse_args()
def main(): def main():
args = parse_args()
course = get_course(args.course)
paths = course_paths(course)
load_dotenv() load_dotenv()
email = os.getenv("COURSE_USERNAME", "") email = os.getenv(course["env_user"], "")
password = os.getenv("COURSE_PASSWORD", "") password = os.getenv(course["env_pass"], "")
if not email or not password: if not email or not password:
log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env") log.error(f"Set {course['env_user']} and {course['env_pass']} in .env")
sys.exit(1) sys.exit(1)
# Parse --modules filter (e.g. "4-5" or "1,3,5") module_filter = parse_module_filter(args.modules) if args.modules else None
module_filter = None if module_filter:
if "--modules" in sys.argv: log.info(f"Module filter: {sorted(module_filter)}")
idx = sys.argv.index("--modules")
if idx + 1 < len(sys.argv):
module_filter = parse_module_filter(sys.argv[idx + 1])
log.info(f"Module filter: {sorted(module_filter)}")
AUDIO_DIR.mkdir(exist_ok=True) paths["audio_dir"].mkdir(parents=True, exist_ok=True)
paths["transcripts_dir"].mkdir(parents=True, exist_ok=True)
# Validate existing manifest belongs to this course
existing = load_manifest(paths["manifest"])
if existing is not None:
validate_manifest_course(existing, course["key"])
session = requests.Session() session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
log.info(f"Course: {course['key']} ({course['name']})")
log.info(f"Root: {paths['root']}")
log.info("Logging in...") log.info("Logging in...")
if not login(session, email, password): if not login(session, course, email, password):
log.error("Login failed. Check credentials in .env") log.error("Login failed. Check credentials in .env")
sys.exit(1) sys.exit(1)
log.info("Login successful") log.info("Login successful")
# Discover structure modules = discover_modules(session, course)
modules = discover_modules(session)
if not modules:
log.error("No modules found")
sys.exit(1)
manifest = { # Start from existing manifest if present — preserves modules outside
"course": "NLP Master Practitioner Bucuresti 2025", # the current --modules filter, and preserves per-lecture state (e.g.
"source_url": COURSE_URL, # transcribe_status) for modules in the filter.
"modules": [], if existing:
manifest = dict(existing)
manifest["course_key"] = course["key"]
manifest["course"] = course["name"]
manifest["source_url"] = course["course_url"]
if "modules" not in manifest:
manifest["modules"] = []
else:
manifest = {
"course_key": course["key"],
"course": course["name"],
"source_url": course["course_url"],
"modules": [],
}
# Index of existing modules by name for in-place replacement.
existing_by_name = {m["name"]: i for i, m in enumerate(manifest["modules"])}
# Prior lecture state (by title) for preserving transcribe_status.
prior_lecture_state: dict[str, dict] = {
lec["title"]: lec
for m in manifest["modules"]
for lec in m.get("lectures", [])
} }
total_files = 0 total = 0
downloaded = 0 downloaded = 0
skipped = 0 skipped = 0
failed = 0 failed = 0
for mod_idx, mod in enumerate(modules, 1): for mod_idx, mod in enumerate(modules, 1):
if module_filter and mod_idx not in module_filter: if module_filter and mod_idx not in module_filter:
log.info(f" Skipping module {mod_idx}: {mod['name']}") log.info(f" Skipping module {mod_idx}: {mod['name']} (outside --modules filter, preserved in manifest)")
continue continue
lectures = discover_lectures(session, mod) lectures = discover_lectures(session, mod, course)
module_entry = { module_entry = {
"name": mod["name"], "name": mod["name"],
"module_id": mod["module_id"], "module_id": mod["module_id"],
@@ -218,55 +398,73 @@ def main():
} }
for lec in lectures: for lec in lectures:
total_files += 1 total += 1
dest = Path(lec["audio_path"]) stem = derived_stem(lec["original_filename"])
stem = dest.stem.replace(" [Audio]", "")
lecture_entry = { prior = prior_lecture_state.get(lec["title"], {})
entry = {
"type": lec["type"],
"title": lec["title"], "title": lec["title"],
"original_filename": lec["original_filename"], "original_filename": lec["original_filename"],
"url": lec["url"], "url": lec["url"],
"audio_path": lec["audio_path"], "audio_path": lec["audio_path"],
"transcript_path": f"transcripts/{stem}.txt", "transcript_path": prior.get("transcript_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
"srt_path": f"transcripts/{stem}.srt", "srt_path": prior.get("srt_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
"summary_path": f"summaries/{stem}_summary.md", "summary_path": prior.get("summary_path") or f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
"download_status": "pending", "download_status": "pending",
"transcribe_status": "pending", # Preserve transcribe_status from prior run (disk check in transcribe.py will correct it if needed).
"transcribe_status": prior.get("transcribe_status", "pending"),
"file_size_bytes": 0, "file_size_bytes": 0,
} }
# Skip if already downloaded if lec["type"] == "text":
if dest.exists() and dest.stat().st_size > 1_000_000: # Captured directly; treated as already-transcribed.
lecture_entry["download_status"] = "complete" txt_path = Path(entry["transcript_path"])
lecture_entry["file_size_bytes"] = dest.stat().st_size if txt_path.exists() and txt_path.stat().st_size > 50:
skipped += 1 entry["download_status"] = "complete"
log.info(f" Skipping (exists): {dest.name}") entry["transcribe_status"] = "complete"
else: skipped += 1
if download_file(session, lec["url"], dest): log.info(f" Skipping text (exists): {txt_path.name}")
lecture_entry["download_status"] = "complete" elif capture_text_lecture(lec, paths["transcripts_dir"]):
lecture_entry["file_size_bytes"] = dest.stat().st_size entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
entry["file_size_bytes"] = txt_path.stat().st_size
downloaded += 1 downloaded += 1
else: else:
lecture_entry["download_status"] = "failed" entry["download_status"] = "failed"
failed += 1 failed += 1
module_entry["lectures"].append(lecture_entry) else:
dest = Path(lec["audio_path"])
if dest.exists() and dest.stat().st_size > 1_000_000:
entry["download_status"] = "complete"
entry["file_size_bytes"] = dest.stat().st_size
skipped += 1
log.info(f" Skipping (exists): {dest.name}")
else:
if lec["type"] == "audio":
ok = download_audio_http(session, lec["url"], dest)
else: # "vimeo"
ok = download_vimeo_audio(lec["url"], course["base_url"] + "/", dest)
if ok:
entry["download_status"] = "complete"
entry["file_size_bytes"] = dest.stat().st_size
downloaded += 1
else:
entry["download_status"] = "failed"
failed += 1
manifest["modules"].append(module_entry) module_entry["lectures"].append(entry)
# Save manifest after each module (checkpoint)
save_manifest(manifest)
# Final validation # Replace or append module in manifest (preserves order for existing, appends new at end).
all_ok = all( if mod["name"] in existing_by_name:
Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000 manifest["modules"][existing_by_name[mod["name"]]] = module_entry
for mod in manifest["modules"] else:
for lec in mod["lectures"] manifest["modules"].append(module_entry)
if lec["download_status"] == "complete" save_manifest(manifest, paths["manifest"])
)
log.info("=" * 60) log.info("=" * 60)
log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.") log.info(f"Downloaded {downloaded}/{total} items, {skipped} skipped, {failed} failures.")
log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}")
log.info("=" * 60) log.info("=" * 60)
if failed: if failed:

View File

@@ -10,8 +10,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
import markdown2 import markdown2
from weasyprint import HTML from weasyprint import HTML
SUMMARIES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "summaries") from courses import course_paths, get_course
PDF_DIR = os.path.join(SUMMARIES_DIR, "pdf")
CSS = """ CSS = """
@page { @page {
@@ -178,9 +177,9 @@ def convert_one(args):
return os.path.basename(md_path), os.path.basename(pdf_path) return os.path.basename(md_path), os.path.basename(pdf_path)
def find_files(modules=None): def find_files(summaries_dir, modules=None):
"""Find all .md files in summaries/, optionally filtered by module numbers.""" """Find all .md files in summaries/, optionally filtered by module numbers."""
pattern = os.path.join(SUMMARIES_DIR, "*.md") pattern = os.path.join(summaries_dir, "*.md")
files = sorted(glob.glob(pattern)) files = sorted(glob.glob(pattern))
if modules: if modules:
@@ -216,32 +215,35 @@ def parse_modules(spec):
def main(): def main():
parser = argparse.ArgumentParser(description="Convert MD summaries to PDF") parser = argparse.ArgumentParser(description="Convert MD summaries to PDF")
parser.add_argument("files", nargs="*", help="Specific MD files to convert") parser.add_argument("files", nargs="*", help="Specific MD files to convert")
parser.add_argument( parser.add_argument("--course", default="master", help="Course key (see courses.py)")
"--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'" parser.add_argument("--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'")
) parser.add_argument("--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)")
parser.add_argument(
"--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)"
)
args = parser.parse_args() args = parser.parse_args()
os.makedirs(PDF_DIR, exist_ok=True) course = get_course(args.course)
paths = course_paths(course)
summaries_dir = str(paths["summaries_dir"].resolve())
pdf_dir = str(paths["pdf_dir"].resolve())
os.makedirs(pdf_dir, exist_ok=True)
if args.files: if args.files:
md_files = [os.path.abspath(f) for f in args.files] md_files = [os.path.abspath(f) for f in args.files]
else: else:
modules = parse_modules(args.modules) if args.modules else None modules = parse_modules(args.modules) if args.modules else None
md_files = find_files(modules) md_files = find_files(summaries_dir, modules)
if not md_files: if not md_files:
print("No MD files found to convert.") print(f"No MD files found in {summaries_dir}")
sys.exit(1) sys.exit(1)
jobs = [] jobs = []
for md_path in md_files: for md_path in md_files:
basename = os.path.splitext(os.path.basename(md_path))[0] basename = os.path.splitext(os.path.basename(md_path))[0]
pdf_path = os.path.join(PDF_DIR, basename + ".pdf") pdf_path = os.path.join(pdf_dir, basename + ".pdf")
jobs.append((md_path, pdf_path)) jobs.append((md_path, pdf_path))
print(f"Course: {course['key']} ({course['name']})")
print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...") print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...")
with ProcessPoolExecutor(max_workers=args.workers) as pool: with ProcessPoolExecutor(max_workers=args.workers) as pool:
@@ -254,7 +256,7 @@ def main():
md_path = futures[future][0] md_path = futures[future][0]
print(f" ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr) print(f" ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr)
print(f"Done. PDFs saved to {PDF_DIR}") print(f"Done. PDFs saved to {pdf_dir}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -3,3 +3,4 @@ beautifulsoup4
python-dotenv python-dotenv
markdown2 markdown2
weasyprint weasyprint
yt-dlp

56
run.bat
View File

@@ -2,9 +2,27 @@
setlocal enabledelayedexpansion setlocal enabledelayedexpansion
cd /d "%~dp0" cd /d "%~dp0"
:: ============================================================
:: Course + module filter argument parsing
:: Usage:
:: run.bat -> master, all modules (backward-compat)
:: run.bat 1-3 -> master, modules 1-3 (backward-compat)
:: run.bat practitioner -> practitioner, all modules
:: run.bat practitioner 1-3 -> practitioner, modules 1-3
:: ============================================================
set "COURSE_KEY=master"
set "MODULE_FILTER=%~1"
if /i "%~1"=="master" (
set "COURSE_KEY=master"
set "MODULE_FILTER=%~2"
)
if /i "%~1"=="practitioner" (
set "COURSE_KEY=practitioner"
set "MODULE_FILTER=%~2"
)
echo ============================================================ echo ============================================================
echo NLP Master - Download + Transcribe Pipeline echo NLP Course Pipeline (course: %COURSE_KEY%)
echo ============================================================ echo ============================================================
echo. echo.
@@ -46,20 +64,28 @@ if not defined PYTHON_CMD (
) )
:: --- .env credentials --- :: --- .env credentials ---
:: Each course uses its own env var pair. Check based on selected course.
if /i "%COURSE_KEY%"=="practitioner" (
set "ENV_USER=PRACTITIONER_USERNAME"
set "ENV_PASS=PRACTITIONER_PASSWORD"
) else (
set "ENV_USER=COURSE_USERNAME"
set "ENV_PASS=COURSE_PASSWORD"
)
if exist ".env" ( if exist ".env" (
findstr /m "COURSE_USERNAME=." ".env" >nul 2>&1 findstr /m "!ENV_USER!=." ".env" >nul 2>&1
if errorlevel 1 ( if errorlevel 1 (
echo [X] .env File exists but COURSE_USERNAME is empty echo [X] .env File exists but !ENV_USER! is empty
echo Edit .env and fill in your credentials. echo Edit .env and set !ENV_USER! and !ENV_PASS!.
set "PREREQ_OK=" set "PREREQ_OK="
) else ( ) else (
echo [OK] .env Credentials configured echo [OK] .env Credentials configured for %COURSE_KEY%
) )
) else ( ) else (
echo [X] .env NOT FOUND echo [X] .env NOT FOUND
echo Create .env with: echo Create .env with:
echo COURSE_USERNAME=your_email echo !ENV_USER!=your_email
echo COURSE_PASSWORD=your_password echo !ENV_PASS!=your_password
set "PREREQ_OK=" set "PREREQ_OK="
) )
@@ -265,11 +291,11 @@ echo Done.
echo. echo.
echo [3/4] Downloading audio files... echo [3/4] Downloading audio files...
echo ============================================================ echo ============================================================
if "%~1"=="" ( if "!MODULE_FILTER!"=="" (
.venv\Scripts\python download.py .venv\Scripts\python download.py --course %COURSE_KEY%
) else ( ) else (
echo Modules filter: %~1 echo Modules filter: !MODULE_FILTER!
.venv\Scripts\python download.py --modules %~1 .venv\Scripts\python download.py --course %COURSE_KEY% --modules !MODULE_FILTER!
) )
if errorlevel 1 ( if errorlevel 1 (
echo. echo.
@@ -287,11 +313,11 @@ echo Using: %WHISPER_BIN%
echo Model: %WHISPER_MODEL% echo Model: %WHISPER_MODEL%
echo. echo.
if "%~1"=="" ( if "!MODULE_FILTER!"=="" (
.venv\Scripts\python transcribe.py .venv\Scripts\python transcribe.py --course %COURSE_KEY%
) else ( ) else (
echo Modules filter: %~1 echo Modules filter: !MODULE_FILTER!
.venv\Scripts\python transcribe.py --modules %~1 .venv\Scripts\python transcribe.py --course %COURSE_KEY% --modules !MODULE_FILTER!
) )
if errorlevel 1 ( if errorlevel 1 (
echo. echo.

View File

@@ -1,27 +1,26 @@
""" """
Generate summaries from transcripts using Claude Code. Generate summaries from transcripts using Claude Code.
Reads manifest.json, processes each transcript, outputs per-lecture summaries, Reads <root>/manifest.json, processes each transcript, outputs per-lecture
and compiles SUPORT_CURS.md master study guide. summaries, and compiles <root>/SUPORT_CURS.md master study guide.
Usage: Usage:
python summarize.py # Print prompts for each transcript (pipe to Claude) python summarize.py # master, print prompts
python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md python summarize.py --course practitioner # practitioner, print prompts
python summarize.py --compile # master, compile SUPORT_CURS.md
python summarize.py --course practitioner --compile
""" """
import argparse
import json import json
import sys import sys
import textwrap
from pathlib import Path from pathlib import Path
MANIFEST_PATH = Path("manifest.json") from courses import course_paths, get_course, validate_manifest_course
SUMMARIES_DIR = Path("summaries")
TRANSCRIPTS_DIR = Path("transcripts")
MASTER_GUIDE = Path("SUPORT_CURS.md")
MAX_WORDS_PER_CHUNK = 10000 MAX_WORDS_PER_CHUNK = 10000
OVERLAP_WORDS = 500 OVERLAP_WORDS = 500
SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner. SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul {course_name}.
Ofera: Ofera:
1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei 1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
@@ -57,13 +56,12 @@ REZUMATE PARTIALE:
""" """
def load_manifest() -> dict: def load_manifest(manifest_path: Path) -> dict:
with open(MANIFEST_PATH, encoding="utf-8") as f: with open(manifest_path, encoding="utf-8") as f:
return json.load(f) return json.load(f)
def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]: def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
"""Split text into chunks at sentence boundaries with overlap."""
words = text.split() words = text.split()
if len(words) <= max_words: if len(words) <= max_words:
return [text] return [text]
@@ -75,25 +73,22 @@ def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
chunk_words = words[start:end] chunk_words = words[start:end]
chunk_text = " ".join(chunk_words) chunk_text = " ".join(chunk_words)
# Try to break at sentence boundary (look back from end)
if end < len(words): if end < len(words):
for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]: for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
last_sep = chunk_text.rfind(sep) last_sep = chunk_text.rfind(sep)
if last_sep > len(chunk_text) // 2: # Don't break too early if last_sep > len(chunk_text) // 2:
chunk_text = chunk_text[:last_sep + 1] chunk_text = chunk_text[:last_sep + 1]
# Recalculate end based on actual words used
end = start + len(chunk_text.split()) end = start + len(chunk_text.split())
break break
chunks.append(chunk_text) chunks.append(chunk_text)
start = max(end - overlap, start + 1) # Overlap, but always advance start = max(end - overlap, start + 1)
return chunks return chunks
def generate_prompts(manifest: dict): def generate_prompts(manifest: dict, course: dict, paths: dict):
"""Print summary prompts for each transcript to stdout.""" paths["summaries_dir"].mkdir(parents=True, exist_ok=True)
SUMMARIES_DIR.mkdir(exist_ok=True)
for mod in manifest["modules"]: for mod in manifest["modules"]:
for lec in mod["lectures"]: for lec in mod["lectures"]:
@@ -123,14 +118,16 @@ def generate_prompts(manifest: dict):
print(f"Output: {summary_path}", file=sys.stderr) print(f"Output: {summary_path}", file=sys.stderr)
if len(chunks) == 1: if len(chunks) == 1:
prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text) prompt = SUMMARY_PROMPT.format(
course_name=course["name"], title=lec["title"], text=text,
)
print(f"SUMMARY_FILE:{summary_path}") print(f"SUMMARY_FILE:{summary_path}")
print(prompt) print(prompt)
print("---END_PROMPT---") print("---END_PROMPT---")
else: else:
# Multi-chunk: generate individual chunk prompts
for i, chunk in enumerate(chunks, 1): for i, chunk in enumerate(chunks, 1):
prompt = SUMMARY_PROMPT.format( prompt = SUMMARY_PROMPT.format(
course_name=course["name"],
title=f"{lec['title']} (partea {i}/{len(chunks)})", title=f"{lec['title']} (partea {i}/{len(chunks)})",
text=chunk, text=chunk,
) )
@@ -138,54 +135,65 @@ def generate_prompts(manifest: dict):
print(prompt) print(prompt)
print("---END_PROMPT---") print("---END_PROMPT---")
# Then a merge prompt
print(f"MERGE_FILE:{summary_path}") print(f"MERGE_FILE:{summary_path}")
merge = MERGE_PROMPT.format( merge = MERGE_PROMPT.format(
title=lec["title"], title=lec["title"],
chunks="{chunk_summaries}", # Placeholder for merge step chunks="{chunk_summaries}",
) )
print(merge) print(merge)
print("---END_PROMPT---") print("---END_PROMPT---")
def compile_master_guide(manifest: dict): def compile_master_guide(manifest: dict, course: dict, paths: dict):
"""Compile all summaries into SUPORT_CURS.md."""
lines = [ lines = [
"# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n", f"# SUPORT CURS - {course['name']}\n",
"_Generat automat din transcrierile audio ale cursului._\n", "_Generat automat din transcrierile audio ale cursului._\n",
"---\n", "---\n",
] ]
for mod in manifest["modules"]: for mod in manifest["modules"]:
lines.append(f"\n## {mod['name']}\n") lines.append(f"\n## {mod['name']}\n")
for lec in mod["lectures"]: for lec in mod["lectures"]:
summary_path = Path(lec["summary_path"]) summary_path = Path(lec["summary_path"])
lines.append(f"\n### {lec['title']}\n") lines.append(f"\n### {lec['title']}\n")
if summary_path.exists(): if summary_path.exists():
content = summary_path.read_text(encoding="utf-8").strip() content = summary_path.read_text(encoding="utf-8").strip()
lines.append(f"{content}\n") lines.append(f"{content}\n")
else: else:
lines.append("_Rezumat indisponibil._\n") lines.append("_Rezumat indisponibil._\n")
lines.append("\n---\n") lines.append("\n---\n")
MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8") paths["master_guide"].parent.mkdir(parents=True, exist_ok=True)
print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)") # Write LF-only to match the WSL2 baseline (the documented summary workflow
# runs from WSL2; Windows text-mode CRLF would break byte-identic compares).
with open(paths["master_guide"], "w", encoding="utf-8", newline="\n") as f:
f.write("\n".join(lines))
print(f"Compiled {paths['master_guide']} ({paths['master_guide'].stat().st_size} bytes)")
def parse_args():
p = argparse.ArgumentParser(description="Generate summaries / compile SUPORT_CURS.md")
p.add_argument("--course", default="master", help="Course key (see courses.py)")
p.add_argument("--compile", action="store_true", help="Compile SUPORT_CURS.md from existing summaries")
return p.parse_args()
def main(): def main():
if not MANIFEST_PATH.exists(): args = parse_args()
print("manifest.json not found. Run download.py and transcribe.py first.") course = get_course(args.course)
paths = course_paths(course)
if not paths["manifest"].exists():
print(f"{paths['manifest']} not found. Run download.py and transcribe.py first.")
sys.exit(1) sys.exit(1)
manifest = load_manifest() manifest = load_manifest(paths["manifest"])
validate_manifest_course(manifest, course["key"])
if "--compile" in sys.argv: if args.compile:
compile_master_guide(manifest) compile_master_guide(manifest, course, paths)
else: else:
generate_prompts(manifest) generate_prompts(manifest, course, paths)
if __name__ == "__main__": if __name__ == "__main__":

91
tests/test_regression.sh Normal file
View File

@@ -0,0 +1,91 @@
#!/bin/bash
# Regression test: curs master (cursuri.aresens.ro/curs/26) — rulat după
# refactor pentru a confirma că backward-compat e intactă.
#
# Read-only: nu face download, nu re-transcrie, nu modifică manifest în mod
# vizibil (summarize.py --compile suprascrie doar SUPORT_CURS.md pe care îl
# comparăm byte-identic cu baseline-ul capturat pre-refactor).
#
# Baseline: /tmp/suport_before.md (captured pre-refactor).
# Rulare: bash tests/test_regression.sh
set -euo pipefail
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$ROOT"
PY="$ROOT/.venv/Scripts/python.exe"
[ -x "$PY" ] || PY=python
if [ ! -f /tmp/suport_before.md ]; then
echo "FAIL: baseline /tmp/suport_before.md lipsește. Capturează cu:"
echo " cp SUPORT_CURS.md /tmp/suport_before.md"
exit 1
fi
echo "=== [1/5] courses.py importabil + curs 'master' rezolvă ==="
"$PY" -c "
from courses import get_course, course_paths, validate_manifest_course
c = get_course('master')
p = course_paths(c)
assert c['key'] == 'master'
assert str(p['manifest']) == 'manifest.json', p['manifest']
assert str(p['master_guide']) == 'SUPORT_CURS.md'
print('OK: master root=. manifest=manifest.json')
"
echo "=== [2/5] manifest.json: schema backward-compat (course_key absent sau 'master') ==="
"$PY" - <<'PY'
import json
from courses import validate_manifest_course
m = json.load(open("manifest.json", encoding="utf-8"))
# Legacy (no course_key) must be accepted as 'master'.
validate_manifest_course(m, "master")
# Opposite direction must raise.
try:
validate_manifest_course(m, "practitioner")
except SystemExit as e:
print(f"OK: cross-course validation refuses: {e}")
else:
raise SystemExit("FAIL: cross-course validation silently allowed")
assert len(m["modules"]) >= 1, "no modules"
print(f"OK: {len(m['modules'])} modules in manifest")
PY
echo "=== [3/5] transcribe.py --course master (idempotent dry-run — citește manifest, nu re-transcrie) ==="
# Invocarea directă e dominată de disk-check pe transcript_path; dacă toate
# .txt există, nu rulează whisper.
"$PY" -c "
import json
m = json.load(open('manifest.json', encoding='utf-8'))
from pathlib import Path
missing = [l['title'] for mod in m['modules'] for l in mod['lectures']
if l.get('transcribe_status') == 'complete'
and l.get('type') != 'text'
and not Path(l['transcript_path']).exists()]
if missing:
print('FAIL: transcribe_status=complete but .txt missing for:', missing[:3])
raise SystemExit(1)
print(f'OK: all completed transcripts present on disk')
"
echo "=== [4/5] summarize.py --course master --compile — SUPORT_CURS.md byte-identic cu baseline ==="
"$PY" summarize.py --course master --compile
if ! diff -q SUPORT_CURS.md /tmp/suport_before.md >/dev/null; then
echo "FAIL: SUPORT_CURS.md diferă de baseline /tmp/suport_before.md"
diff /tmp/suport_before.md SUPORT_CURS.md | head -30
exit 1
fi
echo "OK: SUPORT_CURS.md byte-identic cu baseline."
echo "=== [5/5] cross-course isolation — --course practitioner nu atinge state-ul master ==="
OUT="$("$PY" transcribe.py --course practitioner 2>&1 || true)"
if echo "$OUT" | grep -qiE "belongs to course|not found"; then
echo "OK: transcribe --course practitioner nu a rulat pe manifest master"
echo " (mesaj: $(echo "$OUT" | grep -oE '(belongs to course[^"]*|not found[^"]*)' | head -1))"
else
echo "FAIL: transcribe --course practitioner output neașteptat:"
echo "$OUT" | head -3
exit 1
fi
echo ""
echo "REGRESSION OK — backward-compat curs master intactă."

View File

@@ -1,11 +1,15 @@
""" """
Batch transcription using whisper.cpp. Batch transcription using whisper.cpp.
Reads manifest.json, transcribes each audio file in module order, Reads <root>/manifest.json, transcribes each audio file in module order,
outputs .txt and .srt files, updates manifest status. outputs .txt and .srt files, updates manifest status.
Resumable: skips files with existing transcripts. Resumable: skips files with existing transcripts.
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription. Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
Text lectures (type=="text") are skipped — their transcript files are
written directly by download.py.
""" """
import argparse
import json import json
import logging import logging
import os import os
@@ -14,11 +18,10 @@ import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
MANIFEST_PATH = Path("manifest.json") from courses import course_paths, get_course, validate_manifest_course
TRANSCRIPTS_DIR = Path("transcripts")
WAV_CACHE_DIR = Path("audio_wav")
# whisper.cpp defaults — override with env vars or CLI args # whisper.cpp defaults — override with env vars or CLI args.
# Shared across courses (same model + binary).
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe") WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin") WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
@@ -34,14 +37,11 @@ log = logging.getLogger(__name__)
def find_ffmpeg() -> str: def find_ffmpeg() -> str:
"""Find ffmpeg executable."""
if shutil.which("ffmpeg"): if shutil.which("ffmpeg"):
return "ffmpeg" return "ffmpeg"
# Check local directories
for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]: for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
if p.exists(): if p.exists():
return str(p.resolve()) return str(p.resolve())
# Try imageio-ffmpeg (pip fallback)
try: try:
import imageio_ffmpeg import imageio_ffmpeg
return imageio_ffmpeg.get_ffmpeg_exe() return imageio_ffmpeg.get_ffmpeg_exe()
@@ -50,21 +50,14 @@ def find_ffmpeg() -> str:
return "" return ""
def convert_to_wav(audio_path: str) -> str: def convert_to_wav(audio_path: str, wav_cache_dir: Path) -> str:
"""
Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
Returns path to WAV file. Skips if WAV already exists.
"""
src = Path(audio_path) src = Path(audio_path)
# Already a WAV file, skip
if src.suffix.lower() == ".wav": if src.suffix.lower() == ".wav":
return audio_path return audio_path
WAV_CACHE_DIR.mkdir(exist_ok=True) wav_cache_dir.mkdir(parents=True, exist_ok=True)
wav_path = WAV_CACHE_DIR / (src.stem + ".wav") wav_path = wav_cache_dir / (src.stem + ".wav")
# Skip if already converted
if wav_path.exists() and wav_path.stat().st_size > 0: if wav_path.exists() and wav_path.stat().st_size > 0:
log.info(f" WAV cache hit: {wav_path}") log.info(f" WAV cache hit: {wav_path}")
return str(wav_path) return str(wav_path)
@@ -76,30 +69,17 @@ def convert_to_wav(audio_path: str) -> str:
log.info(f" Converting to WAV: {src.name} -> {wav_path.name}") log.info(f" Converting to WAV: {src.name} -> {wav_path.name}")
cmd = [ cmd = [
ffmpeg, ffmpeg, "-i", audio_path,
"-i", audio_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
"-vn", # no video "-y", str(wav_path),
"-acodec", "pcm_s16le", # 16-bit PCM
"-ar", "16000", # 16kHz sample rate (whisper standard)
"-ac", "1", # mono
"-y", # overwrite
str(wav_path),
] ]
try: try:
result = subprocess.run( result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
cmd,
capture_output=True,
text=True,
timeout=300, # 5 min max for conversion
)
if result.returncode != 0: if result.returncode != 0:
log.error(f" ffmpeg failed: {result.stderr[:300]}") log.error(f" ffmpeg failed: {result.stderr[:300]}")
return audio_path return audio_path
log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)") log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
return str(wav_path) return str(wav_path)
except FileNotFoundError: except FileNotFoundError:
log.warning(f" ffmpeg not found at: {ffmpeg}") log.warning(f" ffmpeg not found at: {ffmpeg}")
return audio_path return audio_path
@@ -108,21 +88,18 @@ def convert_to_wav(audio_path: str) -> str:
return audio_path return audio_path
def load_manifest() -> dict: def load_manifest(manifest_path: Path) -> dict:
with open(MANIFEST_PATH, encoding="utf-8") as f: with open(manifest_path, encoding="utf-8") as f:
return json.load(f) return json.load(f)
def save_manifest(manifest: dict): def save_manifest(manifest: dict, manifest_path: Path):
with open(MANIFEST_PATH, "w", encoding="utf-8") as f: manifest_path.parent.mkdir(parents=True, exist_ok=True)
with open(manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False) json.dump(manifest, f, indent=2, ensure_ascii=False)
def transcribe_file(audio_path: str, output_base: str) -> bool: def transcribe_file(audio_path: str, output_base: str) -> bool:
"""
Run whisper.cpp on a single file.
Returns True on success.
"""
cmd = [ cmd = [
WHISPER_BIN, WHISPER_BIN,
"--model", WHISPER_MODEL, "--model", WHISPER_MODEL,
@@ -131,11 +108,11 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
"--threads", str(os.cpu_count() or 4), "--threads", str(os.cpu_count() or 4),
"--beam-size", "1", "--beam-size", "1",
"--best-of", "1", "--best-of", "1",
"--max-context", "0", # don't carry context between segments (prevents hallucination loops) "--max-context", "0",
"--entropy-thold", "2.4", # reject high-entropy (hallucinated) segments "--entropy-thold", "2.4",
"--max-len", "60", # shorter segments reduce drift "--max-len", "60",
"--suppress-nst", # suppress non-speech tokens (reduces hallucination on silence) "--suppress-nst",
"--no-fallback", # don't retry with higher temperature "--no-fallback",
"--output-txt", "--output-txt",
"--output-srt", "--output-srt",
"--output-file", output_base, "--output-file", output_base,
@@ -143,9 +120,7 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
] ]
log.info(f" CMD: {' '.join(cmd)}") log.info(f" CMD: {' '.join(cmd)}")
try: try:
# Add whisper.exe's directory to PATH so Windows finds its DLLs
env = os.environ.copy() env = os.environ.copy()
whisper_dir = str(Path(WHISPER_BIN).resolve().parent) whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "") env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
@@ -154,18 +129,15 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
cmd, cmd,
stdout=sys.stdout, stdout=sys.stdout,
stderr=sys.stderr, stderr=sys.stderr,
timeout=7200, # 2 hour timeout per file timeout=7200,
env=env, env=env,
) )
if result.returncode != 0: if result.returncode != 0:
log.error(f" whisper.cpp failed (exit {result.returncode})") log.error(f" whisper.cpp failed (exit {result.returncode})")
return False return False
# Verify output exists and is non-empty
txt_path = Path(f"{output_base}.txt") txt_path = Path(f"{output_base}.txt")
srt_path = Path(f"{output_base}.srt") srt_path = Path(f"{output_base}.srt")
if not txt_path.exists() or txt_path.stat().st_size == 0: if not txt_path.exists() or txt_path.stat().st_size == 0:
log.error(f" Empty or missing transcript: {txt_path}") log.error(f" Empty or missing transcript: {txt_path}")
return False return False
@@ -173,7 +145,6 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)") log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
if srt_path.exists(): if srt_path.exists():
log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)") log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
return True return True
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
@@ -189,7 +160,6 @@ def transcribe_file(audio_path: str, output_base: str) -> bool:
def parse_module_filter(arg: str) -> set[int]: def parse_module_filter(arg: str) -> set[int]:
"""Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
result = set() result = set()
for part in arg.split(","): for part in arg.split(","):
part = part.strip() part = part.strip()
@@ -201,21 +171,29 @@ def parse_module_filter(arg: str) -> set[int]:
return result return result
def parse_args():
p = argparse.ArgumentParser(description="Transcribe lecture audio for a course")
p.add_argument("--course", default="master", help="Course key (see courses.py)")
p.add_argument("--modules", default=None, help="Module filter, e.g. '1-3' or '2,4,5'")
return p.parse_args()
def main(): def main():
if not MANIFEST_PATH.exists(): args = parse_args()
log.error("manifest.json not found. Run download.py first.") course = get_course(args.course)
paths = course_paths(course)
if not paths["manifest"].exists():
log.error(f"{paths['manifest']} not found. Run download.py --course {course['key']} first.")
sys.exit(1) sys.exit(1)
# Parse --modules filter module_filter = parse_module_filter(args.modules) if args.modules else None
module_filter = None if module_filter:
if "--modules" in sys.argv: log.info(f"Module filter: {sorted(module_filter)}")
idx = sys.argv.index("--modules")
if idx + 1 < len(sys.argv):
module_filter = parse_module_filter(sys.argv[idx + 1])
log.info(f"Module filter: {sorted(module_filter)}")
manifest = load_manifest() manifest = load_manifest(paths["manifest"])
TRANSCRIPTS_DIR.mkdir(exist_ok=True) validate_manifest_course(manifest, course["key"])
paths["transcripts_dir"].mkdir(parents=True, exist_ok=True)
total = 0 total = 0
transcribed = 0 transcribed = 0
@@ -233,15 +211,23 @@ def main():
for lec in mod["lectures"]: for lec in mod["lectures"]:
total += 1 total += 1
# Text lectures bypass whisper — transcript written by download.py.
if lec.get("type") == "text":
lec["transcribe_status"] = "complete"
skipped += 1
log.info(f" Skipping text: {lec['title']}")
continue
if lec.get("download_status") != "complete": if lec.get("download_status") != "complete":
log.warning(f" Skipping (not downloaded): {lec['title']}") log.warning(f" Skipping (not downloaded): {lec['title']}")
continue continue
audio_path = lec["audio_path"] audio_path = lec["audio_path"]
# Reuse the stem already recorded in the manifest for backward-compat
# with M1-M6 paths (strips ' [Audio]' for aresens filenames).
stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "") stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
output_base = str(TRANSCRIPTS_DIR / stem) output_base = str(paths["transcripts_dir"] / stem)
# Check if already transcribed
txt_path = Path(f"{output_base}.txt") txt_path = Path(f"{output_base}.txt")
if txt_path.exists() and txt_path.stat().st_size > 0: if txt_path.exists() and txt_path.stat().st_size > 0:
lec["transcribe_status"] = "complete" lec["transcribe_status"] = "complete"
@@ -252,8 +238,7 @@ def main():
log.info(f" Transcribing: {lec['title']}") log.info(f" Transcribing: {lec['title']}")
log.info(f" File: {audio_path}") log.info(f" File: {audio_path}")
# Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input wav_path = convert_to_wav(audio_path, paths["wav_cache_dir"])
wav_path = convert_to_wav(audio_path)
if transcribe_file(wav_path, output_base): if transcribe_file(wav_path, output_base):
lec["transcribe_status"] = "complete" lec["transcribe_status"] = "complete"
@@ -262,20 +247,18 @@ def main():
lec["transcribe_status"] = "failed" lec["transcribe_status"] = "failed"
failed += 1 failed += 1
# Save manifest after each file (checkpoint) save_manifest(manifest, paths["manifest"])
save_manifest(manifest)
# Log milestone after first module (no longer pauses)
if mod == manifest["modules"][0] and transcribed > 0: if mod == manifest["modules"][0] and transcribed > 0:
log.info(f"First module complete ({transcribed} files). Continuing automatically...") log.info(f"First module complete ({transcribed} files). Continuing automatically...")
# Validation
empty_outputs = [ empty_outputs = [
lec["title"] lec["title"]
for mod in manifest["modules"] for mod in manifest["modules"]
for lec in mod["lectures"] for lec in mod["lectures"]
if lec.get("transcribe_status") == "complete" if lec.get("transcribe_status") == "complete"
and not Path(lec["transcript_path"]).exists() and lec.get("type") != "text"
and not Path(lec.get("transcript_path", "")).exists()
] ]
log.info("\n" + "=" * 60) log.info("\n" + "=" * 60)
@@ -286,7 +269,7 @@ def main():
log.error(f" Missing transcript: {t}") log.error(f" Missing transcript: {t}")
log.info("=" * 60) log.info("=" * 60)
save_manifest(manifest) save_manifest(manifest, paths["manifest"])
if failed: if failed:
sys.exit(1) sys.exit(1)