diff --git a/.gitignore b/.gitignore index 6bcabcc..e1d415d 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,11 @@ __pycache__/ # Logs *.log + +# Second course (practitioner) — artifacts only, scripts partajate +nlp-practitioner/audio/ +nlp-practitioner/audio_wav/ + +# Recon scratch +scratch_recon.py +tmp_recon/ diff --git a/courses.py b/courses.py new file mode 100644 index 0000000..3125398 --- /dev/null +++ b/courses.py @@ -0,0 +1,80 @@ +""" +Shared course configuration for the NLP Master pipeline. + +A single pipeline (download -> transcribe -> summarize -> pdf) runs on +multiple courses by passing --course . Scripts resolve all artifact +paths against course["root"], so curs master remains in-place at repo root +and subsequent courses land in their own subdirectory. +""" +from pathlib import Path + +COURSES = { + "master": { + "name": "NLP Master Practitioner Bucuresti 2025", + "base_url": "https://cursuri.aresens.ro", + "course_path": "/curs/26", + "login_path": "/login", + "env_user": "COURSE_USERNAME", + "env_pass": "COURSE_PASSWORD", + # Curs master stays at repo root for backward-compat with M1-M6 outputs. + "root": Path("."), + }, + "practitioner": { + "name": "NLP Practitioner (cursnlp.ro)", + "base_url": "https://shop.cursnlp.ro", + "course_path": "/curs/50", + "login_path": "/login", + "env_user": "PRACTITIONER_USERNAME", + "env_pass": "PRACTITIONER_PASSWORD", + "root": Path("nlp-practitioner"), + }, +} + + +def get_course(key: str) -> dict: + """Return course config by key; SystemExit on unknown key.""" + if key not in COURSES: + raise SystemExit( + f"Unknown course '{key}'. Available: {sorted(COURSES)}" + ) + c = dict(COURSES[key]) + c["key"] = key + c["course_url"] = c["base_url"] + c["course_path"] + c["login_url"] = c["base_url"] + c["login_path"] + return c + + +def course_paths(course: dict) -> dict: + """Resolve artifact paths under course['root'].""" + root = course["root"] + return { + "root": root, + "manifest": root / "manifest.json", + "audio_dir": root / "audio", + "wav_cache_dir": root / "audio_wav", + "transcripts_dir": root / "transcripts", + "summaries_dir": root / "summaries", + "pdf_dir": root / "summaries" / "pdf", + "master_guide": root / "SUPORT_CURS.md", + } + + +def validate_manifest_course(manifest: dict, course_key: str) -> None: + """ + Ensure a pre-existing manifest belongs to the course currently being run. + + Legacy policy: a manifest without `course_key` (written before this refactor) + is treated as `master`. This keeps backward-compat with the existing + curs_26 manifest.json from M1-M6. + """ + mck = manifest.get("course_key") + if mck is None: + effective = "master" + else: + effective = mck + if effective != course_key: + raise SystemExit( + f"Manifest belongs to course '{effective}' but --course='{course_key}'. " + f"Refusing to corrupt cross-course state. " + f"Delete {course_key}'s manifest to start fresh, or run with --course={effective}." + ) diff --git a/download.py b/download.py index 8932344..b2c1219 100644 --- a/download.py +++ b/download.py @@ -1,277 +1,475 @@ -""" -Download all audio files from cursuri.aresens.ro NLP Master course. -Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json. -Resumable: skips already-downloaded files. -""" - -import json -import logging -import os -import sys -import time -from pathlib import Path -from urllib.parse import urljoin - -import requests -from bs4 import BeautifulSoup -from dotenv import load_dotenv - -BASE_URL = "https://cursuri.aresens.ro" -COURSE_URL = f"{BASE_URL}/curs/26" -LOGIN_URL = f"{BASE_URL}/login" -AUDIO_DIR = Path("audio") -MANIFEST_PATH = Path("manifest.json") -MAX_RETRIES = 3 -RETRY_BACKOFF = [5, 15, 30] - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.StreamHandler(), - logging.FileHandler("download_errors.log"), - ], -) -log = logging.getLogger(__name__) - - -def login(session: requests.Session, email: str, password: str) -> bool: - """Login and return True on success.""" - resp = session.post(LOGIN_URL, data={ - "email": email, - "password": password, - "act": "login", - "remember": "on", - }, allow_redirects=True) - # Successful login redirects to the course page, not back to /login - if "/login" in resp.url or "loginform" in resp.text: - return False - return True - - -def parse_module_filter(arg: str) -> set[int]: - """Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices.""" - result = set() - for part in arg.split(","): - part = part.strip() - if "-" in part: - a, b = part.split("-", 1) - result.update(range(int(a), int(b) + 1)) - else: - result.add(int(part)) - return result - - -def discover_modules(session: requests.Session) -> list[dict]: - """Fetch course page and return list of {name, url, module_id}.""" - resp = session.get(COURSE_URL) - resp.raise_for_status() - soup = BeautifulSoup(resp.text, "html.parser") - - modules = [] - for div in soup.select("div.module"): - number_el = div.select_one("div.module__number") - link_el = div.select_one("a.btn") - if not number_el or not link_el: - continue - href = link_el.get("href", "") - module_id = href.rstrip("/").split("/")[-1] - modules.append({ - "name": number_el.get_text(strip=True), - "url": urljoin(BASE_URL, href), - "module_id": module_id, - }) - log.info(f"Found {len(modules)} modules") - return modules - - -def discover_lectures(session: requests.Session, module: dict) -> list[dict]: - """Fetch a module page and return list of lectures with audio URLs.""" - resp = session.get(module["url"]) - resp.raise_for_status() - soup = BeautifulSoup(resp.text, "html.parser") - - lectures = [] - for lesson_div in soup.select("div.lesson"): - name_el = lesson_div.select_one("div.module__name") - source_el = lesson_div.select_one("audio source") - if not name_el or not source_el: - continue - src = source_el.get("src", "").strip() - if not src: - continue - audio_url = urljoin(BASE_URL, src) - filename = src.split("/")[-1] - title = name_el.get_text(strip=True) - lectures.append({ - "title": title, - "original_filename": filename, - "url": audio_url, - "audio_path": str(AUDIO_DIR / filename), - }) - log.info(f" {module['name']}: {len(lectures)} lectures") - return lectures - - -def download_file(session: requests.Session, url: str, dest: Path) -> bool: - """Download a file with retry logic. Returns True on success.""" - for attempt in range(MAX_RETRIES): - try: - resp = session.get(url, stream=True, timeout=300) - resp.raise_for_status() - - # Write to temp file first, then rename (atomic) - tmp = dest.with_suffix(".tmp") - total = 0 - with open(tmp, "wb") as f: - for chunk in resp.iter_content(chunk_size=1024 * 1024): - f.write(chunk) - total += len(chunk) - - if total < 1_000_000: # < 1MB is suspicious - log.warning(f"File too small ({total} bytes): {dest.name}") - tmp.unlink(missing_ok=True) - return False - - tmp.rename(dest) - log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)") - return True - - except Exception as e: - wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30 - log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}") - if attempt < MAX_RETRIES - 1: - log.info(f" Retrying in {wait}s...") - time.sleep(wait) - - log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}") - return False - - -def load_manifest() -> dict | None: - """Load existing manifest if present.""" - if MANIFEST_PATH.exists(): - with open(MANIFEST_PATH) as f: - return json.load(f) - return None - - -def save_manifest(manifest: dict): - """Write manifest.json.""" - with open(MANIFEST_PATH, "w", encoding="utf-8") as f: - json.dump(manifest, f, indent=2, ensure_ascii=False) - - -def main(): - load_dotenv() - email = os.getenv("COURSE_USERNAME", "") - password = os.getenv("COURSE_PASSWORD", "") - if not email or not password: - log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env") - sys.exit(1) - - # Parse --modules filter (e.g. "4-5" or "1,3,5") - module_filter = None - if "--modules" in sys.argv: - idx = sys.argv.index("--modules") - if idx + 1 < len(sys.argv): - module_filter = parse_module_filter(sys.argv[idx + 1]) - log.info(f"Module filter: {sorted(module_filter)}") - - AUDIO_DIR.mkdir(exist_ok=True) - - session = requests.Session() - session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) - - log.info("Logging in...") - if not login(session, email, password): - log.error("Login failed. Check credentials in .env") - sys.exit(1) - log.info("Login successful") - - # Discover structure - modules = discover_modules(session) - if not modules: - log.error("No modules found") - sys.exit(1) - - manifest = { - "course": "NLP Master Practitioner Bucuresti 2025", - "source_url": COURSE_URL, - "modules": [], - } - - total_files = 0 - downloaded = 0 - skipped = 0 - failed = 0 - - for mod_idx, mod in enumerate(modules, 1): - if module_filter and mod_idx not in module_filter: - log.info(f" Skipping module {mod_idx}: {mod['name']}") - continue - lectures = discover_lectures(session, mod) - module_entry = { - "name": mod["name"], - "module_id": mod["module_id"], - "lectures": [], - } - - for lec in lectures: - total_files += 1 - dest = Path(lec["audio_path"]) - stem = dest.stem.replace(" [Audio]", "") - - lecture_entry = { - "title": lec["title"], - "original_filename": lec["original_filename"], - "url": lec["url"], - "audio_path": lec["audio_path"], - "transcript_path": f"transcripts/{stem}.txt", - "srt_path": f"transcripts/{stem}.srt", - "summary_path": f"summaries/{stem}_summary.md", - "download_status": "pending", - "transcribe_status": "pending", - "file_size_bytes": 0, - } - - # Skip if already downloaded - if dest.exists() and dest.stat().st_size > 1_000_000: - lecture_entry["download_status"] = "complete" - lecture_entry["file_size_bytes"] = dest.stat().st_size - skipped += 1 - log.info(f" Skipping (exists): {dest.name}") - else: - if download_file(session, lec["url"], dest): - lecture_entry["download_status"] = "complete" - lecture_entry["file_size_bytes"] = dest.stat().st_size - downloaded += 1 - else: - lecture_entry["download_status"] = "failed" - failed += 1 - - module_entry["lectures"].append(lecture_entry) - - manifest["modules"].append(module_entry) - # Save manifest after each module (checkpoint) - save_manifest(manifest) - - # Final validation - all_ok = all( - Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000 - for mod in manifest["modules"] - for lec in mod["lectures"] - if lec["download_status"] == "complete" - ) - - log.info("=" * 60) - log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.") - log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}") - log.info("=" * 60) - - if failed: - sys.exit(1) - - -if __name__ == "__main__": - main() +""" +Download all lecture media from a configured course (see courses.py). + +Logs in, discovers modules + lectures, downloads whichever media each +lecture exposes, writes /manifest.json. Resumable: skips already- +downloaded files. + +Lecture types: + - "audio":