nlp-master/download.py

"""
Download all audio files from cursuri.aresens.ro NLP Master course.
Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json.
Resumable: skips already-downloaded files.
"""

import json
import logging
import os
import sys
import time
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

BASE_URL = "https://cursuri.aresens.ro"
COURSE_URL = f"{BASE_URL}/curs/26"
LOGIN_URL = f"{BASE_URL}/login"
AUDIO_DIR = Path("audio")
MANIFEST_PATH = Path("manifest.json")
MAX_RETRIES = 3
RETRY_BACKOFF = [5, 15, 30]

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("download_errors.log"),
    ],
)
log = logging.getLogger(__name__)


def login(session: requests.Session, email: str, password: str) -> bool:
    """Login and return True on success."""
    resp = session.post(LOGIN_URL, data={
        "email": email,
        "password": password,
        "act": "login",
        "remember": "on",
    }, allow_redirects=True)
    # Successful login redirects to the course page, not back to /login
    if "/login" in resp.url or "loginform" in resp.text:
        return False
    return True


def discover_modules(session: requests.Session) -> list[dict]:
    """Fetch course page and return list of {name, url, module_id}."""
    resp = session.get(COURSE_URL)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    modules = []
    for div in soup.select("div.module"):
        number_el = div.select_one("div.module__number")
        link_el = div.select_one("a.btn")
        if not number_el or not link_el:
            continue
        href = link_el.get("href", "")
        module_id = href.rstrip("/").split("/")[-1]
        modules.append({
            "name": number_el.get_text(strip=True),
            "url": urljoin(BASE_URL, href),
            "module_id": module_id,
        })
    log.info(f"Found {len(modules)} modules")
    return modules


def discover_lectures(session: requests.Session, module: dict) -> list[dict]:
    """Fetch a module page and return list of lectures with audio URLs."""
    resp = session.get(module["url"])
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    lectures = []
    for lesson_div in soup.select("div.lesson"):
        name_el = lesson_div.select_one("div.module__name")
        source_el = lesson_div.select_one("audio source")
        if not name_el or not source_el:
            continue
        src = source_el.get("src", "").strip()
        if not src:
            continue
        audio_url = urljoin(BASE_URL, src)
        filename = src.split("/")[-1]
        title = name_el.get_text(strip=True)
        lectures.append({
            "title": title,
            "original_filename": filename,
            "url": audio_url,
            "audio_path": str(AUDIO_DIR / filename),
        })
    log.info(f"  {module['name']}: {len(lectures)} lectures")
    return lectures


def download_file(session: requests.Session, url: str, dest: Path) -> bool:
    """Download a file with retry logic. Returns True on success."""
    for attempt in range(MAX_RETRIES):
        try:
            resp = session.get(url, stream=True, timeout=300)
            resp.raise_for_status()

            # Write to temp file first, then rename (atomic)
            tmp = dest.with_suffix(".tmp")
            total = 0
            with open(tmp, "wb") as f:
                for chunk in resp.iter_content(chunk_size=1024 * 1024):
                    f.write(chunk)
                    total += len(chunk)

            if total < 1_000_000:  # < 1MB is suspicious
                log.warning(f"File too small ({total} bytes): {dest.name}")
                tmp.unlink(missing_ok=True)
                return False

            tmp.rename(dest)
            log.info(f"  Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
            return True

        except Exception as e:
            wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
            log.warning(f"  Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
            if attempt < MAX_RETRIES - 1:
                log.info(f"  Retrying in {wait}s...")
                time.sleep(wait)

    log.error(f"  FAILED after {MAX_RETRIES} attempts: {dest.name}")
    return False


def load_manifest() -> dict | None:
    """Load existing manifest if present."""
    if MANIFEST_PATH.exists():
        with open(MANIFEST_PATH) as f:
            return json.load(f)
    return None


def save_manifest(manifest: dict):
    """Write manifest.json."""
    with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)


def main():
    load_dotenv()
    email = os.getenv("COURSE_USERNAME", "")
    password = os.getenv("COURSE_PASSWORD", "")
    if not email or not password:
        log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env")
        sys.exit(1)

    AUDIO_DIR.mkdir(exist_ok=True)

    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

    log.info("Logging in...")
    if not login(session, email, password):
        log.error("Login failed. Check credentials in .env")
        sys.exit(1)
    log.info("Login successful")

    # Discover structure
    modules = discover_modules(session)
    if not modules:
        log.error("No modules found")
        sys.exit(1)

    manifest = {
        "course": "NLP Master Practitioner Bucuresti 2025",
        "source_url": COURSE_URL,
        "modules": [],
    }

    total_files = 0
    downloaded = 0
    skipped = 0
    failed = 0

    for mod in modules:
        lectures = discover_lectures(session, mod)
        module_entry = {
            "name": mod["name"],
            "module_id": mod["module_id"],
            "lectures": [],
        }

        for lec in lectures:
            total_files += 1
            dest = Path(lec["audio_path"])
            stem = dest.stem.replace(" [Audio]", "")

            lecture_entry = {
                "title": lec["title"],
                "original_filename": lec["original_filename"],
                "url": lec["url"],
                "audio_path": lec["audio_path"],
                "transcript_path": f"transcripts/{stem}.txt",
                "srt_path": f"transcripts/{stem}.srt",
                "summary_path": f"summaries/{stem}_summary.md",
                "download_status": "pending",
                "transcribe_status": "pending",
                "file_size_bytes": 0,
            }

            # Skip if already downloaded
            if dest.exists() and dest.stat().st_size > 1_000_000:
                lecture_entry["download_status"] = "complete"
                lecture_entry["file_size_bytes"] = dest.stat().st_size
                skipped += 1
                log.info(f"  Skipping (exists): {dest.name}")
            else:
                if download_file(session, lec["url"], dest):
                    lecture_entry["download_status"] = "complete"
                    lecture_entry["file_size_bytes"] = dest.stat().st_size
                    downloaded += 1
                else:
                    lecture_entry["download_status"] = "failed"
                    failed += 1

            module_entry["lectures"].append(lecture_entry)

        manifest["modules"].append(module_entry)
        # Save manifest after each module (checkpoint)
        save_manifest(manifest)

    # Final validation
    all_ok = all(
        Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000
        for mod in manifest["modules"]
        for lec in mod["lectures"]
        if lec["download_status"] == "complete"
    )

    log.info("=" * 60)
    log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.")
    log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}")
    log.info("=" * 60)

    if failed:
        sys.exit(1)


if __name__ == "__main__":
    main()