254 lines
8.3 KiB
Python
254 lines
8.3 KiB
Python
"""
|
|
Download all audio files from cursuri.aresens.ro NLP Master course.
|
|
Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json.
|
|
Resumable: skips already-downloaded files.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from dotenv import load_dotenv
|
|
|
|
BASE_URL = "https://cursuri.aresens.ro"
|
|
COURSE_URL = f"{BASE_URL}/curs/26"
|
|
LOGIN_URL = f"{BASE_URL}/login"
|
|
AUDIO_DIR = Path("audio")
|
|
MANIFEST_PATH = Path("manifest.json")
|
|
MAX_RETRIES = 3
|
|
RETRY_BACKOFF = [5, 15, 30]
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
handlers=[
|
|
logging.StreamHandler(),
|
|
logging.FileHandler("download_errors.log"),
|
|
],
|
|
)
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def login(session: requests.Session, email: str, password: str) -> bool:
|
|
"""Login and return True on success."""
|
|
resp = session.post(LOGIN_URL, data={
|
|
"email": email,
|
|
"password": password,
|
|
"act": "login",
|
|
"remember": "on",
|
|
}, allow_redirects=True)
|
|
# Successful login redirects to the course page, not back to /login
|
|
if "/login" in resp.url or "loginform" in resp.text:
|
|
return False
|
|
return True
|
|
|
|
|
|
def discover_modules(session: requests.Session) -> list[dict]:
|
|
"""Fetch course page and return list of {name, url, module_id}."""
|
|
resp = session.get(COURSE_URL)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
modules = []
|
|
for div in soup.select("div.module"):
|
|
number_el = div.select_one("div.module__number")
|
|
link_el = div.select_one("a.btn")
|
|
if not number_el or not link_el:
|
|
continue
|
|
href = link_el.get("href", "")
|
|
module_id = href.rstrip("/").split("/")[-1]
|
|
modules.append({
|
|
"name": number_el.get_text(strip=True),
|
|
"url": urljoin(BASE_URL, href),
|
|
"module_id": module_id,
|
|
})
|
|
log.info(f"Found {len(modules)} modules")
|
|
return modules
|
|
|
|
|
|
def discover_lectures(session: requests.Session, module: dict) -> list[dict]:
|
|
"""Fetch a module page and return list of lectures with audio URLs."""
|
|
resp = session.get(module["url"])
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
lectures = []
|
|
for lesson_div in soup.select("div.lesson"):
|
|
name_el = lesson_div.select_one("div.module__name")
|
|
source_el = lesson_div.select_one("audio source")
|
|
if not name_el or not source_el:
|
|
continue
|
|
src = source_el.get("src", "").strip()
|
|
if not src:
|
|
continue
|
|
audio_url = urljoin(BASE_URL, src)
|
|
filename = src.split("/")[-1]
|
|
title = name_el.get_text(strip=True)
|
|
lectures.append({
|
|
"title": title,
|
|
"original_filename": filename,
|
|
"url": audio_url,
|
|
"audio_path": str(AUDIO_DIR / filename),
|
|
})
|
|
log.info(f" {module['name']}: {len(lectures)} lectures")
|
|
return lectures
|
|
|
|
|
|
def download_file(session: requests.Session, url: str, dest: Path) -> bool:
|
|
"""Download a file with retry logic. Returns True on success."""
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
resp = session.get(url, stream=True, timeout=300)
|
|
resp.raise_for_status()
|
|
|
|
# Write to temp file first, then rename (atomic)
|
|
tmp = dest.with_suffix(".tmp")
|
|
total = 0
|
|
with open(tmp, "wb") as f:
|
|
for chunk in resp.iter_content(chunk_size=1024 * 1024):
|
|
f.write(chunk)
|
|
total += len(chunk)
|
|
|
|
if total < 1_000_000: # < 1MB is suspicious
|
|
log.warning(f"File too small ({total} bytes): {dest.name}")
|
|
tmp.unlink(missing_ok=True)
|
|
return False
|
|
|
|
tmp.rename(dest)
|
|
log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
|
|
return True
|
|
|
|
except Exception as e:
|
|
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
|
|
log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
log.info(f" Retrying in {wait}s...")
|
|
time.sleep(wait)
|
|
|
|
log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}")
|
|
return False
|
|
|
|
|
|
def load_manifest() -> dict | None:
|
|
"""Load existing manifest if present."""
|
|
if MANIFEST_PATH.exists():
|
|
with open(MANIFEST_PATH) as f:
|
|
return json.load(f)
|
|
return None
|
|
|
|
|
|
def save_manifest(manifest: dict):
|
|
"""Write manifest.json."""
|
|
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
def main():
|
|
load_dotenv()
|
|
email = os.getenv("COURSE_USERNAME", "")
|
|
password = os.getenv("COURSE_PASSWORD", "")
|
|
if not email or not password:
|
|
log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env")
|
|
sys.exit(1)
|
|
|
|
AUDIO_DIR.mkdir(exist_ok=True)
|
|
|
|
session = requests.Session()
|
|
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
|
|
|
|
log.info("Logging in...")
|
|
if not login(session, email, password):
|
|
log.error("Login failed. Check credentials in .env")
|
|
sys.exit(1)
|
|
log.info("Login successful")
|
|
|
|
# Discover structure
|
|
modules = discover_modules(session)
|
|
if not modules:
|
|
log.error("No modules found")
|
|
sys.exit(1)
|
|
|
|
manifest = {
|
|
"course": "NLP Master Practitioner Bucuresti 2025",
|
|
"source_url": COURSE_URL,
|
|
"modules": [],
|
|
}
|
|
|
|
total_files = 0
|
|
downloaded = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for mod in modules:
|
|
lectures = discover_lectures(session, mod)
|
|
module_entry = {
|
|
"name": mod["name"],
|
|
"module_id": mod["module_id"],
|
|
"lectures": [],
|
|
}
|
|
|
|
for lec in lectures:
|
|
total_files += 1
|
|
dest = Path(lec["audio_path"])
|
|
stem = dest.stem.replace(" [Audio]", "")
|
|
|
|
lecture_entry = {
|
|
"title": lec["title"],
|
|
"original_filename": lec["original_filename"],
|
|
"url": lec["url"],
|
|
"audio_path": lec["audio_path"],
|
|
"transcript_path": f"transcripts/{stem}.txt",
|
|
"srt_path": f"transcripts/{stem}.srt",
|
|
"summary_path": f"summaries/{stem}_summary.md",
|
|
"download_status": "pending",
|
|
"transcribe_status": "pending",
|
|
"file_size_bytes": 0,
|
|
}
|
|
|
|
# Skip if already downloaded
|
|
if dest.exists() and dest.stat().st_size > 1_000_000:
|
|
lecture_entry["download_status"] = "complete"
|
|
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
|
skipped += 1
|
|
log.info(f" Skipping (exists): {dest.name}")
|
|
else:
|
|
if download_file(session, lec["url"], dest):
|
|
lecture_entry["download_status"] = "complete"
|
|
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
|
downloaded += 1
|
|
else:
|
|
lecture_entry["download_status"] = "failed"
|
|
failed += 1
|
|
|
|
module_entry["lectures"].append(lecture_entry)
|
|
|
|
manifest["modules"].append(module_entry)
|
|
# Save manifest after each module (checkpoint)
|
|
save_manifest(manifest)
|
|
|
|
# Final validation
|
|
all_ok = all(
|
|
Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000
|
|
for mod in manifest["modules"]
|
|
for lec in mod["lectures"]
|
|
if lec["download_status"] == "complete"
|
|
)
|
|
|
|
log.info("=" * 60)
|
|
log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.")
|
|
log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}")
|
|
log.info("=" * 60)
|
|
|
|
if failed:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|