NLP Master: pipeline download + transcribe + summarize
- run.bat: one-click pipeline (download, convert, transcribe) - download.py: fetch audio from course platform - transcribe.py: whisper.cpp batch transcription (CPU, WAV 16kHz) - MP3->WAV conversion via ffmpeg - --modules filter for splitting work across machines - summarize.py: generate summaries from transcripts - setup_whisper.py: auto-download whisper.cpp, ffmpeg, and model - Medium model (q5_0) instead of large to avoid VRAM crashes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
253
download.py
Normal file
253
download.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
Download all audio files from cursuri.aresens.ro NLP Master course.
|
||||
Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json.
|
||||
Resumable: skips already-downloaded files.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from dotenv import load_dotenv
|
||||
|
||||
BASE_URL = "https://cursuri.aresens.ro"
|
||||
COURSE_URL = f"{BASE_URL}/curs/26"
|
||||
LOGIN_URL = f"{BASE_URL}/login"
|
||||
AUDIO_DIR = Path("audio")
|
||||
MANIFEST_PATH = Path("manifest.json")
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BACKOFF = [5, 15, 30]
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler("download_errors.log"),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def login(session: requests.Session, email: str, password: str) -> bool:
|
||||
"""Login and return True on success."""
|
||||
resp = session.post(LOGIN_URL, data={
|
||||
"email": email,
|
||||
"password": password,
|
||||
"act": "login",
|
||||
"remember": "on",
|
||||
}, allow_redirects=True)
|
||||
# Successful login redirects to the course page, not back to /login
|
||||
if "/login" in resp.url or "loginform" in resp.text:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def discover_modules(session: requests.Session) -> list[dict]:
|
||||
"""Fetch course page and return list of {name, url, module_id}."""
|
||||
resp = session.get(COURSE_URL)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
modules = []
|
||||
for div in soup.select("div.module"):
|
||||
number_el = div.select_one("div.module__number")
|
||||
link_el = div.select_one("a.btn")
|
||||
if not number_el or not link_el:
|
||||
continue
|
||||
href = link_el.get("href", "")
|
||||
module_id = href.rstrip("/").split("/")[-1]
|
||||
modules.append({
|
||||
"name": number_el.get_text(strip=True),
|
||||
"url": urljoin(BASE_URL, href),
|
||||
"module_id": module_id,
|
||||
})
|
||||
log.info(f"Found {len(modules)} modules")
|
||||
return modules
|
||||
|
||||
|
||||
def discover_lectures(session: requests.Session, module: dict) -> list[dict]:
|
||||
"""Fetch a module page and return list of lectures with audio URLs."""
|
||||
resp = session.get(module["url"])
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
lectures = []
|
||||
for lesson_div in soup.select("div.lesson"):
|
||||
name_el = lesson_div.select_one("div.module__name")
|
||||
source_el = lesson_div.select_one("audio source")
|
||||
if not name_el or not source_el:
|
||||
continue
|
||||
src = source_el.get("src", "").strip()
|
||||
if not src:
|
||||
continue
|
||||
audio_url = urljoin(BASE_URL, src)
|
||||
filename = src.split("/")[-1]
|
||||
title = name_el.get_text(strip=True)
|
||||
lectures.append({
|
||||
"title": title,
|
||||
"original_filename": filename,
|
||||
"url": audio_url,
|
||||
"audio_path": str(AUDIO_DIR / filename),
|
||||
})
|
||||
log.info(f" {module['name']}: {len(lectures)} lectures")
|
||||
return lectures
|
||||
|
||||
|
||||
def download_file(session: requests.Session, url: str, dest: Path) -> bool:
|
||||
"""Download a file with retry logic. Returns True on success."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = session.get(url, stream=True, timeout=300)
|
||||
resp.raise_for_status()
|
||||
|
||||
# Write to temp file first, then rename (atomic)
|
||||
tmp = dest.with_suffix(".tmp")
|
||||
total = 0
|
||||
with open(tmp, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=1024 * 1024):
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
|
||||
if total < 1_000_000: # < 1MB is suspicious
|
||||
log.warning(f"File too small ({total} bytes): {dest.name}")
|
||||
tmp.unlink(missing_ok=True)
|
||||
return False
|
||||
|
||||
tmp.rename(dest)
|
||||
log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
|
||||
log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
log.info(f" Retrying in {wait}s...")
|
||||
time.sleep(wait)
|
||||
|
||||
log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}")
|
||||
return False
|
||||
|
||||
|
||||
def load_manifest() -> dict | None:
|
||||
"""Load existing manifest if present."""
|
||||
if MANIFEST_PATH.exists():
|
||||
with open(MANIFEST_PATH) as f:
|
||||
return json.load(f)
|
||||
return None
|
||||
|
||||
|
||||
def save_manifest(manifest: dict):
|
||||
"""Write manifest.json."""
|
||||
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def main():
|
||||
load_dotenv()
|
||||
email = os.getenv("COURSE_USERNAME", "")
|
||||
password = os.getenv("COURSE_PASSWORD", "")
|
||||
if not email or not password:
|
||||
log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env")
|
||||
sys.exit(1)
|
||||
|
||||
AUDIO_DIR.mkdir(exist_ok=True)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
|
||||
|
||||
log.info("Logging in...")
|
||||
if not login(session, email, password):
|
||||
log.error("Login failed. Check credentials in .env")
|
||||
sys.exit(1)
|
||||
log.info("Login successful")
|
||||
|
||||
# Discover structure
|
||||
modules = discover_modules(session)
|
||||
if not modules:
|
||||
log.error("No modules found")
|
||||
sys.exit(1)
|
||||
|
||||
manifest = {
|
||||
"course": "NLP Master Practitioner Bucuresti 2025",
|
||||
"source_url": COURSE_URL,
|
||||
"modules": [],
|
||||
}
|
||||
|
||||
total_files = 0
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for mod in modules:
|
||||
lectures = discover_lectures(session, mod)
|
||||
module_entry = {
|
||||
"name": mod["name"],
|
||||
"module_id": mod["module_id"],
|
||||
"lectures": [],
|
||||
}
|
||||
|
||||
for lec in lectures:
|
||||
total_files += 1
|
||||
dest = Path(lec["audio_path"])
|
||||
stem = dest.stem.replace(" [Audio]", "")
|
||||
|
||||
lecture_entry = {
|
||||
"title": lec["title"],
|
||||
"original_filename": lec["original_filename"],
|
||||
"url": lec["url"],
|
||||
"audio_path": lec["audio_path"],
|
||||
"transcript_path": f"transcripts/{stem}.txt",
|
||||
"srt_path": f"transcripts/{stem}.srt",
|
||||
"summary_path": f"summaries/{stem}_summary.md",
|
||||
"download_status": "pending",
|
||||
"transcribe_status": "pending",
|
||||
"file_size_bytes": 0,
|
||||
}
|
||||
|
||||
# Skip if already downloaded
|
||||
if dest.exists() and dest.stat().st_size > 1_000_000:
|
||||
lecture_entry["download_status"] = "complete"
|
||||
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
||||
skipped += 1
|
||||
log.info(f" Skipping (exists): {dest.name}")
|
||||
else:
|
||||
if download_file(session, lec["url"], dest):
|
||||
lecture_entry["download_status"] = "complete"
|
||||
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
||||
downloaded += 1
|
||||
else:
|
||||
lecture_entry["download_status"] = "failed"
|
||||
failed += 1
|
||||
|
||||
module_entry["lectures"].append(lecture_entry)
|
||||
|
||||
manifest["modules"].append(module_entry)
|
||||
# Save manifest after each module (checkpoint)
|
||||
save_manifest(manifest)
|
||||
|
||||
# Final validation
|
||||
all_ok = all(
|
||||
Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000
|
||||
for mod in manifest["modules"]
|
||||
for lec in mod["lectures"]
|
||||
if lec["download_status"] == "complete"
|
||||
)
|
||||
|
||||
log.info("=" * 60)
|
||||
log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.")
|
||||
log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}")
|
||||
log.info("=" * 60)
|
||||
|
||||
if failed:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user