Files
nlp-master/download.py
Marius Mutu 2e4bb88624 feat: al 4-lea tip de lecție — PDF (extract text cu pypdf)
Recon-ul pe practitioner M1 arată că unele lecții n-au nici audio nici
Vimeo iframe — doar un link "Descarcă rezumat PDF" (/resurse/*.pdf).
Scraperul vechi le clasifica drept "text" și le marca failed (HTML body
avea <50 chars).

- classify_lesson: detectează acum a[href$=".pdf"] → type="pdf".
- download_pdf_and_extract: download PDF via session autentificat
  (pypdf reader) → transcript .txt cu header + conținut pe pagini →
  șterge PDF sursă (preferință utilizator: nu păstrez sursele).
- Branch în main loop pentru type=="pdf".
- requirements.txt: + pypdf.
- transcribe.py: skip type in ("text", "pdf") — transcript e deja scris
  de download.py.

Limitări: PDF-uri cu conținut vizual (infografice, diagrame) extrag
puțin text. Titlul și textul inline sunt capturate; restul rămâne
pentru review manual.

Testat pe 4 PDF-uri M1 practitioner (Premisele NLP, Forme de Pacing,
Gesturi de calmare, Exercitiu Pacing): 3/4 extract bun (877-3068 bytes),
1/4 conținut predominant grafic (203 bytes).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 23:01:09 +03:00

588 lines
22 KiB
Python

"""
Download all lecture media from a configured course (see courses.py).
Logs in, discovers modules + lectures, downloads whichever media each
lecture exposes, writes <root>/manifest.json. Resumable: skips already-
downloaded files.
Lecture types:
- "audio": <audio source> MP3 on the course CDN -> requests stream download
- "vimeo": <iframe src="...player.vimeo.com/video/ID"> -> yt-dlp
(audio-only HLS track -> MP3 96kbps, no video bytes fetched)
- "text": neither audio nor video -> capture the lecture HTML body as
a plain-text transcript directly (skips whisper entirely)
"""
import argparse
import json
import logging
import os
import re
import sys
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from courses import course_paths, get_course, validate_manifest_course
MAX_RETRIES = 3
RETRY_BACKOFF = [5, 15, 30]
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler("download_errors.log"),
],
)
log = logging.getLogger(__name__)
def login(session: requests.Session, course: dict, email: str, password: str) -> bool:
resp = session.post(course["login_url"], data={
"email": email,
"password": password,
"act": "login",
"remember": "on",
}, allow_redirects=True)
if "/login" in resp.url or "loginform" in resp.text:
return False
return True
def parse_module_filter(arg: str) -> set[int]:
result = set()
for part in arg.split(","):
part = part.strip()
if "-" in part:
a, b = part.split("-", 1)
result.update(range(int(a), int(b) + 1))
else:
result.add(int(part))
return result
def discover_modules(session: requests.Session, course: dict) -> list[dict]:
resp = session.get(course["course_url"])
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
modules = []
for div in soup.select("div.module"):
number_el = div.select_one("div.module__number")
link_el = div.select_one("a.btn")
if not number_el or not link_el:
continue
href = link_el.get("href", "")
module_id = href.rstrip("/").split("/")[-1]
modules.append({
"name": number_el.get_text(strip=True),
"url": urljoin(course["base_url"], href),
"module_id": module_id,
})
log.info(f"Found {len(modules)} modules")
if not modules:
log.error("No modules found on course page — selectors mismatch or not logged in")
sys.exit(1)
return modules
VIMEO_ID_RE = re.compile(r"player\.vimeo\.com/video/(\d+)", re.I)
def slugify(text: str) -> str:
"""Filesystem-safe slug for text lectures (no URL-derived filename)."""
text = text.strip().lower()
text = re.sub(r"[^\w\s-]", "", text, flags=re.UNICODE)
text = re.sub(r"[\s_-]+", "_", text)
return text[:80] or "untitled"
def derived_stem(filename: str) -> str:
"""
Stem used for transcript/srt/summary paths.
Strips the ' [Audio]' suffix used on curs master (aresens) filenames
so derived paths stay short and backward-compatible with M1-M6.
"""
return Path(filename).stem.replace(" [Audio]", "")
def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
"""
Return (lecture_type, media_url_or_empty, filename_stem).
Types:
- ("audio", mp3_url, filename_from_url)
- ("vimeo", vimeo_url, "vimeo_<id>")
- ("pdf", pdf_url, filename_from_url) # only attachment is a PDF
- ("text", "", slug_from_title) # no media or PDF found
"""
audio_el = lesson_div.select_one("audio source")
if audio_el and audio_el.get("src", "").strip():
src = audio_el["src"].strip()
return "audio", urljoin(base_url, src), src.split("/")[-1].rsplit(".", 1)[0]
iframe_el = lesson_div.select_one("iframe")
if iframe_el:
src = (iframe_el.get("src") or iframe_el.get("data-src") or "").strip()
m = VIMEO_ID_RE.search(src)
if m:
vimeo_id = m.group(1)
# Canonical player URL works with yt-dlp + referer.
return "vimeo", f"https://player.vimeo.com/video/{vimeo_id}", f"vimeo_{vimeo_id}"
# PDF-only lecture: look for an attachment link ending in .pdf.
for a in lesson_div.select("a[href]"):
href = (a.get("href") or "").strip()
if href.lower().endswith(".pdf"):
pdf_url = urljoin(base_url, href)
# Stem from filename without extension, keep readable name.
stem = href.rsplit("/", 1)[-1].rsplit(".", 1)[0]
return "pdf", pdf_url, stem
return "text", "", "" # stem filled in by caller using title slug
def discover_lectures(session: requests.Session, module: dict, course: dict) -> list[dict]:
resp = session.get(module["url"])
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
lectures = []
for lesson_div in soup.select("div.lesson"):
name_el = lesson_div.select_one("div.module__name")
if not name_el:
continue
title = name_el.get_text(strip=True)
if not title:
continue
ltype, media_url, stem = classify_lesson(lesson_div, course["base_url"])
if ltype == "text":
stem = slugify(title)
# Capture the lesson body HTML (source for text -> transcript)
# so we don't have to re-request it later.
body_el = lesson_div.select_one("div.module__content") or lesson_div
lecture = {
"type": "text",
"title": title,
"original_filename": stem + ".txt",
"url": module["url"], # lesson is inline in module page
"audio_path": "", # no audio
"text_content": body_el.get_text("\n", strip=True),
}
elif ltype == "pdf":
# Transcript derives from extracted PDF text. No audio file; the
# "audio_path" is reused as the PDF cache path (deleted after
# extraction to honor the 'nu pastrez sursele' preference).
pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
lecture = {
"type": "pdf",
"title": title,
"original_filename": f"{stem}.pdf",
"url": media_url,
"audio_path": str(pdf_cache),
}
elif ltype == "vimeo":
# Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
lecture = {
"type": "vimeo",
"title": title,
"original_filename": f"{stem}.mp3",
"url": media_url,
"audio_path": str(audio_path),
}
else: # "audio"
# Preserve original filename (may contain spaces).
filename = media_url.split("/")[-1]
audio_path = course_paths(course)["audio_dir"] / filename
lecture = {
"type": "audio",
"title": title,
"original_filename": filename,
"url": media_url,
"audio_path": str(audio_path),
}
lectures.append(lecture)
counts = {
"audio": sum(1 for L in lectures if L["type"] == "audio"),
"vimeo": sum(1 for L in lectures if L["type"] == "vimeo"),
"pdf": sum(1 for L in lectures if L["type"] == "pdf"),
"text": sum(1 for L in lectures if L["type"] == "text"),
}
log.info(
f" {module['name']}: {len(lectures)} lectures "
f"(audio={counts['audio']}, vimeo={counts['vimeo']}, "
f"pdf={counts['pdf']}, text={counts['text']})"
)
return lectures
def download_audio_http(session: requests.Session, url: str, dest: Path) -> bool:
"""HTTP stream download with retry. Returns True on success."""
for attempt in range(MAX_RETRIES):
try:
resp = session.get(url, stream=True, timeout=300)
resp.raise_for_status()
tmp = dest.with_suffix(dest.suffix + ".tmp")
total = 0
with open(tmp, "wb") as f:
for chunk in resp.iter_content(chunk_size=1024 * 1024):
f.write(chunk)
total += len(chunk)
if total < 1_000_000:
log.warning(f"File too small ({total} bytes): {dest.name}")
tmp.unlink(missing_ok=True)
return False
tmp.rename(dest)
log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
return True
except Exception as e:
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
if attempt < MAX_RETRIES - 1:
log.info(f" Retrying in {wait}s...")
time.sleep(wait)
log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}")
return False
def download_vimeo_audio(vimeo_url: str, referer: str, dest: Path) -> bool:
"""
Download Vimeo audio-only stream via yt-dlp, output as MP3 96kbps.
No video bytes fetched (Vimeo HLS has separate audio tracks).
"""
try:
import yt_dlp
except ImportError:
log.error("yt-dlp not installed. Run: pip install yt-dlp")
return False
dest.parent.mkdir(parents=True, exist_ok=True)
# yt-dlp adds .mp3 extension after postprocessing; give it the stem.
outtmpl_stem = str(dest.with_suffix(""))
ydl_opts = {
"format": "bestaudio",
"outtmpl": outtmpl_stem + ".%(ext)s",
"http_headers": {"Referer": referer},
"quiet": True,
"no_warnings": True,
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "96",
}],
}
for attempt in range(MAX_RETRIES):
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([vimeo_url])
if dest.exists() and dest.stat().st_size > 100_000:
log.info(f" Downloaded (vimeo): {dest.name} ({dest.stat().st_size / 1_000_000:.1f} MB)")
return True
log.warning(f" yt-dlp produced no file or too small: {dest}")
except Exception as e:
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
log.warning(f" Vimeo attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
log.info(f" Retrying in {wait}s...")
time.sleep(wait)
log.error(f" FAILED vimeo download after {MAX_RETRIES} attempts: {vimeo_url}")
return False
def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
"""
Write the lecture's captured HTML text as a transcript .txt file.
Text lectures bypass whisper — content is final here.
"""
transcripts_dir.mkdir(parents=True, exist_ok=True)
stem = Path(lecture["original_filename"]).stem
txt_path = transcripts_dir / f"{stem}.txt"
text = lecture.get("text_content", "").strip()
if len(text) < 50:
log.warning(f" text lesson '{lecture['title']}' has <50 chars, skipping")
return False
header = f"[TEXT LECTURE — no audio]\nTitlu: {lecture['title']}\n\n"
txt_path.write_text(header + text, encoding="utf-8")
log.info(f" Captured (text): {txt_path.name} ({txt_path.stat().st_size} bytes)")
return True
def download_pdf_and_extract(session: requests.Session, lecture: dict,
pdf_cache: Path, transcripts_dir: Path) -> bool:
"""
Download PDF resource via authenticated session, extract text via pypdf,
write as transcript .txt. Delete PDF after extraction (no source retention).
"""
try:
from pypdf import PdfReader
except ImportError:
log.error("pypdf not installed. Run: pip install pypdf")
return False
pdf_cache.parent.mkdir(parents=True, exist_ok=True)
transcripts_dir.mkdir(parents=True, exist_ok=True)
# Download PDF (resource is small — typically <5 MB rezumat)
for attempt in range(MAX_RETRIES):
try:
resp = session.get(lecture["url"], stream=True, timeout=120)
resp.raise_for_status()
tmp = pdf_cache.with_suffix(".pdf.tmp")
total = 0
with open(tmp, "wb") as f:
for chunk in resp.iter_content(chunk_size=256 * 1024):
f.write(chunk)
total += len(chunk)
if total < 1000:
log.warning(f" PDF too small ({total} bytes): {pdf_cache.name}")
tmp.unlink(missing_ok=True)
return False
tmp.rename(pdf_cache)
log.info(f" Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
break
except Exception as e:
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
log.warning(f" PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(wait)
else:
log.error(f" FAILED PDF download: {lecture['url']}")
return False
# Extract text
stem = Path(lecture["original_filename"]).stem
txt_path = transcripts_dir / f"{stem}.txt"
try:
reader = PdfReader(str(pdf_cache))
pages_text = []
for i, page in enumerate(reader.pages, 1):
t = page.extract_text() or ""
if t.strip():
pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
body = "\n\n".join(pages_text).strip()
except Exception as e:
log.error(f" pypdf extract failed on {pdf_cache.name}: {e}")
return False
if len(body) < 50:
log.warning(f" PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
return False
header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
txt_path.write_text(header + body, encoding="utf-8")
log.info(f" Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
# Delete source PDF (user preference: nu pastrez sursele)
pdf_cache.unlink(missing_ok=True)
return True
def load_manifest(manifest_path: Path) -> dict | None:
if manifest_path.exists():
with open(manifest_path, encoding="utf-8") as f:
return json.load(f)
return None
def save_manifest(manifest: dict, manifest_path: Path):
manifest_path.parent.mkdir(parents=True, exist_ok=True)
with open(manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
def parse_args():
p = argparse.ArgumentParser(description="Download lecture media for a course")
p.add_argument("--course", default="master", help="Course key (see courses.py)")
p.add_argument("--modules", default=None, help="Module filter, e.g. '1-3' or '2,4,5'")
return p.parse_args()
def main():
args = parse_args()
course = get_course(args.course)
paths = course_paths(course)
load_dotenv()
email = os.getenv(course["env_user"], "")
password = os.getenv(course["env_pass"], "")
if not email or not password:
log.error(f"Set {course['env_user']} and {course['env_pass']} in .env")
sys.exit(1)
module_filter = parse_module_filter(args.modules) if args.modules else None
if module_filter:
log.info(f"Module filter: {sorted(module_filter)}")
paths["audio_dir"].mkdir(parents=True, exist_ok=True)
paths["transcripts_dir"].mkdir(parents=True, exist_ok=True)
# Validate existing manifest belongs to this course
existing = load_manifest(paths["manifest"])
if existing is not None:
validate_manifest_course(existing, course["key"])
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
log.info(f"Course: {course['key']} ({course['name']})")
log.info(f"Root: {paths['root']}")
log.info("Logging in...")
if not login(session, course, email, password):
log.error("Login failed. Check credentials in .env")
sys.exit(1)
log.info("Login successful")
modules = discover_modules(session, course)
# Start from existing manifest if present — preserves modules outside
# the current --modules filter, and preserves per-lecture state (e.g.
# transcribe_status) for modules in the filter.
if existing:
manifest = dict(existing)
manifest["course_key"] = course["key"]
manifest["course"] = course["name"]
manifest["source_url"] = course["course_url"]
if "modules" not in manifest:
manifest["modules"] = []
else:
manifest = {
"course_key": course["key"],
"course": course["name"],
"source_url": course["course_url"],
"modules": [],
}
# Index of existing modules by name for in-place replacement.
existing_by_name = {m["name"]: i for i, m in enumerate(manifest["modules"])}
# Prior lecture state (by title) for preserving transcribe_status.
prior_lecture_state: dict[str, dict] = {
lec["title"]: lec
for m in manifest["modules"]
for lec in m.get("lectures", [])
}
total = 0
downloaded = 0
skipped = 0
failed = 0
for mod_idx, mod in enumerate(modules, 1):
if module_filter and mod_idx not in module_filter:
log.info(f" Skipping module {mod_idx}: {mod['name']} (outside --modules filter, preserved in manifest)")
continue
lectures = discover_lectures(session, mod, course)
module_entry = {
"name": mod["name"],
"module_id": mod["module_id"],
"lectures": [],
}
for lec in lectures:
total += 1
stem = derived_stem(lec["original_filename"])
prior = prior_lecture_state.get(lec["title"], {})
entry = {
"type": lec["type"],
"title": lec["title"],
"original_filename": lec["original_filename"],
"url": lec["url"],
"audio_path": lec["audio_path"],
"transcript_path": prior.get("transcript_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
"srt_path": prior.get("srt_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
"summary_path": prior.get("summary_path") or f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
"download_status": "pending",
# Preserve transcribe_status from prior run (disk check in transcribe.py will correct it if needed).
"transcribe_status": prior.get("transcribe_status", "pending"),
"file_size_bytes": 0,
}
if lec["type"] == "text":
# Captured directly; treated as already-transcribed.
txt_path = Path(entry["transcript_path"])
if txt_path.exists() and txt_path.stat().st_size > 50:
entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
skipped += 1
log.info(f" Skipping text (exists): {txt_path.name}")
elif capture_text_lecture(lec, paths["transcripts_dir"]):
entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
entry["file_size_bytes"] = txt_path.stat().st_size
downloaded += 1
else:
entry["download_status"] = "failed"
failed += 1
elif lec["type"] == "pdf":
# PDF -> download, extract text, save as transcript, delete source.
txt_path = Path(entry["transcript_path"])
if txt_path.exists() and txt_path.stat().st_size > 50:
entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
skipped += 1
log.info(f" Skipping pdf (transcript exists): {txt_path.name}")
elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
paths["transcripts_dir"]):
entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
downloaded += 1
else:
entry["download_status"] = "failed"
failed += 1
else:
dest = Path(lec["audio_path"])
if dest.exists() and dest.stat().st_size > 1_000_000:
entry["download_status"] = "complete"
entry["file_size_bytes"] = dest.stat().st_size
skipped += 1
log.info(f" Skipping (exists): {dest.name}")
else:
if lec["type"] == "audio":
ok = download_audio_http(session, lec["url"], dest)
else: # "vimeo"
ok = download_vimeo_audio(lec["url"], course["base_url"] + "/", dest)
if ok:
entry["download_status"] = "complete"
entry["file_size_bytes"] = dest.stat().st_size
downloaded += 1
else:
entry["download_status"] = "failed"
failed += 1
module_entry["lectures"].append(entry)
# Replace or append module in manifest (preserves order for existing, appends new at end).
if mod["name"] in existing_by_name:
manifest["modules"][existing_by_name[mod["name"]]] = module_entry
else:
manifest["modules"].append(module_entry)
save_manifest(manifest, paths["manifest"])
log.info("=" * 60)
log.info(f"Downloaded {downloaded}/{total} items, {skipped} skipped, {failed} failures.")
log.info("=" * 60)
if failed:
sys.exit(1)
if __name__ == "__main__":
main()