feat: al 4-lea tip de lecție — PDF (extract text cu pypdf)
Recon-ul pe practitioner M1 arată că unele lecții n-au nici audio nici
Vimeo iframe — doar un link "Descarcă rezumat PDF" (/resurse/*.pdf).
Scraperul vechi le clasifica drept "text" și le marca failed (HTML body
avea <50 chars).
- classify_lesson: detectează acum a[href$=".pdf"] → type="pdf".
- download_pdf_and_extract: download PDF via session autentificat
(pypdf reader) → transcript .txt cu header + conținut pe pagini →
șterge PDF sursă (preferință utilizator: nu păstrez sursele).
- Branch în main loop pentru type=="pdf".
- requirements.txt: + pypdf.
- transcribe.py: skip type in ("text", "pdf") — transcript e deja scris
de download.py.
Limitări: PDF-uri cu conținut vizual (infografice, diagrame) extrag
puțin text. Titlul și textul inline sunt capturate; restul rămâne
pentru review manual.
Testat pe 4 PDF-uri M1 practitioner (Premisele NLP, Forme de Pacing,
Gesturi de calmare, Exercitiu Pacing): 3/4 extract bun (877-3068 bytes),
1/4 conținut predominant grafic (203 bytes).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
116
download.py
116
download.py
@@ -119,7 +119,8 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
|
||||
Types:
|
||||
- ("audio", mp3_url, filename_from_url)
|
||||
- ("vimeo", vimeo_url, "vimeo_<id>")
|
||||
- ("text", "", slug_from_title) # no media found
|
||||
- ("pdf", pdf_url, filename_from_url) # only attachment is a PDF
|
||||
- ("text", "", slug_from_title) # no media or PDF found
|
||||
"""
|
||||
audio_el = lesson_div.select_one("audio source")
|
||||
if audio_el and audio_el.get("src", "").strip():
|
||||
@@ -135,6 +136,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
|
||||
# Canonical player URL works with yt-dlp + referer.
|
||||
return "vimeo", f"https://player.vimeo.com/video/{vimeo_id}", f"vimeo_{vimeo_id}"
|
||||
|
||||
# PDF-only lecture: look for an attachment link ending in .pdf.
|
||||
for a in lesson_div.select("a[href]"):
|
||||
href = (a.get("href") or "").strip()
|
||||
if href.lower().endswith(".pdf"):
|
||||
pdf_url = urljoin(base_url, href)
|
||||
# Stem from filename without extension, keep readable name.
|
||||
stem = href.rsplit("/", 1)[-1].rsplit(".", 1)[0]
|
||||
return "pdf", pdf_url, stem
|
||||
|
||||
return "text", "", "" # stem filled in by caller using title slug
|
||||
|
||||
|
||||
@@ -166,6 +176,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
|
||||
"audio_path": "", # no audio
|
||||
"text_content": body_el.get_text("\n", strip=True),
|
||||
}
|
||||
elif ltype == "pdf":
|
||||
# Transcript derives from extracted PDF text. No audio file; the
|
||||
# "audio_path" is reused as the PDF cache path (deleted after
|
||||
# extraction to honor the 'nu pastrez sursele' preference).
|
||||
pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
|
||||
lecture = {
|
||||
"type": "pdf",
|
||||
"title": title,
|
||||
"original_filename": f"{stem}.pdf",
|
||||
"url": media_url,
|
||||
"audio_path": str(pdf_cache),
|
||||
}
|
||||
elif ltype == "vimeo":
|
||||
# Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
|
||||
audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
|
||||
@@ -193,11 +215,13 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
|
||||
counts = {
|
||||
"audio": sum(1 for L in lectures if L["type"] == "audio"),
|
||||
"vimeo": sum(1 for L in lectures if L["type"] == "vimeo"),
|
||||
"pdf": sum(1 for L in lectures if L["type"] == "pdf"),
|
||||
"text": sum(1 for L in lectures if L["type"] == "text"),
|
||||
}
|
||||
log.info(
|
||||
f" {module['name']}: {len(lectures)} lectures "
|
||||
f"(audio={counts['audio']}, vimeo={counts['vimeo']}, text={counts['text']})"
|
||||
f"(audio={counts['audio']}, vimeo={counts['vimeo']}, "
|
||||
f"pdf={counts['pdf']}, text={counts['text']})"
|
||||
)
|
||||
return lectures
|
||||
|
||||
@@ -297,6 +321,76 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def download_pdf_and_extract(session: requests.Session, lecture: dict,
|
||||
pdf_cache: Path, transcripts_dir: Path) -> bool:
|
||||
"""
|
||||
Download PDF resource via authenticated session, extract text via pypdf,
|
||||
write as transcript .txt. Delete PDF after extraction (no source retention).
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
log.error("pypdf not installed. Run: pip install pypdf")
|
||||
return False
|
||||
|
||||
pdf_cache.parent.mkdir(parents=True, exist_ok=True)
|
||||
transcripts_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download PDF (resource is small — typically <5 MB rezumat)
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = session.get(lecture["url"], stream=True, timeout=120)
|
||||
resp.raise_for_status()
|
||||
tmp = pdf_cache.with_suffix(".pdf.tmp")
|
||||
total = 0
|
||||
with open(tmp, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=256 * 1024):
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
if total < 1000:
|
||||
log.warning(f" PDF too small ({total} bytes): {pdf_cache.name}")
|
||||
tmp.unlink(missing_ok=True)
|
||||
return False
|
||||
tmp.rename(pdf_cache)
|
||||
log.info(f" Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
|
||||
break
|
||||
except Exception as e:
|
||||
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
|
||||
log.warning(f" PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
time.sleep(wait)
|
||||
else:
|
||||
log.error(f" FAILED PDF download: {lecture['url']}")
|
||||
return False
|
||||
|
||||
# Extract text
|
||||
stem = Path(lecture["original_filename"]).stem
|
||||
txt_path = transcripts_dir / f"{stem}.txt"
|
||||
try:
|
||||
reader = PdfReader(str(pdf_cache))
|
||||
pages_text = []
|
||||
for i, page in enumerate(reader.pages, 1):
|
||||
t = page.extract_text() or ""
|
||||
if t.strip():
|
||||
pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
|
||||
body = "\n\n".join(pages_text).strip()
|
||||
except Exception as e:
|
||||
log.error(f" pypdf extract failed on {pdf_cache.name}: {e}")
|
||||
return False
|
||||
|
||||
if len(body) < 50:
|
||||
log.warning(f" PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
|
||||
return False
|
||||
|
||||
header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
|
||||
txt_path.write_text(header + body, encoding="utf-8")
|
||||
log.info(f" Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
|
||||
|
||||
# Delete source PDF (user preference: nu pastrez sursele)
|
||||
pdf_cache.unlink(missing_ok=True)
|
||||
return True
|
||||
|
||||
|
||||
def load_manifest(manifest_path: Path) -> dict | None:
|
||||
if manifest_path.exists():
|
||||
with open(manifest_path, encoding="utf-8") as f:
|
||||
@@ -434,6 +528,24 @@ def main():
|
||||
entry["download_status"] = "failed"
|
||||
failed += 1
|
||||
|
||||
elif lec["type"] == "pdf":
|
||||
# PDF -> download, extract text, save as transcript, delete source.
|
||||
txt_path = Path(entry["transcript_path"])
|
||||
if txt_path.exists() and txt_path.stat().st_size > 50:
|
||||
entry["download_status"] = "complete"
|
||||
entry["transcribe_status"] = "complete"
|
||||
skipped += 1
|
||||
log.info(f" Skipping pdf (transcript exists): {txt_path.name}")
|
||||
elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
|
||||
paths["transcripts_dir"]):
|
||||
entry["download_status"] = "complete"
|
||||
entry["transcribe_status"] = "complete"
|
||||
entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
|
||||
downloaded += 1
|
||||
else:
|
||||
entry["download_status"] = "failed"
|
||||
failed += 1
|
||||
|
||||
else:
|
||||
dest = Path(lec["audio_path"])
|
||||
if dest.exists() and dest.stat().st_size > 1_000_000:
|
||||
|
||||
@@ -4,3 +4,4 @@ python-dotenv
|
||||
markdown2
|
||||
weasyprint
|
||||
yt-dlp
|
||||
pypdf
|
||||
|
||||
@@ -211,11 +211,11 @@ def main():
|
||||
for lec in mod["lectures"]:
|
||||
total += 1
|
||||
|
||||
# Text lectures bypass whisper — transcript written by download.py.
|
||||
if lec.get("type") == "text":
|
||||
# Text and PDF lectures bypass whisper — transcript written by download.py.
|
||||
if lec.get("type") in ("text", "pdf"):
|
||||
lec["transcribe_status"] = "complete"
|
||||
skipped += 1
|
||||
log.info(f" Skipping text: {lec['title']}")
|
||||
log.info(f" Skipping {lec.get('type')}: {lec['title']}")
|
||||
continue
|
||||
|
||||
if lec.get("download_status") != "complete":
|
||||
|
||||
Reference in New Issue
Block a user