feat: al 4-lea tip de lecție — PDF (extract text cu pypdf)

Recon-ul pe practitioner M1 arată că unele lecții n-au nici audio nici Vimeo iframe — doar un link "Descarcă rezumat PDF" (/resurse/*.pdf). Scraperul vechi le clasifica drept "text" și le marca failed (HTML body avea <50 chars). - classify_lesson: detectează acum a[href$=".pdf"] → type="pdf". - download_pdf_and_extract: download PDF via session autentificat (pypdf reader) → transcript .txt cu header + conținut pe pagini → șterge PDF sursă (preferință utilizator: nu păstrez sursele). - Branch în main loop pentru type=="pdf". - requirements.txt: + pypdf. - transcribe.py: skip type in ("text", "pdf") — transcript e deja scris de download.py. Limitări: PDF-uri cu conținut vizual (infografice, diagrame) extrag puțin text. Titlul și textul inline sunt capturate; restul rămâne pentru review manual. Testat pe 4 PDF-uri M1 practitioner (Premisele NLP, Forme de Pacing, Gesturi de calmare, Exercitiu Pacing): 3/4 extract bun (877-3068 bytes), 1/4 conținut predominant grafic (203 bytes). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 23:01:09 +03:00
parent a7cb06ac3e
commit 2e4bb88624
3 changed files with 118 additions and 5 deletions
--- a/download.py
+++ b/download.py
@@ -119,7 +119,8 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
    Types:
      - ("audio",  mp3_url,    filename_from_url)
      - ("vimeo",  vimeo_url,  "vimeo_<id>")
-      - ("text",   "",         slug_from_title)   # no media found
+      - ("pdf",    pdf_url,    filename_from_url)   # only attachment is a PDF
+      - ("text",   "",         slug_from_title)     # no media or PDF found
    """
    audio_el = lesson_div.select_one("audio source")
    if audio_el and audio_el.get("src", "").strip():
@@ -135,6 +136,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
            # Canonical player URL works with yt-dlp + referer.
            return "vimeo", f"https://player.vimeo.com/video/{vimeo_id}", f"vimeo_{vimeo_id}"

+    # PDF-only lecture: look for an attachment link ending in .pdf.
+    for a in lesson_div.select("a[href]"):
+        href = (a.get("href") or "").strip()
+        if href.lower().endswith(".pdf"):
+            pdf_url = urljoin(base_url, href)
+            # Stem from filename without extension, keep readable name.
+            stem = href.rsplit("/", 1)[-1].rsplit(".", 1)[0]
+            return "pdf", pdf_url, stem
+
    return "text", "", ""  # stem filled in by caller using title slug


@@ -166,6 +176,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
                "audio_path": "",      # no audio
                "text_content": body_el.get_text("\n", strip=True),
            }
+        elif ltype == "pdf":
+            # Transcript derives from extracted PDF text. No audio file; the
+            # "audio_path" is reused as the PDF cache path (deleted after
+            # extraction to honor the 'nu pastrez sursele' preference).
+            pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
+            lecture = {
+                "type": "pdf",
+                "title": title,
+                "original_filename": f"{stem}.pdf",
+                "url": media_url,
+                "audio_path": str(pdf_cache),
+            }
        elif ltype == "vimeo":
            # Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
            audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
@@ -193,11 +215,13 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
    counts = {
        "audio": sum(1 for L in lectures if L["type"] == "audio"),
        "vimeo": sum(1 for L in lectures if L["type"] == "vimeo"),
+        "pdf":   sum(1 for L in lectures if L["type"] == "pdf"),
        "text":  sum(1 for L in lectures if L["type"] == "text"),
    }
    log.info(
        f"  {module['name']}: {len(lectures)} lectures "
-        f"(audio={counts['audio']}, vimeo={counts['vimeo']}, text={counts['text']})"
+        f"(audio={counts['audio']}, vimeo={counts['vimeo']}, "
+        f"pdf={counts['pdf']}, text={counts['text']})"
    )
    return lectures

@@ -297,6 +321,76 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
    return True


+def download_pdf_and_extract(session: requests.Session, lecture: dict,
+                              pdf_cache: Path, transcripts_dir: Path) -> bool:
+    """
+    Download PDF resource via authenticated session, extract text via pypdf,
+    write as transcript .txt. Delete PDF after extraction (no source retention).
+    """
+    try:
+        from pypdf import PdfReader
+    except ImportError:
+        log.error("pypdf not installed. Run: pip install pypdf")
+        return False
+
+    pdf_cache.parent.mkdir(parents=True, exist_ok=True)
+    transcripts_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download PDF (resource is small — typically <5 MB rezumat)
+    for attempt in range(MAX_RETRIES):
+        try:
+            resp = session.get(lecture["url"], stream=True, timeout=120)
+            resp.raise_for_status()
+            tmp = pdf_cache.with_suffix(".pdf.tmp")
+            total = 0
+            with open(tmp, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=256 * 1024):
+                    f.write(chunk)
+                    total += len(chunk)
+            if total < 1000:
+                log.warning(f"  PDF too small ({total} bytes): {pdf_cache.name}")
+                tmp.unlink(missing_ok=True)
+                return False
+            tmp.rename(pdf_cache)
+            log.info(f"  Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
+            break
+        except Exception as e:
+            wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
+            log.warning(f"  PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
+            if attempt < MAX_RETRIES - 1:
+                time.sleep(wait)
+    else:
+        log.error(f"  FAILED PDF download: {lecture['url']}")
+        return False
+
+    # Extract text
+    stem = Path(lecture["original_filename"]).stem
+    txt_path = transcripts_dir / f"{stem}.txt"
+    try:
+        reader = PdfReader(str(pdf_cache))
+        pages_text = []
+        for i, page in enumerate(reader.pages, 1):
+            t = page.extract_text() or ""
+            if t.strip():
+                pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
+        body = "\n\n".join(pages_text).strip()
+    except Exception as e:
+        log.error(f"  pypdf extract failed on {pdf_cache.name}: {e}")
+        return False
+
+    if len(body) < 50:
+        log.warning(f"  PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
+        return False
+
+    header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
+    txt_path.write_text(header + body, encoding="utf-8")
+    log.info(f"  Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
+
+    # Delete source PDF (user preference: nu pastrez sursele)
+    pdf_cache.unlink(missing_ok=True)
+    return True
+
+
 def load_manifest(manifest_path: Path) -> dict | None:
    if manifest_path.exists():
        with open(manifest_path, encoding="utf-8") as f:
@@ -434,6 +528,24 @@ def main():
                    entry["download_status"] = "failed"
                    failed += 1

+            elif lec["type"] == "pdf":
+                # PDF -> download, extract text, save as transcript, delete source.
+                txt_path = Path(entry["transcript_path"])
+                if txt_path.exists() and txt_path.stat().st_size > 50:
+                    entry["download_status"] = "complete"
+                    entry["transcribe_status"] = "complete"
+                    skipped += 1
+                    log.info(f"  Skipping pdf (transcript exists): {txt_path.name}")
+                elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
+                                               paths["transcripts_dir"]):
+                    entry["download_status"] = "complete"
+                    entry["transcribe_status"] = "complete"
+                    entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
+                    downloaded += 1
+                else:
+                    entry["download_status"] = "failed"
+                    failed += 1
+
            else:
                dest = Path(lec["audio_path"])
                if dest.exists() and dest.stat().st_size > 1_000_000:
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ python-dotenv
 markdown2
 weasyprint
 yt-dlp
+pypdf
--- a/transcribe.py
+++ b/transcribe.py
@@ -211,11 +211,11 @@ def main():
        for lec in mod["lectures"]:
            total += 1

-            # Text lectures bypass whisper — transcript written by download.py.
-            if lec.get("type") == "text":
+            # Text and PDF lectures bypass whisper — transcript written by download.py.
+            if lec.get("type") in ("text", "pdf"):
                lec["transcribe_status"] = "complete"
                skipped += 1
-                log.info(f"  Skipping text: {lec['title']}")
+                log.info(f"  Skipping {lec.get('type')}: {lec['title']}")
                continue

            if lec.get("download_status") != "complete":