From 2e4bb8862497848dbaf163e754e83242aa52b720 Mon Sep 17 00:00:00 2001
From: Marius Mutu <mmarius28@gmail.com>
Date: Wed, 22 Apr 2026 23:01:09 +0300
Subject: [PATCH] =?UTF-8?q?feat:=20al=204-lea=20tip=20de=20lec=C8=9Bie=20?=
 =?UTF-8?q?=E2=80=94=20PDF=20(extract=20text=20cu=20pypdf)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recon-ul pe practitioner M1 arată că unele lecții n-au nici audio nici
Vimeo iframe — doar un link "Descarcă rezumat PDF" (/resurse/*.pdf).
Scraperul vechi le clasifica drept "text" și le marca failed (HTML body
avea <50 chars).

- classify_lesson: detectează acum a[href$=".pdf"] → type="pdf".
- download_pdf_and_extract: download PDF via session autentificat
  (pypdf reader) → transcript .txt cu header + conținut pe pagini →
  șterge PDF sursă (preferință utilizator: nu păstrez sursele).
- Branch în main loop pentru type=="pdf".
- requirements.txt: + pypdf.
- transcribe.py: skip type in ("text", "pdf") — transcript e deja scris
  de download.py.

Limitări: PDF-uri cu conținut vizual (infografice, diagrame) extrag
puțin text. Titlul și textul inline sunt capturate; restul rămâne
pentru review manual.

Testat pe 4 PDF-uri M1 practitioner (Premisele NLP, Forme de Pacing,
Gesturi de calmare, Exercitiu Pacing): 3/4 extract bun (877-3068 bytes),
1/4 conținut predominant grafic (203 bytes).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 download.py      | 116 ++++++++++++++++++++++++++++++++++++++++++++++-
 requirements.txt |   1 +
 transcribe.py    |   6 +--
 3 files changed, 118 insertions(+), 5 deletions(-)
diff --git a/download.py b/download.py
index b2c1219..d2cad1d 100644
--- a/download.py
+++ b/download.py
@@ -119,7 +119,8 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
     Types:
       - ("audio",  mp3_url,    filename_from_url)
       - ("vimeo",  vimeo_url,  "vimeo_<id>")
-      - ("text",   "",         slug_from_title)   # no media found
+      - ("pdf",    pdf_url,    filename_from_url)   # only attachment is a PDF
+      - ("text",   "",         slug_from_title)     # no media or PDF found
     """
     audio_el = lesson_div.select_one("audio source")
     if audio_el and audio_el.get("src", "").strip():
@@ -135,6 +136,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
             # Canonical player URL works with yt-dlp + referer.
             return "vimeo", f"https://player.vimeo.com/video/{vimeo_id}", f"vimeo_{vimeo_id}"
 
+    # PDF-only lecture: look for an attachment link ending in .pdf.
+    for a in lesson_div.select("a[href]"):
+        href = (a.get("href") or "").strip()
+        if href.lower().endswith(".pdf"):
+            pdf_url = urljoin(base_url, href)
+            # Stem from filename without extension, keep readable name.
+            stem = href.rsplit("/", 1)[-1].rsplit(".", 1)[0]
+            return "pdf", pdf_url, stem
+
     return "text", "", ""  # stem filled in by caller using title slug
 
 
@@ -166,6 +176,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
                 "audio_path": "",      # no audio
                 "text_content": body_el.get_text("\n", strip=True),
             }
+        elif ltype == "pdf":
+            # Transcript derives from extracted PDF text. No audio file; the
+            # "audio_path" is reused as the PDF cache path (deleted after
+            # extraction to honor the 'nu pastrez sursele' preference).
+            pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
+            lecture = {
+                "type": "pdf",
+                "title": title,
+                "original_filename": f"{stem}.pdf",
+                "url": media_url,
+                "audio_path": str(pdf_cache),
+            }
         elif ltype == "vimeo":
             # Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
             audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
@@ -193,11 +215,13 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
     counts = {
         "audio": sum(1 for L in lectures if L["type"] == "audio"),
         "vimeo": sum(1 for L in lectures if L["type"] == "vimeo"),
+        "pdf":   sum(1 for L in lectures if L["type"] == "pdf"),
         "text":  sum(1 for L in lectures if L["type"] == "text"),
     }
     log.info(
         f"  {module['name']}: {len(lectures)} lectures "
-        f"(audio={counts['audio']}, vimeo={counts['vimeo']}, text={counts['text']})"
+        f"(audio={counts['audio']}, vimeo={counts['vimeo']}, "
+        f"pdf={counts['pdf']}, text={counts['text']})"
     )
     return lectures
 
@@ -297,6 +321,76 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
     return True
 
 
+def download_pdf_and_extract(session: requests.Session, lecture: dict,
+                              pdf_cache: Path, transcripts_dir: Path) -> bool:
+    """
+    Download PDF resource via authenticated session, extract text via pypdf,
+    write as transcript .txt. Delete PDF after extraction (no source retention).
+    """
+    try:
+        from pypdf import PdfReader
+    except ImportError:
+        log.error("pypdf not installed. Run: pip install pypdf")
+        return False
+
+    pdf_cache.parent.mkdir(parents=True, exist_ok=True)
+    transcripts_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download PDF (resource is small — typically <5 MB rezumat)
+    for attempt in range(MAX_RETRIES):
+        try:
+            resp = session.get(lecture["url"], stream=True, timeout=120)
+            resp.raise_for_status()
+            tmp = pdf_cache.with_suffix(".pdf.tmp")
+            total = 0
+            with open(tmp, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=256 * 1024):
+                    f.write(chunk)
+                    total += len(chunk)
+            if total < 1000:
+                log.warning(f"  PDF too small ({total} bytes): {pdf_cache.name}")
+                tmp.unlink(missing_ok=True)
+                return False
+            tmp.rename(pdf_cache)
+            log.info(f"  Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
+            break
+        except Exception as e:
+            wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
+            log.warning(f"  PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
+            if attempt < MAX_RETRIES - 1:
+                time.sleep(wait)
+    else:
+        log.error(f"  FAILED PDF download: {lecture['url']}")
+        return False
+
+    # Extract text
+    stem = Path(lecture["original_filename"]).stem
+    txt_path = transcripts_dir / f"{stem}.txt"
+    try:
+        reader = PdfReader(str(pdf_cache))
+        pages_text = []
+        for i, page in enumerate(reader.pages, 1):
+            t = page.extract_text() or ""
+            if t.strip():
+                pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
+        body = "\n\n".join(pages_text).strip()
+    except Exception as e:
+        log.error(f"  pypdf extract failed on {pdf_cache.name}: {e}")
+        return False
+
+    if len(body) < 50:
+        log.warning(f"  PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
+        return False
+
+    header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
+    txt_path.write_text(header + body, encoding="utf-8")
+    log.info(f"  Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
+
+    # Delete source PDF (user preference: nu pastrez sursele)
+    pdf_cache.unlink(missing_ok=True)
+    return True
+
+
 def load_manifest(manifest_path: Path) -> dict | None:
     if manifest_path.exists():
         with open(manifest_path, encoding="utf-8") as f:
@@ -434,6 +528,24 @@ def main():
                     entry["download_status"] = "failed"
                     failed += 1
 
+            elif lec["type"] == "pdf":
+                # PDF -> download, extract text, save as transcript, delete source.
+                txt_path = Path(entry["transcript_path"])
+                if txt_path.exists() and txt_path.stat().st_size > 50:
+                    entry["download_status"] = "complete"
+                    entry["transcribe_status"] = "complete"
+                    skipped += 1
+                    log.info(f"  Skipping pdf (transcript exists): {txt_path.name}")
+                elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
+                                               paths["transcripts_dir"]):
+                    entry["download_status"] = "complete"
+                    entry["transcribe_status"] = "complete"
+                    entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
+                    downloaded += 1
+                else:
+                    entry["download_status"] = "failed"
+                    failed += 1
+
             else:
                 dest = Path(lec["audio_path"])
                 if dest.exists() and dest.stat().st_size > 1_000_000:
diff --git a/requirements.txt b/requirements.txt
index 28b5ce9..c877bcf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ python-dotenv
 markdown2
 weasyprint
 yt-dlp
+pypdf
diff --git a/transcribe.py b/transcribe.py
index 79db143..25c9266 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -211,11 +211,11 @@ def main():
         for lec in mod["lectures"]:
             total += 1
 
-            # Text lectures bypass whisper — transcript written by download.py.
-            if lec.get("type") == "text":
+            # Text and PDF lectures bypass whisper — transcript written by download.py.
+            if lec.get("type") in ("text", "pdf"):
                 lec["transcribe_status"] = "complete"
                 skipped += 1
-                log.info(f"  Skipping text: {lec['title']}")
+                log.info(f"  Skipping {lec.get('type')}: {lec['title']}")
                 continue
 
             if lec.get("download_status") != "complete":