From 2e4bb8862497848dbaf163e754e83242aa52b720 Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Wed, 22 Apr 2026 23:01:09 +0300 Subject: [PATCH] =?UTF-8?q?feat:=20al=204-lea=20tip=20de=20lec=C8=9Bie=20?= =?UTF-8?q?=E2=80=94=20PDF=20(extract=20text=20cu=20pypdf)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recon-ul pe practitioner M1 arată că unele lecții n-au nici audio nici Vimeo iframe — doar un link "Descarcă rezumat PDF" (/resurse/*.pdf). Scraperul vechi le clasifica drept "text" și le marca failed (HTML body avea <50 chars). - classify_lesson: detectează acum a[href$=".pdf"] → type="pdf". - download_pdf_and_extract: download PDF via session autentificat (pypdf reader) → transcript .txt cu header + conținut pe pagini → șterge PDF sursă (preferință utilizator: nu păstrez sursele). - Branch în main loop pentru type=="pdf". - requirements.txt: + pypdf. - transcribe.py: skip type in ("text", "pdf") — transcript e deja scris de download.py. Limitări: PDF-uri cu conținut vizual (infografice, diagrame) extrag puțin text. Titlul și textul inline sunt capturate; restul rămâne pentru review manual. Testat pe 4 PDF-uri M1 practitioner (Premisele NLP, Forme de Pacing, Gesturi de calmare, Exercitiu Pacing): 3/4 extract bun (877-3068 bytes), 1/4 conținut predominant grafic (203 bytes). Co-Authored-By: Claude Sonnet 4.6 --- download.py | 116 ++++++++++++++++++++++++++++++++++++++++++++++- requirements.txt | 1 + transcribe.py | 6 +-- 3 files changed, 118 insertions(+), 5 deletions(-) diff --git a/download.py b/download.py index b2c1219..d2cad1d 100644 --- a/download.py +++ b/download.py @@ -119,7 +119,8 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]: Types: - ("audio", mp3_url, filename_from_url) - ("vimeo", vimeo_url, "vimeo_") - - ("text", "", slug_from_title) # no media found + - ("pdf", pdf_url, filename_from_url) # only attachment is a PDF + - ("text", "", slug_from_title) # no media or PDF found """ audio_el = lesson_div.select_one("audio source") if audio_el and audio_el.get("src", "").strip(): @@ -135,6 +136,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]: # Canonical player URL works with yt-dlp + referer. return "vimeo", f"https://player.vimeo.com/video/{vimeo_id}", f"vimeo_{vimeo_id}" + # PDF-only lecture: look for an attachment link ending in .pdf. + for a in lesson_div.select("a[href]"): + href = (a.get("href") or "").strip() + if href.lower().endswith(".pdf"): + pdf_url = urljoin(base_url, href) + # Stem from filename without extension, keep readable name. + stem = href.rsplit("/", 1)[-1].rsplit(".", 1)[0] + return "pdf", pdf_url, stem + return "text", "", "" # stem filled in by caller using title slug @@ -166,6 +176,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) -> "audio_path": "", # no audio "text_content": body_el.get_text("\n", strip=True), } + elif ltype == "pdf": + # Transcript derives from extracted PDF text. No audio file; the + # "audio_path" is reused as the PDF cache path (deleted after + # extraction to honor the 'nu pastrez sursele' preference). + pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf" + lecture = { + "type": "pdf", + "title": title, + "original_filename": f"{stem}.pdf", + "url": media_url, + "audio_path": str(pdf_cache), + } elif ltype == "vimeo": # Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3) audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3" @@ -193,11 +215,13 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) -> counts = { "audio": sum(1 for L in lectures if L["type"] == "audio"), "vimeo": sum(1 for L in lectures if L["type"] == "vimeo"), + "pdf": sum(1 for L in lectures if L["type"] == "pdf"), "text": sum(1 for L in lectures if L["type"] == "text"), } log.info( f" {module['name']}: {len(lectures)} lectures " - f"(audio={counts['audio']}, vimeo={counts['vimeo']}, text={counts['text']})" + f"(audio={counts['audio']}, vimeo={counts['vimeo']}, " + f"pdf={counts['pdf']}, text={counts['text']})" ) return lectures @@ -297,6 +321,76 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool: return True +def download_pdf_and_extract(session: requests.Session, lecture: dict, + pdf_cache: Path, transcripts_dir: Path) -> bool: + """ + Download PDF resource via authenticated session, extract text via pypdf, + write as transcript .txt. Delete PDF after extraction (no source retention). + """ + try: + from pypdf import PdfReader + except ImportError: + log.error("pypdf not installed. Run: pip install pypdf") + return False + + pdf_cache.parent.mkdir(parents=True, exist_ok=True) + transcripts_dir.mkdir(parents=True, exist_ok=True) + + # Download PDF (resource is small — typically <5 MB rezumat) + for attempt in range(MAX_RETRIES): + try: + resp = session.get(lecture["url"], stream=True, timeout=120) + resp.raise_for_status() + tmp = pdf_cache.with_suffix(".pdf.tmp") + total = 0 + with open(tmp, "wb") as f: + for chunk in resp.iter_content(chunk_size=256 * 1024): + f.write(chunk) + total += len(chunk) + if total < 1000: + log.warning(f" PDF too small ({total} bytes): {pdf_cache.name}") + tmp.unlink(missing_ok=True) + return False + tmp.rename(pdf_cache) + log.info(f" Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)") + break + except Exception as e: + wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30 + log.warning(f" PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}") + if attempt < MAX_RETRIES - 1: + time.sleep(wait) + else: + log.error(f" FAILED PDF download: {lecture['url']}") + return False + + # Extract text + stem = Path(lecture["original_filename"]).stem + txt_path = transcripts_dir / f"{stem}.txt" + try: + reader = PdfReader(str(pdf_cache)) + pages_text = [] + for i, page in enumerate(reader.pages, 1): + t = page.extract_text() or "" + if t.strip(): + pages_text.append(f"--- pagina {i} ---\n{t.strip()}") + body = "\n\n".join(pages_text).strip() + except Exception as e: + log.error(f" pypdf extract failed on {pdf_cache.name}: {e}") + return False + + if len(body) < 50: + log.warning(f" PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript") + return False + + header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n" + txt_path.write_text(header + body, encoding="utf-8") + log.info(f" Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)") + + # Delete source PDF (user preference: nu pastrez sursele) + pdf_cache.unlink(missing_ok=True) + return True + + def load_manifest(manifest_path: Path) -> dict | None: if manifest_path.exists(): with open(manifest_path, encoding="utf-8") as f: @@ -434,6 +528,24 @@ def main(): entry["download_status"] = "failed" failed += 1 + elif lec["type"] == "pdf": + # PDF -> download, extract text, save as transcript, delete source. + txt_path = Path(entry["transcript_path"]) + if txt_path.exists() and txt_path.stat().st_size > 50: + entry["download_status"] = "complete" + entry["transcribe_status"] = "complete" + skipped += 1 + log.info(f" Skipping pdf (transcript exists): {txt_path.name}") + elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]), + paths["transcripts_dir"]): + entry["download_status"] = "complete" + entry["transcribe_status"] = "complete" + entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0 + downloaded += 1 + else: + entry["download_status"] = "failed" + failed += 1 + else: dest = Path(lec["audio_path"]) if dest.exists() and dest.stat().st_size > 1_000_000: diff --git a/requirements.txt b/requirements.txt index 28b5ce9..c877bcf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ python-dotenv markdown2 weasyprint yt-dlp +pypdf diff --git a/transcribe.py b/transcribe.py index 79db143..25c9266 100644 --- a/transcribe.py +++ b/transcribe.py @@ -211,11 +211,11 @@ def main(): for lec in mod["lectures"]: total += 1 - # Text lectures bypass whisper — transcript written by download.py. - if lec.get("type") == "text": + # Text and PDF lectures bypass whisper — transcript written by download.py. + if lec.get("type") in ("text", "pdf"): lec["transcribe_status"] = "complete" skipped += 1 - log.info(f" Skipping text: {lec['title']}") + log.info(f" Skipping {lec.get('type')}: {lec['title']}") continue if lec.get("download_status") != "complete":