feat(practitioner): structură per-modul + PDF-uri sursă + split 2-PC

- audio/Modul {N}/filename.mp3 — fiecare modul în subdirector separat pentru copiere pe telefon și transfer între PC-uri. - PDF-urile se păstrează ca sursă în summaries/pdf/ (fără extract txt). - transcribe_status="pdf_source_only" pentru lecțiile PDF → summarize.py le filtrează automat. - Fix coliziune manifest transcript_path (stem-based, nu preserve prior). - .bat per modul (M2-M8) + dispatchers run_pc1_all (M2-M5) + run_pc2_all (M6-M8) pentru partajare work pe 2 PC-uri. - prepare_pc2_bundle.py: zip cu scripts + manifest + .env + PDFs pentru PC2 (self-installs whisper.cpp/model/ffmpeg la primul run). - M1 whisper complete (49/49 audio+vimeo transcrise). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 08:48:58 +03:00
parent 2e4bb88624
commit 6ee53133b7
132 changed files with 28904 additions and 74 deletions
--- a/download.py
+++ b/download.py
@@ -148,11 +148,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
    return "text", "", ""  # stem filled in by caller using title slug


-def discover_lectures(session: requests.Session, module: dict, course: dict) -> list[dict]:
+def discover_lectures(session: requests.Session, module: dict, course: dict, mod_idx: int) -> list[dict]:
    resp = session.get(module["url"])
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

+    paths = course_paths(course)
+    audio_mod_dir = paths["audio_dir"] / f"Modul {mod_idx}"
+    pdf_dir = paths["pdf_dir"]
+
    lectures = []
    for lesson_div in soup.select("div.lesson"):
        name_el = lesson_div.select_one("div.module__name")
@@ -177,20 +181,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
                "text_content": body_el.get_text("\n", strip=True),
            }
        elif ltype == "pdf":
-            # Transcript derives from extracted PDF text. No audio file; the
-            # "audio_path" is reused as the PDF cache path (deleted after
-            # extraction to honor the 'nu pastrez sursele' preference).
-            pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
+            # PDF source is kept (not extracted / not deleted). Lives flat in
+            # summaries/pdf/ — user reads PDFs directly, no whisper/no txt.
+            pdf_path = pdf_dir / f"{stem}.pdf"
            lecture = {
                "type": "pdf",
                "title": title,
                "original_filename": f"{stem}.pdf",
                "url": media_url,
-                "audio_path": str(pdf_cache),
+                "audio_path": str(pdf_path),
            }
        elif ltype == "vimeo":
-            # Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
-            audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
+            audio_path = audio_mod_dir / f"{stem}.mp3"
            lecture = {
                "type": "vimeo",
                "title": title,
@@ -199,9 +201,8 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
                "audio_path": str(audio_path),
            }
        else:  # "audio"
-            # Preserve original filename (may contain spaces).
            filename = media_url.split("/")[-1]
-            audio_path = course_paths(course)["audio_dir"] / filename
+            audio_path = audio_mod_dir / filename
            lecture = {
                "type": "audio",
                "title": title,
@@ -321,74 +322,37 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
    return True


-def download_pdf_and_extract(session: requests.Session, lecture: dict,
-                              pdf_cache: Path, transcripts_dir: Path) -> bool:
+def download_pdf(session: requests.Session, lecture: dict, pdf_path: Path) -> bool:
    """
-    Download PDF resource via authenticated session, extract text via pypdf,
-    write as transcript .txt. Delete PDF after extraction (no source retention).
+    Download PDF resource via authenticated session, save source file.
+    No text extraction (user reads PDFs directly — many are infographics).
    """
-    try:
-        from pypdf import PdfReader
-    except ImportError:
-        log.error("pypdf not installed. Run: pip install pypdf")
-        return False
+    pdf_path.parent.mkdir(parents=True, exist_ok=True)

-    pdf_cache.parent.mkdir(parents=True, exist_ok=True)
-    transcripts_dir.mkdir(parents=True, exist_ok=True)
-
-    # Download PDF (resource is small — typically <5 MB rezumat)
    for attempt in range(MAX_RETRIES):
        try:
            resp = session.get(lecture["url"], stream=True, timeout=120)
            resp.raise_for_status()
-            tmp = pdf_cache.with_suffix(".pdf.tmp")
+            tmp = pdf_path.with_suffix(".pdf.tmp")
            total = 0
            with open(tmp, "wb") as f:
                for chunk in resp.iter_content(chunk_size=256 * 1024):
                    f.write(chunk)
                    total += len(chunk)
            if total < 1000:
-                log.warning(f"  PDF too small ({total} bytes): {pdf_cache.name}")
+                log.warning(f"  PDF too small ({total} bytes): {pdf_path.name}")
                tmp.unlink(missing_ok=True)
                return False
-            tmp.rename(pdf_cache)
-            log.info(f"  Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
-            break
+            tmp.rename(pdf_path)
+            log.info(f"  Downloaded (pdf): {pdf_path.name} ({total / 1024:.0f} KB)")
+            return True
        except Exception as e:
            wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
            log.warning(f"  PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(wait)
-    else:
-        log.error(f"  FAILED PDF download: {lecture['url']}")
-        return False
-
-    # Extract text
-    stem = Path(lecture["original_filename"]).stem
-    txt_path = transcripts_dir / f"{stem}.txt"
-    try:
-        reader = PdfReader(str(pdf_cache))
-        pages_text = []
-        for i, page in enumerate(reader.pages, 1):
-            t = page.extract_text() or ""
-            if t.strip():
-                pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
-        body = "\n\n".join(pages_text).strip()
-    except Exception as e:
-        log.error(f"  pypdf extract failed on {pdf_cache.name}: {e}")
-        return False
-
-    if len(body) < 50:
-        log.warning(f"  PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
-        return False
-
-    header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
-    txt_path.write_text(header + body, encoding="utf-8")
-    log.info(f"  Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
-
-    # Delete source PDF (user preference: nu pastrez sursele)
-    pdf_cache.unlink(missing_ok=True)
-    return True
+    log.error(f"  FAILED PDF download: {lecture['url']}")
+    return False


 def load_manifest(manifest_path: Path) -> dict | None:
@@ -484,7 +448,7 @@ def main():
        if module_filter and mod_idx not in module_filter:
            log.info(f"  Skipping module {mod_idx}: {mod['name']} (outside --modules filter, preserved in manifest)")
            continue
-        lectures = discover_lectures(session, mod, course)
+        lectures = discover_lectures(session, mod, course, mod_idx)
        module_entry = {
            "name": mod["name"],
            "module_id": mod["module_id"],
@@ -502,9 +466,9 @@ def main():
                "original_filename": lec["original_filename"],
                "url": lec["url"],
                "audio_path": lec["audio_path"],
-                "transcript_path": prior.get("transcript_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
-                "srt_path":        prior.get("srt_path")        or f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
-                "summary_path":    prior.get("summary_path")    or f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
+                "transcript_path": f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
+                "srt_path":        f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
+                "summary_path":    f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
                "download_status": "pending",
                # Preserve transcribe_status from prior run (disk check in transcribe.py will correct it if needed).
                "transcribe_status": prior.get("transcribe_status", "pending"),
@@ -529,18 +493,19 @@ def main():
                    failed += 1

            elif lec["type"] == "pdf":
-                # PDF -> download, extract text, save as transcript, delete source.
-                txt_path = Path(entry["transcript_path"])
-                if txt_path.exists() and txt_path.stat().st_size > 50:
+                # PDF -> download source to summaries/pdf/, keep as-is. No
+                # transcript, no whisper. User reads PDFs directly.
+                pdf_path = Path(lec["audio_path"])  # now points to pdf_dir
+                if pdf_path.exists() and pdf_path.stat().st_size > 1000:
                    entry["download_status"] = "complete"
-                    entry["transcribe_status"] = "complete"
+                    entry["transcribe_status"] = "pdf_source_only"
+                    entry["file_size_bytes"] = pdf_path.stat().st_size
                    skipped += 1
-                    log.info(f"  Skipping pdf (transcript exists): {txt_path.name}")
-                elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
-                                               paths["transcripts_dir"]):
+                    log.info(f"  Skipping pdf (source exists): {pdf_path.name}")
+                elif download_pdf(session, lec, pdf_path):
                    entry["download_status"] = "complete"
-                    entry["transcribe_status"] = "complete"
-                    entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
+                    entry["transcribe_status"] = "pdf_source_only"
+                    entry["file_size_bytes"] = pdf_path.stat().st_size
                    downloaded += 1
                else:
                    entry["download_status"] = "failed"