feat(practitioner): structură per-modul + PDF-uri sursă + split 2-PC

- audio/Modul {N}/filename.mp3 — fiecare modul în subdirector separat
  pentru copiere pe telefon și transfer între PC-uri.
- PDF-urile se păstrează ca sursă în summaries/pdf/ (fără extract txt).
- transcribe_status="pdf_source_only" pentru lecțiile PDF → summarize.py
  le filtrează automat.
- Fix coliziune manifest transcript_path (stem-based, nu preserve prior).
- .bat per modul (M2-M8) + dispatchers run_pc1_all (M2-M5) + run_pc2_all
  (M6-M8) pentru partajare work pe 2 PC-uri.
- prepare_pc2_bundle.py: zip cu scripts + manifest + .env + PDFs pentru
  PC2 (self-installs whisper.cpp/model/ffmpeg la primul run).
- M1 whisper complete (49/49 audio+vimeo transcrise).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-23 08:48:58 +03:00
parent 2e4bb88624
commit 6ee53133b7
132 changed files with 28904 additions and 74 deletions

View File

@@ -148,11 +148,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
return "text", "", "" # stem filled in by caller using title slug
def discover_lectures(session: requests.Session, module: dict, course: dict) -> list[dict]:
def discover_lectures(session: requests.Session, module: dict, course: dict, mod_idx: int) -> list[dict]:
resp = session.get(module["url"])
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
paths = course_paths(course)
audio_mod_dir = paths["audio_dir"] / f"Modul {mod_idx}"
pdf_dir = paths["pdf_dir"]
lectures = []
for lesson_div in soup.select("div.lesson"):
name_el = lesson_div.select_one("div.module__name")
@@ -177,20 +181,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
"text_content": body_el.get_text("\n", strip=True),
}
elif ltype == "pdf":
# Transcript derives from extracted PDF text. No audio file; the
# "audio_path" is reused as the PDF cache path (deleted after
# extraction to honor the 'nu pastrez sursele' preference).
pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
# PDF source is kept (not extracted / not deleted). Lives flat in
# summaries/pdf/ — user reads PDFs directly, no whisper/no txt.
pdf_path = pdf_dir / f"{stem}.pdf"
lecture = {
"type": "pdf",
"title": title,
"original_filename": f"{stem}.pdf",
"url": media_url,
"audio_path": str(pdf_cache),
"audio_path": str(pdf_path),
}
elif ltype == "vimeo":
# Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
audio_path = audio_mod_dir / f"{stem}.mp3"
lecture = {
"type": "vimeo",
"title": title,
@@ -199,9 +201,8 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
"audio_path": str(audio_path),
}
else: # "audio"
# Preserve original filename (may contain spaces).
filename = media_url.split("/")[-1]
audio_path = course_paths(course)["audio_dir"] / filename
audio_path = audio_mod_dir / filename
lecture = {
"type": "audio",
"title": title,
@@ -321,74 +322,37 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
return True
def download_pdf_and_extract(session: requests.Session, lecture: dict,
pdf_cache: Path, transcripts_dir: Path) -> bool:
def download_pdf(session: requests.Session, lecture: dict, pdf_path: Path) -> bool:
"""
Download PDF resource via authenticated session, extract text via pypdf,
write as transcript .txt. Delete PDF after extraction (no source retention).
Download PDF resource via authenticated session, save source file.
No text extraction (user reads PDFs directly — many are infographics).
"""
try:
from pypdf import PdfReader
except ImportError:
log.error("pypdf not installed. Run: pip install pypdf")
return False
pdf_path.parent.mkdir(parents=True, exist_ok=True)
pdf_cache.parent.mkdir(parents=True, exist_ok=True)
transcripts_dir.mkdir(parents=True, exist_ok=True)
# Download PDF (resource is small — typically <5 MB rezumat)
for attempt in range(MAX_RETRIES):
try:
resp = session.get(lecture["url"], stream=True, timeout=120)
resp.raise_for_status()
tmp = pdf_cache.with_suffix(".pdf.tmp")
tmp = pdf_path.with_suffix(".pdf.tmp")
total = 0
with open(tmp, "wb") as f:
for chunk in resp.iter_content(chunk_size=256 * 1024):
f.write(chunk)
total += len(chunk)
if total < 1000:
log.warning(f" PDF too small ({total} bytes): {pdf_cache.name}")
log.warning(f" PDF too small ({total} bytes): {pdf_path.name}")
tmp.unlink(missing_ok=True)
return False
tmp.rename(pdf_cache)
log.info(f" Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
break
tmp.rename(pdf_path)
log.info(f" Downloaded (pdf): {pdf_path.name} ({total / 1024:.0f} KB)")
return True
except Exception as e:
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
log.warning(f" PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(wait)
else:
log.error(f" FAILED PDF download: {lecture['url']}")
return False
# Extract text
stem = Path(lecture["original_filename"]).stem
txt_path = transcripts_dir / f"{stem}.txt"
try:
reader = PdfReader(str(pdf_cache))
pages_text = []
for i, page in enumerate(reader.pages, 1):
t = page.extract_text() or ""
if t.strip():
pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
body = "\n\n".join(pages_text).strip()
except Exception as e:
log.error(f" pypdf extract failed on {pdf_cache.name}: {e}")
return False
if len(body) < 50:
log.warning(f" PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
return False
header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
txt_path.write_text(header + body, encoding="utf-8")
log.info(f" Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
# Delete source PDF (user preference: nu pastrez sursele)
pdf_cache.unlink(missing_ok=True)
return True
log.error(f" FAILED PDF download: {lecture['url']}")
return False
def load_manifest(manifest_path: Path) -> dict | None:
@@ -484,7 +448,7 @@ def main():
if module_filter and mod_idx not in module_filter:
log.info(f" Skipping module {mod_idx}: {mod['name']} (outside --modules filter, preserved in manifest)")
continue
lectures = discover_lectures(session, mod, course)
lectures = discover_lectures(session, mod, course, mod_idx)
module_entry = {
"name": mod["name"],
"module_id": mod["module_id"],
@@ -502,9 +466,9 @@ def main():
"original_filename": lec["original_filename"],
"url": lec["url"],
"audio_path": lec["audio_path"],
"transcript_path": prior.get("transcript_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
"srt_path": prior.get("srt_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
"summary_path": prior.get("summary_path") or f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
"transcript_path": f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
"srt_path": f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
"summary_path": f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
"download_status": "pending",
# Preserve transcribe_status from prior run (disk check in transcribe.py will correct it if needed).
"transcribe_status": prior.get("transcribe_status", "pending"),
@@ -529,18 +493,19 @@ def main():
failed += 1
elif lec["type"] == "pdf":
# PDF -> download, extract text, save as transcript, delete source.
txt_path = Path(entry["transcript_path"])
if txt_path.exists() and txt_path.stat().st_size > 50:
# PDF -> download source to summaries/pdf/, keep as-is. No
# transcript, no whisper. User reads PDFs directly.
pdf_path = Path(lec["audio_path"]) # now points to pdf_dir
if pdf_path.exists() and pdf_path.stat().st_size > 1000:
entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
entry["transcribe_status"] = "pdf_source_only"
entry["file_size_bytes"] = pdf_path.stat().st_size
skipped += 1
log.info(f" Skipping pdf (transcript exists): {txt_path.name}")
elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
paths["transcripts_dir"]):
log.info(f" Skipping pdf (source exists): {pdf_path.name}")
elif download_pdf(session, lec, pdf_path):
entry["download_status"] = "complete"
entry["transcribe_status"] = "complete"
entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
entry["transcribe_status"] = "pdf_source_only"
entry["file_size_bytes"] = pdf_path.stat().st_size
downloaded += 1
else:
entry["download_status"] = "failed"