feat(practitioner): structură per-modul + PDF-uri sursă + split 2-PC
- audio/Modul {N}/filename.mp3 — fiecare modul în subdirector separat
pentru copiere pe telefon și transfer între PC-uri.
- PDF-urile se păstrează ca sursă în summaries/pdf/ (fără extract txt).
- transcribe_status="pdf_source_only" pentru lecțiile PDF → summarize.py
le filtrează automat.
- Fix coliziune manifest transcript_path (stem-based, nu preserve prior).
- .bat per modul (M2-M8) + dispatchers run_pc1_all (M2-M5) + run_pc2_all
(M6-M8) pentru partajare work pe 2 PC-uri.
- prepare_pc2_bundle.py: zip cu scripts + manifest + .env + PDFs pentru
PC2 (self-installs whisper.cpp/model/ffmpeg la primul run).
- M1 whisper complete (49/49 audio+vimeo transcrise).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
107
download.py
107
download.py
@@ -148,11 +148,15 @@ def classify_lesson(lesson_div, base_url: str) -> tuple[str, str, str]:
|
||||
return "text", "", "" # stem filled in by caller using title slug
|
||||
|
||||
|
||||
def discover_lectures(session: requests.Session, module: dict, course: dict) -> list[dict]:
|
||||
def discover_lectures(session: requests.Session, module: dict, course: dict, mod_idx: int) -> list[dict]:
|
||||
resp = session.get(module["url"])
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
paths = course_paths(course)
|
||||
audio_mod_dir = paths["audio_dir"] / f"Modul {mod_idx}"
|
||||
pdf_dir = paths["pdf_dir"]
|
||||
|
||||
lectures = []
|
||||
for lesson_div in soup.select("div.lesson"):
|
||||
name_el = lesson_div.select_one("div.module__name")
|
||||
@@ -177,20 +181,18 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
|
||||
"text_content": body_el.get_text("\n", strip=True),
|
||||
}
|
||||
elif ltype == "pdf":
|
||||
# Transcript derives from extracted PDF text. No audio file; the
|
||||
# "audio_path" is reused as the PDF cache path (deleted after
|
||||
# extraction to honor the 'nu pastrez sursele' preference).
|
||||
pdf_cache = course_paths(course)["audio_dir"] / f"{stem}.pdf"
|
||||
# PDF source is kept (not extracted / not deleted). Lives flat in
|
||||
# summaries/pdf/ — user reads PDFs directly, no whisper/no txt.
|
||||
pdf_path = pdf_dir / f"{stem}.pdf"
|
||||
lecture = {
|
||||
"type": "pdf",
|
||||
"title": title,
|
||||
"original_filename": f"{stem}.pdf",
|
||||
"url": media_url,
|
||||
"audio_path": str(pdf_cache),
|
||||
"audio_path": str(pdf_path),
|
||||
}
|
||||
elif ltype == "vimeo":
|
||||
# Target MP3 filename has .mp3 (yt-dlp extract-audio writes .mp3)
|
||||
audio_path = course_paths(course)["audio_dir"] / f"{stem}.mp3"
|
||||
audio_path = audio_mod_dir / f"{stem}.mp3"
|
||||
lecture = {
|
||||
"type": "vimeo",
|
||||
"title": title,
|
||||
@@ -199,9 +201,8 @@ def discover_lectures(session: requests.Session, module: dict, course: dict) ->
|
||||
"audio_path": str(audio_path),
|
||||
}
|
||||
else: # "audio"
|
||||
# Preserve original filename (may contain spaces).
|
||||
filename = media_url.split("/")[-1]
|
||||
audio_path = course_paths(course)["audio_dir"] / filename
|
||||
audio_path = audio_mod_dir / filename
|
||||
lecture = {
|
||||
"type": "audio",
|
||||
"title": title,
|
||||
@@ -321,74 +322,37 @@ def capture_text_lecture(lecture: dict, transcripts_dir: Path) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def download_pdf_and_extract(session: requests.Session, lecture: dict,
|
||||
pdf_cache: Path, transcripts_dir: Path) -> bool:
|
||||
def download_pdf(session: requests.Session, lecture: dict, pdf_path: Path) -> bool:
|
||||
"""
|
||||
Download PDF resource via authenticated session, extract text via pypdf,
|
||||
write as transcript .txt. Delete PDF after extraction (no source retention).
|
||||
Download PDF resource via authenticated session, save source file.
|
||||
No text extraction (user reads PDFs directly — many are infographics).
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
log.error("pypdf not installed. Run: pip install pypdf")
|
||||
return False
|
||||
pdf_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pdf_cache.parent.mkdir(parents=True, exist_ok=True)
|
||||
transcripts_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download PDF (resource is small — typically <5 MB rezumat)
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = session.get(lecture["url"], stream=True, timeout=120)
|
||||
resp.raise_for_status()
|
||||
tmp = pdf_cache.with_suffix(".pdf.tmp")
|
||||
tmp = pdf_path.with_suffix(".pdf.tmp")
|
||||
total = 0
|
||||
with open(tmp, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=256 * 1024):
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
if total < 1000:
|
||||
log.warning(f" PDF too small ({total} bytes): {pdf_cache.name}")
|
||||
log.warning(f" PDF too small ({total} bytes): {pdf_path.name}")
|
||||
tmp.unlink(missing_ok=True)
|
||||
return False
|
||||
tmp.rename(pdf_cache)
|
||||
log.info(f" Downloaded (pdf): {pdf_cache.name} ({total / 1024:.0f} KB)")
|
||||
break
|
||||
tmp.rename(pdf_path)
|
||||
log.info(f" Downloaded (pdf): {pdf_path.name} ({total / 1024:.0f} KB)")
|
||||
return True
|
||||
except Exception as e:
|
||||
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
|
||||
log.warning(f" PDF attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
time.sleep(wait)
|
||||
else:
|
||||
log.error(f" FAILED PDF download: {lecture['url']}")
|
||||
return False
|
||||
|
||||
# Extract text
|
||||
stem = Path(lecture["original_filename"]).stem
|
||||
txt_path = transcripts_dir / f"{stem}.txt"
|
||||
try:
|
||||
reader = PdfReader(str(pdf_cache))
|
||||
pages_text = []
|
||||
for i, page in enumerate(reader.pages, 1):
|
||||
t = page.extract_text() or ""
|
||||
if t.strip():
|
||||
pages_text.append(f"--- pagina {i} ---\n{t.strip()}")
|
||||
body = "\n\n".join(pages_text).strip()
|
||||
except Exception as e:
|
||||
log.error(f" pypdf extract failed on {pdf_cache.name}: {e}")
|
||||
return False
|
||||
|
||||
if len(body) < 50:
|
||||
log.warning(f" PDF '{lecture['title']}' extracted <50 chars; keeping file, skipping transcript")
|
||||
return False
|
||||
|
||||
header = f"[PDF LECTURE — extracted from {lecture['url']}]\nTitlu: {lecture['title']}\n\n"
|
||||
txt_path.write_text(header + body, encoding="utf-8")
|
||||
log.info(f" Extracted (pdf): {txt_path.name} ({txt_path.stat().st_size} bytes, {len(reader.pages)} pages)")
|
||||
|
||||
# Delete source PDF (user preference: nu pastrez sursele)
|
||||
pdf_cache.unlink(missing_ok=True)
|
||||
return True
|
||||
log.error(f" FAILED PDF download: {lecture['url']}")
|
||||
return False
|
||||
|
||||
|
||||
def load_manifest(manifest_path: Path) -> dict | None:
|
||||
@@ -484,7 +448,7 @@ def main():
|
||||
if module_filter and mod_idx not in module_filter:
|
||||
log.info(f" Skipping module {mod_idx}: {mod['name']} (outside --modules filter, preserved in manifest)")
|
||||
continue
|
||||
lectures = discover_lectures(session, mod, course)
|
||||
lectures = discover_lectures(session, mod, course, mod_idx)
|
||||
module_entry = {
|
||||
"name": mod["name"],
|
||||
"module_id": mod["module_id"],
|
||||
@@ -502,9 +466,9 @@ def main():
|
||||
"original_filename": lec["original_filename"],
|
||||
"url": lec["url"],
|
||||
"audio_path": lec["audio_path"],
|
||||
"transcript_path": prior.get("transcript_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
|
||||
"srt_path": prior.get("srt_path") or f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
|
||||
"summary_path": prior.get("summary_path") or f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
|
||||
"transcript_path": f"{paths['transcripts_dir'].as_posix()}/{stem}.txt",
|
||||
"srt_path": f"{paths['transcripts_dir'].as_posix()}/{stem}.srt",
|
||||
"summary_path": f"{paths['summaries_dir'].as_posix()}/{stem}_summary.md",
|
||||
"download_status": "pending",
|
||||
# Preserve transcribe_status from prior run (disk check in transcribe.py will correct it if needed).
|
||||
"transcribe_status": prior.get("transcribe_status", "pending"),
|
||||
@@ -529,18 +493,19 @@ def main():
|
||||
failed += 1
|
||||
|
||||
elif lec["type"] == "pdf":
|
||||
# PDF -> download, extract text, save as transcript, delete source.
|
||||
txt_path = Path(entry["transcript_path"])
|
||||
if txt_path.exists() and txt_path.stat().st_size > 50:
|
||||
# PDF -> download source to summaries/pdf/, keep as-is. No
|
||||
# transcript, no whisper. User reads PDFs directly.
|
||||
pdf_path = Path(lec["audio_path"]) # now points to pdf_dir
|
||||
if pdf_path.exists() and pdf_path.stat().st_size > 1000:
|
||||
entry["download_status"] = "complete"
|
||||
entry["transcribe_status"] = "complete"
|
||||
entry["transcribe_status"] = "pdf_source_only"
|
||||
entry["file_size_bytes"] = pdf_path.stat().st_size
|
||||
skipped += 1
|
||||
log.info(f" Skipping pdf (transcript exists): {txt_path.name}")
|
||||
elif download_pdf_and_extract(session, lec, Path(lec["audio_path"]),
|
||||
paths["transcripts_dir"]):
|
||||
log.info(f" Skipping pdf (source exists): {pdf_path.name}")
|
||||
elif download_pdf(session, lec, pdf_path):
|
||||
entry["download_status"] = "complete"
|
||||
entry["transcribe_status"] = "complete"
|
||||
entry["file_size_bytes"] = txt_path.stat().st_size if txt_path.exists() else 0
|
||||
entry["transcribe_status"] = "pdf_source_only"
|
||||
entry["file_size_bytes"] = pdf_path.stat().st_size
|
||||
downloaded += 1
|
||||
else:
|
||||
entry["download_status"] = "failed"
|
||||
|
||||
Reference in New Issue
Block a user