NLP Master: pipeline download + transcribe + summarize
- run.bat: one-click pipeline (download, convert, transcribe) - download.py: fetch audio from course platform - transcribe.py: whisper.cpp batch transcription (CPU, WAV 16kHz) - MP3->WAV conversion via ffmpeg - --modules filter for splitting work across machines - summarize.py: generate summaries from transcripts - setup_whisper.py: auto-download whisper.cpp, ffmpeg, and model - Medium model (q5_0) instead of large to avoid VRAM crashes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
192
summarize.py
Normal file
192
summarize.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
Generate summaries from transcripts using Claude Code.
|
||||
Reads manifest.json, processes each transcript, outputs per-lecture summaries,
|
||||
and compiles SUPORT_CURS.md master study guide.
|
||||
|
||||
Usage:
|
||||
python summarize.py # Print prompts for each transcript (pipe to Claude)
|
||||
python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
MANIFEST_PATH = Path("manifest.json")
|
||||
SUMMARIES_DIR = Path("summaries")
|
||||
TRANSCRIPTS_DIR = Path("transcripts")
|
||||
MASTER_GUIDE = Path("SUPORT_CURS.md")
|
||||
|
||||
MAX_WORDS_PER_CHUNK = 10000
|
||||
OVERLAP_WORDS = 500
|
||||
|
||||
SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner.
|
||||
|
||||
Ofera:
|
||||
1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
|
||||
2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important
|
||||
3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer
|
||||
4. **Citate memorabile** - fraze sau idei remarcabile (daca exista)
|
||||
|
||||
Raspunde in limba romana. Formateaza ca Markdown.
|
||||
|
||||
---
|
||||
TITLU LECTIE: {title}
|
||||
---
|
||||
TRANSCRIERE:
|
||||
{text}
|
||||
"""
|
||||
|
||||
MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat).
|
||||
Combina-le intr-un singur rezumat coerent, eliminand duplicatele.
|
||||
|
||||
Pastreaza structura:
|
||||
1. Prezentare generala (3-5 propozitii)
|
||||
2. Concepte cheie cu definitii
|
||||
3. Detalii si exemple importante
|
||||
4. Citate memorabile
|
||||
|
||||
Raspunde in limba romana. Formateaza ca Markdown.
|
||||
|
||||
---
|
||||
TITLU LECTIE: {title}
|
||||
---
|
||||
REZUMATE PARTIALE:
|
||||
{chunks}
|
||||
"""
|
||||
|
||||
|
||||
def load_manifest() -> dict:
|
||||
with open(MANIFEST_PATH, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
|
||||
"""Split text into chunks at sentence boundaries with overlap."""
|
||||
words = text.split()
|
||||
if len(words) <= max_words:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = min(start + max_words, len(words))
|
||||
chunk_words = words[start:end]
|
||||
chunk_text = " ".join(chunk_words)
|
||||
|
||||
# Try to break at sentence boundary (look back from end)
|
||||
if end < len(words):
|
||||
for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
|
||||
last_sep = chunk_text.rfind(sep)
|
||||
if last_sep > len(chunk_text) // 2: # Don't break too early
|
||||
chunk_text = chunk_text[:last_sep + 1]
|
||||
# Recalculate end based on actual words used
|
||||
end = start + len(chunk_text.split())
|
||||
break
|
||||
|
||||
chunks.append(chunk_text)
|
||||
start = max(end - overlap, start + 1) # Overlap, but always advance
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def generate_prompts(manifest: dict):
|
||||
"""Print summary prompts for each transcript to stdout."""
|
||||
SUMMARIES_DIR.mkdir(exist_ok=True)
|
||||
|
||||
for mod in manifest["modules"]:
|
||||
for lec in mod["lectures"]:
|
||||
if lec.get("transcribe_status") != "complete":
|
||||
continue
|
||||
|
||||
summary_path = Path(lec["summary_path"])
|
||||
if summary_path.exists() and summary_path.stat().st_size > 0:
|
||||
print(f"# SKIP (exists): {lec['title']}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
txt_path = Path(lec["transcript_path"])
|
||||
if not txt_path.exists():
|
||||
print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
text = txt_path.read_text(encoding="utf-8").strip()
|
||||
if not text:
|
||||
print(f"# SKIP (empty): {lec['title']}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS)
|
||||
|
||||
print(f"\n{'='*60}", file=sys.stderr)
|
||||
print(f"Lecture: {lec['title']}", file=sys.stderr)
|
||||
print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr)
|
||||
print(f"Output: {summary_path}", file=sys.stderr)
|
||||
|
||||
if len(chunks) == 1:
|
||||
prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text)
|
||||
print(f"SUMMARY_FILE:{summary_path}")
|
||||
print(prompt)
|
||||
print("---END_PROMPT---")
|
||||
else:
|
||||
# Multi-chunk: generate individual chunk prompts
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
prompt = SUMMARY_PROMPT.format(
|
||||
title=f"{lec['title']} (partea {i}/{len(chunks)})",
|
||||
text=chunk,
|
||||
)
|
||||
print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}")
|
||||
print(prompt)
|
||||
print("---END_PROMPT---")
|
||||
|
||||
# Then a merge prompt
|
||||
print(f"MERGE_FILE:{summary_path}")
|
||||
merge = MERGE_PROMPT.format(
|
||||
title=lec["title"],
|
||||
chunks="{chunk_summaries}", # Placeholder for merge step
|
||||
)
|
||||
print(merge)
|
||||
print("---END_PROMPT---")
|
||||
|
||||
|
||||
def compile_master_guide(manifest: dict):
|
||||
"""Compile all summaries into SUPORT_CURS.md."""
|
||||
lines = [
|
||||
"# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n",
|
||||
"_Generat automat din transcrierile audio ale cursului._\n",
|
||||
"---\n",
|
||||
]
|
||||
|
||||
for mod in manifest["modules"]:
|
||||
lines.append(f"\n## {mod['name']}\n")
|
||||
|
||||
for lec in mod["lectures"]:
|
||||
summary_path = Path(lec["summary_path"])
|
||||
lines.append(f"\n### {lec['title']}\n")
|
||||
|
||||
if summary_path.exists():
|
||||
content = summary_path.read_text(encoding="utf-8").strip()
|
||||
lines.append(f"{content}\n")
|
||||
else:
|
||||
lines.append("_Rezumat indisponibil._\n")
|
||||
|
||||
lines.append("\n---\n")
|
||||
|
||||
MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8")
|
||||
print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)")
|
||||
|
||||
|
||||
def main():
|
||||
if not MANIFEST_PATH.exists():
|
||||
print("manifest.json not found. Run download.py and transcribe.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
manifest = load_manifest()
|
||||
|
||||
if "--compile" in sys.argv:
|
||||
compile_master_guide(manifest)
|
||||
else:
|
||||
generate_prompts(manifest)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user