""" Generate summaries from transcripts using Claude Code. Reads manifest.json, processes each transcript, outputs per-lecture summaries, and compiles SUPORT_CURS.md master study guide. Usage: python summarize.py # Print prompts for each transcript (pipe to Claude) python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md """ import json import sys import textwrap from pathlib import Path MANIFEST_PATH = Path("manifest.json") SUMMARIES_DIR = Path("summaries") TRANSCRIPTS_DIR = Path("transcripts") MASTER_GUIDE = Path("SUPORT_CURS.md") MAX_WORDS_PER_CHUNK = 10000 OVERLAP_WORDS = 500 SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner. Ofera: 1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei 2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important 3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer 4. **Citate memorabile** - fraze sau idei remarcabile (daca exista) Raspunde in limba romana. Formateaza ca Markdown. --- TITLU LECTIE: {title} --- TRANSCRIERE: {text} """ MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat). Combina-le intr-un singur rezumat coerent, eliminand duplicatele. Pastreaza structura: 1. Prezentare generala (3-5 propozitii) 2. Concepte cheie cu definitii 3. Detalii si exemple importante 4. Citate memorabile Raspunde in limba romana. Formateaza ca Markdown. --- TITLU LECTIE: {title} --- REZUMATE PARTIALE: {chunks} """ def load_manifest() -> dict: with open(MANIFEST_PATH, encoding="utf-8") as f: return json.load(f) def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]: """Split text into chunks at sentence boundaries with overlap.""" words = text.split() if len(words) <= max_words: return [text] chunks = [] start = 0 while start < len(words): end = min(start + max_words, len(words)) chunk_words = words[start:end] chunk_text = " ".join(chunk_words) # Try to break at sentence boundary (look back from end) if end < len(words): for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]: last_sep = chunk_text.rfind(sep) if last_sep > len(chunk_text) // 2: # Don't break too early chunk_text = chunk_text[:last_sep + 1] # Recalculate end based on actual words used end = start + len(chunk_text.split()) break chunks.append(chunk_text) start = max(end - overlap, start + 1) # Overlap, but always advance return chunks def generate_prompts(manifest: dict): """Print summary prompts for each transcript to stdout.""" SUMMARIES_DIR.mkdir(exist_ok=True) for mod in manifest["modules"]: for lec in mod["lectures"]: if lec.get("transcribe_status") != "complete": continue summary_path = Path(lec["summary_path"]) if summary_path.exists() and summary_path.stat().st_size > 0: print(f"# SKIP (exists): {lec['title']}", file=sys.stderr) continue txt_path = Path(lec["transcript_path"]) if not txt_path.exists(): print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr) continue text = txt_path.read_text(encoding="utf-8").strip() if not text: print(f"# SKIP (empty): {lec['title']}", file=sys.stderr) continue chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS) print(f"\n{'='*60}", file=sys.stderr) print(f"Lecture: {lec['title']}", file=sys.stderr) print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr) print(f"Output: {summary_path}", file=sys.stderr) if len(chunks) == 1: prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text) print(f"SUMMARY_FILE:{summary_path}") print(prompt) print("---END_PROMPT---") else: # Multi-chunk: generate individual chunk prompts for i, chunk in enumerate(chunks, 1): prompt = SUMMARY_PROMPT.format( title=f"{lec['title']} (partea {i}/{len(chunks)})", text=chunk, ) print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}") print(prompt) print("---END_PROMPT---") # Then a merge prompt print(f"MERGE_FILE:{summary_path}") merge = MERGE_PROMPT.format( title=lec["title"], chunks="{chunk_summaries}", # Placeholder for merge step ) print(merge) print("---END_PROMPT---") def compile_master_guide(manifest: dict): """Compile all summaries into SUPORT_CURS.md.""" lines = [ "# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n", "_Generat automat din transcrierile audio ale cursului._\n", "---\n", ] for mod in manifest["modules"]: lines.append(f"\n## {mod['name']}\n") for lec in mod["lectures"]: summary_path = Path(lec["summary_path"]) lines.append(f"\n### {lec['title']}\n") if summary_path.exists(): content = summary_path.read_text(encoding="utf-8").strip() lines.append(f"{content}\n") else: lines.append("_Rezumat indisponibil._\n") lines.append("\n---\n") MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8") print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)") def main(): if not MANIFEST_PATH.exists(): print("manifest.json not found. Run download.py and transcribe.py first.") sys.exit(1) manifest = load_manifest() if "--compile" in sys.argv: compile_master_guide(manifest) else: generate_prompts(manifest) if __name__ == "__main__": main()