NLP Master: pipeline download + transcribe + summarize

- run.bat: one-click pipeline (download, convert, transcribe) - download.py: fetch audio from course platform - transcribe.py: whisper.cpp batch transcription (CPU, WAV 16kHz) - MP3->WAV conversion via ffmpeg - --modules filter for splitting work across machines - summarize.py: generate summaries from transcripts - setup_whisper.py: auto-download whisper.cpp, ffmpeg, and model - Medium model (q5_0) instead of large to avoid VRAM crashes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 01:37:13 +02:00
commit bbc5884545
10 changed files with 2203 additions and 0 deletions
--- a/summarize.py
+++ b/summarize.py
@@ -0,0 +1,192 @@
+"""
+Generate summaries from transcripts using Claude Code.
+Reads manifest.json, processes each transcript, outputs per-lecture summaries,
+and compiles SUPORT_CURS.md master study guide.
+
+Usage:
+  python summarize.py              # Print prompts for each transcript (pipe to Claude)
+  python summarize.py --compile    # Compile existing summaries into SUPORT_CURS.md
+"""
+
+import json
+import sys
+import textwrap
+from pathlib import Path
+
+MANIFEST_PATH = Path("manifest.json")
+SUMMARIES_DIR = Path("summaries")
+TRANSCRIPTS_DIR = Path("transcripts")
+MASTER_GUIDE = Path("SUPORT_CURS.md")
+
+MAX_WORDS_PER_CHUNK = 10000
+OVERLAP_WORDS = 500
+
+SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner.
+
+Ofera:
+1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
+2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important
+3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer
+4. **Citate memorabile** - fraze sau idei remarcabile (daca exista)
+
+Raspunde in limba romana. Formateaza ca Markdown.
+
+---
+TITLU LECTIE: {title}
+---
+TRANSCRIERE:
+{text}
+"""
+
+MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat).
+Combina-le intr-un singur rezumat coerent, eliminand duplicatele.
+
+Pastreaza structura:
+1. Prezentare generala (3-5 propozitii)
+2. Concepte cheie cu definitii
+3. Detalii si exemple importante
+4. Citate memorabile
+
+Raspunde in limba romana. Formateaza ca Markdown.
+
+---
+TITLU LECTIE: {title}
+---
+REZUMATE PARTIALE:
+{chunks}
+"""
+
+
+def load_manifest() -> dict:
+    with open(MANIFEST_PATH, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
+    """Split text into chunks at sentence boundaries with overlap."""
+    words = text.split()
+    if len(words) <= max_words:
+        return [text]
+
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = min(start + max_words, len(words))
+        chunk_words = words[start:end]
+        chunk_text = " ".join(chunk_words)
+
+        # Try to break at sentence boundary (look back from end)
+        if end < len(words):
+            for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
+                last_sep = chunk_text.rfind(sep)
+                if last_sep > len(chunk_text) // 2:  # Don't break too early
+                    chunk_text = chunk_text[:last_sep + 1]
+                    # Recalculate end based on actual words used
+                    end = start + len(chunk_text.split())
+                    break
+
+        chunks.append(chunk_text)
+        start = max(end - overlap, start + 1)  # Overlap, but always advance
+
+    return chunks
+
+
+def generate_prompts(manifest: dict):
+    """Print summary prompts for each transcript to stdout."""
+    SUMMARIES_DIR.mkdir(exist_ok=True)
+
+    for mod in manifest["modules"]:
+        for lec in mod["lectures"]:
+            if lec.get("transcribe_status") != "complete":
+                continue
+
+            summary_path = Path(lec["summary_path"])
+            if summary_path.exists() and summary_path.stat().st_size > 0:
+                print(f"# SKIP (exists): {lec['title']}", file=sys.stderr)
+                continue
+
+            txt_path = Path(lec["transcript_path"])
+            if not txt_path.exists():
+                print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr)
+                continue
+
+            text = txt_path.read_text(encoding="utf-8").strip()
+            if not text:
+                print(f"# SKIP (empty): {lec['title']}", file=sys.stderr)
+                continue
+
+            chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS)
+
+            print(f"\n{'='*60}", file=sys.stderr)
+            print(f"Lecture: {lec['title']}", file=sys.stderr)
+            print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr)
+            print(f"Output: {summary_path}", file=sys.stderr)
+
+            if len(chunks) == 1:
+                prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text)
+                print(f"SUMMARY_FILE:{summary_path}")
+                print(prompt)
+                print("---END_PROMPT---")
+            else:
+                # Multi-chunk: generate individual chunk prompts
+                for i, chunk in enumerate(chunks, 1):
+                    prompt = SUMMARY_PROMPT.format(
+                        title=f"{lec['title']} (partea {i}/{len(chunks)})",
+                        text=chunk,
+                    )
+                    print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}")
+                    print(prompt)
+                    print("---END_PROMPT---")
+
+                # Then a merge prompt
+                print(f"MERGE_FILE:{summary_path}")
+                merge = MERGE_PROMPT.format(
+                    title=lec["title"],
+                    chunks="{chunk_summaries}",  # Placeholder for merge step
+                )
+                print(merge)
+                print("---END_PROMPT---")
+
+
+def compile_master_guide(manifest: dict):
+    """Compile all summaries into SUPORT_CURS.md."""
+    lines = [
+        "# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n",
+        "_Generat automat din transcrierile audio ale cursului._\n",
+        "---\n",
+    ]
+
+    for mod in manifest["modules"]:
+        lines.append(f"\n## {mod['name']}\n")
+
+        for lec in mod["lectures"]:
+            summary_path = Path(lec["summary_path"])
+            lines.append(f"\n### {lec['title']}\n")
+
+            if summary_path.exists():
+                content = summary_path.read_text(encoding="utf-8").strip()
+                lines.append(f"{content}\n")
+            else:
+                lines.append("_Rezumat indisponibil._\n")
+
+        lines.append("\n---\n")
+
+    MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8")
+    print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)")
+
+
+def main():
+    if not MANIFEST_PATH.exists():
+        print("manifest.json not found. Run download.py and transcribe.py first.")
+        sys.exit(1)
+
+    manifest = load_manifest()
+
+    if "--compile" in sys.argv:
+        compile_master_guide(manifest)
+    else:
+        generate_prompts(manifest)
+
+
+if __name__ == "__main__":
+    main()