nlp-master/summarize.py

"""
Generate summaries from transcripts using Claude Code.
Reads <root>/manifest.json, processes each transcript, outputs per-lecture
summaries, and compiles <root>/SUPORT_CURS.md master study guide.

Usage:
  python summarize.py                             # master, print prompts
  python summarize.py --course practitioner       # practitioner, print prompts
  python summarize.py --compile                   # master, compile SUPORT_CURS.md
  python summarize.py --course practitioner --compile
"""

import argparse
import json
import sys
from pathlib import Path

from courses import course_paths, get_course, validate_manifest_course

MAX_WORDS_PER_CHUNK = 10000
OVERLAP_WORDS = 500

SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul {course_name}.

Ofera:
1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important
3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer
4. **Citate memorabile** - fraze sau idei remarcabile (daca exista)

Raspunde in limba romana. Formateaza ca Markdown.

---
TITLU LECTIE: {title}
---
TRANSCRIERE:
{text}
"""

MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat).
Combina-le intr-un singur rezumat coerent, eliminand duplicatele.

Pastreaza structura:
1. Prezentare generala (3-5 propozitii)
2. Concepte cheie cu definitii
3. Detalii si exemple importante
4. Citate memorabile

Raspunde in limba romana. Formateaza ca Markdown.

---
TITLU LECTIE: {title}
---
REZUMATE PARTIALE:
{chunks}
"""


def load_manifest(manifest_path: Path) -> dict:
    with open(manifest_path, encoding="utf-8") as f:
        return json.load(f)


def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
    words = text.split()
    if len(words) <= max_words:
        return [text]

    chunks = []
    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words)

        if end < len(words):
            for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
                last_sep = chunk_text.rfind(sep)
                if last_sep > len(chunk_text) // 2:
                    chunk_text = chunk_text[:last_sep + 1]
                    end = start + len(chunk_text.split())
                    break

        chunks.append(chunk_text)
        start = max(end - overlap, start + 1)

    return chunks


def generate_prompts(manifest: dict, course: dict, paths: dict):
    paths["summaries_dir"].mkdir(parents=True, exist_ok=True)

    for mod in manifest["modules"]:
        for lec in mod["lectures"]:
            if lec.get("transcribe_status") != "complete":
                continue

            summary_path = Path(lec["summary_path"])
            if summary_path.exists() and summary_path.stat().st_size > 0:
                print(f"# SKIP (exists): {lec['title']}", file=sys.stderr)
                continue

            txt_path = Path(lec["transcript_path"])
            if not txt_path.exists():
                print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr)
                continue

            text = txt_path.read_text(encoding="utf-8").strip()
            if not text:
                print(f"# SKIP (empty): {lec['title']}", file=sys.stderr)
                continue

            chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS)

            print(f"\n{'='*60}", file=sys.stderr)
            print(f"Lecture: {lec['title']}", file=sys.stderr)
            print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr)
            print(f"Output: {summary_path}", file=sys.stderr)

            if len(chunks) == 1:
                prompt = SUMMARY_PROMPT.format(
                    course_name=course["name"], title=lec["title"], text=text,
                )
                print(f"SUMMARY_FILE:{summary_path}")
                print(prompt)
                print("---END_PROMPT---")
            else:
                for i, chunk in enumerate(chunks, 1):
                    prompt = SUMMARY_PROMPT.format(
                        course_name=course["name"],
                        title=f"{lec['title']} (partea {i}/{len(chunks)})",
                        text=chunk,
                    )
                    print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}")
                    print(prompt)
                    print("---END_PROMPT---")

                print(f"MERGE_FILE:{summary_path}")
                merge = MERGE_PROMPT.format(
                    title=lec["title"],
                    chunks="{chunk_summaries}",
                )
                print(merge)
                print("---END_PROMPT---")


def compile_master_guide(manifest: dict, course: dict, paths: dict):
    lines = [
        f"# SUPORT CURS - {course['name']}\n",
        "_Generat automat din transcrierile audio ale cursului._\n",
        "---\n",
    ]

    for mod in manifest["modules"]:
        lines.append(f"\n## {mod['name']}\n")
        for lec in mod["lectures"]:
            summary_path = Path(lec["summary_path"])
            lines.append(f"\n### {lec['title']}\n")
            if summary_path.exists():
                content = summary_path.read_text(encoding="utf-8").strip()
                lines.append(f"{content}\n")
            else:
                lines.append("_Rezumat indisponibil._\n")
        lines.append("\n---\n")

    paths["master_guide"].parent.mkdir(parents=True, exist_ok=True)
    # Write LF-only to match the WSL2 baseline (the documented summary workflow
    # runs from WSL2; Windows text-mode CRLF would break byte-identic compares).
    with open(paths["master_guide"], "w", encoding="utf-8", newline="\n") as f:
        f.write("\n".join(lines))
    print(f"Compiled {paths['master_guide']} ({paths['master_guide'].stat().st_size} bytes)")


def parse_args():
    p = argparse.ArgumentParser(description="Generate summaries / compile SUPORT_CURS.md")
    p.add_argument("--course", default="master", help="Course key (see courses.py)")
    p.add_argument("--compile", action="store_true", help="Compile SUPORT_CURS.md from existing summaries")
    return p.parse_args()


def main():
    args = parse_args()
    course = get_course(args.course)
    paths = course_paths(course)

    if not paths["manifest"].exists():
        print(f"{paths['manifest']} not found. Run download.py and transcribe.py first.")
        sys.exit(1)

    manifest = load_manifest(paths["manifest"])
    validate_manifest_course(manifest, course["key"])

    if args.compile:
        compile_master_guide(manifest, course, paths)
    else:
        generate_prompts(manifest, course, paths)


if __name__ == "__main__":
    main()