Un singur set de scripturi acum rulează pe orice curs configurat în courses.py. Master rămâne la rădăcina repo (backward-compat M1-M6); cursuri noi (ex. practitioner la shop.cursnlp.ro) primesc un root dedicat (nlp-practitioner/) cu propriile artefacte. - courses.py: config dict (master, practitioner) + course_paths() + validate_manifest_course() (manifest fără course_key = master). - download.py: --course + --modules; trei tipuri de lecții (audio HTTP, Vimeo iframe via yt-dlp audio-only, text-only cu captură HTML); merge cu manifest existent în loc de replace; strip [Audio] pentru backward-compat paths. - transcribe.py: --course + --modules; skip type==text; path-uri prin course_paths(); validare course_key. - summarize.py: --course + --compile; template prompt folosește course['name']; scrie SUPORT_CURS.md cu LF explicit (WSL2 baseline). - md_to_pdf.py: --course resolv-ă summaries_dir / pdf_dir per curs. - run.bat: detectează master|practitioner ca primul argument, propagă --course la sub-scripturi; backward-compat run.bat [modules]. - requirements.txt: + yt-dlp. - .gitignore: nlp-practitioner/audio/, audio_wav/, scratch_recon.py, tmp_recon/. - tests/test_regression.sh: 5 gate-uri read-only (import, schema, disk-coherence, SUPORT_CURS byte-identic, cross-course isolation). Regression curs master: PASS (manifest + SUPORT_CURS.md hash identic cu baseline /tmp/suport_before.md). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
201 lines
6.8 KiB
Python
201 lines
6.8 KiB
Python
"""
|
|
Generate summaries from transcripts using Claude Code.
|
|
Reads <root>/manifest.json, processes each transcript, outputs per-lecture
|
|
summaries, and compiles <root>/SUPORT_CURS.md master study guide.
|
|
|
|
Usage:
|
|
python summarize.py # master, print prompts
|
|
python summarize.py --course practitioner # practitioner, print prompts
|
|
python summarize.py --compile # master, compile SUPORT_CURS.md
|
|
python summarize.py --course practitioner --compile
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from courses import course_paths, get_course, validate_manifest_course
|
|
|
|
MAX_WORDS_PER_CHUNK = 10000
|
|
OVERLAP_WORDS = 500
|
|
|
|
SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul {course_name}.
|
|
|
|
Ofera:
|
|
1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
|
|
2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important
|
|
3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer
|
|
4. **Citate memorabile** - fraze sau idei remarcabile (daca exista)
|
|
|
|
Raspunde in limba romana. Formateaza ca Markdown.
|
|
|
|
---
|
|
TITLU LECTIE: {title}
|
|
---
|
|
TRANSCRIERE:
|
|
{text}
|
|
"""
|
|
|
|
MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat).
|
|
Combina-le intr-un singur rezumat coerent, eliminand duplicatele.
|
|
|
|
Pastreaza structura:
|
|
1. Prezentare generala (3-5 propozitii)
|
|
2. Concepte cheie cu definitii
|
|
3. Detalii si exemple importante
|
|
4. Citate memorabile
|
|
|
|
Raspunde in limba romana. Formateaza ca Markdown.
|
|
|
|
---
|
|
TITLU LECTIE: {title}
|
|
---
|
|
REZUMATE PARTIALE:
|
|
{chunks}
|
|
"""
|
|
|
|
|
|
def load_manifest(manifest_path: Path) -> dict:
|
|
with open(manifest_path, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
|
|
words = text.split()
|
|
if len(words) <= max_words:
|
|
return [text]
|
|
|
|
chunks = []
|
|
start = 0
|
|
while start < len(words):
|
|
end = min(start + max_words, len(words))
|
|
chunk_words = words[start:end]
|
|
chunk_text = " ".join(chunk_words)
|
|
|
|
if end < len(words):
|
|
for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
|
|
last_sep = chunk_text.rfind(sep)
|
|
if last_sep > len(chunk_text) // 2:
|
|
chunk_text = chunk_text[:last_sep + 1]
|
|
end = start + len(chunk_text.split())
|
|
break
|
|
|
|
chunks.append(chunk_text)
|
|
start = max(end - overlap, start + 1)
|
|
|
|
return chunks
|
|
|
|
|
|
def generate_prompts(manifest: dict, course: dict, paths: dict):
|
|
paths["summaries_dir"].mkdir(parents=True, exist_ok=True)
|
|
|
|
for mod in manifest["modules"]:
|
|
for lec in mod["lectures"]:
|
|
if lec.get("transcribe_status") != "complete":
|
|
continue
|
|
|
|
summary_path = Path(lec["summary_path"])
|
|
if summary_path.exists() and summary_path.stat().st_size > 0:
|
|
print(f"# SKIP (exists): {lec['title']}", file=sys.stderr)
|
|
continue
|
|
|
|
txt_path = Path(lec["transcript_path"])
|
|
if not txt_path.exists():
|
|
print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr)
|
|
continue
|
|
|
|
text = txt_path.read_text(encoding="utf-8").strip()
|
|
if not text:
|
|
print(f"# SKIP (empty): {lec['title']}", file=sys.stderr)
|
|
continue
|
|
|
|
chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS)
|
|
|
|
print(f"\n{'='*60}", file=sys.stderr)
|
|
print(f"Lecture: {lec['title']}", file=sys.stderr)
|
|
print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr)
|
|
print(f"Output: {summary_path}", file=sys.stderr)
|
|
|
|
if len(chunks) == 1:
|
|
prompt = SUMMARY_PROMPT.format(
|
|
course_name=course["name"], title=lec["title"], text=text,
|
|
)
|
|
print(f"SUMMARY_FILE:{summary_path}")
|
|
print(prompt)
|
|
print("---END_PROMPT---")
|
|
else:
|
|
for i, chunk in enumerate(chunks, 1):
|
|
prompt = SUMMARY_PROMPT.format(
|
|
course_name=course["name"],
|
|
title=f"{lec['title']} (partea {i}/{len(chunks)})",
|
|
text=chunk,
|
|
)
|
|
print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}")
|
|
print(prompt)
|
|
print("---END_PROMPT---")
|
|
|
|
print(f"MERGE_FILE:{summary_path}")
|
|
merge = MERGE_PROMPT.format(
|
|
title=lec["title"],
|
|
chunks="{chunk_summaries}",
|
|
)
|
|
print(merge)
|
|
print("---END_PROMPT---")
|
|
|
|
|
|
def compile_master_guide(manifest: dict, course: dict, paths: dict):
|
|
lines = [
|
|
f"# SUPORT CURS - {course['name']}\n",
|
|
"_Generat automat din transcrierile audio ale cursului._\n",
|
|
"---\n",
|
|
]
|
|
|
|
for mod in manifest["modules"]:
|
|
lines.append(f"\n## {mod['name']}\n")
|
|
for lec in mod["lectures"]:
|
|
summary_path = Path(lec["summary_path"])
|
|
lines.append(f"\n### {lec['title']}\n")
|
|
if summary_path.exists():
|
|
content = summary_path.read_text(encoding="utf-8").strip()
|
|
lines.append(f"{content}\n")
|
|
else:
|
|
lines.append("_Rezumat indisponibil._\n")
|
|
lines.append("\n---\n")
|
|
|
|
paths["master_guide"].parent.mkdir(parents=True, exist_ok=True)
|
|
# Write LF-only to match the WSL2 baseline (the documented summary workflow
|
|
# runs from WSL2; Windows text-mode CRLF would break byte-identic compares).
|
|
with open(paths["master_guide"], "w", encoding="utf-8", newline="\n") as f:
|
|
f.write("\n".join(lines))
|
|
print(f"Compiled {paths['master_guide']} ({paths['master_guide'].stat().st_size} bytes)")
|
|
|
|
|
|
def parse_args():
|
|
p = argparse.ArgumentParser(description="Generate summaries / compile SUPORT_CURS.md")
|
|
p.add_argument("--course", default="master", help="Course key (see courses.py)")
|
|
p.add_argument("--compile", action="store_true", help="Compile SUPORT_CURS.md from existing summaries")
|
|
return p.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
course = get_course(args.course)
|
|
paths = course_paths(course)
|
|
|
|
if not paths["manifest"].exists():
|
|
print(f"{paths['manifest']} not found. Run download.py and transcribe.py first.")
|
|
sys.exit(1)
|
|
|
|
manifest = load_manifest(paths["manifest"])
|
|
validate_manifest_course(manifest, course["key"])
|
|
|
|
if args.compile:
|
|
compile_master_guide(manifest, course, paths)
|
|
else:
|
|
generate_prompts(manifest, course, paths)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|