Un singur set de scripturi acum rulează pe orice curs configurat în courses.py. Master rămâne la rădăcina repo (backward-compat M1-M6); cursuri noi (ex. practitioner la shop.cursnlp.ro) primesc un root dedicat (nlp-practitioner/) cu propriile artefacte. - courses.py: config dict (master, practitioner) + course_paths() + validate_manifest_course() (manifest fără course_key = master). - download.py: --course + --modules; trei tipuri de lecții (audio HTTP, Vimeo iframe via yt-dlp audio-only, text-only cu captură HTML); merge cu manifest existent în loc de replace; strip [Audio] pentru backward-compat paths. - transcribe.py: --course + --modules; skip type==text; path-uri prin course_paths(); validare course_key. - summarize.py: --course + --compile; template prompt folosește course['name']; scrie SUPORT_CURS.md cu LF explicit (WSL2 baseline). - md_to_pdf.py: --course resolv-ă summaries_dir / pdf_dir per curs. - run.bat: detectează master|practitioner ca primul argument, propagă --course la sub-scripturi; backward-compat run.bat [modules]. - requirements.txt: + yt-dlp. - .gitignore: nlp-practitioner/audio/, audio_wav/, scratch_recon.py, tmp_recon/. - tests/test_regression.sh: 5 gate-uri read-only (import, schema, disk-coherence, SUPORT_CURS byte-identic, cross-course isolation). Regression curs master: PASS (manifest + SUPORT_CURS.md hash identic cu baseline /tmp/suport_before.md). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
264 lines
5.9 KiB
Python
264 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Convert Markdown summaries to print-friendly PDFs."""
|
|
|
|
import argparse
|
|
import glob
|
|
import os
|
|
import sys
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
import markdown2
|
|
from weasyprint import HTML
|
|
|
|
from courses import course_paths, get_course
|
|
|
|
CSS = """
|
|
@page {
|
|
size: A4;
|
|
margin: 2cm;
|
|
@bottom-right {
|
|
content: counter(page);
|
|
font-size: 9pt;
|
|
color: #666;
|
|
}
|
|
}
|
|
|
|
body {
|
|
font-family: "Segoe UI", "Noto Sans", "DejaVu Sans", sans-serif;
|
|
font-size: 11pt;
|
|
line-height: 1.5;
|
|
color: #111;
|
|
}
|
|
|
|
h1 {
|
|
font-size: 18pt;
|
|
font-weight: bold;
|
|
margin-top: 0.5em;
|
|
margin-bottom: 0.3em;
|
|
border-bottom: 1.5pt solid #333;
|
|
padding-bottom: 0.2em;
|
|
page-break-before: auto;
|
|
}
|
|
|
|
h1:first-of-type {
|
|
page-break-before: avoid;
|
|
}
|
|
|
|
h2 {
|
|
font-size: 14pt;
|
|
font-weight: bold;
|
|
margin-top: 1em;
|
|
margin-bottom: 0.3em;
|
|
color: #222;
|
|
}
|
|
|
|
h3 {
|
|
font-size: 12pt;
|
|
font-weight: bold;
|
|
margin-top: 0.8em;
|
|
margin-bottom: 0.2em;
|
|
color: #333;
|
|
}
|
|
|
|
p {
|
|
margin: 0.4em 0;
|
|
}
|
|
|
|
ul, ol {
|
|
margin: 0.3em 0;
|
|
padding-left: 1.5em;
|
|
}
|
|
|
|
li {
|
|
margin: 0.15em 0;
|
|
}
|
|
|
|
strong {
|
|
font-weight: bold;
|
|
}
|
|
|
|
em {
|
|
font-style: italic;
|
|
}
|
|
|
|
hr {
|
|
border: none;
|
|
border-top: 0.5pt solid #999;
|
|
margin: 1em 0;
|
|
}
|
|
|
|
table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin: 0.5em 0;
|
|
font-size: 8.5pt;
|
|
page-break-inside: avoid;
|
|
table-layout: fixed;
|
|
word-wrap: break-word;
|
|
overflow-wrap: break-word;
|
|
}
|
|
|
|
th {
|
|
background-color: #e8e8e8;
|
|
font-weight: bold;
|
|
text-align: left;
|
|
padding: 3pt 4pt;
|
|
border: 0.5pt solid #bbb;
|
|
}
|
|
|
|
td {
|
|
padding: 3pt 4pt;
|
|
border: 0.5pt solid #ccc;
|
|
word-break: break-all;
|
|
}
|
|
|
|
tr:nth-child(even) td {
|
|
background-color: #f5f5f5;
|
|
}
|
|
|
|
pre {
|
|
background-color: #f5f5f5;
|
|
border: 0.5pt solid #ccc;
|
|
border-radius: 3pt;
|
|
padding: 8pt 10pt;
|
|
font-family: "Consolas", "DejaVu Sans Mono", monospace;
|
|
font-size: 9pt;
|
|
line-height: 1.35;
|
|
white-space: pre-wrap;
|
|
word-wrap: break-word;
|
|
page-break-inside: avoid;
|
|
}
|
|
|
|
code {
|
|
font-family: "Consolas", "DejaVu Sans Mono", monospace;
|
|
font-size: 9.5pt;
|
|
background-color: #f0f0f0;
|
|
padding: 1pt 3pt;
|
|
border-radius: 2pt;
|
|
}
|
|
|
|
pre code {
|
|
background: none;
|
|
padding: 0;
|
|
}
|
|
|
|
blockquote {
|
|
border-left: 2pt solid #999;
|
|
margin: 0.5em 0;
|
|
padding: 0.3em 0 0.3em 1em;
|
|
color: #333;
|
|
}
|
|
"""
|
|
|
|
|
|
def convert_one(args):
|
|
"""Convert a single Markdown file to PDF. Designed for parallel execution."""
|
|
md_path, pdf_path = args
|
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
md_text = f.read()
|
|
|
|
html_body = markdown2.markdown(
|
|
md_text,
|
|
extras=["tables", "fenced-code-blocks", "header-ids", "break-on-newline", "code-friendly"],
|
|
)
|
|
|
|
html_doc = f"""<!DOCTYPE html>
|
|
<html lang="ro">
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<style>{CSS}</style>
|
|
</head>
|
|
<body>
|
|
{html_body}
|
|
</body>
|
|
</html>"""
|
|
|
|
HTML(string=html_doc).write_pdf(pdf_path)
|
|
return os.path.basename(md_path), os.path.basename(pdf_path)
|
|
|
|
|
|
def find_files(summaries_dir, modules=None):
|
|
"""Find all .md files in summaries/, optionally filtered by module numbers."""
|
|
pattern = os.path.join(summaries_dir, "*.md")
|
|
files = sorted(glob.glob(pattern))
|
|
|
|
if modules:
|
|
filtered = []
|
|
for f in files:
|
|
basename = os.path.basename(f)
|
|
if not basename.startswith("MODUL"):
|
|
continue
|
|
try:
|
|
num = int(basename.split("_")[0].replace("MODUL", ""))
|
|
if num in modules:
|
|
filtered.append(f)
|
|
except ValueError:
|
|
continue
|
|
files = filtered
|
|
|
|
return files
|
|
|
|
|
|
def parse_modules(spec):
|
|
"""Parse module spec like '1-3' or '2,4,5' into a set of ints."""
|
|
modules = set()
|
|
for part in spec.split(","):
|
|
part = part.strip()
|
|
if "-" in part:
|
|
start, end = part.split("-", 1)
|
|
modules.update(range(int(start), int(end) + 1))
|
|
else:
|
|
modules.add(int(part))
|
|
return modules
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Convert MD summaries to PDF")
|
|
parser.add_argument("files", nargs="*", help="Specific MD files to convert")
|
|
parser.add_argument("--course", default="master", help="Course key (see courses.py)")
|
|
parser.add_argument("--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'")
|
|
parser.add_argument("--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)")
|
|
args = parser.parse_args()
|
|
|
|
course = get_course(args.course)
|
|
paths = course_paths(course)
|
|
summaries_dir = str(paths["summaries_dir"].resolve())
|
|
pdf_dir = str(paths["pdf_dir"].resolve())
|
|
|
|
os.makedirs(pdf_dir, exist_ok=True)
|
|
|
|
if args.files:
|
|
md_files = [os.path.abspath(f) for f in args.files]
|
|
else:
|
|
modules = parse_modules(args.modules) if args.modules else None
|
|
md_files = find_files(summaries_dir, modules)
|
|
|
|
if not md_files:
|
|
print(f"No MD files found in {summaries_dir}")
|
|
sys.exit(1)
|
|
|
|
jobs = []
|
|
for md_path in md_files:
|
|
basename = os.path.splitext(os.path.basename(md_path))[0]
|
|
pdf_path = os.path.join(pdf_dir, basename + ".pdf")
|
|
jobs.append((md_path, pdf_path))
|
|
|
|
print(f"Course: {course['key']} ({course['name']})")
|
|
print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...")
|
|
|
|
with ProcessPoolExecutor(max_workers=args.workers) as pool:
|
|
futures = {pool.submit(convert_one, job): job for job in jobs}
|
|
for future in as_completed(futures):
|
|
try:
|
|
src, dst = future.result()
|
|
print(f" {src} -> {dst}")
|
|
except Exception as e:
|
|
md_path = futures[future][0]
|
|
print(f" ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr)
|
|
|
|
print(f"Done. PDFs saved to {pdf_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|