nlp-master/md_to_pdf.py

#!/usr/bin/env python3
"""Convert Markdown summaries to print-friendly PDFs."""

import argparse
import glob
import os
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed

import markdown2
from weasyprint import HTML

from courses import course_paths, get_course

CSS = """
@page {
    size: A4;
    margin: 2cm;
    @bottom-right {
        content: counter(page);
        font-size: 9pt;
        color: #666;
    }
}

body {
    font-family: "Segoe UI", "Noto Sans", "DejaVu Sans", sans-serif;
    font-size: 11pt;
    line-height: 1.5;
    color: #111;
}

h1 {
    font-size: 18pt;
    font-weight: bold;
    margin-top: 0.5em;
    margin-bottom: 0.3em;
    border-bottom: 1.5pt solid #333;
    padding-bottom: 0.2em;
    page-break-before: auto;
}

h1:first-of-type {
    page-break-before: avoid;
}

h2 {
    font-size: 14pt;
    font-weight: bold;
    margin-top: 1em;
    margin-bottom: 0.3em;
    color: #222;
}

h3 {
    font-size: 12pt;
    font-weight: bold;
    margin-top: 0.8em;
    margin-bottom: 0.2em;
    color: #333;
}

p {
    margin: 0.4em 0;
}

ul, ol {
    margin: 0.3em 0;
    padding-left: 1.5em;
}

li {
    margin: 0.15em 0;
}

strong {
    font-weight: bold;
}

em {
    font-style: italic;
}

hr {
    border: none;
    border-top: 0.5pt solid #999;
    margin: 1em 0;
}

table {
    width: 100%;
    border-collapse: collapse;
    margin: 0.5em 0;
    font-size: 8.5pt;
    page-break-inside: avoid;
    table-layout: fixed;
    word-wrap: break-word;
    overflow-wrap: break-word;
}

th {
    background-color: #e8e8e8;
    font-weight: bold;
    text-align: left;
    padding: 3pt 4pt;
    border: 0.5pt solid #bbb;
}

td {
    padding: 3pt 4pt;
    border: 0.5pt solid #ccc;
    word-break: break-all;
}

tr:nth-child(even) td {
    background-color: #f5f5f5;
}

pre {
    background-color: #f5f5f5;
    border: 0.5pt solid #ccc;
    border-radius: 3pt;
    padding: 8pt 10pt;
    font-family: "Consolas", "DejaVu Sans Mono", monospace;
    font-size: 9pt;
    line-height: 1.35;
    white-space: pre-wrap;
    word-wrap: break-word;
    page-break-inside: avoid;
}

code {
    font-family: "Consolas", "DejaVu Sans Mono", monospace;
    font-size: 9.5pt;
    background-color: #f0f0f0;
    padding: 1pt 3pt;
    border-radius: 2pt;
}

pre code {
    background: none;
    padding: 0;
}

blockquote {
    border-left: 2pt solid #999;
    margin: 0.5em 0;
    padding: 0.3em 0 0.3em 1em;
    color: #333;
}
"""


def convert_one(args):
    """Convert a single Markdown file to PDF. Designed for parallel execution."""
    md_path, pdf_path = args
    with open(md_path, "r", encoding="utf-8") as f:
        md_text = f.read()

    html_body = markdown2.markdown(
        md_text,
        extras=["tables", "fenced-code-blocks", "header-ids", "break-on-newline", "code-friendly"],
    )

    html_doc = f"""<!DOCTYPE html>
<html lang="ro">
<head>
<meta charset="utf-8">
<style>{CSS}</style>
</head>
<body>
{html_body}
</body>
</html>"""

    HTML(string=html_doc).write_pdf(pdf_path)
    return os.path.basename(md_path), os.path.basename(pdf_path)


def find_files(summaries_dir, modules=None):
    """Find all .md files in summaries/, optionally filtered by module numbers."""
    pattern = os.path.join(summaries_dir, "*.md")
    files = sorted(glob.glob(pattern))

    if modules:
        filtered = []
        for f in files:
            basename = os.path.basename(f)
            if not basename.startswith("MODUL"):
                continue
            try:
                num = int(basename.split("_")[0].replace("MODUL", ""))
                if num in modules:
                    filtered.append(f)
            except ValueError:
                continue
        files = filtered

    return files


def parse_modules(spec):
    """Parse module spec like '1-3' or '2,4,5' into a set of ints."""
    modules = set()
    for part in spec.split(","):
        part = part.strip()
        if "-" in part:
            start, end = part.split("-", 1)
            modules.update(range(int(start), int(end) + 1))
        else:
            modules.add(int(part))
    return modules


def main():
    parser = argparse.ArgumentParser(description="Convert MD summaries to PDF")
    parser.add_argument("files", nargs="*", help="Specific MD files to convert")
    parser.add_argument("--course", default="master", help="Course key (see courses.py)")
    parser.add_argument("--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'")
    parser.add_argument("--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)")
    args = parser.parse_args()

    course = get_course(args.course)
    paths = course_paths(course)
    summaries_dir = str(paths["summaries_dir"].resolve())
    pdf_dir = str(paths["pdf_dir"].resolve())

    os.makedirs(pdf_dir, exist_ok=True)

    if args.files:
        md_files = [os.path.abspath(f) for f in args.files]
    else:
        modules = parse_modules(args.modules) if args.modules else None
        md_files = find_files(summaries_dir, modules)

    if not md_files:
        print(f"No MD files found in {summaries_dir}")
        sys.exit(1)

    jobs = []
    for md_path in md_files:
        basename = os.path.splitext(os.path.basename(md_path))[0]
        pdf_path = os.path.join(pdf_dir, basename + ".pdf")
        jobs.append((md_path, pdf_path))

    print(f"Course: {course['key']} ({course['name']})")
    print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...")

    with ProcessPoolExecutor(max_workers=args.workers) as pool:
        futures = {pool.submit(convert_one, job): job for job in jobs}
        for future in as_completed(futures):
            try:
                src, dst = future.result()
                print(f"  {src} -> {dst}")
            except Exception as e:
                md_path = futures[future][0]
                print(f"  ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr)

    print(f"Done. PDFs saved to {pdf_dir}")


if __name__ == "__main__":
    main()