Files
nlp-master/md_to_pdf.py
Marius Mutu c2f37ddb8f fix: md_to_pdf — parallel conversion, fix tabele largi, fix underscores
- Conversie paralelă cu ProcessPoolExecutor (4 workers default)
- Tabele: font 8.5pt, padding compact, word-break pentru text lung
- code-friendly extra previne pierderea _ din nume de fișiere
- find_files caută toate *.md din summaries/, nu doar MODUL*
- .gitignore: adaugă .claude/ (local state)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 15:34:12 +02:00

262 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""Convert Markdown summaries to print-friendly PDFs."""
import argparse
import glob
import os
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed
import markdown2
from weasyprint import HTML
SUMMARIES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "summaries")
PDF_DIR = os.path.join(SUMMARIES_DIR, "pdf")
CSS = """
@page {
size: A4;
margin: 2cm;
@bottom-right {
content: counter(page);
font-size: 9pt;
color: #666;
}
}
body {
font-family: "Segoe UI", "Noto Sans", "DejaVu Sans", sans-serif;
font-size: 11pt;
line-height: 1.5;
color: #111;
}
h1 {
font-size: 18pt;
font-weight: bold;
margin-top: 0.5em;
margin-bottom: 0.3em;
border-bottom: 1.5pt solid #333;
padding-bottom: 0.2em;
page-break-before: auto;
}
h1:first-of-type {
page-break-before: avoid;
}
h2 {
font-size: 14pt;
font-weight: bold;
margin-top: 1em;
margin-bottom: 0.3em;
color: #222;
}
h3 {
font-size: 12pt;
font-weight: bold;
margin-top: 0.8em;
margin-bottom: 0.2em;
color: #333;
}
p {
margin: 0.4em 0;
}
ul, ol {
margin: 0.3em 0;
padding-left: 1.5em;
}
li {
margin: 0.15em 0;
}
strong {
font-weight: bold;
}
em {
font-style: italic;
}
hr {
border: none;
border-top: 0.5pt solid #999;
margin: 1em 0;
}
table {
width: 100%;
border-collapse: collapse;
margin: 0.5em 0;
font-size: 8.5pt;
page-break-inside: avoid;
table-layout: fixed;
word-wrap: break-word;
overflow-wrap: break-word;
}
th {
background-color: #e8e8e8;
font-weight: bold;
text-align: left;
padding: 3pt 4pt;
border: 0.5pt solid #bbb;
}
td {
padding: 3pt 4pt;
border: 0.5pt solid #ccc;
word-break: break-all;
}
tr:nth-child(even) td {
background-color: #f5f5f5;
}
pre {
background-color: #f5f5f5;
border: 0.5pt solid #ccc;
border-radius: 3pt;
padding: 8pt 10pt;
font-family: "Consolas", "DejaVu Sans Mono", monospace;
font-size: 9pt;
line-height: 1.35;
white-space: pre-wrap;
word-wrap: break-word;
page-break-inside: avoid;
}
code {
font-family: "Consolas", "DejaVu Sans Mono", monospace;
font-size: 9.5pt;
background-color: #f0f0f0;
padding: 1pt 3pt;
border-radius: 2pt;
}
pre code {
background: none;
padding: 0;
}
blockquote {
border-left: 2pt solid #999;
margin: 0.5em 0;
padding: 0.3em 0 0.3em 1em;
color: #333;
}
"""
def convert_one(args):
"""Convert a single Markdown file to PDF. Designed for parallel execution."""
md_path, pdf_path = args
with open(md_path, "r", encoding="utf-8") as f:
md_text = f.read()
html_body = markdown2.markdown(
md_text,
extras=["tables", "fenced-code-blocks", "header-ids", "break-on-newline", "code-friendly"],
)
html_doc = f"""<!DOCTYPE html>
<html lang="ro">
<head>
<meta charset="utf-8">
<style>{CSS}</style>
</head>
<body>
{html_body}
</body>
</html>"""
HTML(string=html_doc).write_pdf(pdf_path)
return os.path.basename(md_path), os.path.basename(pdf_path)
def find_files(modules=None):
"""Find all .md files in summaries/, optionally filtered by module numbers."""
pattern = os.path.join(SUMMARIES_DIR, "*.md")
files = sorted(glob.glob(pattern))
if modules:
filtered = []
for f in files:
basename = os.path.basename(f)
if not basename.startswith("MODUL"):
continue
try:
num = int(basename.split("_")[0].replace("MODUL", ""))
if num in modules:
filtered.append(f)
except ValueError:
continue
files = filtered
return files
def parse_modules(spec):
"""Parse module spec like '1-3' or '2,4,5' into a set of ints."""
modules = set()
for part in spec.split(","):
part = part.strip()
if "-" in part:
start, end = part.split("-", 1)
modules.update(range(int(start), int(end) + 1))
else:
modules.add(int(part))
return modules
def main():
parser = argparse.ArgumentParser(description="Convert MD summaries to PDF")
parser.add_argument("files", nargs="*", help="Specific MD files to convert")
parser.add_argument(
"--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'"
)
parser.add_argument(
"--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)"
)
args = parser.parse_args()
os.makedirs(PDF_DIR, exist_ok=True)
if args.files:
md_files = [os.path.abspath(f) for f in args.files]
else:
modules = parse_modules(args.modules) if args.modules else None
md_files = find_files(modules)
if not md_files:
print("No MD files found to convert.")
sys.exit(1)
jobs = []
for md_path in md_files:
basename = os.path.splitext(os.path.basename(md_path))[0]
pdf_path = os.path.join(PDF_DIR, basename + ".pdf")
jobs.append((md_path, pdf_path))
print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...")
with ProcessPoolExecutor(max_workers=args.workers) as pool:
futures = {pool.submit(convert_one, job): job for job in jobs}
for future in as_completed(futures):
try:
src, dst = future.result()
print(f" {src} -> {dst}")
except Exception as e:
md_path = futures[future][0]
print(f" ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr)
print(f"Done. PDFs saved to {PDF_DIR}")
if __name__ == "__main__":
main()