refactor: parametrize pipeline cu --course flag + suport Vimeo/text
Un singur set de scripturi acum rulează pe orice curs configurat în courses.py. Master rămâne la rădăcina repo (backward-compat M1-M6); cursuri noi (ex. practitioner la shop.cursnlp.ro) primesc un root dedicat (nlp-practitioner/) cu propriile artefacte. - courses.py: config dict (master, practitioner) + course_paths() + validate_manifest_course() (manifest fără course_key = master). - download.py: --course + --modules; trei tipuri de lecții (audio HTTP, Vimeo iframe via yt-dlp audio-only, text-only cu captură HTML); merge cu manifest existent în loc de replace; strip [Audio] pentru backward-compat paths. - transcribe.py: --course + --modules; skip type==text; path-uri prin course_paths(); validare course_key. - summarize.py: --course + --compile; template prompt folosește course['name']; scrie SUPORT_CURS.md cu LF explicit (WSL2 baseline). - md_to_pdf.py: --course resolv-ă summaries_dir / pdf_dir per curs. - run.bat: detectează master|practitioner ca primul argument, propagă --course la sub-scripturi; backward-compat run.bat [modules]. - requirements.txt: + yt-dlp. - .gitignore: nlp-practitioner/audio/, audio_wav/, scratch_recon.py, tmp_recon/. - tests/test_regression.sh: 5 gate-uri read-only (import, schema, disk-coherence, SUPORT_CURS byte-identic, cross-course isolation). Regression curs master: PASS (manifest + SUPORT_CURS.md hash identic cu baseline /tmp/suport_before.md). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
32
md_to_pdf.py
32
md_to_pdf.py
@@ -10,8 +10,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
import markdown2
|
||||
from weasyprint import HTML
|
||||
|
||||
SUMMARIES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "summaries")
|
||||
PDF_DIR = os.path.join(SUMMARIES_DIR, "pdf")
|
||||
from courses import course_paths, get_course
|
||||
|
||||
CSS = """
|
||||
@page {
|
||||
@@ -178,9 +177,9 @@ def convert_one(args):
|
||||
return os.path.basename(md_path), os.path.basename(pdf_path)
|
||||
|
||||
|
||||
def find_files(modules=None):
|
||||
def find_files(summaries_dir, modules=None):
|
||||
"""Find all .md files in summaries/, optionally filtered by module numbers."""
|
||||
pattern = os.path.join(SUMMARIES_DIR, "*.md")
|
||||
pattern = os.path.join(summaries_dir, "*.md")
|
||||
files = sorted(glob.glob(pattern))
|
||||
|
||||
if modules:
|
||||
@@ -216,32 +215,35 @@ def parse_modules(spec):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convert MD summaries to PDF")
|
||||
parser.add_argument("files", nargs="*", help="Specific MD files to convert")
|
||||
parser.add_argument(
|
||||
"--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)"
|
||||
)
|
||||
parser.add_argument("--course", default="master", help="Course key (see courses.py)")
|
||||
parser.add_argument("--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'")
|
||||
parser.add_argument("--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(PDF_DIR, exist_ok=True)
|
||||
course = get_course(args.course)
|
||||
paths = course_paths(course)
|
||||
summaries_dir = str(paths["summaries_dir"].resolve())
|
||||
pdf_dir = str(paths["pdf_dir"].resolve())
|
||||
|
||||
os.makedirs(pdf_dir, exist_ok=True)
|
||||
|
||||
if args.files:
|
||||
md_files = [os.path.abspath(f) for f in args.files]
|
||||
else:
|
||||
modules = parse_modules(args.modules) if args.modules else None
|
||||
md_files = find_files(modules)
|
||||
md_files = find_files(summaries_dir, modules)
|
||||
|
||||
if not md_files:
|
||||
print("No MD files found to convert.")
|
||||
print(f"No MD files found in {summaries_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
jobs = []
|
||||
for md_path in md_files:
|
||||
basename = os.path.splitext(os.path.basename(md_path))[0]
|
||||
pdf_path = os.path.join(PDF_DIR, basename + ".pdf")
|
||||
pdf_path = os.path.join(pdf_dir, basename + ".pdf")
|
||||
jobs.append((md_path, pdf_path))
|
||||
|
||||
print(f"Course: {course['key']} ({course['name']})")
|
||||
print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...")
|
||||
|
||||
with ProcessPoolExecutor(max_workers=args.workers) as pool:
|
||||
@@ -254,7 +256,7 @@ def main():
|
||||
md_path = futures[future][0]
|
||||
print(f" ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr)
|
||||
|
||||
print(f"Done. PDFs saved to {PDF_DIR}")
|
||||
print(f"Done. PDFs saved to {pdf_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user