refactor: parametrize pipeline cu --course flag + suport Vimeo/text

Un singur set de scripturi acum rulează pe orice curs configurat în
courses.py. Master rămâne la rădăcina repo (backward-compat M1-M6);
cursuri noi (ex. practitioner la shop.cursnlp.ro) primesc un root
dedicat (nlp-practitioner/) cu propriile artefacte.

- courses.py: config dict (master, practitioner) + course_paths() +
  validate_manifest_course() (manifest fără course_key = master).
- download.py: --course + --modules; trei tipuri de lecții (audio HTTP,
  Vimeo iframe via yt-dlp audio-only, text-only cu captură HTML);
  merge cu manifest existent în loc de replace; strip [Audio] pentru
  backward-compat paths.
- transcribe.py: --course + --modules; skip type==text; path-uri prin
  course_paths(); validare course_key.
- summarize.py: --course + --compile; template prompt folosește
  course['name']; scrie SUPORT_CURS.md cu LF explicit (WSL2 baseline).
- md_to_pdf.py: --course resolv-ă summaries_dir / pdf_dir per curs.
- run.bat: detectează master|practitioner ca primul argument,
  propagă --course la sub-scripturi; backward-compat run.bat [modules].
- requirements.txt: + yt-dlp.
- .gitignore: nlp-practitioner/audio/, audio_wav/, scratch_recon.py, tmp_recon/.
- tests/test_regression.sh: 5 gate-uri read-only (import, schema,
  disk-coherence, SUPORT_CURS byte-identic, cross-course isolation).

Regression curs master: PASS (manifest + SUPORT_CURS.md hash
identic cu baseline /tmp/suport_before.md).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-22 14:33:19 +03:00
parent ada00e380d
commit d22038d002
9 changed files with 1192 additions and 795 deletions

View File

@@ -10,8 +10,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
import markdown2
from weasyprint import HTML
SUMMARIES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "summaries")
PDF_DIR = os.path.join(SUMMARIES_DIR, "pdf")
from courses import course_paths, get_course
CSS = """
@page {
@@ -178,9 +177,9 @@ def convert_one(args):
return os.path.basename(md_path), os.path.basename(pdf_path)
def find_files(modules=None):
def find_files(summaries_dir, modules=None):
"""Find all .md files in summaries/, optionally filtered by module numbers."""
pattern = os.path.join(SUMMARIES_DIR, "*.md")
pattern = os.path.join(summaries_dir, "*.md")
files = sorted(glob.glob(pattern))
if modules:
@@ -216,32 +215,35 @@ def parse_modules(spec):
def main():
parser = argparse.ArgumentParser(description="Convert MD summaries to PDF")
parser.add_argument("files", nargs="*", help="Specific MD files to convert")
parser.add_argument(
"--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'"
)
parser.add_argument(
"--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)"
)
parser.add_argument("--course", default="master", help="Course key (see courses.py)")
parser.add_argument("--modules", "-m", help="Module filter, e.g. '1-3' or '2,4,5'")
parser.add_argument("--workers", "-w", type=int, default=4, help="Parallel workers (default: 4)")
args = parser.parse_args()
os.makedirs(PDF_DIR, exist_ok=True)
course = get_course(args.course)
paths = course_paths(course)
summaries_dir = str(paths["summaries_dir"].resolve())
pdf_dir = str(paths["pdf_dir"].resolve())
os.makedirs(pdf_dir, exist_ok=True)
if args.files:
md_files = [os.path.abspath(f) for f in args.files]
else:
modules = parse_modules(args.modules) if args.modules else None
md_files = find_files(modules)
md_files = find_files(summaries_dir, modules)
if not md_files:
print("No MD files found to convert.")
print(f"No MD files found in {summaries_dir}")
sys.exit(1)
jobs = []
for md_path in md_files:
basename = os.path.splitext(os.path.basename(md_path))[0]
pdf_path = os.path.join(PDF_DIR, basename + ".pdf")
pdf_path = os.path.join(pdf_dir, basename + ".pdf")
jobs.append((md_path, pdf_path))
print(f"Course: {course['key']} ({course['name']})")
print(f"Converting {len(jobs)} file(s) to PDF with {args.workers} workers...")
with ProcessPoolExecutor(max_workers=args.workers) as pool:
@@ -254,7 +256,7 @@ def main():
md_path = futures[future][0]
print(f" ERROR {os.path.basename(md_path)}: {e}", file=sys.stderr)
print(f"Done. PDFs saved to {PDF_DIR}")
print(f"Done. PDFs saved to {pdf_dir}")
if __name__ == "__main__":