diff --git a/.gitignore b/.gitignore index 90a9b77..e284e29 100644 --- a/.gitignore +++ b/.gitignore @@ -1,34 +1,34 @@ -# Audio files -audio/ -*.mp3 -*.wav - -# Whisper models -models/ -*.bin - -# Credentials -.env - -# Transcripts and summaries (large generated content) -transcripts/ -summaries/ - -# Binaries (downloaded by setup_whisper.py) -whisper-bin/ -ffmpeg-bin/ - -# Temp files -.whisper_bin_path -.ffmpeg_bin_path - -# WAV cache (converted from MP3) -audio_wav/ - -# Python -__pycache__/ -*.pyc -.venv/ - -# Logs -*.log +# Audio files +audio/ +*.mp3 +*.wav + +# Whisper models +models/ +*.bin + +# Credentials +.env + +# Transcripts and summaries (large generated content) +transcripts/ +summaries/ + +# Binaries (downloaded by setup_whisper.py) +whisper-bin/ +ffmpeg-bin/ + +# Temp files +.whisper_bin_path +.ffmpeg_bin_path + +# WAV cache (converted from MP3) +audio_wav/ + +# Python +__pycache__/ +*.pyc +.venv/ + +# Logs +*.log diff --git a/PLAN.md b/PLAN.md index 86cd9aa..b4e3b3d 100644 --- a/PLAN.md +++ b/PLAN.md @@ -1,243 +1,243 @@ -# Design: NLP Master Course Audio Pipeline - -Generated by /office-hours on 2026-03-23 -Branch: unknown -Repo: nlp-master (local, no git) -Status: APPROVED -Mode: Builder - -## Problem Statement - -Marius has an NLP master course hosted at cursuri.aresens.ro/curs/26 with 35 audio recordings (5 modules x 7 lectures, ~95 minutes each, ~58 hours total) in Romanian. The audio is behind a password-protected website. He wants to download all audio files, transcribe them offline using his AMD Radeon RX 6600M 8GB GPU, and generate clean transcripts with per-lecture summaries as study materials. - -## What Makes This Cool - -58 hours of Romanian lecture audio turned into searchable, summarized study materials — completely automated. Download once, transcribe overnight, summarize with Claude Code. A pipeline that would take weeks of manual work happens in hours. - -## Constraints - -- **Hardware:** AMD Radeon RX 6600M 8GB (RDNA2) — no CUDA, needs Vulkan or ROCm -- **Language:** Romanian audio — Whisper large-v3 has decent but not perfect Romanian support (~95% accuracy on clean audio) -- **Source:** Password-protected website at cursuri.aresens.ro/curs/26 -- **Scale:** ~35 MP3 files, ~95 min each, ~58 hours total -- **Privacy:** Course content is for personal study use only -- **Tooling:** Claude Code available for summary generation (no separate API cost) -- **Platform:** Native Windows (Python + whisper.cpp + Vulkan). Claude Code runs from WSL2 for summaries. -- **Summaries language:** Romanian (matching source material) -- **Audio format:** MP3, 320kbps, 48kHz stereo, ~218MB per file (verified from sample: "Master 25M1 Z1A [Audio].mp3") - -## Premises - -1. Legitimate access to the course — downloading audio for personal study is within usage rights -2. whisper.cpp with Vulkan backend is the right tool for RX 6600M (avoids ROCm compatibility issues on RDNA2) -3. Audio quality is decent (recorded lectures) — Whisper large-v3 will produce usable Romanian transcripts -4. Summaries will be generated by Claude Code after transcription — separate step -5. Batch pipeline (download all → transcribe all → summarize all) is preferred over incremental processing - -## Approaches Considered - -### Approach A: Full Pipeline (CHOSEN) -Python script for website login + MP3 download. Shell script for whisper.cpp batch transcription (Vulkan, large-v3-q5_0). Claude Code for per-lecture summaries from transcripts. -- Effort: M (human: ~2 days / CC: ~30 min to build, ~8 hours to run transcription) -- Risk: Low -- Pros: Complete automation, reproducible for module 6, best quality -- Cons: whisper.cpp Vulkan build requires system setup - -### Approach B: Download + Transcribe Only -Same download + transcription, no automated summaries. Simpler but defers the valuable part. -- Effort: S (human: ~1 day / CC: ~20 min) -- Risk: Low - -### Approach C: Fully Offline (Local LLM summaries) -Everything offline including summaries via llama.cpp. Zero external costs but lower summary quality. -- Effort: M (human: ~2 days / CC: ~40 min) -- Risk: Medium (8GB VRAM shared between whisper.cpp and llama.cpp) - -## Recommended Approach - -**Approach A: Full Pipeline** — Download → whisper.cpp/Vulkan → Claude Code summaries. - -**Execution model:** Everything runs on native Windows (Python, whisper.cpp). Claude Code runs from WSL2 for the summary step. - -### Step 0: Project Setup -- Initialize git repo with `.gitignore` (exclude: `audio/`, `models/`, `.env`, `*.mp3`, `*.wav`, `*.bin`) -- Install Python on Windows (if not already) -- Install Vulkan SDK on Windows -- Create `.env` with course credentials (never committed) - -### Step 1: Site Recon + Download Audio Files -- **First:** Browse cursuri.aresens.ro/curs/26 to understand page structure (login form, module layout, MP3 link format) -- Based on recon, write `download.py` using the right scraping approach (requests+BS4 for static, playwright for JS-rendered — don't build both) -- Login with credentials from `.env` or interactive prompt -- Discover all modules dynamically (don't hardcode 5x7 — actual count may vary) -- Preserve original file names (e.g., "Master 25M1 Z1A [Audio].mp3") and extract lecture titles -- Write `manifest.json` mapping each file to: module, lecture title, original URL, file path, download status -- **Resumability:** skip already-downloaded files (check existence + file size). Retry 3x with backoff. Log to `download_errors.log`. -- **Validation:** after download completes, print summary: "Downloaded X/Y files, Z failures. All files > 1MB: pass/fail." - -### Step 2: Install whisper.cpp with Vulkan (Windows native) -- Option A: Download pre-built Windows binary with Vulkan from [whisper.cpp-windows-vulkan-bin](https://github.com/jerryshell/whisper.cpp-windows-vulkan-bin) -- Option B: Build from source with Visual Studio + `-DGGML_VULKAN=1` CMake flag -- Download model: `ggml-large-v3-q5_0.bin` (~1.5GB) from Hugging Face into `models/` -- **VRAM test:** transcribe a 2-min clip from the first lecture to verify GPU detection, measure speed, and validate MP3 input works. If MP3 fails (whisper.cpp built without ffmpeg libs), install ffmpeg or pre-convert with Python pydub. -- **Speed calibration:** RX 6600M is roughly half the speed of RX 9070 XT. Realistic estimate: **3-5x realtime** (~18-30 min per 90-min file). Total: **~12-18 hours** for all files. Plan for a full day, not overnight. -- **Fallback:** if large-v3-q5_0 OOMs on 8GB, try `ggml-large-v3-q4_0.bin` or `ggml-medium-q5_0.bin`. - -### Step 3: Batch Transcription -- `transcribe.py` (Python, cross-platform) reads `manifest.json`, processes files in module order -- Calls whisper.cpp with: `--language ro --model models\ggml-large-v3-q5_0.bin --output-txt --output-srt` -- Output .txt and .srt per file to `transcripts/{original_name_without_ext}/` -- Updates `manifest.json` with transcription status per file -- **Resumability:** skip files with existing .txt output. Log failures to `transcribe_errors.log`. -- **Quality gate:** after first module completes (~2 hours), STOP and spot-check 2-3 transcripts. If Romanian accuracy is poor (lots of garbled text), consider: switching to `large-v3` unquantized, adjusting `--beam-size`, or accepting lower quality. -- **Validation:** print summary: "Transcribed X/Y files. Z failures. No empty outputs: pass/fail." - -### Step 4: Summary Generation with Claude Code -- From WSL2, use Claude Code to process each transcript -- Use a Python script (`summarize.py`) that reads `manifest.json`, opens each .txt file, and prints the summary prompt for Claude Code -- Summary prompt (Romanian): "Rezuma aceasta transcriere. Ofera: (1) prezentare generala in 3-5 propozitii, (2) concepte cheie cu definitii, (3) detalii si exemple importante" -- **Chunking:** split transcripts > 10K words at sentence boundaries (not raw word count) with 500-word overlap. Summarize chunks, then merge. -- Output to `summaries/{original_name}_summary.md` -- Final: compile `SUPORT_CURS.md` — master study guide with lecture titles as headings - -### Manifest Schema -```json -{ - "course": "NLP Master 2025", - "source_url": "https://cursuri.aresens.ro/curs/26", - "modules": [ - { - "name": "Modul 1", - "lectures": [ - { - "title": "Master 25M1 Z1A", - "original_filename": "Master 25M1 Z1A [Audio].mp3", - "url": "https://...", - "audio_path": "audio/Master 25M1 Z1A [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z1A.txt", - "srt_path": "transcripts/Master 25M1 Z1A.srt", - "summary_path": "summaries/Master 25M1 Z1A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 228486429 - } - ] - } - ] -} -``` - -### Directory Structure -``` -nlp-master/ - .gitignore # Excludes audio/, models/, .env - .env # Course credentials (not committed) - manifest.json # Shared metadata for all scripts - download.py # Step 1: site recon + download - transcribe.py # Step 3: batch transcription - summarize.py # Step 4: summary generation helper - audio/ - Master 25M1 Z1A [Audio].mp3 - Master 25M1 Z1B [Audio].mp3 - ... - models/ - ggml-large-v3-q5_0.bin - transcripts/ - Master 25M1 Z1A.txt - Master 25M1 Z1A.srt - ... - summaries/ - Master 25M1 Z1A_summary.md - ... - SUPORT_CURS.md -``` - -## Open Questions - -1. ~~What is the exact website structure?~~ Resolved: browse site first in Step 1. -2. ~~Are there lecture titles on the website?~~ Resolved: preserve original names + extract titles. -3. ~~Do you want the summaries in Romanian or English?~~ Resolved: Romanian. -4. Should the master study guide (SUPORT_CURS.md) include the full transcripts or just summaries? -5. Is there a 6th module coming? If so, the pipeline should be easily re-runnable. -6. Does whisper.cpp Windows binary support MP3 input natively? (Validated in Step 2 VRAM test) - -## Success Criteria - -- All ~35 MP3 files downloaded and organized by module -- All files transcribed to .txt and .srt with >90% accuracy -- Per-lecture summaries generated with key concepts extracted -- Master study guide (SUPORT_CURS.md) ready for reading/searching -- Pipeline is re-runnable for module 6 when it arrives - -## Next Steps - -1. **git init + .gitignore** — set up project, exclude audio/models/.env (~2 min) -2. **Browse cursuri.aresens.ro** — understand site structure before coding (~10 min) -3. **Build download.py** — login + scrape + download + manifest.json (~15 min with CC) -4. **Install whisper.cpp on Windows** — pre-built binary or build from source + Vulkan SDK (~15 min) -5. **Download whisper model** — large-v3-q5_0 from Hugging Face (~5 min) -6. **Test transcription** — 2-min clip, validate GPU, calibrate speed, check MP3 support (~5 min) -7. **Build transcribe.py** — reads manifest, processes in module order, updates status (~10 min with CC) -8. **Run batch transcription** — ~12-18 hours (leave running during workday) -9. **Spot-check quality** — review 2-3 transcripts after Module 1 completes -10. **Generate summaries with Claude Code** — via summarize.py helper (~30 min) -11. **Compile SUPORT_CURS.md** — master study guide (~10 min) - -## NOT in scope -- Building a web UI or search interface for transcripts — just flat files -- Automated quality scoring of transcriptions — manual spot-check is sufficient -- Speaker diarization (identifying different speakers) — single lecturer -- Translation to English — summaries stay in Romanian -- CI/CD or deployment — this is a local personal pipeline - -## What already exists -- Nothing — greenfield project. No existing code to reuse. -- The one existing file (`Master 25M1 Z1A [Audio].mp3`) confirms the naming pattern and audio specs. - -## Failure Modes -``` -FAILURE MODE | TEST? | HANDLING? | SILENT? -================================|=======|===========|======== -Session expires during download | No | Yes (retry)| No — logged -MP3 truncated (network drop) | Yes* | Yes (size) | No — validation -whisper.cpp OOM on large model | No | Yes (fallback)| No — logged -whisper.cpp can't read MP3 | No | No** | Yes — CRITICAL -Empty transcript output | Yes* | Yes (log) | No — validation -Poor Romanian accuracy | No | Yes (gate)| No — spot-check -Claude Code input too large | No | Yes (chunk)| No — script handles -manifest.json corruption | No | No | Yes — low risk - -* = covered by inline validation checks -** = validated in Step 2 test; if fails, install ffmpeg or use pydub -``` -**Critical gap:** whisper.cpp MP3 support must be validated in Step 2. If it fails silently (produces garbage), the entire batch is wasted. - -## Eng Review Decisions (2026-03-24) -1. Hybrid platform → **All Windows Python** (not WSL2 for scripts) -2. Browse site first → build the right scraper, not two fallback paths -3. Preserve original file names + extract lecture titles -4. Add manifest.json as shared metadata between scripts -5. Python for all scripts (download.py, transcribe.py, summarize.py) -6. Built-in validation checks in each script -7. Feed MP3s directly (no pre-convert) -8. Process in module order -9. Realistic transcription estimate: 12-18 hours (not 7-8) - -## What I noticed about how you think - -- You said "vreau offline transcription + claude code pentru summaries" — you immediately found the pragmatic middle path between fully offline and fully API-dependent. That's good engineering instinct: use the best tool for each step rather than forcing one tool to do everything. -- You gave concrete numbers upfront: "5 module din 6, fiecare cu 7 audio-uri" and "90-100 minute" — you'd already scoped the problem before sitting down. That's not how most people start; most people say "I have some audio files." -- You chose "transcripts + summaries" over "just transcripts" or "full study system" — you know what's useful without over-engineering. - -## GSTACK REVIEW REPORT - -| Review | Trigger | Why | Runs | Status | Findings | -|--------|---------|-----|------|--------|----------| -| CEO Review | `/plan-ceo-review` | Scope & strategy | 0 | — | — | -| Codex Review | `/codex review` | Independent 2nd opinion | 0 | — | — | -| Eng Review | `/plan-eng-review` | Architecture & tests (required) | 1 | CLEAR (PLAN) | 8 issues, 0 critical gaps | -| Design Review | `/plan-design-review` | UI/UX gaps | 0 | — | — | - -- **OUTSIDE VOICE:** Claude subagent ran — 10 findings, 3 cross-model tensions resolved (platform execution, speed estimate, module order) -- **UNRESOLVED:** 0 -- **VERDICT:** ENG CLEARED — ready to implement +# Design: NLP Master Course Audio Pipeline + +Generated by /office-hours on 2026-03-23 +Branch: unknown +Repo: nlp-master (local, no git) +Status: APPROVED +Mode: Builder + +## Problem Statement + +Marius has an NLP master course hosted at cursuri.aresens.ro/curs/26 with 35 audio recordings (5 modules x 7 lectures, ~95 minutes each, ~58 hours total) in Romanian. The audio is behind a password-protected website. He wants to download all audio files, transcribe them offline using his AMD Radeon RX 6600M 8GB GPU, and generate clean transcripts with per-lecture summaries as study materials. + +## What Makes This Cool + +58 hours of Romanian lecture audio turned into searchable, summarized study materials — completely automated. Download once, transcribe overnight, summarize with Claude Code. A pipeline that would take weeks of manual work happens in hours. + +## Constraints + +- **Hardware:** AMD Radeon RX 6600M 8GB (RDNA2) — no CUDA, needs Vulkan or ROCm +- **Language:** Romanian audio — Whisper large-v3 has decent but not perfect Romanian support (~95% accuracy on clean audio) +- **Source:** Password-protected website at cursuri.aresens.ro/curs/26 +- **Scale:** ~35 MP3 files, ~95 min each, ~58 hours total +- **Privacy:** Course content is for personal study use only +- **Tooling:** Claude Code available for summary generation (no separate API cost) +- **Platform:** Native Windows (Python + whisper.cpp + Vulkan). Claude Code runs from WSL2 for summaries. +- **Summaries language:** Romanian (matching source material) +- **Audio format:** MP3, 320kbps, 48kHz stereo, ~218MB per file (verified from sample: "Master 25M1 Z1A [Audio].mp3") + +## Premises + +1. Legitimate access to the course — downloading audio for personal study is within usage rights +2. whisper.cpp with Vulkan backend is the right tool for RX 6600M (avoids ROCm compatibility issues on RDNA2) +3. Audio quality is decent (recorded lectures) — Whisper large-v3 will produce usable Romanian transcripts +4. Summaries will be generated by Claude Code after transcription — separate step +5. Batch pipeline (download all → transcribe all → summarize all) is preferred over incremental processing + +## Approaches Considered + +### Approach A: Full Pipeline (CHOSEN) +Python script for website login + MP3 download. Shell script for whisper.cpp batch transcription (Vulkan, large-v3-q5_0). Claude Code for per-lecture summaries from transcripts. +- Effort: M (human: ~2 days / CC: ~30 min to build, ~8 hours to run transcription) +- Risk: Low +- Pros: Complete automation, reproducible for module 6, best quality +- Cons: whisper.cpp Vulkan build requires system setup + +### Approach B: Download + Transcribe Only +Same download + transcription, no automated summaries. Simpler but defers the valuable part. +- Effort: S (human: ~1 day / CC: ~20 min) +- Risk: Low + +### Approach C: Fully Offline (Local LLM summaries) +Everything offline including summaries via llama.cpp. Zero external costs but lower summary quality. +- Effort: M (human: ~2 days / CC: ~40 min) +- Risk: Medium (8GB VRAM shared between whisper.cpp and llama.cpp) + +## Recommended Approach + +**Approach A: Full Pipeline** — Download → whisper.cpp/Vulkan → Claude Code summaries. + +**Execution model:** Everything runs on native Windows (Python, whisper.cpp). Claude Code runs from WSL2 for the summary step. + +### Step 0: Project Setup +- Initialize git repo with `.gitignore` (exclude: `audio/`, `models/`, `.env`, `*.mp3`, `*.wav`, `*.bin`) +- Install Python on Windows (if not already) +- Install Vulkan SDK on Windows +- Create `.env` with course credentials (never committed) + +### Step 1: Site Recon + Download Audio Files +- **First:** Browse cursuri.aresens.ro/curs/26 to understand page structure (login form, module layout, MP3 link format) +- Based on recon, write `download.py` using the right scraping approach (requests+BS4 for static, playwright for JS-rendered — don't build both) +- Login with credentials from `.env` or interactive prompt +- Discover all modules dynamically (don't hardcode 5x7 — actual count may vary) +- Preserve original file names (e.g., "Master 25M1 Z1A [Audio].mp3") and extract lecture titles +- Write `manifest.json` mapping each file to: module, lecture title, original URL, file path, download status +- **Resumability:** skip already-downloaded files (check existence + file size). Retry 3x with backoff. Log to `download_errors.log`. +- **Validation:** after download completes, print summary: "Downloaded X/Y files, Z failures. All files > 1MB: pass/fail." + +### Step 2: Install whisper.cpp with Vulkan (Windows native) +- Option A: Download pre-built Windows binary with Vulkan from [whisper.cpp-windows-vulkan-bin](https://github.com/jerryshell/whisper.cpp-windows-vulkan-bin) +- Option B: Build from source with Visual Studio + `-DGGML_VULKAN=1` CMake flag +- Download model: `ggml-large-v3-q5_0.bin` (~1.5GB) from Hugging Face into `models/` +- **VRAM test:** transcribe a 2-min clip from the first lecture to verify GPU detection, measure speed, and validate MP3 input works. If MP3 fails (whisper.cpp built without ffmpeg libs), install ffmpeg or pre-convert with Python pydub. +- **Speed calibration:** RX 6600M is roughly half the speed of RX 9070 XT. Realistic estimate: **3-5x realtime** (~18-30 min per 90-min file). Total: **~12-18 hours** for all files. Plan for a full day, not overnight. +- **Fallback:** if large-v3-q5_0 OOMs on 8GB, try `ggml-large-v3-q4_0.bin` or `ggml-medium-q5_0.bin`. + +### Step 3: Batch Transcription +- `transcribe.py` (Python, cross-platform) reads `manifest.json`, processes files in module order +- Calls whisper.cpp with: `--language ro --model models\ggml-large-v3-q5_0.bin --output-txt --output-srt` +- Output .txt and .srt per file to `transcripts/{original_name_without_ext}/` +- Updates `manifest.json` with transcription status per file +- **Resumability:** skip files with existing .txt output. Log failures to `transcribe_errors.log`. +- **Quality gate:** after first module completes (~2 hours), STOP and spot-check 2-3 transcripts. If Romanian accuracy is poor (lots of garbled text), consider: switching to `large-v3` unquantized, adjusting `--beam-size`, or accepting lower quality. +- **Validation:** print summary: "Transcribed X/Y files. Z failures. No empty outputs: pass/fail." + +### Step 4: Summary Generation with Claude Code +- From WSL2, use Claude Code to process each transcript +- Use a Python script (`summarize.py`) that reads `manifest.json`, opens each .txt file, and prints the summary prompt for Claude Code +- Summary prompt (Romanian): "Rezuma aceasta transcriere. Ofera: (1) prezentare generala in 3-5 propozitii, (2) concepte cheie cu definitii, (3) detalii si exemple importante" +- **Chunking:** split transcripts > 10K words at sentence boundaries (not raw word count) with 500-word overlap. Summarize chunks, then merge. +- Output to `summaries/{original_name}_summary.md` +- Final: compile `SUPORT_CURS.md` — master study guide with lecture titles as headings + +### Manifest Schema +```json +{ + "course": "NLP Master 2025", + "source_url": "https://cursuri.aresens.ro/curs/26", + "modules": [ + { + "name": "Modul 1", + "lectures": [ + { + "title": "Master 25M1 Z1A", + "original_filename": "Master 25M1 Z1A [Audio].mp3", + "url": "https://...", + "audio_path": "audio/Master 25M1 Z1A [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z1A.txt", + "srt_path": "transcripts/Master 25M1 Z1A.srt", + "summary_path": "summaries/Master 25M1 Z1A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 228486429 + } + ] + } + ] +} +``` + +### Directory Structure +``` +nlp-master/ + .gitignore # Excludes audio/, models/, .env + .env # Course credentials (not committed) + manifest.json # Shared metadata for all scripts + download.py # Step 1: site recon + download + transcribe.py # Step 3: batch transcription + summarize.py # Step 4: summary generation helper + audio/ + Master 25M1 Z1A [Audio].mp3 + Master 25M1 Z1B [Audio].mp3 + ... + models/ + ggml-large-v3-q5_0.bin + transcripts/ + Master 25M1 Z1A.txt + Master 25M1 Z1A.srt + ... + summaries/ + Master 25M1 Z1A_summary.md + ... + SUPORT_CURS.md +``` + +## Open Questions + +1. ~~What is the exact website structure?~~ Resolved: browse site first in Step 1. +2. ~~Are there lecture titles on the website?~~ Resolved: preserve original names + extract titles. +3. ~~Do you want the summaries in Romanian or English?~~ Resolved: Romanian. +4. Should the master study guide (SUPORT_CURS.md) include the full transcripts or just summaries? +5. Is there a 6th module coming? If so, the pipeline should be easily re-runnable. +6. Does whisper.cpp Windows binary support MP3 input natively? (Validated in Step 2 VRAM test) + +## Success Criteria + +- All ~35 MP3 files downloaded and organized by module +- All files transcribed to .txt and .srt with >90% accuracy +- Per-lecture summaries generated with key concepts extracted +- Master study guide (SUPORT_CURS.md) ready for reading/searching +- Pipeline is re-runnable for module 6 when it arrives + +## Next Steps + +1. **git init + .gitignore** — set up project, exclude audio/models/.env (~2 min) +2. **Browse cursuri.aresens.ro** — understand site structure before coding (~10 min) +3. **Build download.py** — login + scrape + download + manifest.json (~15 min with CC) +4. **Install whisper.cpp on Windows** — pre-built binary or build from source + Vulkan SDK (~15 min) +5. **Download whisper model** — large-v3-q5_0 from Hugging Face (~5 min) +6. **Test transcription** — 2-min clip, validate GPU, calibrate speed, check MP3 support (~5 min) +7. **Build transcribe.py** — reads manifest, processes in module order, updates status (~10 min with CC) +8. **Run batch transcription** — ~12-18 hours (leave running during workday) +9. **Spot-check quality** — review 2-3 transcripts after Module 1 completes +10. **Generate summaries with Claude Code** — via summarize.py helper (~30 min) +11. **Compile SUPORT_CURS.md** — master study guide (~10 min) + +## NOT in scope +- Building a web UI or search interface for transcripts — just flat files +- Automated quality scoring of transcriptions — manual spot-check is sufficient +- Speaker diarization (identifying different speakers) — single lecturer +- Translation to English — summaries stay in Romanian +- CI/CD or deployment — this is a local personal pipeline + +## What already exists +- Nothing — greenfield project. No existing code to reuse. +- The one existing file (`Master 25M1 Z1A [Audio].mp3`) confirms the naming pattern and audio specs. + +## Failure Modes +``` +FAILURE MODE | TEST? | HANDLING? | SILENT? +================================|=======|===========|======== +Session expires during download | No | Yes (retry)| No — logged +MP3 truncated (network drop) | Yes* | Yes (size) | No — validation +whisper.cpp OOM on large model | No | Yes (fallback)| No — logged +whisper.cpp can't read MP3 | No | No** | Yes — CRITICAL +Empty transcript output | Yes* | Yes (log) | No — validation +Poor Romanian accuracy | No | Yes (gate)| No — spot-check +Claude Code input too large | No | Yes (chunk)| No — script handles +manifest.json corruption | No | No | Yes — low risk + +* = covered by inline validation checks +** = validated in Step 2 test; if fails, install ffmpeg or use pydub +``` +**Critical gap:** whisper.cpp MP3 support must be validated in Step 2. If it fails silently (produces garbage), the entire batch is wasted. + +## Eng Review Decisions (2026-03-24) +1. Hybrid platform → **All Windows Python** (not WSL2 for scripts) +2. Browse site first → build the right scraper, not two fallback paths +3. Preserve original file names + extract lecture titles +4. Add manifest.json as shared metadata between scripts +5. Python for all scripts (download.py, transcribe.py, summarize.py) +6. Built-in validation checks in each script +7. Feed MP3s directly (no pre-convert) +8. Process in module order +9. Realistic transcription estimate: 12-18 hours (not 7-8) + +## What I noticed about how you think + +- You said "vreau offline transcription + claude code pentru summaries" — you immediately found the pragmatic middle path between fully offline and fully API-dependent. That's good engineering instinct: use the best tool for each step rather than forcing one tool to do everything. +- You gave concrete numbers upfront: "5 module din 6, fiecare cu 7 audio-uri" and "90-100 minute" — you'd already scoped the problem before sitting down. That's not how most people start; most people say "I have some audio files." +- You chose "transcripts + summaries" over "just transcripts" or "full study system" — you know what's useful without over-engineering. + +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | `/plan-ceo-review` | Scope & strategy | 0 | — | — | +| Codex Review | `/codex review` | Independent 2nd opinion | 0 | — | — | +| Eng Review | `/plan-eng-review` | Architecture & tests (required) | 1 | CLEAR (PLAN) | 8 issues, 0 critical gaps | +| Design Review | `/plan-design-review` | UI/UX gaps | 0 | — | — | + +- **OUTSIDE VOICE:** Claude subagent ran — 10 findings, 3 cross-model tensions resolved (platform execution, speed estimate, module order) +- **UNRESOLVED:** 0 +- **VERDICT:** ENG CLEARED — ready to implement diff --git a/TODOS.md b/TODOS.md index d636cef..0f78274 100644 --- a/TODOS.md +++ b/TODOS.md @@ -1,8 +1,8 @@ -# TODOS - -## Re-run pipeline for Module 6 -- **What:** Re-run `download.py` when module 6 becomes available on cursuri.aresens.ro/curs/26 -- **Why:** Course has 6 modules total, only 5 are currently available. Pipeline is designed to be re-runnable — manifest.json + resumability means it discovers new modules and skips already-downloaded files. -- **How:** Run `python download.py` → check manifest for new files → run `python transcribe.py` → generate summaries → update SUPORT_CURS.md -- **Depends on:** Course provider publishing module 6 -- **Added:** 2026-03-24 +# TODOS + +## Re-run pipeline for Module 6 +- **What:** Re-run `download.py` when module 6 becomes available on cursuri.aresens.ro/curs/26 +- **Why:** Course has 6 modules total, only 5 are currently available. Pipeline is designed to be re-runnable — manifest.json + resumability means it discovers new modules and skips already-downloaded files. +- **How:** Run `python download.py` → check manifest for new files → run `python transcribe.py` → generate summaries → update SUPORT_CURS.md +- **Depends on:** Course provider publishing module 6 +- **Added:** 2026-03-24 diff --git a/download.py b/download.py index ce4869e..591bd7e 100644 --- a/download.py +++ b/download.py @@ -1,253 +1,253 @@ -""" -Download all audio files from cursuri.aresens.ro NLP Master course. -Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json. -Resumable: skips already-downloaded files. -""" - -import json -import logging -import os -import sys -import time -from pathlib import Path -from urllib.parse import urljoin - -import requests -from bs4 import BeautifulSoup -from dotenv import load_dotenv - -BASE_URL = "https://cursuri.aresens.ro" -COURSE_URL = f"{BASE_URL}/curs/26" -LOGIN_URL = f"{BASE_URL}/login" -AUDIO_DIR = Path("audio") -MANIFEST_PATH = Path("manifest.json") -MAX_RETRIES = 3 -RETRY_BACKOFF = [5, 15, 30] - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.StreamHandler(), - logging.FileHandler("download_errors.log"), - ], -) -log = logging.getLogger(__name__) - - -def login(session: requests.Session, email: str, password: str) -> bool: - """Login and return True on success.""" - resp = session.post(LOGIN_URL, data={ - "email": email, - "password": password, - "act": "login", - "remember": "on", - }, allow_redirects=True) - # Successful login redirects to the course page, not back to /login - if "/login" in resp.url or "loginform" in resp.text: - return False - return True - - -def discover_modules(session: requests.Session) -> list[dict]: - """Fetch course page and return list of {name, url, module_id}.""" - resp = session.get(COURSE_URL) - resp.raise_for_status() - soup = BeautifulSoup(resp.text, "html.parser") - - modules = [] - for div in soup.select("div.module"): - number_el = div.select_one("div.module__number") - link_el = div.select_one("a.btn") - if not number_el or not link_el: - continue - href = link_el.get("href", "") - module_id = href.rstrip("/").split("/")[-1] - modules.append({ - "name": number_el.get_text(strip=True), - "url": urljoin(BASE_URL, href), - "module_id": module_id, - }) - log.info(f"Found {len(modules)} modules") - return modules - - -def discover_lectures(session: requests.Session, module: dict) -> list[dict]: - """Fetch a module page and return list of lectures with audio URLs.""" - resp = session.get(module["url"]) - resp.raise_for_status() - soup = BeautifulSoup(resp.text, "html.parser") - - lectures = [] - for lesson_div in soup.select("div.lesson"): - name_el = lesson_div.select_one("div.module__name") - source_el = lesson_div.select_one("audio source") - if not name_el or not source_el: - continue - src = source_el.get("src", "").strip() - if not src: - continue - audio_url = urljoin(BASE_URL, src) - filename = src.split("/")[-1] - title = name_el.get_text(strip=True) - lectures.append({ - "title": title, - "original_filename": filename, - "url": audio_url, - "audio_path": str(AUDIO_DIR / filename), - }) - log.info(f" {module['name']}: {len(lectures)} lectures") - return lectures - - -def download_file(session: requests.Session, url: str, dest: Path) -> bool: - """Download a file with retry logic. Returns True on success.""" - for attempt in range(MAX_RETRIES): - try: - resp = session.get(url, stream=True, timeout=300) - resp.raise_for_status() - - # Write to temp file first, then rename (atomic) - tmp = dest.with_suffix(".tmp") - total = 0 - with open(tmp, "wb") as f: - for chunk in resp.iter_content(chunk_size=1024 * 1024): - f.write(chunk) - total += len(chunk) - - if total < 1_000_000: # < 1MB is suspicious - log.warning(f"File too small ({total} bytes): {dest.name}") - tmp.unlink(missing_ok=True) - return False - - tmp.rename(dest) - log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)") - return True - - except Exception as e: - wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30 - log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}") - if attempt < MAX_RETRIES - 1: - log.info(f" Retrying in {wait}s...") - time.sleep(wait) - - log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}") - return False - - -def load_manifest() -> dict | None: - """Load existing manifest if present.""" - if MANIFEST_PATH.exists(): - with open(MANIFEST_PATH) as f: - return json.load(f) - return None - - -def save_manifest(manifest: dict): - """Write manifest.json.""" - with open(MANIFEST_PATH, "w", encoding="utf-8") as f: - json.dump(manifest, f, indent=2, ensure_ascii=False) - - -def main(): - load_dotenv() - email = os.getenv("COURSE_USERNAME", "") - password = os.getenv("COURSE_PASSWORD", "") - if not email or not password: - log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env") - sys.exit(1) - - AUDIO_DIR.mkdir(exist_ok=True) - - session = requests.Session() - session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) - - log.info("Logging in...") - if not login(session, email, password): - log.error("Login failed. Check credentials in .env") - sys.exit(1) - log.info("Login successful") - - # Discover structure - modules = discover_modules(session) - if not modules: - log.error("No modules found") - sys.exit(1) - - manifest = { - "course": "NLP Master Practitioner Bucuresti 2025", - "source_url": COURSE_URL, - "modules": [], - } - - total_files = 0 - downloaded = 0 - skipped = 0 - failed = 0 - - for mod in modules: - lectures = discover_lectures(session, mod) - module_entry = { - "name": mod["name"], - "module_id": mod["module_id"], - "lectures": [], - } - - for lec in lectures: - total_files += 1 - dest = Path(lec["audio_path"]) - stem = dest.stem.replace(" [Audio]", "") - - lecture_entry = { - "title": lec["title"], - "original_filename": lec["original_filename"], - "url": lec["url"], - "audio_path": lec["audio_path"], - "transcript_path": f"transcripts/{stem}.txt", - "srt_path": f"transcripts/{stem}.srt", - "summary_path": f"summaries/{stem}_summary.md", - "download_status": "pending", - "transcribe_status": "pending", - "file_size_bytes": 0, - } - - # Skip if already downloaded - if dest.exists() and dest.stat().st_size > 1_000_000: - lecture_entry["download_status"] = "complete" - lecture_entry["file_size_bytes"] = dest.stat().st_size - skipped += 1 - log.info(f" Skipping (exists): {dest.name}") - else: - if download_file(session, lec["url"], dest): - lecture_entry["download_status"] = "complete" - lecture_entry["file_size_bytes"] = dest.stat().st_size - downloaded += 1 - else: - lecture_entry["download_status"] = "failed" - failed += 1 - - module_entry["lectures"].append(lecture_entry) - - manifest["modules"].append(module_entry) - # Save manifest after each module (checkpoint) - save_manifest(manifest) - - # Final validation - all_ok = all( - Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000 - for mod in manifest["modules"] - for lec in mod["lectures"] - if lec["download_status"] == "complete" - ) - - log.info("=" * 60) - log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.") - log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}") - log.info("=" * 60) - - if failed: - sys.exit(1) - - -if __name__ == "__main__": - main() +""" +Download all audio files from cursuri.aresens.ro NLP Master course. +Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json. +Resumable: skips already-downloaded files. +""" + +import json +import logging +import os +import sys +import time +from pathlib import Path +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup +from dotenv import load_dotenv + +BASE_URL = "https://cursuri.aresens.ro" +COURSE_URL = f"{BASE_URL}/curs/26" +LOGIN_URL = f"{BASE_URL}/login" +AUDIO_DIR = Path("audio") +MANIFEST_PATH = Path("manifest.json") +MAX_RETRIES = 3 +RETRY_BACKOFF = [5, 15, 30] + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(), + logging.FileHandler("download_errors.log"), + ], +) +log = logging.getLogger(__name__) + + +def login(session: requests.Session, email: str, password: str) -> bool: + """Login and return True on success.""" + resp = session.post(LOGIN_URL, data={ + "email": email, + "password": password, + "act": "login", + "remember": "on", + }, allow_redirects=True) + # Successful login redirects to the course page, not back to /login + if "/login" in resp.url or "loginform" in resp.text: + return False + return True + + +def discover_modules(session: requests.Session) -> list[dict]: + """Fetch course page and return list of {name, url, module_id}.""" + resp = session.get(COURSE_URL) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + modules = [] + for div in soup.select("div.module"): + number_el = div.select_one("div.module__number") + link_el = div.select_one("a.btn") + if not number_el or not link_el: + continue + href = link_el.get("href", "") + module_id = href.rstrip("/").split("/")[-1] + modules.append({ + "name": number_el.get_text(strip=True), + "url": urljoin(BASE_URL, href), + "module_id": module_id, + }) + log.info(f"Found {len(modules)} modules") + return modules + + +def discover_lectures(session: requests.Session, module: dict) -> list[dict]: + """Fetch a module page and return list of lectures with audio URLs.""" + resp = session.get(module["url"]) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + lectures = [] + for lesson_div in soup.select("div.lesson"): + name_el = lesson_div.select_one("div.module__name") + source_el = lesson_div.select_one("audio source") + if not name_el or not source_el: + continue + src = source_el.get("src", "").strip() + if not src: + continue + audio_url = urljoin(BASE_URL, src) + filename = src.split("/")[-1] + title = name_el.get_text(strip=True) + lectures.append({ + "title": title, + "original_filename": filename, + "url": audio_url, + "audio_path": str(AUDIO_DIR / filename), + }) + log.info(f" {module['name']}: {len(lectures)} lectures") + return lectures + + +def download_file(session: requests.Session, url: str, dest: Path) -> bool: + """Download a file with retry logic. Returns True on success.""" + for attempt in range(MAX_RETRIES): + try: + resp = session.get(url, stream=True, timeout=300) + resp.raise_for_status() + + # Write to temp file first, then rename (atomic) + tmp = dest.with_suffix(".tmp") + total = 0 + with open(tmp, "wb") as f: + for chunk in resp.iter_content(chunk_size=1024 * 1024): + f.write(chunk) + total += len(chunk) + + if total < 1_000_000: # < 1MB is suspicious + log.warning(f"File too small ({total} bytes): {dest.name}") + tmp.unlink(missing_ok=True) + return False + + tmp.rename(dest) + log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)") + return True + + except Exception as e: + wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30 + log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}") + if attempt < MAX_RETRIES - 1: + log.info(f" Retrying in {wait}s...") + time.sleep(wait) + + log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}") + return False + + +def load_manifest() -> dict | None: + """Load existing manifest if present.""" + if MANIFEST_PATH.exists(): + with open(MANIFEST_PATH) as f: + return json.load(f) + return None + + +def save_manifest(manifest: dict): + """Write manifest.json.""" + with open(MANIFEST_PATH, "w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2, ensure_ascii=False) + + +def main(): + load_dotenv() + email = os.getenv("COURSE_USERNAME", "") + password = os.getenv("COURSE_PASSWORD", "") + if not email or not password: + log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env") + sys.exit(1) + + AUDIO_DIR.mkdir(exist_ok=True) + + session = requests.Session() + session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) + + log.info("Logging in...") + if not login(session, email, password): + log.error("Login failed. Check credentials in .env") + sys.exit(1) + log.info("Login successful") + + # Discover structure + modules = discover_modules(session) + if not modules: + log.error("No modules found") + sys.exit(1) + + manifest = { + "course": "NLP Master Practitioner Bucuresti 2025", + "source_url": COURSE_URL, + "modules": [], + } + + total_files = 0 + downloaded = 0 + skipped = 0 + failed = 0 + + for mod in modules: + lectures = discover_lectures(session, mod) + module_entry = { + "name": mod["name"], + "module_id": mod["module_id"], + "lectures": [], + } + + for lec in lectures: + total_files += 1 + dest = Path(lec["audio_path"]) + stem = dest.stem.replace(" [Audio]", "") + + lecture_entry = { + "title": lec["title"], + "original_filename": lec["original_filename"], + "url": lec["url"], + "audio_path": lec["audio_path"], + "transcript_path": f"transcripts/{stem}.txt", + "srt_path": f"transcripts/{stem}.srt", + "summary_path": f"summaries/{stem}_summary.md", + "download_status": "pending", + "transcribe_status": "pending", + "file_size_bytes": 0, + } + + # Skip if already downloaded + if dest.exists() and dest.stat().st_size > 1_000_000: + lecture_entry["download_status"] = "complete" + lecture_entry["file_size_bytes"] = dest.stat().st_size + skipped += 1 + log.info(f" Skipping (exists): {dest.name}") + else: + if download_file(session, lec["url"], dest): + lecture_entry["download_status"] = "complete" + lecture_entry["file_size_bytes"] = dest.stat().st_size + downloaded += 1 + else: + lecture_entry["download_status"] = "failed" + failed += 1 + + module_entry["lectures"].append(lecture_entry) + + manifest["modules"].append(module_entry) + # Save manifest after each module (checkpoint) + save_manifest(manifest) + + # Final validation + all_ok = all( + Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000 + for mod in manifest["modules"] + for lec in mod["lectures"] + if lec["download_status"] == "complete" + ) + + log.info("=" * 60) + log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.") + log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}") + log.info("=" * 60) + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/manifest.json b/manifest.json index eaa579f..a529e35 100644 --- a/manifest.json +++ b/manifest.json @@ -1,533 +1,533 @@ -{ - "course": "NLP Master Practitioner Bucuresti 2025", - "source_url": "https://cursuri.aresens.ro/curs/26", - "modules": [ - { - "name": "Modul 1", - "module_id": "41", - "lectures": [ - { - "title": "Master 2025 Modulul 1 - Ziua 1 - partea 1", - "original_filename": "Master 25M1 Z1A [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1A [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z1A [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z1A.txt", - "srt_path": "transcripts/Master 25M1 Z1A.srt", - "summary_path": "summaries/Master 25M1 Z1A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 228486429 - }, - { - "title": "Master 2025 Modulul 1 - Ziua 1 - partea 2", - "original_filename": "Master 25M1 Z1B [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1B [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z1B [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z1B.txt", - "srt_path": "transcripts/Master 25M1 Z1B.srt", - "summary_path": "summaries/Master 25M1 Z1B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 237397902 - }, - { - "title": "Master 2025 Modulul 1 - Ziua 1 - partea 3", - "original_filename": "Master 25M1 Z1C [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1C [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z1C [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z1C.txt", - "srt_path": "transcripts/Master 25M1 Z1C.srt", - "summary_path": "summaries/Master 25M1 Z1C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 235260881 - }, - { - "title": "Master 2025 Modulul 1 - Ziua 1 - partea 4", - "original_filename": "Master 25M1 Z1D [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1D [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z1D [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z1D.txt", - "srt_path": "transcripts/Master 25M1 Z1D.srt", - "summary_path": "summaries/Master 25M1 Z1D_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 194361634 - }, - { - "title": "Master 2025 Modulul 1 Ziua 2 - partea 1", - "original_filename": "Master 25M1 Z2A [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z2A [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z2A [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z2A.txt", - "srt_path": "transcripts/Master 25M1 Z2A.srt", - "summary_path": "summaries/Master 25M1 Z2A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 220578693 - }, - { - "title": "Master 2025 Modulul 1 - Ziua 2 - partea 2", - "original_filename": "Master 25M1 Z2B [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z2B [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z2B [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z2B.txt", - "srt_path": "transcripts/Master 25M1 Z2B.srt", - "summary_path": "summaries/Master 25M1 Z2B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 226529515 - }, - { - "title": "Master 2025 Modulul 1 - Ziua 2 - partea 3", - "original_filename": "Master 25M1 Z2C [Audio].mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z2C [Audio].mp3", - "audio_path": "audio\\Master 25M1 Z2C [Audio].mp3", - "transcript_path": "transcripts/Master 25M1 Z2C.txt", - "srt_path": "transcripts/Master 25M1 Z2C.srt", - "summary_path": "summaries/Master 25M1 Z2C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 257356313 - } - ] - }, - { - "name": "Modul 2", - "module_id": "42", - "lectures": [ - { - "title": "Master 2025 Modulul 2 - Ziua 1 - partea 1", - "original_filename": "Audio Master 2025 M2 Z1A.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1A.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z1A.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z1A.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z1A.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z1A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 258054778 - }, - { - "title": "Master 2025 Modulul 2 - Ziua 1 - partea 2", - "original_filename": "Audio Master 2025 M2 Z1B.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1B.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z1B.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z1B.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z1B.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z1B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 216209835 - }, - { - "title": "Master 2025 Modulul 2 - Ziua 1 - partea 3", - "original_filename": "Audio Master 2025 M2 Z1C.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1C.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z1C.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z1C.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z1C.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z1C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 193398314 - }, - { - "title": "Master 2025 Modulul 2 - Ziua 1 - partea 4", - "original_filename": "Audio Master 2025 M2 Z1D.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1D.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z1D.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z1D.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z1D.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z1D_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 208041363 - }, - { - "title": "Master 2025 Modulul 2 - Ziua 2 - partea 1", - "original_filename": "Audio Master 2025 M2 Z2A.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z2A.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z2A.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z2A.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z2A.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z2A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 211064486 - }, - { - "title": "Master 2025 Modulul 2 - Ziua 2 - partea 2", - "original_filename": "Audio Master 2025 M2 Z2B.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z2B.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z2B.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z2B.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z2B.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z2B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 207999984 - }, - { - "title": "Master 2025 Modulul 2 - Ziua 2 - partea 3", - "original_filename": "Audio Master 2025 M2 Z2C.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z2C.mp3", - "audio_path": "audio\\Audio Master 2025 M2 Z2C.mp3", - "transcript_path": "transcripts/Audio Master 2025 M2 Z2C.txt", - "srt_path": "transcripts/Audio Master 2025 M2 Z2C.srt", - "summary_path": "summaries/Audio Master 2025 M2 Z2C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 184772743 - } - ] - }, - { - "name": "Modul 3", - "module_id": "43", - "lectures": [ - { - "title": "Master 2025 Modulul 3 - Ziua 1 - partea 1", - "original_filename": "Audio Master 2025 M3 Z1A.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1A.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z1A.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z1A.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z1A.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z1A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 210586002 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 1 - partea 2", - "original_filename": "Audio Master 2025 M3 Z1B.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1B.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z1B.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z1B.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z1B.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z1B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 255087574 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 1 - partea 3", - "original_filename": "Audio Master 2025 M3 Z1C.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1C.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z1C.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z1C.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z1C.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z1C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 205669023 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 1 - partea 4", - "original_filename": "Audio Master 2025 M3 Z1D.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1D.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z1D.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z1D.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z1D.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z1D_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 103415954 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 2 - partea 1", - "original_filename": "Audio Master 2025 M3 Z2A.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2A.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z2A.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z2A.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z2A.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z2A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 208181205 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 2 - partea 2", - "original_filename": "Audio Master 2025 M3 Z2B.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2B.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z2B.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z2B.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z2B.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z2B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 237693327 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 2 - partea 3", - "original_filename": "Audio Master 2025 M3 Z2C.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2C.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z2C.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z2C.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z2C.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z2C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 197203071 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 2 - partea 4", - "original_filename": "Audio Master 2025 M3 Z2D.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2D.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z2D.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z2D.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z2D.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z2D_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 120281751 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 3 - partea 1", - "original_filename": "Audio Master 2025 M3 Z3A.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z3A.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z3A.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z3A.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z3A.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z3A_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 203818014 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 3 - partea 2", - "original_filename": "Audio Master 2025 M3 Z3B.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z3B.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z3B.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z3B.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z3B.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z3B_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 141426812 - }, - { - "title": "Master 2025 Modulul 3 - Ziua 3 - partea 3", - "original_filename": "Audio Master 2025 M3 Z3C.mp3", - "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z3C.mp3", - "audio_path": "audio\\Audio Master 2025 M3 Z3C.mp3", - "transcript_path": "transcripts/Audio Master 2025 M3 Z3C.txt", - "srt_path": "transcripts/Audio Master 2025 M3 Z3C.srt", - "summary_path": "summaries/Audio Master 2025 M3 Z3C_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 252042855 - } - ] - }, - { - "name": "Modul 4", - "module_id": "44", - "lectures": [ - { - "title": "Master 2025 Modulul 4 - Ziua 1 - partea 1", - "original_filename": "Master 2025 M4 Z1A -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1A -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z1A -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z1A -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z1A -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z1A -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 248913052 - }, - { - "title": "Master 2025 Modulul 4 - Ziua 1 - partea 2", - "original_filename": "Master 2025 M4 Z1B -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1B -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z1B -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z1B -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z1B -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z1B -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 210927678 - }, - { - "title": "Master 2025 Modulul 4 - Ziua 1 - partea 3", - "original_filename": "Master 2025 M4 Z1C -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1C -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z1C -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z1C -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z1C -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z1C -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 227319425 - }, - { - "title": "Master 2025 Modulul 4 - Ziua 1 - partea 4", - "original_filename": "Master 2025 M4 Z1D -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1D -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z1D -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z1D -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z1D -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z1D -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 183335625 - }, - { - "title": "Master 2025 Modulul 4 - Ziua 2 - partea 1", - "original_filename": "Master 2025 M4 Z2A -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z2A -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z2A -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z2A -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z2A -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z2A -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 220160622 - }, - { - "title": "Master 2025 Modulul 4 - Ziua 2 - partea 2", - "original_filename": "Master 2025 M4 Z2B -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z2B -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z2B -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z2B -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z2B -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z2B -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 172078409 - }, - { - "title": "Master 2025 Modulul 4 - Ziua 2 - partea 3", - "original_filename": "Master 2025 M4 Z2C -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z2C -Audio-.mp3", - "audio_path": "audio\\Master 2025 M4 Z2C -Audio-.mp3", - "transcript_path": "transcripts/Master 2025 M4 Z2C -Audio-.txt", - "srt_path": "transcripts/Master 2025 M4 Z2C -Audio-.srt", - "summary_path": "summaries/Master 2025 M4 Z2C -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 195028889 - } - ] - }, - { - "name": "Modul 5", - "module_id": "45", - "lectures": [ - { - "title": "Master 2025 Modulul 5 - Ziua 1 - partea 1", - "original_filename": "Master25 M5 4K Z1A -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1A -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z1A -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z1A -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z1A -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z1A -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 181683315 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 1 - partea 2", - "original_filename": "Master25 M5 4K Z1B -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1B -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z1B -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z1B -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z1B -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z1B -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 215257187 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 1 - partea 3", - "original_filename": "Master25 M5 4K Z1C -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1C -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z1C -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z1C -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z1C -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z1C -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 233139107 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 1 - partea 4", - "original_filename": "Master25 M5 4K Z1D -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1D -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z1D -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z1D -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z1D -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z1D -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 268886625 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 2 - partea 1", - "original_filename": "Master25 M5 4K Z2A -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z2A -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z2A -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z2A -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z2A -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z2A -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 131424916 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 3 - partea 1", - "original_filename": "Master25 M5 4K Z3A -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3A -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z3A -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z3A -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z3A -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z3A -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 212945322 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 3 - partea 2", - "original_filename": "Master25 M5 4K Z3B -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3B -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z3B -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z3B -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z3B -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z3B -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 226194511 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 3 - partea 3", - "original_filename": "Master25 M5 4K Z3C -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3C -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z3C -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z3C -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z3C -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z3C -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 289911605 - }, - { - "title": "Master 2025 Modulul 5 - Ziua 3 - partea 4", - "original_filename": "Master25 M5 4K Z3D -Audio-.mp3", - "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3D -Audio-.mp3", - "audio_path": "audio\\Master25 M5 4K Z3D -Audio-.mp3", - "transcript_path": "transcripts/Master25 M5 4K Z3D -Audio-.txt", - "srt_path": "transcripts/Master25 M5 4K Z3D -Audio-.srt", - "summary_path": "summaries/Master25 M5 4K Z3D -Audio-_summary.md", - "download_status": "complete", - "transcribe_status": "pending", - "file_size_bytes": 222780725 - } - ] - }, - { - "name": "Modul 6", - "module_id": "46", - "lectures": [] - } - ] +{ + "course": "NLP Master Practitioner Bucuresti 2025", + "source_url": "https://cursuri.aresens.ro/curs/26", + "modules": [ + { + "name": "Modul 1", + "module_id": "41", + "lectures": [ + { + "title": "Master 2025 Modulul 1 - Ziua 1 - partea 1", + "original_filename": "Master 25M1 Z1A [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1A [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z1A [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z1A.txt", + "srt_path": "transcripts/Master 25M1 Z1A.srt", + "summary_path": "summaries/Master 25M1 Z1A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 228486429 + }, + { + "title": "Master 2025 Modulul 1 - Ziua 1 - partea 2", + "original_filename": "Master 25M1 Z1B [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1B [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z1B [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z1B.txt", + "srt_path": "transcripts/Master 25M1 Z1B.srt", + "summary_path": "summaries/Master 25M1 Z1B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 237397902 + }, + { + "title": "Master 2025 Modulul 1 - Ziua 1 - partea 3", + "original_filename": "Master 25M1 Z1C [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1C [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z1C [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z1C.txt", + "srt_path": "transcripts/Master 25M1 Z1C.srt", + "summary_path": "summaries/Master 25M1 Z1C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 235260881 + }, + { + "title": "Master 2025 Modulul 1 - Ziua 1 - partea 4", + "original_filename": "Master 25M1 Z1D [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z1D [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z1D [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z1D.txt", + "srt_path": "transcripts/Master 25M1 Z1D.srt", + "summary_path": "summaries/Master 25M1 Z1D_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 194361634 + }, + { + "title": "Master 2025 Modulul 1 Ziua 2 - partea 1", + "original_filename": "Master 25M1 Z2A [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z2A [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z2A [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z2A.txt", + "srt_path": "transcripts/Master 25M1 Z2A.srt", + "summary_path": "summaries/Master 25M1 Z2A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 220578693 + }, + { + "title": "Master 2025 Modulul 1 - Ziua 2 - partea 2", + "original_filename": "Master 25M1 Z2B [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z2B [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z2B [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z2B.txt", + "srt_path": "transcripts/Master 25M1 Z2B.srt", + "summary_path": "summaries/Master 25M1 Z2B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 226529515 + }, + { + "title": "Master 2025 Modulul 1 - Ziua 2 - partea 3", + "original_filename": "Master 25M1 Z2C [Audio].mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 25M1 Z2C [Audio].mp3", + "audio_path": "audio\\Master 25M1 Z2C [Audio].mp3", + "transcript_path": "transcripts/Master 25M1 Z2C.txt", + "srt_path": "transcripts/Master 25M1 Z2C.srt", + "summary_path": "summaries/Master 25M1 Z2C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 257356313 + } + ] + }, + { + "name": "Modul 2", + "module_id": "42", + "lectures": [ + { + "title": "Master 2025 Modulul 2 - Ziua 1 - partea 1", + "original_filename": "Audio Master 2025 M2 Z1A.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1A.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z1A.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z1A.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z1A.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z1A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 258054778 + }, + { + "title": "Master 2025 Modulul 2 - Ziua 1 - partea 2", + "original_filename": "Audio Master 2025 M2 Z1B.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1B.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z1B.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z1B.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z1B.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z1B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 216209835 + }, + { + "title": "Master 2025 Modulul 2 - Ziua 1 - partea 3", + "original_filename": "Audio Master 2025 M2 Z1C.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1C.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z1C.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z1C.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z1C.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z1C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 193398314 + }, + { + "title": "Master 2025 Modulul 2 - Ziua 1 - partea 4", + "original_filename": "Audio Master 2025 M2 Z1D.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z1D.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z1D.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z1D.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z1D.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z1D_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 208041363 + }, + { + "title": "Master 2025 Modulul 2 - Ziua 2 - partea 1", + "original_filename": "Audio Master 2025 M2 Z2A.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z2A.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z2A.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z2A.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z2A.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z2A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 211064486 + }, + { + "title": "Master 2025 Modulul 2 - Ziua 2 - partea 2", + "original_filename": "Audio Master 2025 M2 Z2B.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z2B.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z2B.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z2B.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z2B.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z2B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 207999984 + }, + { + "title": "Master 2025 Modulul 2 - Ziua 2 - partea 3", + "original_filename": "Audio Master 2025 M2 Z2C.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M2 Z2C.mp3", + "audio_path": "audio\\Audio Master 2025 M2 Z2C.mp3", + "transcript_path": "transcripts/Audio Master 2025 M2 Z2C.txt", + "srt_path": "transcripts/Audio Master 2025 M2 Z2C.srt", + "summary_path": "summaries/Audio Master 2025 M2 Z2C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 184772743 + } + ] + }, + { + "name": "Modul 3", + "module_id": "43", + "lectures": [ + { + "title": "Master 2025 Modulul 3 - Ziua 1 - partea 1", + "original_filename": "Audio Master 2025 M3 Z1A.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1A.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z1A.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z1A.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z1A.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z1A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 210586002 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 1 - partea 2", + "original_filename": "Audio Master 2025 M3 Z1B.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1B.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z1B.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z1B.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z1B.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z1B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 255087574 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 1 - partea 3", + "original_filename": "Audio Master 2025 M3 Z1C.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1C.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z1C.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z1C.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z1C.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z1C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 205669023 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 1 - partea 4", + "original_filename": "Audio Master 2025 M3 Z1D.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z1D.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z1D.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z1D.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z1D.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z1D_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 103415954 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 2 - partea 1", + "original_filename": "Audio Master 2025 M3 Z2A.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2A.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z2A.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z2A.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z2A.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z2A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 208181205 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 2 - partea 2", + "original_filename": "Audio Master 2025 M3 Z2B.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2B.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z2B.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z2B.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z2B.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z2B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 237693327 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 2 - partea 3", + "original_filename": "Audio Master 2025 M3 Z2C.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2C.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z2C.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z2C.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z2C.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z2C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 197203071 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 2 - partea 4", + "original_filename": "Audio Master 2025 M3 Z2D.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z2D.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z2D.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z2D.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z2D.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z2D_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 120281751 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 3 - partea 1", + "original_filename": "Audio Master 2025 M3 Z3A.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z3A.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z3A.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z3A.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z3A.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z3A_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 203818014 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 3 - partea 2", + "original_filename": "Audio Master 2025 M3 Z3B.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z3B.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z3B.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z3B.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z3B.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z3B_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 141426812 + }, + { + "title": "Master 2025 Modulul 3 - Ziua 3 - partea 3", + "original_filename": "Audio Master 2025 M3 Z3C.mp3", + "url": "https://cursuri.aresens.ro/resurse/Audio Master 2025 M3 Z3C.mp3", + "audio_path": "audio\\Audio Master 2025 M3 Z3C.mp3", + "transcript_path": "transcripts/Audio Master 2025 M3 Z3C.txt", + "srt_path": "transcripts/Audio Master 2025 M3 Z3C.srt", + "summary_path": "summaries/Audio Master 2025 M3 Z3C_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 252042855 + } + ] + }, + { + "name": "Modul 4", + "module_id": "44", + "lectures": [ + { + "title": "Master 2025 Modulul 4 - Ziua 1 - partea 1", + "original_filename": "Master 2025 M4 Z1A -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1A -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z1A -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z1A -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z1A -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z1A -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 248913052 + }, + { + "title": "Master 2025 Modulul 4 - Ziua 1 - partea 2", + "original_filename": "Master 2025 M4 Z1B -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1B -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z1B -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z1B -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z1B -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z1B -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 210927678 + }, + { + "title": "Master 2025 Modulul 4 - Ziua 1 - partea 3", + "original_filename": "Master 2025 M4 Z1C -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1C -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z1C -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z1C -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z1C -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z1C -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 227319425 + }, + { + "title": "Master 2025 Modulul 4 - Ziua 1 - partea 4", + "original_filename": "Master 2025 M4 Z1D -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z1D -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z1D -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z1D -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z1D -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z1D -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 183335625 + }, + { + "title": "Master 2025 Modulul 4 - Ziua 2 - partea 1", + "original_filename": "Master 2025 M4 Z2A -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z2A -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z2A -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z2A -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z2A -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z2A -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 220160622 + }, + { + "title": "Master 2025 Modulul 4 - Ziua 2 - partea 2", + "original_filename": "Master 2025 M4 Z2B -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z2B -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z2B -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z2B -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z2B -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z2B -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 172078409 + }, + { + "title": "Master 2025 Modulul 4 - Ziua 2 - partea 3", + "original_filename": "Master 2025 M4 Z2C -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master 2025 M4 Z2C -Audio-.mp3", + "audio_path": "audio\\Master 2025 M4 Z2C -Audio-.mp3", + "transcript_path": "transcripts/Master 2025 M4 Z2C -Audio-.txt", + "srt_path": "transcripts/Master 2025 M4 Z2C -Audio-.srt", + "summary_path": "summaries/Master 2025 M4 Z2C -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 195028889 + } + ] + }, + { + "name": "Modul 5", + "module_id": "45", + "lectures": [ + { + "title": "Master 2025 Modulul 5 - Ziua 1 - partea 1", + "original_filename": "Master25 M5 4K Z1A -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1A -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z1A -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z1A -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z1A -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z1A -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 181683315 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 1 - partea 2", + "original_filename": "Master25 M5 4K Z1B -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1B -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z1B -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z1B -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z1B -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z1B -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 215257187 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 1 - partea 3", + "original_filename": "Master25 M5 4K Z1C -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1C -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z1C -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z1C -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z1C -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z1C -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 233139107 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 1 - partea 4", + "original_filename": "Master25 M5 4K Z1D -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z1D -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z1D -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z1D -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z1D -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z1D -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 268886625 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 2 - partea 1", + "original_filename": "Master25 M5 4K Z2A -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z2A -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z2A -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z2A -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z2A -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z2A -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 131424916 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 3 - partea 1", + "original_filename": "Master25 M5 4K Z3A -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3A -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z3A -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z3A -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z3A -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z3A -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 212945322 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 3 - partea 2", + "original_filename": "Master25 M5 4K Z3B -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3B -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z3B -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z3B -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z3B -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z3B -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 226194511 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 3 - partea 3", + "original_filename": "Master25 M5 4K Z3C -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3C -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z3C -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z3C -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z3C -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z3C -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 289911605 + }, + { + "title": "Master 2025 Modulul 5 - Ziua 3 - partea 4", + "original_filename": "Master25 M5 4K Z3D -Audio-.mp3", + "url": "https://cursuri.aresens.ro/resurse/Master25 M5 4K Z3D -Audio-.mp3", + "audio_path": "audio\\Master25 M5 4K Z3D -Audio-.mp3", + "transcript_path": "transcripts/Master25 M5 4K Z3D -Audio-.txt", + "srt_path": "transcripts/Master25 M5 4K Z3D -Audio-.srt", + "summary_path": "summaries/Master25 M5 4K Z3D -Audio-_summary.md", + "download_status": "complete", + "transcribe_status": "pending", + "file_size_bytes": 222780725 + } + ] + }, + { + "name": "Modul 6", + "module_id": "46", + "lectures": [] + } + ] } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 97a8e63..f8fbc0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -requests -beautifulsoup4 -python-dotenv +requests +beautifulsoup4 +python-dotenv diff --git a/run.bat b/run.bat index 6e81de8..6ed5f72 100644 --- a/run.bat +++ b/run.bat @@ -1,313 +1,313 @@ -@echo off -setlocal enabledelayedexpansion -cd /d "%~dp0" - -:: Prevent Vulkan from exhausting VRAM — overflow to system RAM instead of crashing -set "GGML_VK_PREFER_HOST_MEMORY=ON" - -echo ============================================================ -echo NLP Master - Download + Transcribe Pipeline -echo ============================================================ -echo. - -:: ============================================================ -:: PREREQUISITES CHECK -:: ============================================================ -echo Checking prerequisites... -echo. -set "PREREQ_OK=1" -set "NEED_WHISPER=" -set "NEED_MODEL=" - -:: --- Python --- -python --version >nul 2>&1 -if errorlevel 1 ( - echo [X] Python NOT FOUND - echo Install from: https://www.python.org/downloads/ - echo Make sure to check "Add Python to PATH" during install. - echo. - echo Cannot continue without Python. Install it and re-run. - pause - exit /b 1 -) else ( - for /f "tokens=2" %%v in ('python --version 2^>^&1') do echo [OK] Python %%v -) - -:: --- .env credentials --- -if exist ".env" ( - findstr /m "COURSE_USERNAME=." ".env" >nul 2>&1 - if errorlevel 1 ( - echo [X] .env File exists but COURSE_USERNAME is empty - echo Edit .env and fill in your credentials. - set "PREREQ_OK=" - ) else ( - echo [OK] .env Credentials configured - ) -) else ( - echo [X] .env NOT FOUND - echo Create .env with: - echo COURSE_USERNAME=your_email - echo COURSE_PASSWORD=your_password - set "PREREQ_OK=" -) - -:: --- ffmpeg --- -set "FFMPEG_FOUND=" -set "NEED_FFMPEG=" -where ffmpeg >nul 2>&1 -if not errorlevel 1 ( - set "FFMPEG_FOUND=1" - for /f "delims=" %%p in ('where ffmpeg 2^>nul') do set "FFMPEG_LOCATION=%%p" - echo [OK] ffmpeg !FFMPEG_LOCATION! -) else ( - if exist "ffmpeg.exe" ( - set "FFMPEG_FOUND=1" - echo [OK] ffmpeg .\ffmpeg.exe (local^) - ) else ( - echo [--] ffmpeg Not found - will auto-install - set "NEED_FFMPEG=1" - ) -) - -:: --- whisper-cli.exe --- -set "WHISPER_FOUND=" -set "WHISPER_LOCATION=" -if defined WHISPER_BIN ( - if exist "%WHISPER_BIN%" ( - set "WHISPER_FOUND=1" - set "WHISPER_LOCATION=%WHISPER_BIN% (env var)" - ) -) -if not defined WHISPER_FOUND ( - where whisper-cli.exe >nul 2>&1 - if not errorlevel 1 ( - set "WHISPER_FOUND=1" - for /f "delims=" %%p in ('where whisper-cli.exe 2^>nul') do set "WHISPER_LOCATION=%%p (PATH)" - ) -) -if not defined WHISPER_FOUND ( - if exist "whisper-cli.exe" ( - set "WHISPER_FOUND=1" - set "WHISPER_BIN=whisper-cli.exe" - set "WHISPER_LOCATION=.\whisper-cli.exe (local)" - ) -) -if not defined WHISPER_FOUND ( - if exist "whisper-bin\whisper-cli.exe" ( - set "WHISPER_FOUND=1" - set "WHISPER_BIN=whisper-bin\whisper-cli.exe" - set "WHISPER_LOCATION=whisper-bin\whisper-cli.exe (auto-installed)" - ) -) -if not defined WHISPER_FOUND ( - if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" ( - set "WHISPER_FOUND=1" - set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe" - set "WHISPER_LOCATION=whisper.cpp\build\... (local build)" - ) -) - -if defined WHISPER_FOUND ( - echo [OK] whisper-cli !WHISPER_LOCATION! -) else ( - echo [--] whisper-cli Not found - will auto-download - set "NEED_WHISPER=1" -) - -:: --- Whisper model --- -if not defined WHISPER_MODEL set "WHISPER_MODEL=models\ggml-medium-q5_0.bin" -if exist "%WHISPER_MODEL%" ( - for %%F in ("%WHISPER_MODEL%") do ( - set /a "MODEL_MB=%%~zF / 1048576" - ) - echo [OK] Whisper model %WHISPER_MODEL% (!MODEL_MB! MB^) -) else ( - echo [--] Whisper model Not found - will auto-download (~500 MB^) - set "NEED_MODEL=1" -) - -:: --- Vulkan GPU support --- -set "VULKAN_FOUND=" -where vulkaninfo >nul 2>&1 -if not errorlevel 1 ( - set "VULKAN_FOUND=1" - echo [OK] Vulkan SDK Installed -) else ( - if exist "%VULKAN_SDK%\Bin\vulkaninfo.exe" ( - set "VULKAN_FOUND=1" - echo [OK] Vulkan SDK %VULKAN_SDK% - ) else ( - echo [!!] Vulkan SDK Not detected (whisper.cpp may use CPU fallback^) - echo Install from: https://vulkan.lunarg.com/sdk/home - ) -) - -:: --- Disk space --- -echo. -for /f "tokens=3" %%a in ('dir /-c "%~dp0." 2^>nul ^| findstr /c:"bytes free"') do ( - set /a "FREE_GB=%%a / 1073741824" 2>nul -) -if defined FREE_GB ( - if !FREE_GB! LSS 50 ( - echo [!!] Disk space ~!FREE_GB! GB free (need ~50 GB for all audio + transcripts^) - ) else ( - echo [OK] Disk space ~!FREE_GB! GB free - ) -) - -echo. - -:: --- Stop if .env is broken (can't auto-fix that) --- -if not defined PREREQ_OK ( - echo ============================================================ - echo MISSING PREREQUISITES - fix the [X] items above and re-run. - echo ============================================================ - pause - exit /b 1 -) - -:: ============================================================ -:: AUTO-INSTALL MISSING COMPONENTS -:: ============================================================ -if defined NEED_FFMPEG ( - echo ============================================================ - echo Auto-downloading ffmpeg... - echo ============================================================ - python setup_whisper.py ffmpeg - if errorlevel 1 ( - echo. - echo ERROR: Could not install ffmpeg. - echo Download manually from: https://www.gyan.dev/ffmpeg/builds/ - echo Extract ffmpeg.exe to ffmpeg-bin\ and re-run. - pause - exit /b 1 - ) - if exist ".ffmpeg_bin_path" del .ffmpeg_bin_path - echo. -) - -:: Add ffmpeg-bin to PATH if it exists -if exist "ffmpeg-bin\ffmpeg.exe" ( - set "PATH=%~dp0ffmpeg-bin;%PATH%" -) - -if defined NEED_WHISPER ( - echo ============================================================ - echo Auto-downloading whisper.cpp (Vulkan build^)... - echo ============================================================ - python setup_whisper.py whisper - if errorlevel 1 ( - echo. - echo ERROR: Failed to auto-download whisper.cpp. - echo Download manually from: https://github.com/ggml-org/whisper.cpp/releases - pause - exit /b 1 - ) - :: Read the path that setup_whisper.py wrote - if exist ".whisper_bin_path" ( - set /p WHISPER_BIN=<.whisper_bin_path - del .whisper_bin_path - echo Using: !WHISPER_BIN! - ) - echo. -) - -if defined NEED_MODEL ( - echo ============================================================ - echo Auto-downloading Whisper model (ggml-medium-q5_0, ~500 MB^)... - echo This will take a few minutes depending on your connection. - echo ============================================================ - python setup_whisper.py model - if errorlevel 1 ( - echo. - echo ERROR: Failed to download model. - echo Download manually from: - echo https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin - echo Save to: models\ggml-medium-q5_0.bin - pause - exit /b 1 - ) - echo. -) - -echo All prerequisites OK! -echo. -echo ============================================================ -echo Starting pipeline... -echo ============================================================ -echo. - -:: ============================================================ -:: STEP 1: VENV + DEPENDENCIES -:: ============================================================ -if not exist ".venv\Scripts\python.exe" ( - echo [1/4] Creating Python virtual environment... - python -m venv .venv - if errorlevel 1 ( - echo ERROR: Failed to create venv. - pause - exit /b 1 - ) - echo Done. -) else ( - echo [1/4] Virtual environment already exists. -) - -echo [2/4] Installing Python dependencies... -.venv\Scripts\pip install -q -r requirements.txt -if errorlevel 1 ( - echo ERROR: Failed to install dependencies. - pause - exit /b 1 -) -echo Done. - -:: ============================================================ -:: STEP 2: DOWNLOAD -:: ============================================================ -echo. -echo [3/4] Downloading audio files... -echo ============================================================ -.venv\Scripts\python download.py -if errorlevel 1 ( - echo. - echo WARNING: Some downloads failed. Check download_errors.log - echo Press any key to continue to transcription anyway, or Ctrl+C to abort. - pause >nul -) - -:: ============================================================ -:: STEP 3: TRANSCRIBE -:: ============================================================ -echo. -echo [4/4] Transcribing with whisper.cpp... -echo ============================================================ -echo Using: %WHISPER_BIN% -echo Model: %WHISPER_MODEL% -echo. - -if "%~1"=="" ( - .venv\Scripts\python transcribe.py -) else ( - echo Modules filter: %~1 - .venv\Scripts\python transcribe.py --modules %~1 -) -if errorlevel 1 ( - echo. - echo WARNING: Some transcriptions failed. Check transcribe_errors.log -) - -:: ============================================================ -:: DONE -:: ============================================================ -echo. -echo ============================================================ -echo Pipeline complete! -echo - Audio files: audio\ -echo - Transcripts: transcripts\ -echo - Manifest: manifest.json -echo. -echo Next step: generate summaries from WSL2 with Claude Code -echo python summarize.py -echo ============================================================ -pause +@echo off +setlocal enabledelayedexpansion +cd /d "%~dp0" + +:: Prevent Vulkan from exhausting VRAM — overflow to system RAM instead of crashing +set "GGML_VK_PREFER_HOST_MEMORY=ON" + +echo ============================================================ +echo NLP Master - Download + Transcribe Pipeline +echo ============================================================ +echo. + +:: ============================================================ +:: PREREQUISITES CHECK +:: ============================================================ +echo Checking prerequisites... +echo. +set "PREREQ_OK=1" +set "NEED_WHISPER=" +set "NEED_MODEL=" + +:: --- Python --- +python --version >nul 2>&1 +if errorlevel 1 ( + echo [X] Python NOT FOUND + echo Install from: https://www.python.org/downloads/ + echo Make sure to check "Add Python to PATH" during install. + echo. + echo Cannot continue without Python. Install it and re-run. + pause + exit /b 1 +) else ( + for /f "tokens=2" %%v in ('python --version 2^>^&1') do echo [OK] Python %%v +) + +:: --- .env credentials --- +if exist ".env" ( + findstr /m "COURSE_USERNAME=." ".env" >nul 2>&1 + if errorlevel 1 ( + echo [X] .env File exists but COURSE_USERNAME is empty + echo Edit .env and fill in your credentials. + set "PREREQ_OK=" + ) else ( + echo [OK] .env Credentials configured + ) +) else ( + echo [X] .env NOT FOUND + echo Create .env with: + echo COURSE_USERNAME=your_email + echo COURSE_PASSWORD=your_password + set "PREREQ_OK=" +) + +:: --- ffmpeg --- +set "FFMPEG_FOUND=" +set "NEED_FFMPEG=" +where ffmpeg >nul 2>&1 +if not errorlevel 1 ( + set "FFMPEG_FOUND=1" + for /f "delims=" %%p in ('where ffmpeg 2^>nul') do set "FFMPEG_LOCATION=%%p" + echo [OK] ffmpeg !FFMPEG_LOCATION! +) else ( + if exist "ffmpeg.exe" ( + set "FFMPEG_FOUND=1" + echo [OK] ffmpeg .\ffmpeg.exe (local^) + ) else ( + echo [--] ffmpeg Not found - will auto-install + set "NEED_FFMPEG=1" + ) +) + +:: --- whisper-cli.exe --- +set "WHISPER_FOUND=" +set "WHISPER_LOCATION=" +if defined WHISPER_BIN ( + if exist "%WHISPER_BIN%" ( + set "WHISPER_FOUND=1" + set "WHISPER_LOCATION=%WHISPER_BIN% (env var)" + ) +) +if not defined WHISPER_FOUND ( + where whisper-cli.exe >nul 2>&1 + if not errorlevel 1 ( + set "WHISPER_FOUND=1" + for /f "delims=" %%p in ('where whisper-cli.exe 2^>nul') do set "WHISPER_LOCATION=%%p (PATH)" + ) +) +if not defined WHISPER_FOUND ( + if exist "whisper-cli.exe" ( + set "WHISPER_FOUND=1" + set "WHISPER_BIN=whisper-cli.exe" + set "WHISPER_LOCATION=.\whisper-cli.exe (local)" + ) +) +if not defined WHISPER_FOUND ( + if exist "whisper-bin\whisper-cli.exe" ( + set "WHISPER_FOUND=1" + set "WHISPER_BIN=whisper-bin\whisper-cli.exe" + set "WHISPER_LOCATION=whisper-bin\whisper-cli.exe (auto-installed)" + ) +) +if not defined WHISPER_FOUND ( + if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" ( + set "WHISPER_FOUND=1" + set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe" + set "WHISPER_LOCATION=whisper.cpp\build\... (local build)" + ) +) + +if defined WHISPER_FOUND ( + echo [OK] whisper-cli !WHISPER_LOCATION! +) else ( + echo [--] whisper-cli Not found - will auto-download + set "NEED_WHISPER=1" +) + +:: --- Whisper model --- +if not defined WHISPER_MODEL set "WHISPER_MODEL=models\ggml-medium-q5_0.bin" +if exist "%WHISPER_MODEL%" ( + for %%F in ("%WHISPER_MODEL%") do ( + set /a "MODEL_MB=%%~zF / 1048576" + ) + echo [OK] Whisper model %WHISPER_MODEL% (!MODEL_MB! MB^) +) else ( + echo [--] Whisper model Not found - will auto-download (~500 MB^) + set "NEED_MODEL=1" +) + +:: --- Vulkan GPU support --- +set "VULKAN_FOUND=" +where vulkaninfo >nul 2>&1 +if not errorlevel 1 ( + set "VULKAN_FOUND=1" + echo [OK] Vulkan SDK Installed +) else ( + if exist "%VULKAN_SDK%\Bin\vulkaninfo.exe" ( + set "VULKAN_FOUND=1" + echo [OK] Vulkan SDK %VULKAN_SDK% + ) else ( + echo [!!] Vulkan SDK Not detected (whisper.cpp may use CPU fallback^) + echo Install from: https://vulkan.lunarg.com/sdk/home + ) +) + +:: --- Disk space --- +echo. +for /f "tokens=3" %%a in ('dir /-c "%~dp0." 2^>nul ^| findstr /c:"bytes free"') do ( + set /a "FREE_GB=%%a / 1073741824" 2>nul +) +if defined FREE_GB ( + if !FREE_GB! LSS 50 ( + echo [!!] Disk space ~!FREE_GB! GB free (need ~50 GB for all audio + transcripts^) + ) else ( + echo [OK] Disk space ~!FREE_GB! GB free + ) +) + +echo. + +:: --- Stop if .env is broken (can't auto-fix that) --- +if not defined PREREQ_OK ( + echo ============================================================ + echo MISSING PREREQUISITES - fix the [X] items above and re-run. + echo ============================================================ + pause + exit /b 1 +) + +:: ============================================================ +:: AUTO-INSTALL MISSING COMPONENTS +:: ============================================================ +if defined NEED_FFMPEG ( + echo ============================================================ + echo Auto-downloading ffmpeg... + echo ============================================================ + python setup_whisper.py ffmpeg + if errorlevel 1 ( + echo. + echo ERROR: Could not install ffmpeg. + echo Download manually from: https://www.gyan.dev/ffmpeg/builds/ + echo Extract ffmpeg.exe to ffmpeg-bin\ and re-run. + pause + exit /b 1 + ) + if exist ".ffmpeg_bin_path" del .ffmpeg_bin_path + echo. +) + +:: Add ffmpeg-bin to PATH if it exists +if exist "ffmpeg-bin\ffmpeg.exe" ( + set "PATH=%~dp0ffmpeg-bin;%PATH%" +) + +if defined NEED_WHISPER ( + echo ============================================================ + echo Auto-downloading whisper.cpp (Vulkan build^)... + echo ============================================================ + python setup_whisper.py whisper + if errorlevel 1 ( + echo. + echo ERROR: Failed to auto-download whisper.cpp. + echo Download manually from: https://github.com/ggml-org/whisper.cpp/releases + pause + exit /b 1 + ) + :: Read the path that setup_whisper.py wrote + if exist ".whisper_bin_path" ( + set /p WHISPER_BIN=<.whisper_bin_path + del .whisper_bin_path + echo Using: !WHISPER_BIN! + ) + echo. +) + +if defined NEED_MODEL ( + echo ============================================================ + echo Auto-downloading Whisper model (ggml-medium-q5_0, ~500 MB^)... + echo This will take a few minutes depending on your connection. + echo ============================================================ + python setup_whisper.py model + if errorlevel 1 ( + echo. + echo ERROR: Failed to download model. + echo Download manually from: + echo https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin + echo Save to: models\ggml-medium-q5_0.bin + pause + exit /b 1 + ) + echo. +) + +echo All prerequisites OK! +echo. +echo ============================================================ +echo Starting pipeline... +echo ============================================================ +echo. + +:: ============================================================ +:: STEP 1: VENV + DEPENDENCIES +:: ============================================================ +if not exist ".venv\Scripts\python.exe" ( + echo [1/4] Creating Python virtual environment... + python -m venv .venv + if errorlevel 1 ( + echo ERROR: Failed to create venv. + pause + exit /b 1 + ) + echo Done. +) else ( + echo [1/4] Virtual environment already exists. +) + +echo [2/4] Installing Python dependencies... +.venv\Scripts\pip install -q -r requirements.txt +if errorlevel 1 ( + echo ERROR: Failed to install dependencies. + pause + exit /b 1 +) +echo Done. + +:: ============================================================ +:: STEP 2: DOWNLOAD +:: ============================================================ +echo. +echo [3/4] Downloading audio files... +echo ============================================================ +.venv\Scripts\python download.py +if errorlevel 1 ( + echo. + echo WARNING: Some downloads failed. Check download_errors.log + echo Press any key to continue to transcription anyway, or Ctrl+C to abort. + pause >nul +) + +:: ============================================================ +:: STEP 3: TRANSCRIBE +:: ============================================================ +echo. +echo [4/4] Transcribing with whisper.cpp... +echo ============================================================ +echo Using: %WHISPER_BIN% +echo Model: %WHISPER_MODEL% +echo. + +if "%~1"=="" ( + .venv\Scripts\python transcribe.py +) else ( + echo Modules filter: %~1 + .venv\Scripts\python transcribe.py --modules %~1 +) +if errorlevel 1 ( + echo. + echo WARNING: Some transcriptions failed. Check transcribe_errors.log +) + +:: ============================================================ +:: DONE +:: ============================================================ +echo. +echo ============================================================ +echo Pipeline complete! +echo - Audio files: audio\ +echo - Transcripts: transcripts\ +echo - Manifest: manifest.json +echo. +echo Next step: generate summaries from WSL2 with Claude Code +echo python summarize.py +echo ============================================================ +pause diff --git a/setup_whisper.py b/setup_whisper.py index d2fc730..cc7fa18 100644 --- a/setup_whisper.py +++ b/setup_whisper.py @@ -1,325 +1,325 @@ -""" -Auto-download and setup whisper.cpp (Vulkan) + model for Windows. -Called by run.bat when prerequisites are missing. -""" - -import io -import json -import os -import sys -import zipfile -from pathlib import Path -from urllib.request import urlopen, Request - -MODELS_DIR = Path("models") -MODEL_NAME = "ggml-medium-q5_0.bin" -MODEL_URL = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/{MODEL_NAME}" - -GITHUB_API = "https://api.github.com/repos/ggml-org/whisper.cpp/releases/latest" -# Community Vulkan builds (for AMD GPUs) -VULKAN_BUILDS_API = "https://api.github.com/repos/jerryshell/whisper.cpp-windows-vulkan-bin/releases/latest" -WHISPER_DIR = Path("whisper-bin") - - -def progress_bar(current: int, total: int, width: int = 40): - if total <= 0: - return - pct = current / total - filled = int(width * pct) - bar = "=" * filled + "-" * (width - filled) - mb_done = current / 1_048_576 - mb_total = total / 1_048_576 - print(f"\r [{bar}] {pct:.0%} {mb_done:.0f}/{mb_total:.0f} MB", end="", flush=True) - - -def download_file(url: str, dest: Path, desc: str): - """Download a file with progress bar.""" - print(f"\n Downloading {desc}...") - print(f" URL: {url}") - - req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) - resp = urlopen(req, timeout=60) - - total = int(resp.headers.get("Content-Length", 0)) - downloaded = 0 - tmp = dest.with_suffix(".tmp") - - with open(tmp, "wb") as f: - while True: - chunk = resp.read(1024 * 1024) - if not chunk: - break - f.write(chunk) - downloaded += len(chunk) - progress_bar(downloaded, total) - - print() # newline after progress bar - tmp.rename(dest) - print(f" Saved: {dest} ({downloaded / 1_048_576:.0f} MB)") - - -def fetch_release(api_url: str) -> dict | None: - """Fetch a GitHub release JSON.""" - req = Request(api_url, headers={"User-Agent": "Mozilla/5.0"}) - try: - resp = urlopen(req, timeout=30) - return json.loads(resp.read()) - except Exception as e: - print(f" Could not fetch from {api_url}: {e}") - return None - - -def extract_zip(zip_path: Path): - """Extract zip contents into WHISPER_DIR, flattened.""" - print(f"\n Extracting to {WHISPER_DIR}/...") - WHISPER_DIR.mkdir(exist_ok=True) - with zipfile.ZipFile(zip_path) as zf: - for member in zf.namelist(): - filename = Path(member).name - if not filename: - continue - target = WHISPER_DIR / filename - with zf.open(member) as src, open(target, "wb") as dst: - dst.write(src.read()) - print(f" {filename}") - zip_path.unlink() - - -def find_whisper_exe() -> str | None: - """Find whisper-cli.exe (or similar) in WHISPER_DIR.""" - whisper_exe = WHISPER_DIR / "whisper-cli.exe" - if whisper_exe.exists(): - return str(whisper_exe) - - # Try main.exe (older naming) - main_exe = WHISPER_DIR / "main.exe" - if main_exe.exists(): - return str(main_exe) - - exes = list(WHISPER_DIR.glob("*.exe")) - for exe in exes: - if "whisper" in exe.name.lower() and "cli" in exe.name.lower(): - return str(exe) - for exe in exes: - if "whisper" in exe.name.lower(): - return str(exe) - if exes: - return str(exes[0]) - return None - - -def try_community_vulkan_build() -> str | None: - """Try downloading Vulkan build from jerryshell's community repo.""" - print("\n Trying community Vulkan build (jerryshell/whisper.cpp-windows-vulkan-bin)...") - release = fetch_release(VULKAN_BUILDS_API) - if not release: - return None - - tag = release.get("tag_name", "unknown") - print(f" Community release: {tag}") - - # Find a zip asset - for asset in release.get("assets", []): - name = asset["name"].lower() - if name.endswith(".zip"): - print(f" Found: {asset['name']}") - zip_path = Path(asset["name"]) - download_file(asset["browser_download_url"], zip_path, asset["name"]) - extract_zip(zip_path) - return find_whisper_exe() - - print(" No zip asset found in community release") - return None - - -def try_official_vulkan_build() -> str | None: - """Try downloading Vulkan build from official ggml-org releases.""" - print("\n Fetching latest whisper.cpp release from ggml-org...") - release = fetch_release(GITHUB_API) - if not release: - return None - - tag = release.get("tag_name", "unknown") - print(f" Official release: {tag}") - - # Priority: vulkan > noavx (cpu-only, no CUDA deps) > skip CUDA entirely - vulkan_asset = None - cpu_asset = None - for asset in release.get("assets", []): - name = asset["name"].lower() - if not name.endswith(".zip"): - continue - # Must be Windows - if "win" not in name and "x64" not in name: - continue - # Absolutely skip CUDA builds - they won't work on AMD - if "cuda" in name: - continue - if "vulkan" in name: - vulkan_asset = asset - break - if "noavx" not in name and "openblas" not in name: - cpu_asset = asset - - chosen = vulkan_asset or cpu_asset - if not chosen: - print(" No Vulkan or CPU-only build found in official releases") - print(" Available assets:") - for asset in release.get("assets", []): - print(f" - {asset['name']}") - return None - - if vulkan_asset: - print(f" Found official Vulkan build: {chosen['name']}") - else: - print(f" No Vulkan build in official release, using CPU build: {chosen['name']}") - print(f" (Will work but without GPU acceleration)") - - zip_path = Path(chosen["name"]) - download_file(chosen["browser_download_url"], zip_path, chosen["name"]) - extract_zip(zip_path) - return find_whisper_exe() - - -def setup_whisper_bin() -> str | None: - """Download whisper.cpp Vulkan release. Returns path to whisper-cli.exe.""" - whisper_exe = WHISPER_DIR / "whisper-cli.exe" - if whisper_exe.exists(): - # Check if it's a CUDA build (has CUDA DLLs but no Vulkan DLL) - has_cuda = (WHISPER_DIR / "ggml-cuda.dll").exists() - has_vulkan = (WHISPER_DIR / "ggml-vulkan.dll").exists() - if has_cuda and not has_vulkan: - print(f" WARNING: Existing install is a CUDA build (won't work on AMD GPU)") - print(f" Removing and re-downloading Vulkan build...") - import shutil - shutil.rmtree(WHISPER_DIR) - else: - print(f" whisper-cli.exe already exists at {whisper_exe}") - return str(whisper_exe) - - # Strategy: try community Vulkan build first (reliable for AMD), - # then fall back to official release - exe_path = try_community_vulkan_build() - if exe_path: - print(f"\n whisper-cli.exe ready at: {exe_path} (Vulkan)") - return exe_path - - print("\n Community build failed, trying official release...") - exe_path = try_official_vulkan_build() - if exe_path: - print(f"\n whisper-cli.exe ready at: {exe_path}") - return exe_path - - print("\n ERROR: Could not download whisper.cpp") - print(" Manual install: https://github.com/ggml-org/whisper.cpp/releases") - print(" Build from source with: cmake -DGGML_VULKAN=1") - return None - - -FFMPEG_DIR = Path("ffmpeg-bin") -FFMPEG_URL = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip" - - -def setup_ffmpeg() -> str | None: - """Download ffmpeg if not found. Returns path to ffmpeg.exe.""" - import shutil - - # Already in PATH? - if shutil.which("ffmpeg"): - path = shutil.which("ffmpeg") - print(f" ffmpeg already in PATH: {path}") - return path - - # Already downloaded locally? - local_exe = FFMPEG_DIR / "ffmpeg.exe" - if local_exe.exists(): - print(f" ffmpeg already exists at {local_exe}") - return str(local_exe) - - print("\n Downloading ffmpeg (essentials build)...") - zip_path = Path("ffmpeg-essentials.zip") - download_file(FFMPEG_URL, zip_path, "ffmpeg") - - print(f"\n Extracting ffmpeg...") - FFMPEG_DIR.mkdir(exist_ok=True) - with zipfile.ZipFile(zip_path) as zf: - for member in zf.namelist(): - # Only extract the bin/*.exe files - if member.endswith(".exe"): - filename = Path(member).name - target = FFMPEG_DIR / filename - with zf.open(member) as src, open(target, "wb") as dst: - dst.write(src.read()) - print(f" {filename}") - - zip_path.unlink() - - if local_exe.exists(): - print(f"\n ffmpeg ready at: {local_exe}") - return str(local_exe) - - print(" ERROR: ffmpeg.exe not found after extraction") - return None - - -def setup_model() -> bool: - """Download whisper model. Returns True on success.""" - MODELS_DIR.mkdir(exist_ok=True) - model_path = MODELS_DIR / MODEL_NAME - - if model_path.exists() and model_path.stat().st_size > 100_000_000: - print(f" Model already exists: {model_path} ({model_path.stat().st_size / 1_048_576:.0f} MB)") - return True - - download_file(MODEL_URL, model_path, f"Whisper model ({MODEL_NAME})") - - if model_path.exists() and model_path.stat().st_size > 100_000_000: - return True - - print(" ERROR: Model file too small or missing after download") - return False - - -def main(): - what = sys.argv[1] if len(sys.argv) > 1 else "all" - - if what in ("all", "ffmpeg"): - print("=" * 60) - print(" Setting up ffmpeg") - print("=" * 60) - ffmpeg_path = setup_ffmpeg() - if ffmpeg_path: - Path(".ffmpeg_bin_path").write_text(ffmpeg_path) - else: - print("\nFAILED to set up ffmpeg") - if what == "ffmpeg": - sys.exit(1) - - if what in ("all", "whisper"): - print("=" * 60) - print(" Setting up whisper.cpp") - print("=" * 60) - exe_path = setup_whisper_bin() - if exe_path: - # Write path to temp file so run.bat can read it - Path(".whisper_bin_path").write_text(exe_path) - else: - print("\nFAILED to set up whisper.cpp") - if what == "whisper": - sys.exit(1) - - if what in ("all", "model"): - print() - print("=" * 60) - print(f" Downloading Whisper model: {MODEL_NAME}") - print("=" * 60) - if not setup_model(): - print("\nFAILED to download model") - sys.exit(1) - - print() - print("Setup complete!") - - -if __name__ == "__main__": - main() +""" +Auto-download and setup whisper.cpp (Vulkan) + model for Windows. +Called by run.bat when prerequisites are missing. +""" + +import io +import json +import os +import sys +import zipfile +from pathlib import Path +from urllib.request import urlopen, Request + +MODELS_DIR = Path("models") +MODEL_NAME = "ggml-medium-q5_0.bin" +MODEL_URL = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/{MODEL_NAME}" + +GITHUB_API = "https://api.github.com/repos/ggml-org/whisper.cpp/releases/latest" +# Community Vulkan builds (for AMD GPUs) +VULKAN_BUILDS_API = "https://api.github.com/repos/jerryshell/whisper.cpp-windows-vulkan-bin/releases/latest" +WHISPER_DIR = Path("whisper-bin") + + +def progress_bar(current: int, total: int, width: int = 40): + if total <= 0: + return + pct = current / total + filled = int(width * pct) + bar = "=" * filled + "-" * (width - filled) + mb_done = current / 1_048_576 + mb_total = total / 1_048_576 + print(f"\r [{bar}] {pct:.0%} {mb_done:.0f}/{mb_total:.0f} MB", end="", flush=True) + + +def download_file(url: str, dest: Path, desc: str): + """Download a file with progress bar.""" + print(f"\n Downloading {desc}...") + print(f" URL: {url}") + + req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) + resp = urlopen(req, timeout=60) + + total = int(resp.headers.get("Content-Length", 0)) + downloaded = 0 + tmp = dest.with_suffix(".tmp") + + with open(tmp, "wb") as f: + while True: + chunk = resp.read(1024 * 1024) + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + progress_bar(downloaded, total) + + print() # newline after progress bar + tmp.rename(dest) + print(f" Saved: {dest} ({downloaded / 1_048_576:.0f} MB)") + + +def fetch_release(api_url: str) -> dict | None: + """Fetch a GitHub release JSON.""" + req = Request(api_url, headers={"User-Agent": "Mozilla/5.0"}) + try: + resp = urlopen(req, timeout=30) + return json.loads(resp.read()) + except Exception as e: + print(f" Could not fetch from {api_url}: {e}") + return None + + +def extract_zip(zip_path: Path): + """Extract zip contents into WHISPER_DIR, flattened.""" + print(f"\n Extracting to {WHISPER_DIR}/...") + WHISPER_DIR.mkdir(exist_ok=True) + with zipfile.ZipFile(zip_path) as zf: + for member in zf.namelist(): + filename = Path(member).name + if not filename: + continue + target = WHISPER_DIR / filename + with zf.open(member) as src, open(target, "wb") as dst: + dst.write(src.read()) + print(f" {filename}") + zip_path.unlink() + + +def find_whisper_exe() -> str | None: + """Find whisper-cli.exe (or similar) in WHISPER_DIR.""" + whisper_exe = WHISPER_DIR / "whisper-cli.exe" + if whisper_exe.exists(): + return str(whisper_exe) + + # Try main.exe (older naming) + main_exe = WHISPER_DIR / "main.exe" + if main_exe.exists(): + return str(main_exe) + + exes = list(WHISPER_DIR.glob("*.exe")) + for exe in exes: + if "whisper" in exe.name.lower() and "cli" in exe.name.lower(): + return str(exe) + for exe in exes: + if "whisper" in exe.name.lower(): + return str(exe) + if exes: + return str(exes[0]) + return None + + +def try_community_vulkan_build() -> str | None: + """Try downloading Vulkan build from jerryshell's community repo.""" + print("\n Trying community Vulkan build (jerryshell/whisper.cpp-windows-vulkan-bin)...") + release = fetch_release(VULKAN_BUILDS_API) + if not release: + return None + + tag = release.get("tag_name", "unknown") + print(f" Community release: {tag}") + + # Find a zip asset + for asset in release.get("assets", []): + name = asset["name"].lower() + if name.endswith(".zip"): + print(f" Found: {asset['name']}") + zip_path = Path(asset["name"]) + download_file(asset["browser_download_url"], zip_path, asset["name"]) + extract_zip(zip_path) + return find_whisper_exe() + + print(" No zip asset found in community release") + return None + + +def try_official_vulkan_build() -> str | None: + """Try downloading Vulkan build from official ggml-org releases.""" + print("\n Fetching latest whisper.cpp release from ggml-org...") + release = fetch_release(GITHUB_API) + if not release: + return None + + tag = release.get("tag_name", "unknown") + print(f" Official release: {tag}") + + # Priority: vulkan > noavx (cpu-only, no CUDA deps) > skip CUDA entirely + vulkan_asset = None + cpu_asset = None + for asset in release.get("assets", []): + name = asset["name"].lower() + if not name.endswith(".zip"): + continue + # Must be Windows + if "win" not in name and "x64" not in name: + continue + # Absolutely skip CUDA builds - they won't work on AMD + if "cuda" in name: + continue + if "vulkan" in name: + vulkan_asset = asset + break + if "noavx" not in name and "openblas" not in name: + cpu_asset = asset + + chosen = vulkan_asset or cpu_asset + if not chosen: + print(" No Vulkan or CPU-only build found in official releases") + print(" Available assets:") + for asset in release.get("assets", []): + print(f" - {asset['name']}") + return None + + if vulkan_asset: + print(f" Found official Vulkan build: {chosen['name']}") + else: + print(f" No Vulkan build in official release, using CPU build: {chosen['name']}") + print(f" (Will work but without GPU acceleration)") + + zip_path = Path(chosen["name"]) + download_file(chosen["browser_download_url"], zip_path, chosen["name"]) + extract_zip(zip_path) + return find_whisper_exe() + + +def setup_whisper_bin() -> str | None: + """Download whisper.cpp Vulkan release. Returns path to whisper-cli.exe.""" + whisper_exe = WHISPER_DIR / "whisper-cli.exe" + if whisper_exe.exists(): + # Check if it's a CUDA build (has CUDA DLLs but no Vulkan DLL) + has_cuda = (WHISPER_DIR / "ggml-cuda.dll").exists() + has_vulkan = (WHISPER_DIR / "ggml-vulkan.dll").exists() + if has_cuda and not has_vulkan: + print(f" WARNING: Existing install is a CUDA build (won't work on AMD GPU)") + print(f" Removing and re-downloading Vulkan build...") + import shutil + shutil.rmtree(WHISPER_DIR) + else: + print(f" whisper-cli.exe already exists at {whisper_exe}") + return str(whisper_exe) + + # Strategy: try community Vulkan build first (reliable for AMD), + # then fall back to official release + exe_path = try_community_vulkan_build() + if exe_path: + print(f"\n whisper-cli.exe ready at: {exe_path} (Vulkan)") + return exe_path + + print("\n Community build failed, trying official release...") + exe_path = try_official_vulkan_build() + if exe_path: + print(f"\n whisper-cli.exe ready at: {exe_path}") + return exe_path + + print("\n ERROR: Could not download whisper.cpp") + print(" Manual install: https://github.com/ggml-org/whisper.cpp/releases") + print(" Build from source with: cmake -DGGML_VULKAN=1") + return None + + +FFMPEG_DIR = Path("ffmpeg-bin") +FFMPEG_URL = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip" + + +def setup_ffmpeg() -> str | None: + """Download ffmpeg if not found. Returns path to ffmpeg.exe.""" + import shutil + + # Already in PATH? + if shutil.which("ffmpeg"): + path = shutil.which("ffmpeg") + print(f" ffmpeg already in PATH: {path}") + return path + + # Already downloaded locally? + local_exe = FFMPEG_DIR / "ffmpeg.exe" + if local_exe.exists(): + print(f" ffmpeg already exists at {local_exe}") + return str(local_exe) + + print("\n Downloading ffmpeg (essentials build)...") + zip_path = Path("ffmpeg-essentials.zip") + download_file(FFMPEG_URL, zip_path, "ffmpeg") + + print(f"\n Extracting ffmpeg...") + FFMPEG_DIR.mkdir(exist_ok=True) + with zipfile.ZipFile(zip_path) as zf: + for member in zf.namelist(): + # Only extract the bin/*.exe files + if member.endswith(".exe"): + filename = Path(member).name + target = FFMPEG_DIR / filename + with zf.open(member) as src, open(target, "wb") as dst: + dst.write(src.read()) + print(f" {filename}") + + zip_path.unlink() + + if local_exe.exists(): + print(f"\n ffmpeg ready at: {local_exe}") + return str(local_exe) + + print(" ERROR: ffmpeg.exe not found after extraction") + return None + + +def setup_model() -> bool: + """Download whisper model. Returns True on success.""" + MODELS_DIR.mkdir(exist_ok=True) + model_path = MODELS_DIR / MODEL_NAME + + if model_path.exists() and model_path.stat().st_size > 100_000_000: + print(f" Model already exists: {model_path} ({model_path.stat().st_size / 1_048_576:.0f} MB)") + return True + + download_file(MODEL_URL, model_path, f"Whisper model ({MODEL_NAME})") + + if model_path.exists() and model_path.stat().st_size > 100_000_000: + return True + + print(" ERROR: Model file too small or missing after download") + return False + + +def main(): + what = sys.argv[1] if len(sys.argv) > 1 else "all" + + if what in ("all", "ffmpeg"): + print("=" * 60) + print(" Setting up ffmpeg") + print("=" * 60) + ffmpeg_path = setup_ffmpeg() + if ffmpeg_path: + Path(".ffmpeg_bin_path").write_text(ffmpeg_path) + else: + print("\nFAILED to set up ffmpeg") + if what == "ffmpeg": + sys.exit(1) + + if what in ("all", "whisper"): + print("=" * 60) + print(" Setting up whisper.cpp") + print("=" * 60) + exe_path = setup_whisper_bin() + if exe_path: + # Write path to temp file so run.bat can read it + Path(".whisper_bin_path").write_text(exe_path) + else: + print("\nFAILED to set up whisper.cpp") + if what == "whisper": + sys.exit(1) + + if what in ("all", "model"): + print() + print("=" * 60) + print(f" Downloading Whisper model: {MODEL_NAME}") + print("=" * 60) + if not setup_model(): + print("\nFAILED to download model") + sys.exit(1) + + print() + print("Setup complete!") + + +if __name__ == "__main__": + main() diff --git a/summarize.py b/summarize.py index 5f5fd10..42e7519 100644 --- a/summarize.py +++ b/summarize.py @@ -1,192 +1,192 @@ -""" -Generate summaries from transcripts using Claude Code. -Reads manifest.json, processes each transcript, outputs per-lecture summaries, -and compiles SUPORT_CURS.md master study guide. - -Usage: - python summarize.py # Print prompts for each transcript (pipe to Claude) - python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md -""" - -import json -import sys -import textwrap -from pathlib import Path - -MANIFEST_PATH = Path("manifest.json") -SUMMARIES_DIR = Path("summaries") -TRANSCRIPTS_DIR = Path("transcripts") -MASTER_GUIDE = Path("SUPORT_CURS.md") - -MAX_WORDS_PER_CHUNK = 10000 -OVERLAP_WORDS = 500 - -SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner. - -Ofera: -1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei -2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important -3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer -4. **Citate memorabile** - fraze sau idei remarcabile (daca exista) - -Raspunde in limba romana. Formateaza ca Markdown. - ---- -TITLU LECTIE: {title} ---- -TRANSCRIERE: -{text} -""" - -MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat). -Combina-le intr-un singur rezumat coerent, eliminand duplicatele. - -Pastreaza structura: -1. Prezentare generala (3-5 propozitii) -2. Concepte cheie cu definitii -3. Detalii si exemple importante -4. Citate memorabile - -Raspunde in limba romana. Formateaza ca Markdown. - ---- -TITLU LECTIE: {title} ---- -REZUMATE PARTIALE: -{chunks} -""" - - -def load_manifest() -> dict: - with open(MANIFEST_PATH, encoding="utf-8") as f: - return json.load(f) - - -def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]: - """Split text into chunks at sentence boundaries with overlap.""" - words = text.split() - if len(words) <= max_words: - return [text] - - chunks = [] - start = 0 - while start < len(words): - end = min(start + max_words, len(words)) - chunk_words = words[start:end] - chunk_text = " ".join(chunk_words) - - # Try to break at sentence boundary (look back from end) - if end < len(words): - for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]: - last_sep = chunk_text.rfind(sep) - if last_sep > len(chunk_text) // 2: # Don't break too early - chunk_text = chunk_text[:last_sep + 1] - # Recalculate end based on actual words used - end = start + len(chunk_text.split()) - break - - chunks.append(chunk_text) - start = max(end - overlap, start + 1) # Overlap, but always advance - - return chunks - - -def generate_prompts(manifest: dict): - """Print summary prompts for each transcript to stdout.""" - SUMMARIES_DIR.mkdir(exist_ok=True) - - for mod in manifest["modules"]: - for lec in mod["lectures"]: - if lec.get("transcribe_status") != "complete": - continue - - summary_path = Path(lec["summary_path"]) - if summary_path.exists() and summary_path.stat().st_size > 0: - print(f"# SKIP (exists): {lec['title']}", file=sys.stderr) - continue - - txt_path = Path(lec["transcript_path"]) - if not txt_path.exists(): - print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr) - continue - - text = txt_path.read_text(encoding="utf-8").strip() - if not text: - print(f"# SKIP (empty): {lec['title']}", file=sys.stderr) - continue - - chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS) - - print(f"\n{'='*60}", file=sys.stderr) - print(f"Lecture: {lec['title']}", file=sys.stderr) - print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr) - print(f"Output: {summary_path}", file=sys.stderr) - - if len(chunks) == 1: - prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text) - print(f"SUMMARY_FILE:{summary_path}") - print(prompt) - print("---END_PROMPT---") - else: - # Multi-chunk: generate individual chunk prompts - for i, chunk in enumerate(chunks, 1): - prompt = SUMMARY_PROMPT.format( - title=f"{lec['title']} (partea {i}/{len(chunks)})", - text=chunk, - ) - print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}") - print(prompt) - print("---END_PROMPT---") - - # Then a merge prompt - print(f"MERGE_FILE:{summary_path}") - merge = MERGE_PROMPT.format( - title=lec["title"], - chunks="{chunk_summaries}", # Placeholder for merge step - ) - print(merge) - print("---END_PROMPT---") - - -def compile_master_guide(manifest: dict): - """Compile all summaries into SUPORT_CURS.md.""" - lines = [ - "# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n", - "_Generat automat din transcrierile audio ale cursului._\n", - "---\n", - ] - - for mod in manifest["modules"]: - lines.append(f"\n## {mod['name']}\n") - - for lec in mod["lectures"]: - summary_path = Path(lec["summary_path"]) - lines.append(f"\n### {lec['title']}\n") - - if summary_path.exists(): - content = summary_path.read_text(encoding="utf-8").strip() - lines.append(f"{content}\n") - else: - lines.append("_Rezumat indisponibil._\n") - - lines.append("\n---\n") - - MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8") - print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)") - - -def main(): - if not MANIFEST_PATH.exists(): - print("manifest.json not found. Run download.py and transcribe.py first.") - sys.exit(1) - - manifest = load_manifest() - - if "--compile" in sys.argv: - compile_master_guide(manifest) - else: - generate_prompts(manifest) - - -if __name__ == "__main__": - main() +""" +Generate summaries from transcripts using Claude Code. +Reads manifest.json, processes each transcript, outputs per-lecture summaries, +and compiles SUPORT_CURS.md master study guide. + +Usage: + python summarize.py # Print prompts for each transcript (pipe to Claude) + python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md +""" + +import json +import sys +import textwrap +from pathlib import Path + +MANIFEST_PATH = Path("manifest.json") +SUMMARIES_DIR = Path("summaries") +TRANSCRIPTS_DIR = Path("transcripts") +MASTER_GUIDE = Path("SUPORT_CURS.md") + +MAX_WORDS_PER_CHUNK = 10000 +OVERLAP_WORDS = 500 + +SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner. + +Ofera: +1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei +2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important +3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer +4. **Citate memorabile** - fraze sau idei remarcabile (daca exista) + +Raspunde in limba romana. Formateaza ca Markdown. + +--- +TITLU LECTIE: {title} +--- +TRANSCRIERE: +{text} +""" + +MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat). +Combina-le intr-un singur rezumat coerent, eliminand duplicatele. + +Pastreaza structura: +1. Prezentare generala (3-5 propozitii) +2. Concepte cheie cu definitii +3. Detalii si exemple importante +4. Citate memorabile + +Raspunde in limba romana. Formateaza ca Markdown. + +--- +TITLU LECTIE: {title} +--- +REZUMATE PARTIALE: +{chunks} +""" + + +def load_manifest() -> dict: + with open(MANIFEST_PATH, encoding="utf-8") as f: + return json.load(f) + + +def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]: + """Split text into chunks at sentence boundaries with overlap.""" + words = text.split() + if len(words) <= max_words: + return [text] + + chunks = [] + start = 0 + while start < len(words): + end = min(start + max_words, len(words)) + chunk_words = words[start:end] + chunk_text = " ".join(chunk_words) + + # Try to break at sentence boundary (look back from end) + if end < len(words): + for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]: + last_sep = chunk_text.rfind(sep) + if last_sep > len(chunk_text) // 2: # Don't break too early + chunk_text = chunk_text[:last_sep + 1] + # Recalculate end based on actual words used + end = start + len(chunk_text.split()) + break + + chunks.append(chunk_text) + start = max(end - overlap, start + 1) # Overlap, but always advance + + return chunks + + +def generate_prompts(manifest: dict): + """Print summary prompts for each transcript to stdout.""" + SUMMARIES_DIR.mkdir(exist_ok=True) + + for mod in manifest["modules"]: + for lec in mod["lectures"]: + if lec.get("transcribe_status") != "complete": + continue + + summary_path = Path(lec["summary_path"]) + if summary_path.exists() and summary_path.stat().st_size > 0: + print(f"# SKIP (exists): {lec['title']}", file=sys.stderr) + continue + + txt_path = Path(lec["transcript_path"]) + if not txt_path.exists(): + print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr) + continue + + text = txt_path.read_text(encoding="utf-8").strip() + if not text: + print(f"# SKIP (empty): {lec['title']}", file=sys.stderr) + continue + + chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS) + + print(f"\n{'='*60}", file=sys.stderr) + print(f"Lecture: {lec['title']}", file=sys.stderr) + print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr) + print(f"Output: {summary_path}", file=sys.stderr) + + if len(chunks) == 1: + prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text) + print(f"SUMMARY_FILE:{summary_path}") + print(prompt) + print("---END_PROMPT---") + else: + # Multi-chunk: generate individual chunk prompts + for i, chunk in enumerate(chunks, 1): + prompt = SUMMARY_PROMPT.format( + title=f"{lec['title']} (partea {i}/{len(chunks)})", + text=chunk, + ) + print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}") + print(prompt) + print("---END_PROMPT---") + + # Then a merge prompt + print(f"MERGE_FILE:{summary_path}") + merge = MERGE_PROMPT.format( + title=lec["title"], + chunks="{chunk_summaries}", # Placeholder for merge step + ) + print(merge) + print("---END_PROMPT---") + + +def compile_master_guide(manifest: dict): + """Compile all summaries into SUPORT_CURS.md.""" + lines = [ + "# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n", + "_Generat automat din transcrierile audio ale cursului._\n", + "---\n", + ] + + for mod in manifest["modules"]: + lines.append(f"\n## {mod['name']}\n") + + for lec in mod["lectures"]: + summary_path = Path(lec["summary_path"]) + lines.append(f"\n### {lec['title']}\n") + + if summary_path.exists(): + content = summary_path.read_text(encoding="utf-8").strip() + lines.append(f"{content}\n") + else: + lines.append("_Rezumat indisponibil._\n") + + lines.append("\n---\n") + + MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8") + print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)") + + +def main(): + if not MANIFEST_PATH.exists(): + print("manifest.json not found. Run download.py and transcribe.py first.") + sys.exit(1) + + manifest = load_manifest() + + if "--compile" in sys.argv: + compile_master_guide(manifest) + else: + generate_prompts(manifest) + + +if __name__ == "__main__": + main() diff --git a/transcribe.py b/transcribe.py index e3dce59..f5ba638 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,299 +1,299 @@ -""" -Batch transcription using whisper.cpp. -Reads manifest.json, transcribes each audio file in module order, -outputs .txt and .srt files, updates manifest status. -Resumable: skips files with existing transcripts. -Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription. -""" - -import json -import logging -import os -import shutil -import subprocess -import sys -from pathlib import Path - -MANIFEST_PATH = Path("manifest.json") -TRANSCRIPTS_DIR = Path("transcripts") -WAV_CACHE_DIR = Path("audio_wav") - -# whisper.cpp defaults — override with env vars or CLI args -WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe") -WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin") - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.StreamHandler(), - logging.FileHandler("transcribe_errors.log"), - ], -) -log = logging.getLogger(__name__) - - -def find_ffmpeg() -> str: - """Find ffmpeg executable.""" - if shutil.which("ffmpeg"): - return "ffmpeg" - # Check local directories - for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]: - if p.exists(): - return str(p.resolve()) - # Try imageio-ffmpeg (pip fallback) - try: - import imageio_ffmpeg - return imageio_ffmpeg.get_ffmpeg_exe() - except ImportError: - pass - return "" - - -def convert_to_wav(audio_path: str) -> str: - """ - Convert audio file to WAV 16kHz mono (optimal for whisper.cpp). - Returns path to WAV file. Skips if WAV already exists. - """ - src = Path(audio_path) - - # Already a WAV file, skip - if src.suffix.lower() == ".wav": - return audio_path - - WAV_CACHE_DIR.mkdir(exist_ok=True) - wav_path = WAV_CACHE_DIR / (src.stem + ".wav") - - # Skip if already converted - if wav_path.exists() and wav_path.stat().st_size > 0: - log.info(f" WAV cache hit: {wav_path}") - return str(wav_path) - - ffmpeg = find_ffmpeg() - if not ffmpeg: - log.warning(" ffmpeg not found, using original file (may cause bad transcription)") - return audio_path - - log.info(f" Converting to WAV: {src.name} -> {wav_path.name}") - cmd = [ - ffmpeg, - "-i", audio_path, - "-vn", # no video - "-acodec", "pcm_s16le", # 16-bit PCM - "-ar", "16000", # 16kHz sample rate (whisper standard) - "-ac", "1", # mono - "-y", # overwrite - str(wav_path), - ] - - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300, # 5 min max for conversion - ) - if result.returncode != 0: - log.error(f" ffmpeg failed: {result.stderr[:300]}") - return audio_path - - log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)") - return str(wav_path) - - except FileNotFoundError: - log.warning(f" ffmpeg not found at: {ffmpeg}") - return audio_path - except subprocess.TimeoutExpired: - log.error(f" ffmpeg conversion timeout for {audio_path}") - return audio_path - - -def load_manifest() -> dict: - with open(MANIFEST_PATH, encoding="utf-8") as f: - return json.load(f) - - -def save_manifest(manifest: dict): - with open(MANIFEST_PATH, "w", encoding="utf-8") as f: - json.dump(manifest, f, indent=2, ensure_ascii=False) - - -def transcribe_file(audio_path: str, output_base: str) -> bool: - """ - Run whisper.cpp on a single file. - Returns True on success. - """ - cmd = [ - WHISPER_BIN, - "--model", WHISPER_MODEL, - "--language", "ro", - "--no-gpu", - "--threads", str(os.cpu_count() or 4), - "--beam-size", "1", - "--best-of", "1", - "--output-txt", - "--output-srt", - "--output-file", output_base, - "--file", audio_path, - ] - - log.info(f" CMD: {' '.join(cmd)}") - - try: - # Add whisper.exe's directory to PATH so Windows finds its DLLs - env = os.environ.copy() - whisper_dir = str(Path(WHISPER_BIN).resolve().parent) - env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "") - - result = subprocess.run( - cmd, - stdout=sys.stdout, - stderr=sys.stderr, - timeout=7200, # 2 hour timeout per file - env=env, - ) - - if result.returncode != 0: - log.error(f" whisper.cpp failed (exit {result.returncode})") - return False - - # Verify output exists and is non-empty - txt_path = Path(f"{output_base}.txt") - srt_path = Path(f"{output_base}.srt") - - if not txt_path.exists() or txt_path.stat().st_size == 0: - log.error(f" Empty or missing transcript: {txt_path}") - return False - - log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)") - if srt_path.exists(): - log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)") - - return True - - except subprocess.TimeoutExpired: - log.error(f" Timeout (>2h) for {audio_path}") - return False - except FileNotFoundError: - log.error(f" whisper.cpp not found at: {WHISPER_BIN}") - log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH") - return False - except Exception as e: - log.error(f" Error: {e}") - return False - - -def parse_module_filter(arg: str) -> set[int]: - """Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices.""" - result = set() - for part in arg.split(","): - part = part.strip() - if "-" in part: - a, b = part.split("-", 1) - result.update(range(int(a), int(b) + 1)) - else: - result.add(int(part)) - return result - - -def main(): - if not MANIFEST_PATH.exists(): - log.error("manifest.json not found. Run download.py first.") - sys.exit(1) - - # Parse --modules filter - module_filter = None - if "--modules" in sys.argv: - idx = sys.argv.index("--modules") - if idx + 1 < len(sys.argv): - module_filter = parse_module_filter(sys.argv[idx + 1]) - log.info(f"Module filter: {sorted(module_filter)}") - - manifest = load_manifest() - TRANSCRIPTS_DIR.mkdir(exist_ok=True) - - total = 0 - transcribed = 0 - skipped = 0 - failed = 0 - - for mod_idx, mod in enumerate(manifest["modules"], 1): - if module_filter and mod_idx not in module_filter: - log.info(f"\nSkipping module {mod_idx}: {mod['name']}") - continue - log.info(f"\n{'='*60}") - log.info(f"Module: {mod['name']}") - log.info(f"{'='*60}") - - for lec in mod["lectures"]: - total += 1 - - if lec.get("download_status") != "complete": - log.warning(f" Skipping (not downloaded): {lec['title']}") - continue - - audio_path = lec["audio_path"] - stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "") - output_base = str(TRANSCRIPTS_DIR / stem) - - # Check if already transcribed - txt_path = Path(f"{output_base}.txt") - if txt_path.exists() and txt_path.stat().st_size > 0: - lec["transcribe_status"] = "complete" - skipped += 1 - log.info(f" Skipping (exists): {stem}.txt") - continue - - log.info(f" Transcribing: {lec['title']}") - log.info(f" File: {audio_path}") - - # Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input - wav_path = convert_to_wav(audio_path) - - if transcribe_file(wav_path, output_base): - lec["transcribe_status"] = "complete" - transcribed += 1 - else: - lec["transcribe_status"] = "failed" - failed += 1 - - # Save manifest after each file (checkpoint) - save_manifest(manifest) - - # Quality gate: pause after first module - if mod == manifest["modules"][0] and transcribed > 0: - log.info("\n" + "!" * 60) - log.info("QUALITY GATE: First module complete.") - log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.") - log.info("Press Enter to continue, or Ctrl+C to abort...") - log.info("!" * 60) - try: - input() - except EOFError: - pass # Non-interactive mode, continue - - # Validation - empty_outputs = [ - lec["title"] - for mod in manifest["modules"] - for lec in mod["lectures"] - if lec.get("transcribe_status") == "complete" - and not Path(lec["transcript_path"]).exists() - ] - - log.info("\n" + "=" * 60) - log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.") - log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}") - if empty_outputs: - for t in empty_outputs: - log.error(f" Missing transcript: {t}") - log.info("=" * 60) - - save_manifest(manifest) - - if failed: - sys.exit(1) - - -if __name__ == "__main__": - main() +""" +Batch transcription using whisper.cpp. +Reads manifest.json, transcribes each audio file in module order, +outputs .txt and .srt files, updates manifest status. +Resumable: skips files with existing transcripts. +Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription. +""" + +import json +import logging +import os +import shutil +import subprocess +import sys +from pathlib import Path + +MANIFEST_PATH = Path("manifest.json") +TRANSCRIPTS_DIR = Path("transcripts") +WAV_CACHE_DIR = Path("audio_wav") + +# whisper.cpp defaults — override with env vars or CLI args +WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe") +WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(), + logging.FileHandler("transcribe_errors.log"), + ], +) +log = logging.getLogger(__name__) + + +def find_ffmpeg() -> str: + """Find ffmpeg executable.""" + if shutil.which("ffmpeg"): + return "ffmpeg" + # Check local directories + for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]: + if p.exists(): + return str(p.resolve()) + # Try imageio-ffmpeg (pip fallback) + try: + import imageio_ffmpeg + return imageio_ffmpeg.get_ffmpeg_exe() + except ImportError: + pass + return "" + + +def convert_to_wav(audio_path: str) -> str: + """ + Convert audio file to WAV 16kHz mono (optimal for whisper.cpp). + Returns path to WAV file. Skips if WAV already exists. + """ + src = Path(audio_path) + + # Already a WAV file, skip + if src.suffix.lower() == ".wav": + return audio_path + + WAV_CACHE_DIR.mkdir(exist_ok=True) + wav_path = WAV_CACHE_DIR / (src.stem + ".wav") + + # Skip if already converted + if wav_path.exists() and wav_path.stat().st_size > 0: + log.info(f" WAV cache hit: {wav_path}") + return str(wav_path) + + ffmpeg = find_ffmpeg() + if not ffmpeg: + log.warning(" ffmpeg not found, using original file (may cause bad transcription)") + return audio_path + + log.info(f" Converting to WAV: {src.name} -> {wav_path.name}") + cmd = [ + ffmpeg, + "-i", audio_path, + "-vn", # no video + "-acodec", "pcm_s16le", # 16-bit PCM + "-ar", "16000", # 16kHz sample rate (whisper standard) + "-ac", "1", # mono + "-y", # overwrite + str(wav_path), + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 min max for conversion + ) + if result.returncode != 0: + log.error(f" ffmpeg failed: {result.stderr[:300]}") + return audio_path + + log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)") + return str(wav_path) + + except FileNotFoundError: + log.warning(f" ffmpeg not found at: {ffmpeg}") + return audio_path + except subprocess.TimeoutExpired: + log.error(f" ffmpeg conversion timeout for {audio_path}") + return audio_path + + +def load_manifest() -> dict: + with open(MANIFEST_PATH, encoding="utf-8") as f: + return json.load(f) + + +def save_manifest(manifest: dict): + with open(MANIFEST_PATH, "w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2, ensure_ascii=False) + + +def transcribe_file(audio_path: str, output_base: str) -> bool: + """ + Run whisper.cpp on a single file. + Returns True on success. + """ + cmd = [ + WHISPER_BIN, + "--model", WHISPER_MODEL, + "--language", "ro", + "--no-gpu", + "--threads", str(os.cpu_count() or 4), + "--beam-size", "1", + "--best-of", "1", + "--output-txt", + "--output-srt", + "--output-file", output_base, + "--file", audio_path, + ] + + log.info(f" CMD: {' '.join(cmd)}") + + try: + # Add whisper.exe's directory to PATH so Windows finds its DLLs + env = os.environ.copy() + whisper_dir = str(Path(WHISPER_BIN).resolve().parent) + env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "") + + result = subprocess.run( + cmd, + stdout=sys.stdout, + stderr=sys.stderr, + timeout=7200, # 2 hour timeout per file + env=env, + ) + + if result.returncode != 0: + log.error(f" whisper.cpp failed (exit {result.returncode})") + return False + + # Verify output exists and is non-empty + txt_path = Path(f"{output_base}.txt") + srt_path = Path(f"{output_base}.srt") + + if not txt_path.exists() or txt_path.stat().st_size == 0: + log.error(f" Empty or missing transcript: {txt_path}") + return False + + log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)") + if srt_path.exists(): + log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)") + + return True + + except subprocess.TimeoutExpired: + log.error(f" Timeout (>2h) for {audio_path}") + return False + except FileNotFoundError: + log.error(f" whisper.cpp not found at: {WHISPER_BIN}") + log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH") + return False + except Exception as e: + log.error(f" Error: {e}") + return False + + +def parse_module_filter(arg: str) -> set[int]: + """Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices.""" + result = set() + for part in arg.split(","): + part = part.strip() + if "-" in part: + a, b = part.split("-", 1) + result.update(range(int(a), int(b) + 1)) + else: + result.add(int(part)) + return result + + +def main(): + if not MANIFEST_PATH.exists(): + log.error("manifest.json not found. Run download.py first.") + sys.exit(1) + + # Parse --modules filter + module_filter = None + if "--modules" in sys.argv: + idx = sys.argv.index("--modules") + if idx + 1 < len(sys.argv): + module_filter = parse_module_filter(sys.argv[idx + 1]) + log.info(f"Module filter: {sorted(module_filter)}") + + manifest = load_manifest() + TRANSCRIPTS_DIR.mkdir(exist_ok=True) + + total = 0 + transcribed = 0 + skipped = 0 + failed = 0 + + for mod_idx, mod in enumerate(manifest["modules"], 1): + if module_filter and mod_idx not in module_filter: + log.info(f"\nSkipping module {mod_idx}: {mod['name']}") + continue + log.info(f"\n{'='*60}") + log.info(f"Module: {mod['name']}") + log.info(f"{'='*60}") + + for lec in mod["lectures"]: + total += 1 + + if lec.get("download_status") != "complete": + log.warning(f" Skipping (not downloaded): {lec['title']}") + continue + + audio_path = lec["audio_path"] + stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "") + output_base = str(TRANSCRIPTS_DIR / stem) + + # Check if already transcribed + txt_path = Path(f"{output_base}.txt") + if txt_path.exists() and txt_path.stat().st_size > 0: + lec["transcribe_status"] = "complete" + skipped += 1 + log.info(f" Skipping (exists): {stem}.txt") + continue + + log.info(f" Transcribing: {lec['title']}") + log.info(f" File: {audio_path}") + + # Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input + wav_path = convert_to_wav(audio_path) + + if transcribe_file(wav_path, output_base): + lec["transcribe_status"] = "complete" + transcribed += 1 + else: + lec["transcribe_status"] = "failed" + failed += 1 + + # Save manifest after each file (checkpoint) + save_manifest(manifest) + + # Quality gate: pause after first module + if mod == manifest["modules"][0] and transcribed > 0: + log.info("\n" + "!" * 60) + log.info("QUALITY GATE: First module complete.") + log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.") + log.info("Press Enter to continue, or Ctrl+C to abort...") + log.info("!" * 60) + try: + input() + except EOFError: + pass # Non-interactive mode, continue + + # Validation + empty_outputs = [ + lec["title"] + for mod in manifest["modules"] + for lec in mod["lectures"] + if lec.get("transcribe_status") == "complete" + and not Path(lec["transcript_path"]).exists() + ] + + log.info("\n" + "=" * 60) + log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.") + log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}") + if empty_outputs: + for t in empty_outputs: + log.error(f" Missing transcript: {t}") + log.info("=" * 60) + + save_manifest(manifest) + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main()