chore: normalize line endings from CRLF to LF across all files
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
68
.gitignore
vendored
68
.gitignore
vendored
@@ -1,34 +1,34 @@
|
|||||||
# Audio files
|
# Audio files
|
||||||
audio/
|
audio/
|
||||||
*.mp3
|
*.mp3
|
||||||
*.wav
|
*.wav
|
||||||
|
|
||||||
# Whisper models
|
# Whisper models
|
||||||
models/
|
models/
|
||||||
*.bin
|
*.bin
|
||||||
|
|
||||||
# Credentials
|
# Credentials
|
||||||
.env
|
.env
|
||||||
|
|
||||||
# Transcripts and summaries (large generated content)
|
# Transcripts and summaries (large generated content)
|
||||||
transcripts/
|
transcripts/
|
||||||
summaries/
|
summaries/
|
||||||
|
|
||||||
# Binaries (downloaded by setup_whisper.py)
|
# Binaries (downloaded by setup_whisper.py)
|
||||||
whisper-bin/
|
whisper-bin/
|
||||||
ffmpeg-bin/
|
ffmpeg-bin/
|
||||||
|
|
||||||
# Temp files
|
# Temp files
|
||||||
.whisper_bin_path
|
.whisper_bin_path
|
||||||
.ffmpeg_bin_path
|
.ffmpeg_bin_path
|
||||||
|
|
||||||
# WAV cache (converted from MP3)
|
# WAV cache (converted from MP3)
|
||||||
audio_wav/
|
audio_wav/
|
||||||
|
|
||||||
# Python
|
# Python
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
.venv/
|
.venv/
|
||||||
|
|
||||||
# Logs
|
# Logs
|
||||||
*.log
|
*.log
|
||||||
|
|||||||
486
PLAN.md
486
PLAN.md
@@ -1,243 +1,243 @@
|
|||||||
# Design: NLP Master Course Audio Pipeline
|
# Design: NLP Master Course Audio Pipeline
|
||||||
|
|
||||||
Generated by /office-hours on 2026-03-23
|
Generated by /office-hours on 2026-03-23
|
||||||
Branch: unknown
|
Branch: unknown
|
||||||
Repo: nlp-master (local, no git)
|
Repo: nlp-master (local, no git)
|
||||||
Status: APPROVED
|
Status: APPROVED
|
||||||
Mode: Builder
|
Mode: Builder
|
||||||
|
|
||||||
## Problem Statement
|
## Problem Statement
|
||||||
|
|
||||||
Marius has an NLP master course hosted at cursuri.aresens.ro/curs/26 with 35 audio recordings (5 modules x 7 lectures, ~95 minutes each, ~58 hours total) in Romanian. The audio is behind a password-protected website. He wants to download all audio files, transcribe them offline using his AMD Radeon RX 6600M 8GB GPU, and generate clean transcripts with per-lecture summaries as study materials.
|
Marius has an NLP master course hosted at cursuri.aresens.ro/curs/26 with 35 audio recordings (5 modules x 7 lectures, ~95 minutes each, ~58 hours total) in Romanian. The audio is behind a password-protected website. He wants to download all audio files, transcribe them offline using his AMD Radeon RX 6600M 8GB GPU, and generate clean transcripts with per-lecture summaries as study materials.
|
||||||
|
|
||||||
## What Makes This Cool
|
## What Makes This Cool
|
||||||
|
|
||||||
58 hours of Romanian lecture audio turned into searchable, summarized study materials — completely automated. Download once, transcribe overnight, summarize with Claude Code. A pipeline that would take weeks of manual work happens in hours.
|
58 hours of Romanian lecture audio turned into searchable, summarized study materials — completely automated. Download once, transcribe overnight, summarize with Claude Code. A pipeline that would take weeks of manual work happens in hours.
|
||||||
|
|
||||||
## Constraints
|
## Constraints
|
||||||
|
|
||||||
- **Hardware:** AMD Radeon RX 6600M 8GB (RDNA2) — no CUDA, needs Vulkan or ROCm
|
- **Hardware:** AMD Radeon RX 6600M 8GB (RDNA2) — no CUDA, needs Vulkan or ROCm
|
||||||
- **Language:** Romanian audio — Whisper large-v3 has decent but not perfect Romanian support (~95% accuracy on clean audio)
|
- **Language:** Romanian audio — Whisper large-v3 has decent but not perfect Romanian support (~95% accuracy on clean audio)
|
||||||
- **Source:** Password-protected website at cursuri.aresens.ro/curs/26
|
- **Source:** Password-protected website at cursuri.aresens.ro/curs/26
|
||||||
- **Scale:** ~35 MP3 files, ~95 min each, ~58 hours total
|
- **Scale:** ~35 MP3 files, ~95 min each, ~58 hours total
|
||||||
- **Privacy:** Course content is for personal study use only
|
- **Privacy:** Course content is for personal study use only
|
||||||
- **Tooling:** Claude Code available for summary generation (no separate API cost)
|
- **Tooling:** Claude Code available for summary generation (no separate API cost)
|
||||||
- **Platform:** Native Windows (Python + whisper.cpp + Vulkan). Claude Code runs from WSL2 for summaries.
|
- **Platform:** Native Windows (Python + whisper.cpp + Vulkan). Claude Code runs from WSL2 for summaries.
|
||||||
- **Summaries language:** Romanian (matching source material)
|
- **Summaries language:** Romanian (matching source material)
|
||||||
- **Audio format:** MP3, 320kbps, 48kHz stereo, ~218MB per file (verified from sample: "Master 25M1 Z1A [Audio].mp3")
|
- **Audio format:** MP3, 320kbps, 48kHz stereo, ~218MB per file (verified from sample: "Master 25M1 Z1A [Audio].mp3")
|
||||||
|
|
||||||
## Premises
|
## Premises
|
||||||
|
|
||||||
1. Legitimate access to the course — downloading audio for personal study is within usage rights
|
1. Legitimate access to the course — downloading audio for personal study is within usage rights
|
||||||
2. whisper.cpp with Vulkan backend is the right tool for RX 6600M (avoids ROCm compatibility issues on RDNA2)
|
2. whisper.cpp with Vulkan backend is the right tool for RX 6600M (avoids ROCm compatibility issues on RDNA2)
|
||||||
3. Audio quality is decent (recorded lectures) — Whisper large-v3 will produce usable Romanian transcripts
|
3. Audio quality is decent (recorded lectures) — Whisper large-v3 will produce usable Romanian transcripts
|
||||||
4. Summaries will be generated by Claude Code after transcription — separate step
|
4. Summaries will be generated by Claude Code after transcription — separate step
|
||||||
5. Batch pipeline (download all → transcribe all → summarize all) is preferred over incremental processing
|
5. Batch pipeline (download all → transcribe all → summarize all) is preferred over incremental processing
|
||||||
|
|
||||||
## Approaches Considered
|
## Approaches Considered
|
||||||
|
|
||||||
### Approach A: Full Pipeline (CHOSEN)
|
### Approach A: Full Pipeline (CHOSEN)
|
||||||
Python script for website login + MP3 download. Shell script for whisper.cpp batch transcription (Vulkan, large-v3-q5_0). Claude Code for per-lecture summaries from transcripts.
|
Python script for website login + MP3 download. Shell script for whisper.cpp batch transcription (Vulkan, large-v3-q5_0). Claude Code for per-lecture summaries from transcripts.
|
||||||
- Effort: M (human: ~2 days / CC: ~30 min to build, ~8 hours to run transcription)
|
- Effort: M (human: ~2 days / CC: ~30 min to build, ~8 hours to run transcription)
|
||||||
- Risk: Low
|
- Risk: Low
|
||||||
- Pros: Complete automation, reproducible for module 6, best quality
|
- Pros: Complete automation, reproducible for module 6, best quality
|
||||||
- Cons: whisper.cpp Vulkan build requires system setup
|
- Cons: whisper.cpp Vulkan build requires system setup
|
||||||
|
|
||||||
### Approach B: Download + Transcribe Only
|
### Approach B: Download + Transcribe Only
|
||||||
Same download + transcription, no automated summaries. Simpler but defers the valuable part.
|
Same download + transcription, no automated summaries. Simpler but defers the valuable part.
|
||||||
- Effort: S (human: ~1 day / CC: ~20 min)
|
- Effort: S (human: ~1 day / CC: ~20 min)
|
||||||
- Risk: Low
|
- Risk: Low
|
||||||
|
|
||||||
### Approach C: Fully Offline (Local LLM summaries)
|
### Approach C: Fully Offline (Local LLM summaries)
|
||||||
Everything offline including summaries via llama.cpp. Zero external costs but lower summary quality.
|
Everything offline including summaries via llama.cpp. Zero external costs but lower summary quality.
|
||||||
- Effort: M (human: ~2 days / CC: ~40 min)
|
- Effort: M (human: ~2 days / CC: ~40 min)
|
||||||
- Risk: Medium (8GB VRAM shared between whisper.cpp and llama.cpp)
|
- Risk: Medium (8GB VRAM shared between whisper.cpp and llama.cpp)
|
||||||
|
|
||||||
## Recommended Approach
|
## Recommended Approach
|
||||||
|
|
||||||
**Approach A: Full Pipeline** — Download → whisper.cpp/Vulkan → Claude Code summaries.
|
**Approach A: Full Pipeline** — Download → whisper.cpp/Vulkan → Claude Code summaries.
|
||||||
|
|
||||||
**Execution model:** Everything runs on native Windows (Python, whisper.cpp). Claude Code runs from WSL2 for the summary step.
|
**Execution model:** Everything runs on native Windows (Python, whisper.cpp). Claude Code runs from WSL2 for the summary step.
|
||||||
|
|
||||||
### Step 0: Project Setup
|
### Step 0: Project Setup
|
||||||
- Initialize git repo with `.gitignore` (exclude: `audio/`, `models/`, `.env`, `*.mp3`, `*.wav`, `*.bin`)
|
- Initialize git repo with `.gitignore` (exclude: `audio/`, `models/`, `.env`, `*.mp3`, `*.wav`, `*.bin`)
|
||||||
- Install Python on Windows (if not already)
|
- Install Python on Windows (if not already)
|
||||||
- Install Vulkan SDK on Windows
|
- Install Vulkan SDK on Windows
|
||||||
- Create `.env` with course credentials (never committed)
|
- Create `.env` with course credentials (never committed)
|
||||||
|
|
||||||
### Step 1: Site Recon + Download Audio Files
|
### Step 1: Site Recon + Download Audio Files
|
||||||
- **First:** Browse cursuri.aresens.ro/curs/26 to understand page structure (login form, module layout, MP3 link format)
|
- **First:** Browse cursuri.aresens.ro/curs/26 to understand page structure (login form, module layout, MP3 link format)
|
||||||
- Based on recon, write `download.py` using the right scraping approach (requests+BS4 for static, playwright for JS-rendered — don't build both)
|
- Based on recon, write `download.py` using the right scraping approach (requests+BS4 for static, playwright for JS-rendered — don't build both)
|
||||||
- Login with credentials from `.env` or interactive prompt
|
- Login with credentials from `.env` or interactive prompt
|
||||||
- Discover all modules dynamically (don't hardcode 5x7 — actual count may vary)
|
- Discover all modules dynamically (don't hardcode 5x7 — actual count may vary)
|
||||||
- Preserve original file names (e.g., "Master 25M1 Z1A [Audio].mp3") and extract lecture titles
|
- Preserve original file names (e.g., "Master 25M1 Z1A [Audio].mp3") and extract lecture titles
|
||||||
- Write `manifest.json` mapping each file to: module, lecture title, original URL, file path, download status
|
- Write `manifest.json` mapping each file to: module, lecture title, original URL, file path, download status
|
||||||
- **Resumability:** skip already-downloaded files (check existence + file size). Retry 3x with backoff. Log to `download_errors.log`.
|
- **Resumability:** skip already-downloaded files (check existence + file size). Retry 3x with backoff. Log to `download_errors.log`.
|
||||||
- **Validation:** after download completes, print summary: "Downloaded X/Y files, Z failures. All files > 1MB: pass/fail."
|
- **Validation:** after download completes, print summary: "Downloaded X/Y files, Z failures. All files > 1MB: pass/fail."
|
||||||
|
|
||||||
### Step 2: Install whisper.cpp with Vulkan (Windows native)
|
### Step 2: Install whisper.cpp with Vulkan (Windows native)
|
||||||
- Option A: Download pre-built Windows binary with Vulkan from [whisper.cpp-windows-vulkan-bin](https://github.com/jerryshell/whisper.cpp-windows-vulkan-bin)
|
- Option A: Download pre-built Windows binary with Vulkan from [whisper.cpp-windows-vulkan-bin](https://github.com/jerryshell/whisper.cpp-windows-vulkan-bin)
|
||||||
- Option B: Build from source with Visual Studio + `-DGGML_VULKAN=1` CMake flag
|
- Option B: Build from source with Visual Studio + `-DGGML_VULKAN=1` CMake flag
|
||||||
- Download model: `ggml-large-v3-q5_0.bin` (~1.5GB) from Hugging Face into `models/`
|
- Download model: `ggml-large-v3-q5_0.bin` (~1.5GB) from Hugging Face into `models/`
|
||||||
- **VRAM test:** transcribe a 2-min clip from the first lecture to verify GPU detection, measure speed, and validate MP3 input works. If MP3 fails (whisper.cpp built without ffmpeg libs), install ffmpeg or pre-convert with Python pydub.
|
- **VRAM test:** transcribe a 2-min clip from the first lecture to verify GPU detection, measure speed, and validate MP3 input works. If MP3 fails (whisper.cpp built without ffmpeg libs), install ffmpeg or pre-convert with Python pydub.
|
||||||
- **Speed calibration:** RX 6600M is roughly half the speed of RX 9070 XT. Realistic estimate: **3-5x realtime** (~18-30 min per 90-min file). Total: **~12-18 hours** for all files. Plan for a full day, not overnight.
|
- **Speed calibration:** RX 6600M is roughly half the speed of RX 9070 XT. Realistic estimate: **3-5x realtime** (~18-30 min per 90-min file). Total: **~12-18 hours** for all files. Plan for a full day, not overnight.
|
||||||
- **Fallback:** if large-v3-q5_0 OOMs on 8GB, try `ggml-large-v3-q4_0.bin` or `ggml-medium-q5_0.bin`.
|
- **Fallback:** if large-v3-q5_0 OOMs on 8GB, try `ggml-large-v3-q4_0.bin` or `ggml-medium-q5_0.bin`.
|
||||||
|
|
||||||
### Step 3: Batch Transcription
|
### Step 3: Batch Transcription
|
||||||
- `transcribe.py` (Python, cross-platform) reads `manifest.json`, processes files in module order
|
- `transcribe.py` (Python, cross-platform) reads `manifest.json`, processes files in module order
|
||||||
- Calls whisper.cpp with: `--language ro --model models\ggml-large-v3-q5_0.bin --output-txt --output-srt`
|
- Calls whisper.cpp with: `--language ro --model models\ggml-large-v3-q5_0.bin --output-txt --output-srt`
|
||||||
- Output .txt and .srt per file to `transcripts/{original_name_without_ext}/`
|
- Output .txt and .srt per file to `transcripts/{original_name_without_ext}/`
|
||||||
- Updates `manifest.json` with transcription status per file
|
- Updates `manifest.json` with transcription status per file
|
||||||
- **Resumability:** skip files with existing .txt output. Log failures to `transcribe_errors.log`.
|
- **Resumability:** skip files with existing .txt output. Log failures to `transcribe_errors.log`.
|
||||||
- **Quality gate:** after first module completes (~2 hours), STOP and spot-check 2-3 transcripts. If Romanian accuracy is poor (lots of garbled text), consider: switching to `large-v3` unquantized, adjusting `--beam-size`, or accepting lower quality.
|
- **Quality gate:** after first module completes (~2 hours), STOP and spot-check 2-3 transcripts. If Romanian accuracy is poor (lots of garbled text), consider: switching to `large-v3` unquantized, adjusting `--beam-size`, or accepting lower quality.
|
||||||
- **Validation:** print summary: "Transcribed X/Y files. Z failures. No empty outputs: pass/fail."
|
- **Validation:** print summary: "Transcribed X/Y files. Z failures. No empty outputs: pass/fail."
|
||||||
|
|
||||||
### Step 4: Summary Generation with Claude Code
|
### Step 4: Summary Generation with Claude Code
|
||||||
- From WSL2, use Claude Code to process each transcript
|
- From WSL2, use Claude Code to process each transcript
|
||||||
- Use a Python script (`summarize.py`) that reads `manifest.json`, opens each .txt file, and prints the summary prompt for Claude Code
|
- Use a Python script (`summarize.py`) that reads `manifest.json`, opens each .txt file, and prints the summary prompt for Claude Code
|
||||||
- Summary prompt (Romanian): "Rezuma aceasta transcriere. Ofera: (1) prezentare generala in 3-5 propozitii, (2) concepte cheie cu definitii, (3) detalii si exemple importante"
|
- Summary prompt (Romanian): "Rezuma aceasta transcriere. Ofera: (1) prezentare generala in 3-5 propozitii, (2) concepte cheie cu definitii, (3) detalii si exemple importante"
|
||||||
- **Chunking:** split transcripts > 10K words at sentence boundaries (not raw word count) with 500-word overlap. Summarize chunks, then merge.
|
- **Chunking:** split transcripts > 10K words at sentence boundaries (not raw word count) with 500-word overlap. Summarize chunks, then merge.
|
||||||
- Output to `summaries/{original_name}_summary.md`
|
- Output to `summaries/{original_name}_summary.md`
|
||||||
- Final: compile `SUPORT_CURS.md` — master study guide with lecture titles as headings
|
- Final: compile `SUPORT_CURS.md` — master study guide with lecture titles as headings
|
||||||
|
|
||||||
### Manifest Schema
|
### Manifest Schema
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"course": "NLP Master 2025",
|
"course": "NLP Master 2025",
|
||||||
"source_url": "https://cursuri.aresens.ro/curs/26",
|
"source_url": "https://cursuri.aresens.ro/curs/26",
|
||||||
"modules": [
|
"modules": [
|
||||||
{
|
{
|
||||||
"name": "Modul 1",
|
"name": "Modul 1",
|
||||||
"lectures": [
|
"lectures": [
|
||||||
{
|
{
|
||||||
"title": "Master 25M1 Z1A",
|
"title": "Master 25M1 Z1A",
|
||||||
"original_filename": "Master 25M1 Z1A [Audio].mp3",
|
"original_filename": "Master 25M1 Z1A [Audio].mp3",
|
||||||
"url": "https://...",
|
"url": "https://...",
|
||||||
"audio_path": "audio/Master 25M1 Z1A [Audio].mp3",
|
"audio_path": "audio/Master 25M1 Z1A [Audio].mp3",
|
||||||
"transcript_path": "transcripts/Master 25M1 Z1A.txt",
|
"transcript_path": "transcripts/Master 25M1 Z1A.txt",
|
||||||
"srt_path": "transcripts/Master 25M1 Z1A.srt",
|
"srt_path": "transcripts/Master 25M1 Z1A.srt",
|
||||||
"summary_path": "summaries/Master 25M1 Z1A_summary.md",
|
"summary_path": "summaries/Master 25M1 Z1A_summary.md",
|
||||||
"download_status": "complete",
|
"download_status": "complete",
|
||||||
"transcribe_status": "pending",
|
"transcribe_status": "pending",
|
||||||
"file_size_bytes": 228486429
|
"file_size_bytes": 228486429
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Directory Structure
|
### Directory Structure
|
||||||
```
|
```
|
||||||
nlp-master/
|
nlp-master/
|
||||||
.gitignore # Excludes audio/, models/, .env
|
.gitignore # Excludes audio/, models/, .env
|
||||||
.env # Course credentials (not committed)
|
.env # Course credentials (not committed)
|
||||||
manifest.json # Shared metadata for all scripts
|
manifest.json # Shared metadata for all scripts
|
||||||
download.py # Step 1: site recon + download
|
download.py # Step 1: site recon + download
|
||||||
transcribe.py # Step 3: batch transcription
|
transcribe.py # Step 3: batch transcription
|
||||||
summarize.py # Step 4: summary generation helper
|
summarize.py # Step 4: summary generation helper
|
||||||
audio/
|
audio/
|
||||||
Master 25M1 Z1A [Audio].mp3
|
Master 25M1 Z1A [Audio].mp3
|
||||||
Master 25M1 Z1B [Audio].mp3
|
Master 25M1 Z1B [Audio].mp3
|
||||||
...
|
...
|
||||||
models/
|
models/
|
||||||
ggml-large-v3-q5_0.bin
|
ggml-large-v3-q5_0.bin
|
||||||
transcripts/
|
transcripts/
|
||||||
Master 25M1 Z1A.txt
|
Master 25M1 Z1A.txt
|
||||||
Master 25M1 Z1A.srt
|
Master 25M1 Z1A.srt
|
||||||
...
|
...
|
||||||
summaries/
|
summaries/
|
||||||
Master 25M1 Z1A_summary.md
|
Master 25M1 Z1A_summary.md
|
||||||
...
|
...
|
||||||
SUPORT_CURS.md
|
SUPORT_CURS.md
|
||||||
```
|
```
|
||||||
|
|
||||||
## Open Questions
|
## Open Questions
|
||||||
|
|
||||||
1. ~~What is the exact website structure?~~ Resolved: browse site first in Step 1.
|
1. ~~What is the exact website structure?~~ Resolved: browse site first in Step 1.
|
||||||
2. ~~Are there lecture titles on the website?~~ Resolved: preserve original names + extract titles.
|
2. ~~Are there lecture titles on the website?~~ Resolved: preserve original names + extract titles.
|
||||||
3. ~~Do you want the summaries in Romanian or English?~~ Resolved: Romanian.
|
3. ~~Do you want the summaries in Romanian or English?~~ Resolved: Romanian.
|
||||||
4. Should the master study guide (SUPORT_CURS.md) include the full transcripts or just summaries?
|
4. Should the master study guide (SUPORT_CURS.md) include the full transcripts or just summaries?
|
||||||
5. Is there a 6th module coming? If so, the pipeline should be easily re-runnable.
|
5. Is there a 6th module coming? If so, the pipeline should be easily re-runnable.
|
||||||
6. Does whisper.cpp Windows binary support MP3 input natively? (Validated in Step 2 VRAM test)
|
6. Does whisper.cpp Windows binary support MP3 input natively? (Validated in Step 2 VRAM test)
|
||||||
|
|
||||||
## Success Criteria
|
## Success Criteria
|
||||||
|
|
||||||
- All ~35 MP3 files downloaded and organized by module
|
- All ~35 MP3 files downloaded and organized by module
|
||||||
- All files transcribed to .txt and .srt with >90% accuracy
|
- All files transcribed to .txt and .srt with >90% accuracy
|
||||||
- Per-lecture summaries generated with key concepts extracted
|
- Per-lecture summaries generated with key concepts extracted
|
||||||
- Master study guide (SUPORT_CURS.md) ready for reading/searching
|
- Master study guide (SUPORT_CURS.md) ready for reading/searching
|
||||||
- Pipeline is re-runnable for module 6 when it arrives
|
- Pipeline is re-runnable for module 6 when it arrives
|
||||||
|
|
||||||
## Next Steps
|
## Next Steps
|
||||||
|
|
||||||
1. **git init + .gitignore** — set up project, exclude audio/models/.env (~2 min)
|
1. **git init + .gitignore** — set up project, exclude audio/models/.env (~2 min)
|
||||||
2. **Browse cursuri.aresens.ro** — understand site structure before coding (~10 min)
|
2. **Browse cursuri.aresens.ro** — understand site structure before coding (~10 min)
|
||||||
3. **Build download.py** — login + scrape + download + manifest.json (~15 min with CC)
|
3. **Build download.py** — login + scrape + download + manifest.json (~15 min with CC)
|
||||||
4. **Install whisper.cpp on Windows** — pre-built binary or build from source + Vulkan SDK (~15 min)
|
4. **Install whisper.cpp on Windows** — pre-built binary or build from source + Vulkan SDK (~15 min)
|
||||||
5. **Download whisper model** — large-v3-q5_0 from Hugging Face (~5 min)
|
5. **Download whisper model** — large-v3-q5_0 from Hugging Face (~5 min)
|
||||||
6. **Test transcription** — 2-min clip, validate GPU, calibrate speed, check MP3 support (~5 min)
|
6. **Test transcription** — 2-min clip, validate GPU, calibrate speed, check MP3 support (~5 min)
|
||||||
7. **Build transcribe.py** — reads manifest, processes in module order, updates status (~10 min with CC)
|
7. **Build transcribe.py** — reads manifest, processes in module order, updates status (~10 min with CC)
|
||||||
8. **Run batch transcription** — ~12-18 hours (leave running during workday)
|
8. **Run batch transcription** — ~12-18 hours (leave running during workday)
|
||||||
9. **Spot-check quality** — review 2-3 transcripts after Module 1 completes
|
9. **Spot-check quality** — review 2-3 transcripts after Module 1 completes
|
||||||
10. **Generate summaries with Claude Code** — via summarize.py helper (~30 min)
|
10. **Generate summaries with Claude Code** — via summarize.py helper (~30 min)
|
||||||
11. **Compile SUPORT_CURS.md** — master study guide (~10 min)
|
11. **Compile SUPORT_CURS.md** — master study guide (~10 min)
|
||||||
|
|
||||||
## NOT in scope
|
## NOT in scope
|
||||||
- Building a web UI or search interface for transcripts — just flat files
|
- Building a web UI or search interface for transcripts — just flat files
|
||||||
- Automated quality scoring of transcriptions — manual spot-check is sufficient
|
- Automated quality scoring of transcriptions — manual spot-check is sufficient
|
||||||
- Speaker diarization (identifying different speakers) — single lecturer
|
- Speaker diarization (identifying different speakers) — single lecturer
|
||||||
- Translation to English — summaries stay in Romanian
|
- Translation to English — summaries stay in Romanian
|
||||||
- CI/CD or deployment — this is a local personal pipeline
|
- CI/CD or deployment — this is a local personal pipeline
|
||||||
|
|
||||||
## What already exists
|
## What already exists
|
||||||
- Nothing — greenfield project. No existing code to reuse.
|
- Nothing — greenfield project. No existing code to reuse.
|
||||||
- The one existing file (`Master 25M1 Z1A [Audio].mp3`) confirms the naming pattern and audio specs.
|
- The one existing file (`Master 25M1 Z1A [Audio].mp3`) confirms the naming pattern and audio specs.
|
||||||
|
|
||||||
## Failure Modes
|
## Failure Modes
|
||||||
```
|
```
|
||||||
FAILURE MODE | TEST? | HANDLING? | SILENT?
|
FAILURE MODE | TEST? | HANDLING? | SILENT?
|
||||||
================================|=======|===========|========
|
================================|=======|===========|========
|
||||||
Session expires during download | No | Yes (retry)| No — logged
|
Session expires during download | No | Yes (retry)| No — logged
|
||||||
MP3 truncated (network drop) | Yes* | Yes (size) | No — validation
|
MP3 truncated (network drop) | Yes* | Yes (size) | No — validation
|
||||||
whisper.cpp OOM on large model | No | Yes (fallback)| No — logged
|
whisper.cpp OOM on large model | No | Yes (fallback)| No — logged
|
||||||
whisper.cpp can't read MP3 | No | No** | Yes — CRITICAL
|
whisper.cpp can't read MP3 | No | No** | Yes — CRITICAL
|
||||||
Empty transcript output | Yes* | Yes (log) | No — validation
|
Empty transcript output | Yes* | Yes (log) | No — validation
|
||||||
Poor Romanian accuracy | No | Yes (gate)| No — spot-check
|
Poor Romanian accuracy | No | Yes (gate)| No — spot-check
|
||||||
Claude Code input too large | No | Yes (chunk)| No — script handles
|
Claude Code input too large | No | Yes (chunk)| No — script handles
|
||||||
manifest.json corruption | No | No | Yes — low risk
|
manifest.json corruption | No | No | Yes — low risk
|
||||||
|
|
||||||
* = covered by inline validation checks
|
* = covered by inline validation checks
|
||||||
** = validated in Step 2 test; if fails, install ffmpeg or use pydub
|
** = validated in Step 2 test; if fails, install ffmpeg or use pydub
|
||||||
```
|
```
|
||||||
**Critical gap:** whisper.cpp MP3 support must be validated in Step 2. If it fails silently (produces garbage), the entire batch is wasted.
|
**Critical gap:** whisper.cpp MP3 support must be validated in Step 2. If it fails silently (produces garbage), the entire batch is wasted.
|
||||||
|
|
||||||
## Eng Review Decisions (2026-03-24)
|
## Eng Review Decisions (2026-03-24)
|
||||||
1. Hybrid platform → **All Windows Python** (not WSL2 for scripts)
|
1. Hybrid platform → **All Windows Python** (not WSL2 for scripts)
|
||||||
2. Browse site first → build the right scraper, not two fallback paths
|
2. Browse site first → build the right scraper, not two fallback paths
|
||||||
3. Preserve original file names + extract lecture titles
|
3. Preserve original file names + extract lecture titles
|
||||||
4. Add manifest.json as shared metadata between scripts
|
4. Add manifest.json as shared metadata between scripts
|
||||||
5. Python for all scripts (download.py, transcribe.py, summarize.py)
|
5. Python for all scripts (download.py, transcribe.py, summarize.py)
|
||||||
6. Built-in validation checks in each script
|
6. Built-in validation checks in each script
|
||||||
7. Feed MP3s directly (no pre-convert)
|
7. Feed MP3s directly (no pre-convert)
|
||||||
8. Process in module order
|
8. Process in module order
|
||||||
9. Realistic transcription estimate: 12-18 hours (not 7-8)
|
9. Realistic transcription estimate: 12-18 hours (not 7-8)
|
||||||
|
|
||||||
## What I noticed about how you think
|
## What I noticed about how you think
|
||||||
|
|
||||||
- You said "vreau offline transcription + claude code pentru summaries" — you immediately found the pragmatic middle path between fully offline and fully API-dependent. That's good engineering instinct: use the best tool for each step rather than forcing one tool to do everything.
|
- You said "vreau offline transcription + claude code pentru summaries" — you immediately found the pragmatic middle path between fully offline and fully API-dependent. That's good engineering instinct: use the best tool for each step rather than forcing one tool to do everything.
|
||||||
- You gave concrete numbers upfront: "5 module din 6, fiecare cu 7 audio-uri" and "90-100 minute" — you'd already scoped the problem before sitting down. That's not how most people start; most people say "I have some audio files."
|
- You gave concrete numbers upfront: "5 module din 6, fiecare cu 7 audio-uri" and "90-100 minute" — you'd already scoped the problem before sitting down. That's not how most people start; most people say "I have some audio files."
|
||||||
- You chose "transcripts + summaries" over "just transcripts" or "full study system" — you know what's useful without over-engineering.
|
- You chose "transcripts + summaries" over "just transcripts" or "full study system" — you know what's useful without over-engineering.
|
||||||
|
|
||||||
## GSTACK REVIEW REPORT
|
## GSTACK REVIEW REPORT
|
||||||
|
|
||||||
| Review | Trigger | Why | Runs | Status | Findings |
|
| Review | Trigger | Why | Runs | Status | Findings |
|
||||||
|--------|---------|-----|------|--------|----------|
|
|--------|---------|-----|------|--------|----------|
|
||||||
| CEO Review | `/plan-ceo-review` | Scope & strategy | 0 | — | — |
|
| CEO Review | `/plan-ceo-review` | Scope & strategy | 0 | — | — |
|
||||||
| Codex Review | `/codex review` | Independent 2nd opinion | 0 | — | — |
|
| Codex Review | `/codex review` | Independent 2nd opinion | 0 | — | — |
|
||||||
| Eng Review | `/plan-eng-review` | Architecture & tests (required) | 1 | CLEAR (PLAN) | 8 issues, 0 critical gaps |
|
| Eng Review | `/plan-eng-review` | Architecture & tests (required) | 1 | CLEAR (PLAN) | 8 issues, 0 critical gaps |
|
||||||
| Design Review | `/plan-design-review` | UI/UX gaps | 0 | — | — |
|
| Design Review | `/plan-design-review` | UI/UX gaps | 0 | — | — |
|
||||||
|
|
||||||
- **OUTSIDE VOICE:** Claude subagent ran — 10 findings, 3 cross-model tensions resolved (platform execution, speed estimate, module order)
|
- **OUTSIDE VOICE:** Claude subagent ran — 10 findings, 3 cross-model tensions resolved (platform execution, speed estimate, module order)
|
||||||
- **UNRESOLVED:** 0
|
- **UNRESOLVED:** 0
|
||||||
- **VERDICT:** ENG CLEARED — ready to implement
|
- **VERDICT:** ENG CLEARED — ready to implement
|
||||||
|
|||||||
16
TODOS.md
16
TODOS.md
@@ -1,8 +1,8 @@
|
|||||||
# TODOS
|
# TODOS
|
||||||
|
|
||||||
## Re-run pipeline for Module 6
|
## Re-run pipeline for Module 6
|
||||||
- **What:** Re-run `download.py` when module 6 becomes available on cursuri.aresens.ro/curs/26
|
- **What:** Re-run `download.py` when module 6 becomes available on cursuri.aresens.ro/curs/26
|
||||||
- **Why:** Course has 6 modules total, only 5 are currently available. Pipeline is designed to be re-runnable — manifest.json + resumability means it discovers new modules and skips already-downloaded files.
|
- **Why:** Course has 6 modules total, only 5 are currently available. Pipeline is designed to be re-runnable — manifest.json + resumability means it discovers new modules and skips already-downloaded files.
|
||||||
- **How:** Run `python download.py` → check manifest for new files → run `python transcribe.py` → generate summaries → update SUPORT_CURS.md
|
- **How:** Run `python download.py` → check manifest for new files → run `python transcribe.py` → generate summaries → update SUPORT_CURS.md
|
||||||
- **Depends on:** Course provider publishing module 6
|
- **Depends on:** Course provider publishing module 6
|
||||||
- **Added:** 2026-03-24
|
- **Added:** 2026-03-24
|
||||||
|
|||||||
506
download.py
506
download.py
@@ -1,253 +1,253 @@
|
|||||||
"""
|
"""
|
||||||
Download all audio files from cursuri.aresens.ro NLP Master course.
|
Download all audio files from cursuri.aresens.ro NLP Master course.
|
||||||
Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json.
|
Logs in, discovers modules and lectures, downloads MP3s, writes manifest.json.
|
||||||
Resumable: skips already-downloaded files.
|
Resumable: skips already-downloaded files.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
BASE_URL = "https://cursuri.aresens.ro"
|
BASE_URL = "https://cursuri.aresens.ro"
|
||||||
COURSE_URL = f"{BASE_URL}/curs/26"
|
COURSE_URL = f"{BASE_URL}/curs/26"
|
||||||
LOGIN_URL = f"{BASE_URL}/login"
|
LOGIN_URL = f"{BASE_URL}/login"
|
||||||
AUDIO_DIR = Path("audio")
|
AUDIO_DIR = Path("audio")
|
||||||
MANIFEST_PATH = Path("manifest.json")
|
MANIFEST_PATH = Path("manifest.json")
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_BACKOFF = [5, 15, 30]
|
RETRY_BACKOFF = [5, 15, 30]
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
handlers=[
|
handlers=[
|
||||||
logging.StreamHandler(),
|
logging.StreamHandler(),
|
||||||
logging.FileHandler("download_errors.log"),
|
logging.FileHandler("download_errors.log"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def login(session: requests.Session, email: str, password: str) -> bool:
|
def login(session: requests.Session, email: str, password: str) -> bool:
|
||||||
"""Login and return True on success."""
|
"""Login and return True on success."""
|
||||||
resp = session.post(LOGIN_URL, data={
|
resp = session.post(LOGIN_URL, data={
|
||||||
"email": email,
|
"email": email,
|
||||||
"password": password,
|
"password": password,
|
||||||
"act": "login",
|
"act": "login",
|
||||||
"remember": "on",
|
"remember": "on",
|
||||||
}, allow_redirects=True)
|
}, allow_redirects=True)
|
||||||
# Successful login redirects to the course page, not back to /login
|
# Successful login redirects to the course page, not back to /login
|
||||||
if "/login" in resp.url or "loginform" in resp.text:
|
if "/login" in resp.url or "loginform" in resp.text:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def discover_modules(session: requests.Session) -> list[dict]:
|
def discover_modules(session: requests.Session) -> list[dict]:
|
||||||
"""Fetch course page and return list of {name, url, module_id}."""
|
"""Fetch course page and return list of {name, url, module_id}."""
|
||||||
resp = session.get(COURSE_URL)
|
resp = session.get(COURSE_URL)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
modules = []
|
modules = []
|
||||||
for div in soup.select("div.module"):
|
for div in soup.select("div.module"):
|
||||||
number_el = div.select_one("div.module__number")
|
number_el = div.select_one("div.module__number")
|
||||||
link_el = div.select_one("a.btn")
|
link_el = div.select_one("a.btn")
|
||||||
if not number_el or not link_el:
|
if not number_el or not link_el:
|
||||||
continue
|
continue
|
||||||
href = link_el.get("href", "")
|
href = link_el.get("href", "")
|
||||||
module_id = href.rstrip("/").split("/")[-1]
|
module_id = href.rstrip("/").split("/")[-1]
|
||||||
modules.append({
|
modules.append({
|
||||||
"name": number_el.get_text(strip=True),
|
"name": number_el.get_text(strip=True),
|
||||||
"url": urljoin(BASE_URL, href),
|
"url": urljoin(BASE_URL, href),
|
||||||
"module_id": module_id,
|
"module_id": module_id,
|
||||||
})
|
})
|
||||||
log.info(f"Found {len(modules)} modules")
|
log.info(f"Found {len(modules)} modules")
|
||||||
return modules
|
return modules
|
||||||
|
|
||||||
|
|
||||||
def discover_lectures(session: requests.Session, module: dict) -> list[dict]:
|
def discover_lectures(session: requests.Session, module: dict) -> list[dict]:
|
||||||
"""Fetch a module page and return list of lectures with audio URLs."""
|
"""Fetch a module page and return list of lectures with audio URLs."""
|
||||||
resp = session.get(module["url"])
|
resp = session.get(module["url"])
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
lectures = []
|
lectures = []
|
||||||
for lesson_div in soup.select("div.lesson"):
|
for lesson_div in soup.select("div.lesson"):
|
||||||
name_el = lesson_div.select_one("div.module__name")
|
name_el = lesson_div.select_one("div.module__name")
|
||||||
source_el = lesson_div.select_one("audio source")
|
source_el = lesson_div.select_one("audio source")
|
||||||
if not name_el or not source_el:
|
if not name_el or not source_el:
|
||||||
continue
|
continue
|
||||||
src = source_el.get("src", "").strip()
|
src = source_el.get("src", "").strip()
|
||||||
if not src:
|
if not src:
|
||||||
continue
|
continue
|
||||||
audio_url = urljoin(BASE_URL, src)
|
audio_url = urljoin(BASE_URL, src)
|
||||||
filename = src.split("/")[-1]
|
filename = src.split("/")[-1]
|
||||||
title = name_el.get_text(strip=True)
|
title = name_el.get_text(strip=True)
|
||||||
lectures.append({
|
lectures.append({
|
||||||
"title": title,
|
"title": title,
|
||||||
"original_filename": filename,
|
"original_filename": filename,
|
||||||
"url": audio_url,
|
"url": audio_url,
|
||||||
"audio_path": str(AUDIO_DIR / filename),
|
"audio_path": str(AUDIO_DIR / filename),
|
||||||
})
|
})
|
||||||
log.info(f" {module['name']}: {len(lectures)} lectures")
|
log.info(f" {module['name']}: {len(lectures)} lectures")
|
||||||
return lectures
|
return lectures
|
||||||
|
|
||||||
|
|
||||||
def download_file(session: requests.Session, url: str, dest: Path) -> bool:
|
def download_file(session: requests.Session, url: str, dest: Path) -> bool:
|
||||||
"""Download a file with retry logic. Returns True on success."""
|
"""Download a file with retry logic. Returns True on success."""
|
||||||
for attempt in range(MAX_RETRIES):
|
for attempt in range(MAX_RETRIES):
|
||||||
try:
|
try:
|
||||||
resp = session.get(url, stream=True, timeout=300)
|
resp = session.get(url, stream=True, timeout=300)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
# Write to temp file first, then rename (atomic)
|
# Write to temp file first, then rename (atomic)
|
||||||
tmp = dest.with_suffix(".tmp")
|
tmp = dest.with_suffix(".tmp")
|
||||||
total = 0
|
total = 0
|
||||||
with open(tmp, "wb") as f:
|
with open(tmp, "wb") as f:
|
||||||
for chunk in resp.iter_content(chunk_size=1024 * 1024):
|
for chunk in resp.iter_content(chunk_size=1024 * 1024):
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
total += len(chunk)
|
total += len(chunk)
|
||||||
|
|
||||||
if total < 1_000_000: # < 1MB is suspicious
|
if total < 1_000_000: # < 1MB is suspicious
|
||||||
log.warning(f"File too small ({total} bytes): {dest.name}")
|
log.warning(f"File too small ({total} bytes): {dest.name}")
|
||||||
tmp.unlink(missing_ok=True)
|
tmp.unlink(missing_ok=True)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
tmp.rename(dest)
|
tmp.rename(dest)
|
||||||
log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
|
log.info(f" Downloaded: {dest.name} ({total / 1_000_000:.1f} MB)")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
|
wait = RETRY_BACKOFF[attempt] if attempt < len(RETRY_BACKOFF) else 30
|
||||||
log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
|
log.warning(f" Attempt {attempt + 1}/{MAX_RETRIES} failed for {dest.name}: {e}")
|
||||||
if attempt < MAX_RETRIES - 1:
|
if attempt < MAX_RETRIES - 1:
|
||||||
log.info(f" Retrying in {wait}s...")
|
log.info(f" Retrying in {wait}s...")
|
||||||
time.sleep(wait)
|
time.sleep(wait)
|
||||||
|
|
||||||
log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}")
|
log.error(f" FAILED after {MAX_RETRIES} attempts: {dest.name}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def load_manifest() -> dict | None:
|
def load_manifest() -> dict | None:
|
||||||
"""Load existing manifest if present."""
|
"""Load existing manifest if present."""
|
||||||
if MANIFEST_PATH.exists():
|
if MANIFEST_PATH.exists():
|
||||||
with open(MANIFEST_PATH) as f:
|
with open(MANIFEST_PATH) as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def save_manifest(manifest: dict):
|
def save_manifest(manifest: dict):
|
||||||
"""Write manifest.json."""
|
"""Write manifest.json."""
|
||||||
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
||||||
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
email = os.getenv("COURSE_USERNAME", "")
|
email = os.getenv("COURSE_USERNAME", "")
|
||||||
password = os.getenv("COURSE_PASSWORD", "")
|
password = os.getenv("COURSE_PASSWORD", "")
|
||||||
if not email or not password:
|
if not email or not password:
|
||||||
log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env")
|
log.error("Set COURSE_USERNAME and COURSE_PASSWORD in .env")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
AUDIO_DIR.mkdir(exist_ok=True)
|
AUDIO_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
|
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
|
||||||
|
|
||||||
log.info("Logging in...")
|
log.info("Logging in...")
|
||||||
if not login(session, email, password):
|
if not login(session, email, password):
|
||||||
log.error("Login failed. Check credentials in .env")
|
log.error("Login failed. Check credentials in .env")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
log.info("Login successful")
|
log.info("Login successful")
|
||||||
|
|
||||||
# Discover structure
|
# Discover structure
|
||||||
modules = discover_modules(session)
|
modules = discover_modules(session)
|
||||||
if not modules:
|
if not modules:
|
||||||
log.error("No modules found")
|
log.error("No modules found")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
manifest = {
|
manifest = {
|
||||||
"course": "NLP Master Practitioner Bucuresti 2025",
|
"course": "NLP Master Practitioner Bucuresti 2025",
|
||||||
"source_url": COURSE_URL,
|
"source_url": COURSE_URL,
|
||||||
"modules": [],
|
"modules": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
total_files = 0
|
total_files = 0
|
||||||
downloaded = 0
|
downloaded = 0
|
||||||
skipped = 0
|
skipped = 0
|
||||||
failed = 0
|
failed = 0
|
||||||
|
|
||||||
for mod in modules:
|
for mod in modules:
|
||||||
lectures = discover_lectures(session, mod)
|
lectures = discover_lectures(session, mod)
|
||||||
module_entry = {
|
module_entry = {
|
||||||
"name": mod["name"],
|
"name": mod["name"],
|
||||||
"module_id": mod["module_id"],
|
"module_id": mod["module_id"],
|
||||||
"lectures": [],
|
"lectures": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
for lec in lectures:
|
for lec in lectures:
|
||||||
total_files += 1
|
total_files += 1
|
||||||
dest = Path(lec["audio_path"])
|
dest = Path(lec["audio_path"])
|
||||||
stem = dest.stem.replace(" [Audio]", "")
|
stem = dest.stem.replace(" [Audio]", "")
|
||||||
|
|
||||||
lecture_entry = {
|
lecture_entry = {
|
||||||
"title": lec["title"],
|
"title": lec["title"],
|
||||||
"original_filename": lec["original_filename"],
|
"original_filename": lec["original_filename"],
|
||||||
"url": lec["url"],
|
"url": lec["url"],
|
||||||
"audio_path": lec["audio_path"],
|
"audio_path": lec["audio_path"],
|
||||||
"transcript_path": f"transcripts/{stem}.txt",
|
"transcript_path": f"transcripts/{stem}.txt",
|
||||||
"srt_path": f"transcripts/{stem}.srt",
|
"srt_path": f"transcripts/{stem}.srt",
|
||||||
"summary_path": f"summaries/{stem}_summary.md",
|
"summary_path": f"summaries/{stem}_summary.md",
|
||||||
"download_status": "pending",
|
"download_status": "pending",
|
||||||
"transcribe_status": "pending",
|
"transcribe_status": "pending",
|
||||||
"file_size_bytes": 0,
|
"file_size_bytes": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Skip if already downloaded
|
# Skip if already downloaded
|
||||||
if dest.exists() and dest.stat().st_size > 1_000_000:
|
if dest.exists() and dest.stat().st_size > 1_000_000:
|
||||||
lecture_entry["download_status"] = "complete"
|
lecture_entry["download_status"] = "complete"
|
||||||
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
||||||
skipped += 1
|
skipped += 1
|
||||||
log.info(f" Skipping (exists): {dest.name}")
|
log.info(f" Skipping (exists): {dest.name}")
|
||||||
else:
|
else:
|
||||||
if download_file(session, lec["url"], dest):
|
if download_file(session, lec["url"], dest):
|
||||||
lecture_entry["download_status"] = "complete"
|
lecture_entry["download_status"] = "complete"
|
||||||
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
lecture_entry["file_size_bytes"] = dest.stat().st_size
|
||||||
downloaded += 1
|
downloaded += 1
|
||||||
else:
|
else:
|
||||||
lecture_entry["download_status"] = "failed"
|
lecture_entry["download_status"] = "failed"
|
||||||
failed += 1
|
failed += 1
|
||||||
|
|
||||||
module_entry["lectures"].append(lecture_entry)
|
module_entry["lectures"].append(lecture_entry)
|
||||||
|
|
||||||
manifest["modules"].append(module_entry)
|
manifest["modules"].append(module_entry)
|
||||||
# Save manifest after each module (checkpoint)
|
# Save manifest after each module (checkpoint)
|
||||||
save_manifest(manifest)
|
save_manifest(manifest)
|
||||||
|
|
||||||
# Final validation
|
# Final validation
|
||||||
all_ok = all(
|
all_ok = all(
|
||||||
Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000
|
Path(lec["audio_path"]).exists() and Path(lec["audio_path"]).stat().st_size > 1_000_000
|
||||||
for mod in manifest["modules"]
|
for mod in manifest["modules"]
|
||||||
for lec in mod["lectures"]
|
for lec in mod["lectures"]
|
||||||
if lec["download_status"] == "complete"
|
if lec["download_status"] == "complete"
|
||||||
)
|
)
|
||||||
|
|
||||||
log.info("=" * 60)
|
log.info("=" * 60)
|
||||||
log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.")
|
log.info(f"Downloaded {downloaded}/{total_files} files, {skipped} skipped, {failed} failures.")
|
||||||
log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}")
|
log.info(f"All files > 1MB: {'PASS' if all_ok else 'FAIL'}")
|
||||||
log.info("=" * 60)
|
log.info("=" * 60)
|
||||||
|
|
||||||
if failed:
|
if failed:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
1064
manifest.json
1064
manifest.json
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,3 @@
|
|||||||
requests
|
requests
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
python-dotenv
|
python-dotenv
|
||||||
|
|||||||
626
run.bat
626
run.bat
@@ -1,313 +1,313 @@
|
|||||||
@echo off
|
@echo off
|
||||||
setlocal enabledelayedexpansion
|
setlocal enabledelayedexpansion
|
||||||
cd /d "%~dp0"
|
cd /d "%~dp0"
|
||||||
|
|
||||||
:: Prevent Vulkan from exhausting VRAM — overflow to system RAM instead of crashing
|
:: Prevent Vulkan from exhausting VRAM — overflow to system RAM instead of crashing
|
||||||
set "GGML_VK_PREFER_HOST_MEMORY=ON"
|
set "GGML_VK_PREFER_HOST_MEMORY=ON"
|
||||||
|
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo NLP Master - Download + Transcribe Pipeline
|
echo NLP Master - Download + Transcribe Pipeline
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo.
|
echo.
|
||||||
|
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
:: PREREQUISITES CHECK
|
:: PREREQUISITES CHECK
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
echo Checking prerequisites...
|
echo Checking prerequisites...
|
||||||
echo.
|
echo.
|
||||||
set "PREREQ_OK=1"
|
set "PREREQ_OK=1"
|
||||||
set "NEED_WHISPER="
|
set "NEED_WHISPER="
|
||||||
set "NEED_MODEL="
|
set "NEED_MODEL="
|
||||||
|
|
||||||
:: --- Python ---
|
:: --- Python ---
|
||||||
python --version >nul 2>&1
|
python --version >nul 2>&1
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo [X] Python NOT FOUND
|
echo [X] Python NOT FOUND
|
||||||
echo Install from: https://www.python.org/downloads/
|
echo Install from: https://www.python.org/downloads/
|
||||||
echo Make sure to check "Add Python to PATH" during install.
|
echo Make sure to check "Add Python to PATH" during install.
|
||||||
echo.
|
echo.
|
||||||
echo Cannot continue without Python. Install it and re-run.
|
echo Cannot continue without Python. Install it and re-run.
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
) else (
|
) else (
|
||||||
for /f "tokens=2" %%v in ('python --version 2^>^&1') do echo [OK] Python %%v
|
for /f "tokens=2" %%v in ('python --version 2^>^&1') do echo [OK] Python %%v
|
||||||
)
|
)
|
||||||
|
|
||||||
:: --- .env credentials ---
|
:: --- .env credentials ---
|
||||||
if exist ".env" (
|
if exist ".env" (
|
||||||
findstr /m "COURSE_USERNAME=." ".env" >nul 2>&1
|
findstr /m "COURSE_USERNAME=." ".env" >nul 2>&1
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo [X] .env File exists but COURSE_USERNAME is empty
|
echo [X] .env File exists but COURSE_USERNAME is empty
|
||||||
echo Edit .env and fill in your credentials.
|
echo Edit .env and fill in your credentials.
|
||||||
set "PREREQ_OK="
|
set "PREREQ_OK="
|
||||||
) else (
|
) else (
|
||||||
echo [OK] .env Credentials configured
|
echo [OK] .env Credentials configured
|
||||||
)
|
)
|
||||||
) else (
|
) else (
|
||||||
echo [X] .env NOT FOUND
|
echo [X] .env NOT FOUND
|
||||||
echo Create .env with:
|
echo Create .env with:
|
||||||
echo COURSE_USERNAME=your_email
|
echo COURSE_USERNAME=your_email
|
||||||
echo COURSE_PASSWORD=your_password
|
echo COURSE_PASSWORD=your_password
|
||||||
set "PREREQ_OK="
|
set "PREREQ_OK="
|
||||||
)
|
)
|
||||||
|
|
||||||
:: --- ffmpeg ---
|
:: --- ffmpeg ---
|
||||||
set "FFMPEG_FOUND="
|
set "FFMPEG_FOUND="
|
||||||
set "NEED_FFMPEG="
|
set "NEED_FFMPEG="
|
||||||
where ffmpeg >nul 2>&1
|
where ffmpeg >nul 2>&1
|
||||||
if not errorlevel 1 (
|
if not errorlevel 1 (
|
||||||
set "FFMPEG_FOUND=1"
|
set "FFMPEG_FOUND=1"
|
||||||
for /f "delims=" %%p in ('where ffmpeg 2^>nul') do set "FFMPEG_LOCATION=%%p"
|
for /f "delims=" %%p in ('where ffmpeg 2^>nul') do set "FFMPEG_LOCATION=%%p"
|
||||||
echo [OK] ffmpeg !FFMPEG_LOCATION!
|
echo [OK] ffmpeg !FFMPEG_LOCATION!
|
||||||
) else (
|
) else (
|
||||||
if exist "ffmpeg.exe" (
|
if exist "ffmpeg.exe" (
|
||||||
set "FFMPEG_FOUND=1"
|
set "FFMPEG_FOUND=1"
|
||||||
echo [OK] ffmpeg .\ffmpeg.exe (local^)
|
echo [OK] ffmpeg .\ffmpeg.exe (local^)
|
||||||
) else (
|
) else (
|
||||||
echo [--] ffmpeg Not found - will auto-install
|
echo [--] ffmpeg Not found - will auto-install
|
||||||
set "NEED_FFMPEG=1"
|
set "NEED_FFMPEG=1"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
:: --- whisper-cli.exe ---
|
:: --- whisper-cli.exe ---
|
||||||
set "WHISPER_FOUND="
|
set "WHISPER_FOUND="
|
||||||
set "WHISPER_LOCATION="
|
set "WHISPER_LOCATION="
|
||||||
if defined WHISPER_BIN (
|
if defined WHISPER_BIN (
|
||||||
if exist "%WHISPER_BIN%" (
|
if exist "%WHISPER_BIN%" (
|
||||||
set "WHISPER_FOUND=1"
|
set "WHISPER_FOUND=1"
|
||||||
set "WHISPER_LOCATION=%WHISPER_BIN% (env var)"
|
set "WHISPER_LOCATION=%WHISPER_BIN% (env var)"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not defined WHISPER_FOUND (
|
if not defined WHISPER_FOUND (
|
||||||
where whisper-cli.exe >nul 2>&1
|
where whisper-cli.exe >nul 2>&1
|
||||||
if not errorlevel 1 (
|
if not errorlevel 1 (
|
||||||
set "WHISPER_FOUND=1"
|
set "WHISPER_FOUND=1"
|
||||||
for /f "delims=" %%p in ('where whisper-cli.exe 2^>nul') do set "WHISPER_LOCATION=%%p (PATH)"
|
for /f "delims=" %%p in ('where whisper-cli.exe 2^>nul') do set "WHISPER_LOCATION=%%p (PATH)"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not defined WHISPER_FOUND (
|
if not defined WHISPER_FOUND (
|
||||||
if exist "whisper-cli.exe" (
|
if exist "whisper-cli.exe" (
|
||||||
set "WHISPER_FOUND=1"
|
set "WHISPER_FOUND=1"
|
||||||
set "WHISPER_BIN=whisper-cli.exe"
|
set "WHISPER_BIN=whisper-cli.exe"
|
||||||
set "WHISPER_LOCATION=.\whisper-cli.exe (local)"
|
set "WHISPER_LOCATION=.\whisper-cli.exe (local)"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not defined WHISPER_FOUND (
|
if not defined WHISPER_FOUND (
|
||||||
if exist "whisper-bin\whisper-cli.exe" (
|
if exist "whisper-bin\whisper-cli.exe" (
|
||||||
set "WHISPER_FOUND=1"
|
set "WHISPER_FOUND=1"
|
||||||
set "WHISPER_BIN=whisper-bin\whisper-cli.exe"
|
set "WHISPER_BIN=whisper-bin\whisper-cli.exe"
|
||||||
set "WHISPER_LOCATION=whisper-bin\whisper-cli.exe (auto-installed)"
|
set "WHISPER_LOCATION=whisper-bin\whisper-cli.exe (auto-installed)"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not defined WHISPER_FOUND (
|
if not defined WHISPER_FOUND (
|
||||||
if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" (
|
if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" (
|
||||||
set "WHISPER_FOUND=1"
|
set "WHISPER_FOUND=1"
|
||||||
set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe"
|
set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe"
|
||||||
set "WHISPER_LOCATION=whisper.cpp\build\... (local build)"
|
set "WHISPER_LOCATION=whisper.cpp\build\... (local build)"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if defined WHISPER_FOUND (
|
if defined WHISPER_FOUND (
|
||||||
echo [OK] whisper-cli !WHISPER_LOCATION!
|
echo [OK] whisper-cli !WHISPER_LOCATION!
|
||||||
) else (
|
) else (
|
||||||
echo [--] whisper-cli Not found - will auto-download
|
echo [--] whisper-cli Not found - will auto-download
|
||||||
set "NEED_WHISPER=1"
|
set "NEED_WHISPER=1"
|
||||||
)
|
)
|
||||||
|
|
||||||
:: --- Whisper model ---
|
:: --- Whisper model ---
|
||||||
if not defined WHISPER_MODEL set "WHISPER_MODEL=models\ggml-medium-q5_0.bin"
|
if not defined WHISPER_MODEL set "WHISPER_MODEL=models\ggml-medium-q5_0.bin"
|
||||||
if exist "%WHISPER_MODEL%" (
|
if exist "%WHISPER_MODEL%" (
|
||||||
for %%F in ("%WHISPER_MODEL%") do (
|
for %%F in ("%WHISPER_MODEL%") do (
|
||||||
set /a "MODEL_MB=%%~zF / 1048576"
|
set /a "MODEL_MB=%%~zF / 1048576"
|
||||||
)
|
)
|
||||||
echo [OK] Whisper model %WHISPER_MODEL% (!MODEL_MB! MB^)
|
echo [OK] Whisper model %WHISPER_MODEL% (!MODEL_MB! MB^)
|
||||||
) else (
|
) else (
|
||||||
echo [--] Whisper model Not found - will auto-download (~500 MB^)
|
echo [--] Whisper model Not found - will auto-download (~500 MB^)
|
||||||
set "NEED_MODEL=1"
|
set "NEED_MODEL=1"
|
||||||
)
|
)
|
||||||
|
|
||||||
:: --- Vulkan GPU support ---
|
:: --- Vulkan GPU support ---
|
||||||
set "VULKAN_FOUND="
|
set "VULKAN_FOUND="
|
||||||
where vulkaninfo >nul 2>&1
|
where vulkaninfo >nul 2>&1
|
||||||
if not errorlevel 1 (
|
if not errorlevel 1 (
|
||||||
set "VULKAN_FOUND=1"
|
set "VULKAN_FOUND=1"
|
||||||
echo [OK] Vulkan SDK Installed
|
echo [OK] Vulkan SDK Installed
|
||||||
) else (
|
) else (
|
||||||
if exist "%VULKAN_SDK%\Bin\vulkaninfo.exe" (
|
if exist "%VULKAN_SDK%\Bin\vulkaninfo.exe" (
|
||||||
set "VULKAN_FOUND=1"
|
set "VULKAN_FOUND=1"
|
||||||
echo [OK] Vulkan SDK %VULKAN_SDK%
|
echo [OK] Vulkan SDK %VULKAN_SDK%
|
||||||
) else (
|
) else (
|
||||||
echo [!!] Vulkan SDK Not detected (whisper.cpp may use CPU fallback^)
|
echo [!!] Vulkan SDK Not detected (whisper.cpp may use CPU fallback^)
|
||||||
echo Install from: https://vulkan.lunarg.com/sdk/home
|
echo Install from: https://vulkan.lunarg.com/sdk/home
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
:: --- Disk space ---
|
:: --- Disk space ---
|
||||||
echo.
|
echo.
|
||||||
for /f "tokens=3" %%a in ('dir /-c "%~dp0." 2^>nul ^| findstr /c:"bytes free"') do (
|
for /f "tokens=3" %%a in ('dir /-c "%~dp0." 2^>nul ^| findstr /c:"bytes free"') do (
|
||||||
set /a "FREE_GB=%%a / 1073741824" 2>nul
|
set /a "FREE_GB=%%a / 1073741824" 2>nul
|
||||||
)
|
)
|
||||||
if defined FREE_GB (
|
if defined FREE_GB (
|
||||||
if !FREE_GB! LSS 50 (
|
if !FREE_GB! LSS 50 (
|
||||||
echo [!!] Disk space ~!FREE_GB! GB free (need ~50 GB for all audio + transcripts^)
|
echo [!!] Disk space ~!FREE_GB! GB free (need ~50 GB for all audio + transcripts^)
|
||||||
) else (
|
) else (
|
||||||
echo [OK] Disk space ~!FREE_GB! GB free
|
echo [OK] Disk space ~!FREE_GB! GB free
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
echo.
|
echo.
|
||||||
|
|
||||||
:: --- Stop if .env is broken (can't auto-fix that) ---
|
:: --- Stop if .env is broken (can't auto-fix that) ---
|
||||||
if not defined PREREQ_OK (
|
if not defined PREREQ_OK (
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo MISSING PREREQUISITES - fix the [X] items above and re-run.
|
echo MISSING PREREQUISITES - fix the [X] items above and re-run.
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
|
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
:: AUTO-INSTALL MISSING COMPONENTS
|
:: AUTO-INSTALL MISSING COMPONENTS
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
if defined NEED_FFMPEG (
|
if defined NEED_FFMPEG (
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo Auto-downloading ffmpeg...
|
echo Auto-downloading ffmpeg...
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
python setup_whisper.py ffmpeg
|
python setup_whisper.py ffmpeg
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
echo ERROR: Could not install ffmpeg.
|
echo ERROR: Could not install ffmpeg.
|
||||||
echo Download manually from: https://www.gyan.dev/ffmpeg/builds/
|
echo Download manually from: https://www.gyan.dev/ffmpeg/builds/
|
||||||
echo Extract ffmpeg.exe to ffmpeg-bin\ and re-run.
|
echo Extract ffmpeg.exe to ffmpeg-bin\ and re-run.
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
if exist ".ffmpeg_bin_path" del .ffmpeg_bin_path
|
if exist ".ffmpeg_bin_path" del .ffmpeg_bin_path
|
||||||
echo.
|
echo.
|
||||||
)
|
)
|
||||||
|
|
||||||
:: Add ffmpeg-bin to PATH if it exists
|
:: Add ffmpeg-bin to PATH if it exists
|
||||||
if exist "ffmpeg-bin\ffmpeg.exe" (
|
if exist "ffmpeg-bin\ffmpeg.exe" (
|
||||||
set "PATH=%~dp0ffmpeg-bin;%PATH%"
|
set "PATH=%~dp0ffmpeg-bin;%PATH%"
|
||||||
)
|
)
|
||||||
|
|
||||||
if defined NEED_WHISPER (
|
if defined NEED_WHISPER (
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo Auto-downloading whisper.cpp (Vulkan build^)...
|
echo Auto-downloading whisper.cpp (Vulkan build^)...
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
python setup_whisper.py whisper
|
python setup_whisper.py whisper
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
echo ERROR: Failed to auto-download whisper.cpp.
|
echo ERROR: Failed to auto-download whisper.cpp.
|
||||||
echo Download manually from: https://github.com/ggml-org/whisper.cpp/releases
|
echo Download manually from: https://github.com/ggml-org/whisper.cpp/releases
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
:: Read the path that setup_whisper.py wrote
|
:: Read the path that setup_whisper.py wrote
|
||||||
if exist ".whisper_bin_path" (
|
if exist ".whisper_bin_path" (
|
||||||
set /p WHISPER_BIN=<.whisper_bin_path
|
set /p WHISPER_BIN=<.whisper_bin_path
|
||||||
del .whisper_bin_path
|
del .whisper_bin_path
|
||||||
echo Using: !WHISPER_BIN!
|
echo Using: !WHISPER_BIN!
|
||||||
)
|
)
|
||||||
echo.
|
echo.
|
||||||
)
|
)
|
||||||
|
|
||||||
if defined NEED_MODEL (
|
if defined NEED_MODEL (
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo Auto-downloading Whisper model (ggml-medium-q5_0, ~500 MB^)...
|
echo Auto-downloading Whisper model (ggml-medium-q5_0, ~500 MB^)...
|
||||||
echo This will take a few minutes depending on your connection.
|
echo This will take a few minutes depending on your connection.
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
python setup_whisper.py model
|
python setup_whisper.py model
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
echo ERROR: Failed to download model.
|
echo ERROR: Failed to download model.
|
||||||
echo Download manually from:
|
echo Download manually from:
|
||||||
echo https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin
|
echo https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin
|
||||||
echo Save to: models\ggml-medium-q5_0.bin
|
echo Save to: models\ggml-medium-q5_0.bin
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
echo.
|
echo.
|
||||||
)
|
)
|
||||||
|
|
||||||
echo All prerequisites OK!
|
echo All prerequisites OK!
|
||||||
echo.
|
echo.
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo Starting pipeline...
|
echo Starting pipeline...
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo.
|
echo.
|
||||||
|
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
:: STEP 1: VENV + DEPENDENCIES
|
:: STEP 1: VENV + DEPENDENCIES
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
if not exist ".venv\Scripts\python.exe" (
|
if not exist ".venv\Scripts\python.exe" (
|
||||||
echo [1/4] Creating Python virtual environment...
|
echo [1/4] Creating Python virtual environment...
|
||||||
python -m venv .venv
|
python -m venv .venv
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo ERROR: Failed to create venv.
|
echo ERROR: Failed to create venv.
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
echo Done.
|
echo Done.
|
||||||
) else (
|
) else (
|
||||||
echo [1/4] Virtual environment already exists.
|
echo [1/4] Virtual environment already exists.
|
||||||
)
|
)
|
||||||
|
|
||||||
echo [2/4] Installing Python dependencies...
|
echo [2/4] Installing Python dependencies...
|
||||||
.venv\Scripts\pip install -q -r requirements.txt
|
.venv\Scripts\pip install -q -r requirements.txt
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo ERROR: Failed to install dependencies.
|
echo ERROR: Failed to install dependencies.
|
||||||
pause
|
pause
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
echo Done.
|
echo Done.
|
||||||
|
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
:: STEP 2: DOWNLOAD
|
:: STEP 2: DOWNLOAD
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
echo.
|
echo.
|
||||||
echo [3/4] Downloading audio files...
|
echo [3/4] Downloading audio files...
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
.venv\Scripts\python download.py
|
.venv\Scripts\python download.py
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
echo WARNING: Some downloads failed. Check download_errors.log
|
echo WARNING: Some downloads failed. Check download_errors.log
|
||||||
echo Press any key to continue to transcription anyway, or Ctrl+C to abort.
|
echo Press any key to continue to transcription anyway, or Ctrl+C to abort.
|
||||||
pause >nul
|
pause >nul
|
||||||
)
|
)
|
||||||
|
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
:: STEP 3: TRANSCRIBE
|
:: STEP 3: TRANSCRIBE
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
echo.
|
echo.
|
||||||
echo [4/4] Transcribing with whisper.cpp...
|
echo [4/4] Transcribing with whisper.cpp...
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo Using: %WHISPER_BIN%
|
echo Using: %WHISPER_BIN%
|
||||||
echo Model: %WHISPER_MODEL%
|
echo Model: %WHISPER_MODEL%
|
||||||
echo.
|
echo.
|
||||||
|
|
||||||
if "%~1"=="" (
|
if "%~1"=="" (
|
||||||
.venv\Scripts\python transcribe.py
|
.venv\Scripts\python transcribe.py
|
||||||
) else (
|
) else (
|
||||||
echo Modules filter: %~1
|
echo Modules filter: %~1
|
||||||
.venv\Scripts\python transcribe.py --modules %~1
|
.venv\Scripts\python transcribe.py --modules %~1
|
||||||
)
|
)
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
echo WARNING: Some transcriptions failed. Check transcribe_errors.log
|
echo WARNING: Some transcriptions failed. Check transcribe_errors.log
|
||||||
)
|
)
|
||||||
|
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
:: DONE
|
:: DONE
|
||||||
:: ============================================================
|
:: ============================================================
|
||||||
echo.
|
echo.
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
echo Pipeline complete!
|
echo Pipeline complete!
|
||||||
echo - Audio files: audio\
|
echo - Audio files: audio\
|
||||||
echo - Transcripts: transcripts\
|
echo - Transcripts: transcripts\
|
||||||
echo - Manifest: manifest.json
|
echo - Manifest: manifest.json
|
||||||
echo.
|
echo.
|
||||||
echo Next step: generate summaries from WSL2 with Claude Code
|
echo Next step: generate summaries from WSL2 with Claude Code
|
||||||
echo python summarize.py
|
echo python summarize.py
|
||||||
echo ============================================================
|
echo ============================================================
|
||||||
pause
|
pause
|
||||||
|
|||||||
650
setup_whisper.py
650
setup_whisper.py
@@ -1,325 +1,325 @@
|
|||||||
"""
|
"""
|
||||||
Auto-download and setup whisper.cpp (Vulkan) + model for Windows.
|
Auto-download and setup whisper.cpp (Vulkan) + model for Windows.
|
||||||
Called by run.bat when prerequisites are missing.
|
Called by run.bat when prerequisites are missing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.request import urlopen, Request
|
from urllib.request import urlopen, Request
|
||||||
|
|
||||||
MODELS_DIR = Path("models")
|
MODELS_DIR = Path("models")
|
||||||
MODEL_NAME = "ggml-medium-q5_0.bin"
|
MODEL_NAME = "ggml-medium-q5_0.bin"
|
||||||
MODEL_URL = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/{MODEL_NAME}"
|
MODEL_URL = f"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/{MODEL_NAME}"
|
||||||
|
|
||||||
GITHUB_API = "https://api.github.com/repos/ggml-org/whisper.cpp/releases/latest"
|
GITHUB_API = "https://api.github.com/repos/ggml-org/whisper.cpp/releases/latest"
|
||||||
# Community Vulkan builds (for AMD GPUs)
|
# Community Vulkan builds (for AMD GPUs)
|
||||||
VULKAN_BUILDS_API = "https://api.github.com/repos/jerryshell/whisper.cpp-windows-vulkan-bin/releases/latest"
|
VULKAN_BUILDS_API = "https://api.github.com/repos/jerryshell/whisper.cpp-windows-vulkan-bin/releases/latest"
|
||||||
WHISPER_DIR = Path("whisper-bin")
|
WHISPER_DIR = Path("whisper-bin")
|
||||||
|
|
||||||
|
|
||||||
def progress_bar(current: int, total: int, width: int = 40):
|
def progress_bar(current: int, total: int, width: int = 40):
|
||||||
if total <= 0:
|
if total <= 0:
|
||||||
return
|
return
|
||||||
pct = current / total
|
pct = current / total
|
||||||
filled = int(width * pct)
|
filled = int(width * pct)
|
||||||
bar = "=" * filled + "-" * (width - filled)
|
bar = "=" * filled + "-" * (width - filled)
|
||||||
mb_done = current / 1_048_576
|
mb_done = current / 1_048_576
|
||||||
mb_total = total / 1_048_576
|
mb_total = total / 1_048_576
|
||||||
print(f"\r [{bar}] {pct:.0%} {mb_done:.0f}/{mb_total:.0f} MB", end="", flush=True)
|
print(f"\r [{bar}] {pct:.0%} {mb_done:.0f}/{mb_total:.0f} MB", end="", flush=True)
|
||||||
|
|
||||||
|
|
||||||
def download_file(url: str, dest: Path, desc: str):
|
def download_file(url: str, dest: Path, desc: str):
|
||||||
"""Download a file with progress bar."""
|
"""Download a file with progress bar."""
|
||||||
print(f"\n Downloading {desc}...")
|
print(f"\n Downloading {desc}...")
|
||||||
print(f" URL: {url}")
|
print(f" URL: {url}")
|
||||||
|
|
||||||
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
resp = urlopen(req, timeout=60)
|
resp = urlopen(req, timeout=60)
|
||||||
|
|
||||||
total = int(resp.headers.get("Content-Length", 0))
|
total = int(resp.headers.get("Content-Length", 0))
|
||||||
downloaded = 0
|
downloaded = 0
|
||||||
tmp = dest.with_suffix(".tmp")
|
tmp = dest.with_suffix(".tmp")
|
||||||
|
|
||||||
with open(tmp, "wb") as f:
|
with open(tmp, "wb") as f:
|
||||||
while True:
|
while True:
|
||||||
chunk = resp.read(1024 * 1024)
|
chunk = resp.read(1024 * 1024)
|
||||||
if not chunk:
|
if not chunk:
|
||||||
break
|
break
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
downloaded += len(chunk)
|
downloaded += len(chunk)
|
||||||
progress_bar(downloaded, total)
|
progress_bar(downloaded, total)
|
||||||
|
|
||||||
print() # newline after progress bar
|
print() # newline after progress bar
|
||||||
tmp.rename(dest)
|
tmp.rename(dest)
|
||||||
print(f" Saved: {dest} ({downloaded / 1_048_576:.0f} MB)")
|
print(f" Saved: {dest} ({downloaded / 1_048_576:.0f} MB)")
|
||||||
|
|
||||||
|
|
||||||
def fetch_release(api_url: str) -> dict | None:
|
def fetch_release(api_url: str) -> dict | None:
|
||||||
"""Fetch a GitHub release JSON."""
|
"""Fetch a GitHub release JSON."""
|
||||||
req = Request(api_url, headers={"User-Agent": "Mozilla/5.0"})
|
req = Request(api_url, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
try:
|
try:
|
||||||
resp = urlopen(req, timeout=30)
|
resp = urlopen(req, timeout=30)
|
||||||
return json.loads(resp.read())
|
return json.loads(resp.read())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" Could not fetch from {api_url}: {e}")
|
print(f" Could not fetch from {api_url}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_zip(zip_path: Path):
|
def extract_zip(zip_path: Path):
|
||||||
"""Extract zip contents into WHISPER_DIR, flattened."""
|
"""Extract zip contents into WHISPER_DIR, flattened."""
|
||||||
print(f"\n Extracting to {WHISPER_DIR}/...")
|
print(f"\n Extracting to {WHISPER_DIR}/...")
|
||||||
WHISPER_DIR.mkdir(exist_ok=True)
|
WHISPER_DIR.mkdir(exist_ok=True)
|
||||||
with zipfile.ZipFile(zip_path) as zf:
|
with zipfile.ZipFile(zip_path) as zf:
|
||||||
for member in zf.namelist():
|
for member in zf.namelist():
|
||||||
filename = Path(member).name
|
filename = Path(member).name
|
||||||
if not filename:
|
if not filename:
|
||||||
continue
|
continue
|
||||||
target = WHISPER_DIR / filename
|
target = WHISPER_DIR / filename
|
||||||
with zf.open(member) as src, open(target, "wb") as dst:
|
with zf.open(member) as src, open(target, "wb") as dst:
|
||||||
dst.write(src.read())
|
dst.write(src.read())
|
||||||
print(f" {filename}")
|
print(f" {filename}")
|
||||||
zip_path.unlink()
|
zip_path.unlink()
|
||||||
|
|
||||||
|
|
||||||
def find_whisper_exe() -> str | None:
|
def find_whisper_exe() -> str | None:
|
||||||
"""Find whisper-cli.exe (or similar) in WHISPER_DIR."""
|
"""Find whisper-cli.exe (or similar) in WHISPER_DIR."""
|
||||||
whisper_exe = WHISPER_DIR / "whisper-cli.exe"
|
whisper_exe = WHISPER_DIR / "whisper-cli.exe"
|
||||||
if whisper_exe.exists():
|
if whisper_exe.exists():
|
||||||
return str(whisper_exe)
|
return str(whisper_exe)
|
||||||
|
|
||||||
# Try main.exe (older naming)
|
# Try main.exe (older naming)
|
||||||
main_exe = WHISPER_DIR / "main.exe"
|
main_exe = WHISPER_DIR / "main.exe"
|
||||||
if main_exe.exists():
|
if main_exe.exists():
|
||||||
return str(main_exe)
|
return str(main_exe)
|
||||||
|
|
||||||
exes = list(WHISPER_DIR.glob("*.exe"))
|
exes = list(WHISPER_DIR.glob("*.exe"))
|
||||||
for exe in exes:
|
for exe in exes:
|
||||||
if "whisper" in exe.name.lower() and "cli" in exe.name.lower():
|
if "whisper" in exe.name.lower() and "cli" in exe.name.lower():
|
||||||
return str(exe)
|
return str(exe)
|
||||||
for exe in exes:
|
for exe in exes:
|
||||||
if "whisper" in exe.name.lower():
|
if "whisper" in exe.name.lower():
|
||||||
return str(exe)
|
return str(exe)
|
||||||
if exes:
|
if exes:
|
||||||
return str(exes[0])
|
return str(exes[0])
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def try_community_vulkan_build() -> str | None:
|
def try_community_vulkan_build() -> str | None:
|
||||||
"""Try downloading Vulkan build from jerryshell's community repo."""
|
"""Try downloading Vulkan build from jerryshell's community repo."""
|
||||||
print("\n Trying community Vulkan build (jerryshell/whisper.cpp-windows-vulkan-bin)...")
|
print("\n Trying community Vulkan build (jerryshell/whisper.cpp-windows-vulkan-bin)...")
|
||||||
release = fetch_release(VULKAN_BUILDS_API)
|
release = fetch_release(VULKAN_BUILDS_API)
|
||||||
if not release:
|
if not release:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
tag = release.get("tag_name", "unknown")
|
tag = release.get("tag_name", "unknown")
|
||||||
print(f" Community release: {tag}")
|
print(f" Community release: {tag}")
|
||||||
|
|
||||||
# Find a zip asset
|
# Find a zip asset
|
||||||
for asset in release.get("assets", []):
|
for asset in release.get("assets", []):
|
||||||
name = asset["name"].lower()
|
name = asset["name"].lower()
|
||||||
if name.endswith(".zip"):
|
if name.endswith(".zip"):
|
||||||
print(f" Found: {asset['name']}")
|
print(f" Found: {asset['name']}")
|
||||||
zip_path = Path(asset["name"])
|
zip_path = Path(asset["name"])
|
||||||
download_file(asset["browser_download_url"], zip_path, asset["name"])
|
download_file(asset["browser_download_url"], zip_path, asset["name"])
|
||||||
extract_zip(zip_path)
|
extract_zip(zip_path)
|
||||||
return find_whisper_exe()
|
return find_whisper_exe()
|
||||||
|
|
||||||
print(" No zip asset found in community release")
|
print(" No zip asset found in community release")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def try_official_vulkan_build() -> str | None:
|
def try_official_vulkan_build() -> str | None:
|
||||||
"""Try downloading Vulkan build from official ggml-org releases."""
|
"""Try downloading Vulkan build from official ggml-org releases."""
|
||||||
print("\n Fetching latest whisper.cpp release from ggml-org...")
|
print("\n Fetching latest whisper.cpp release from ggml-org...")
|
||||||
release = fetch_release(GITHUB_API)
|
release = fetch_release(GITHUB_API)
|
||||||
if not release:
|
if not release:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
tag = release.get("tag_name", "unknown")
|
tag = release.get("tag_name", "unknown")
|
||||||
print(f" Official release: {tag}")
|
print(f" Official release: {tag}")
|
||||||
|
|
||||||
# Priority: vulkan > noavx (cpu-only, no CUDA deps) > skip CUDA entirely
|
# Priority: vulkan > noavx (cpu-only, no CUDA deps) > skip CUDA entirely
|
||||||
vulkan_asset = None
|
vulkan_asset = None
|
||||||
cpu_asset = None
|
cpu_asset = None
|
||||||
for asset in release.get("assets", []):
|
for asset in release.get("assets", []):
|
||||||
name = asset["name"].lower()
|
name = asset["name"].lower()
|
||||||
if not name.endswith(".zip"):
|
if not name.endswith(".zip"):
|
||||||
continue
|
continue
|
||||||
# Must be Windows
|
# Must be Windows
|
||||||
if "win" not in name and "x64" not in name:
|
if "win" not in name and "x64" not in name:
|
||||||
continue
|
continue
|
||||||
# Absolutely skip CUDA builds - they won't work on AMD
|
# Absolutely skip CUDA builds - they won't work on AMD
|
||||||
if "cuda" in name:
|
if "cuda" in name:
|
||||||
continue
|
continue
|
||||||
if "vulkan" in name:
|
if "vulkan" in name:
|
||||||
vulkan_asset = asset
|
vulkan_asset = asset
|
||||||
break
|
break
|
||||||
if "noavx" not in name and "openblas" not in name:
|
if "noavx" not in name and "openblas" not in name:
|
||||||
cpu_asset = asset
|
cpu_asset = asset
|
||||||
|
|
||||||
chosen = vulkan_asset or cpu_asset
|
chosen = vulkan_asset or cpu_asset
|
||||||
if not chosen:
|
if not chosen:
|
||||||
print(" No Vulkan or CPU-only build found in official releases")
|
print(" No Vulkan or CPU-only build found in official releases")
|
||||||
print(" Available assets:")
|
print(" Available assets:")
|
||||||
for asset in release.get("assets", []):
|
for asset in release.get("assets", []):
|
||||||
print(f" - {asset['name']}")
|
print(f" - {asset['name']}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if vulkan_asset:
|
if vulkan_asset:
|
||||||
print(f" Found official Vulkan build: {chosen['name']}")
|
print(f" Found official Vulkan build: {chosen['name']}")
|
||||||
else:
|
else:
|
||||||
print(f" No Vulkan build in official release, using CPU build: {chosen['name']}")
|
print(f" No Vulkan build in official release, using CPU build: {chosen['name']}")
|
||||||
print(f" (Will work but without GPU acceleration)")
|
print(f" (Will work but without GPU acceleration)")
|
||||||
|
|
||||||
zip_path = Path(chosen["name"])
|
zip_path = Path(chosen["name"])
|
||||||
download_file(chosen["browser_download_url"], zip_path, chosen["name"])
|
download_file(chosen["browser_download_url"], zip_path, chosen["name"])
|
||||||
extract_zip(zip_path)
|
extract_zip(zip_path)
|
||||||
return find_whisper_exe()
|
return find_whisper_exe()
|
||||||
|
|
||||||
|
|
||||||
def setup_whisper_bin() -> str | None:
|
def setup_whisper_bin() -> str | None:
|
||||||
"""Download whisper.cpp Vulkan release. Returns path to whisper-cli.exe."""
|
"""Download whisper.cpp Vulkan release. Returns path to whisper-cli.exe."""
|
||||||
whisper_exe = WHISPER_DIR / "whisper-cli.exe"
|
whisper_exe = WHISPER_DIR / "whisper-cli.exe"
|
||||||
if whisper_exe.exists():
|
if whisper_exe.exists():
|
||||||
# Check if it's a CUDA build (has CUDA DLLs but no Vulkan DLL)
|
# Check if it's a CUDA build (has CUDA DLLs but no Vulkan DLL)
|
||||||
has_cuda = (WHISPER_DIR / "ggml-cuda.dll").exists()
|
has_cuda = (WHISPER_DIR / "ggml-cuda.dll").exists()
|
||||||
has_vulkan = (WHISPER_DIR / "ggml-vulkan.dll").exists()
|
has_vulkan = (WHISPER_DIR / "ggml-vulkan.dll").exists()
|
||||||
if has_cuda and not has_vulkan:
|
if has_cuda and not has_vulkan:
|
||||||
print(f" WARNING: Existing install is a CUDA build (won't work on AMD GPU)")
|
print(f" WARNING: Existing install is a CUDA build (won't work on AMD GPU)")
|
||||||
print(f" Removing and re-downloading Vulkan build...")
|
print(f" Removing and re-downloading Vulkan build...")
|
||||||
import shutil
|
import shutil
|
||||||
shutil.rmtree(WHISPER_DIR)
|
shutil.rmtree(WHISPER_DIR)
|
||||||
else:
|
else:
|
||||||
print(f" whisper-cli.exe already exists at {whisper_exe}")
|
print(f" whisper-cli.exe already exists at {whisper_exe}")
|
||||||
return str(whisper_exe)
|
return str(whisper_exe)
|
||||||
|
|
||||||
# Strategy: try community Vulkan build first (reliable for AMD),
|
# Strategy: try community Vulkan build first (reliable for AMD),
|
||||||
# then fall back to official release
|
# then fall back to official release
|
||||||
exe_path = try_community_vulkan_build()
|
exe_path = try_community_vulkan_build()
|
||||||
if exe_path:
|
if exe_path:
|
||||||
print(f"\n whisper-cli.exe ready at: {exe_path} (Vulkan)")
|
print(f"\n whisper-cli.exe ready at: {exe_path} (Vulkan)")
|
||||||
return exe_path
|
return exe_path
|
||||||
|
|
||||||
print("\n Community build failed, trying official release...")
|
print("\n Community build failed, trying official release...")
|
||||||
exe_path = try_official_vulkan_build()
|
exe_path = try_official_vulkan_build()
|
||||||
if exe_path:
|
if exe_path:
|
||||||
print(f"\n whisper-cli.exe ready at: {exe_path}")
|
print(f"\n whisper-cli.exe ready at: {exe_path}")
|
||||||
return exe_path
|
return exe_path
|
||||||
|
|
||||||
print("\n ERROR: Could not download whisper.cpp")
|
print("\n ERROR: Could not download whisper.cpp")
|
||||||
print(" Manual install: https://github.com/ggml-org/whisper.cpp/releases")
|
print(" Manual install: https://github.com/ggml-org/whisper.cpp/releases")
|
||||||
print(" Build from source with: cmake -DGGML_VULKAN=1")
|
print(" Build from source with: cmake -DGGML_VULKAN=1")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
FFMPEG_DIR = Path("ffmpeg-bin")
|
FFMPEG_DIR = Path("ffmpeg-bin")
|
||||||
FFMPEG_URL = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
|
FFMPEG_URL = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
|
||||||
|
|
||||||
|
|
||||||
def setup_ffmpeg() -> str | None:
|
def setup_ffmpeg() -> str | None:
|
||||||
"""Download ffmpeg if not found. Returns path to ffmpeg.exe."""
|
"""Download ffmpeg if not found. Returns path to ffmpeg.exe."""
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
# Already in PATH?
|
# Already in PATH?
|
||||||
if shutil.which("ffmpeg"):
|
if shutil.which("ffmpeg"):
|
||||||
path = shutil.which("ffmpeg")
|
path = shutil.which("ffmpeg")
|
||||||
print(f" ffmpeg already in PATH: {path}")
|
print(f" ffmpeg already in PATH: {path}")
|
||||||
return path
|
return path
|
||||||
|
|
||||||
# Already downloaded locally?
|
# Already downloaded locally?
|
||||||
local_exe = FFMPEG_DIR / "ffmpeg.exe"
|
local_exe = FFMPEG_DIR / "ffmpeg.exe"
|
||||||
if local_exe.exists():
|
if local_exe.exists():
|
||||||
print(f" ffmpeg already exists at {local_exe}")
|
print(f" ffmpeg already exists at {local_exe}")
|
||||||
return str(local_exe)
|
return str(local_exe)
|
||||||
|
|
||||||
print("\n Downloading ffmpeg (essentials build)...")
|
print("\n Downloading ffmpeg (essentials build)...")
|
||||||
zip_path = Path("ffmpeg-essentials.zip")
|
zip_path = Path("ffmpeg-essentials.zip")
|
||||||
download_file(FFMPEG_URL, zip_path, "ffmpeg")
|
download_file(FFMPEG_URL, zip_path, "ffmpeg")
|
||||||
|
|
||||||
print(f"\n Extracting ffmpeg...")
|
print(f"\n Extracting ffmpeg...")
|
||||||
FFMPEG_DIR.mkdir(exist_ok=True)
|
FFMPEG_DIR.mkdir(exist_ok=True)
|
||||||
with zipfile.ZipFile(zip_path) as zf:
|
with zipfile.ZipFile(zip_path) as zf:
|
||||||
for member in zf.namelist():
|
for member in zf.namelist():
|
||||||
# Only extract the bin/*.exe files
|
# Only extract the bin/*.exe files
|
||||||
if member.endswith(".exe"):
|
if member.endswith(".exe"):
|
||||||
filename = Path(member).name
|
filename = Path(member).name
|
||||||
target = FFMPEG_DIR / filename
|
target = FFMPEG_DIR / filename
|
||||||
with zf.open(member) as src, open(target, "wb") as dst:
|
with zf.open(member) as src, open(target, "wb") as dst:
|
||||||
dst.write(src.read())
|
dst.write(src.read())
|
||||||
print(f" {filename}")
|
print(f" {filename}")
|
||||||
|
|
||||||
zip_path.unlink()
|
zip_path.unlink()
|
||||||
|
|
||||||
if local_exe.exists():
|
if local_exe.exists():
|
||||||
print(f"\n ffmpeg ready at: {local_exe}")
|
print(f"\n ffmpeg ready at: {local_exe}")
|
||||||
return str(local_exe)
|
return str(local_exe)
|
||||||
|
|
||||||
print(" ERROR: ffmpeg.exe not found after extraction")
|
print(" ERROR: ffmpeg.exe not found after extraction")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def setup_model() -> bool:
|
def setup_model() -> bool:
|
||||||
"""Download whisper model. Returns True on success."""
|
"""Download whisper model. Returns True on success."""
|
||||||
MODELS_DIR.mkdir(exist_ok=True)
|
MODELS_DIR.mkdir(exist_ok=True)
|
||||||
model_path = MODELS_DIR / MODEL_NAME
|
model_path = MODELS_DIR / MODEL_NAME
|
||||||
|
|
||||||
if model_path.exists() and model_path.stat().st_size > 100_000_000:
|
if model_path.exists() and model_path.stat().st_size > 100_000_000:
|
||||||
print(f" Model already exists: {model_path} ({model_path.stat().st_size / 1_048_576:.0f} MB)")
|
print(f" Model already exists: {model_path} ({model_path.stat().st_size / 1_048_576:.0f} MB)")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
download_file(MODEL_URL, model_path, f"Whisper model ({MODEL_NAME})")
|
download_file(MODEL_URL, model_path, f"Whisper model ({MODEL_NAME})")
|
||||||
|
|
||||||
if model_path.exists() and model_path.stat().st_size > 100_000_000:
|
if model_path.exists() and model_path.stat().st_size > 100_000_000:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
print(" ERROR: Model file too small or missing after download")
|
print(" ERROR: Model file too small or missing after download")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
what = sys.argv[1] if len(sys.argv) > 1 else "all"
|
what = sys.argv[1] if len(sys.argv) > 1 else "all"
|
||||||
|
|
||||||
if what in ("all", "ffmpeg"):
|
if what in ("all", "ffmpeg"):
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print(" Setting up ffmpeg")
|
print(" Setting up ffmpeg")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
ffmpeg_path = setup_ffmpeg()
|
ffmpeg_path = setup_ffmpeg()
|
||||||
if ffmpeg_path:
|
if ffmpeg_path:
|
||||||
Path(".ffmpeg_bin_path").write_text(ffmpeg_path)
|
Path(".ffmpeg_bin_path").write_text(ffmpeg_path)
|
||||||
else:
|
else:
|
||||||
print("\nFAILED to set up ffmpeg")
|
print("\nFAILED to set up ffmpeg")
|
||||||
if what == "ffmpeg":
|
if what == "ffmpeg":
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if what in ("all", "whisper"):
|
if what in ("all", "whisper"):
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print(" Setting up whisper.cpp")
|
print(" Setting up whisper.cpp")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
exe_path = setup_whisper_bin()
|
exe_path = setup_whisper_bin()
|
||||||
if exe_path:
|
if exe_path:
|
||||||
# Write path to temp file so run.bat can read it
|
# Write path to temp file so run.bat can read it
|
||||||
Path(".whisper_bin_path").write_text(exe_path)
|
Path(".whisper_bin_path").write_text(exe_path)
|
||||||
else:
|
else:
|
||||||
print("\nFAILED to set up whisper.cpp")
|
print("\nFAILED to set up whisper.cpp")
|
||||||
if what == "whisper":
|
if what == "whisper":
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if what in ("all", "model"):
|
if what in ("all", "model"):
|
||||||
print()
|
print()
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print(f" Downloading Whisper model: {MODEL_NAME}")
|
print(f" Downloading Whisper model: {MODEL_NAME}")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
if not setup_model():
|
if not setup_model():
|
||||||
print("\nFAILED to download model")
|
print("\nFAILED to download model")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("Setup complete!")
|
print("Setup complete!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
384
summarize.py
384
summarize.py
@@ -1,192 +1,192 @@
|
|||||||
"""
|
"""
|
||||||
Generate summaries from transcripts using Claude Code.
|
Generate summaries from transcripts using Claude Code.
|
||||||
Reads manifest.json, processes each transcript, outputs per-lecture summaries,
|
Reads manifest.json, processes each transcript, outputs per-lecture summaries,
|
||||||
and compiles SUPORT_CURS.md master study guide.
|
and compiles SUPORT_CURS.md master study guide.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python summarize.py # Print prompts for each transcript (pipe to Claude)
|
python summarize.py # Print prompts for each transcript (pipe to Claude)
|
||||||
python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md
|
python summarize.py --compile # Compile existing summaries into SUPORT_CURS.md
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import textwrap
|
import textwrap
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
MANIFEST_PATH = Path("manifest.json")
|
MANIFEST_PATH = Path("manifest.json")
|
||||||
SUMMARIES_DIR = Path("summaries")
|
SUMMARIES_DIR = Path("summaries")
|
||||||
TRANSCRIPTS_DIR = Path("transcripts")
|
TRANSCRIPTS_DIR = Path("transcripts")
|
||||||
MASTER_GUIDE = Path("SUPORT_CURS.md")
|
MASTER_GUIDE = Path("SUPORT_CURS.md")
|
||||||
|
|
||||||
MAX_WORDS_PER_CHUNK = 10000
|
MAX_WORDS_PER_CHUNK = 10000
|
||||||
OVERLAP_WORDS = 500
|
OVERLAP_WORDS = 500
|
||||||
|
|
||||||
SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner.
|
SUMMARY_PROMPT = """Rezuma aceasta transcriere a unei lectii din cursul NLP Master Practitioner.
|
||||||
|
|
||||||
Ofera:
|
Ofera:
|
||||||
1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
|
1. **Prezentare generala** - 3-5 propozitii care descriu subiectul principal al lectiei
|
||||||
2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important
|
2. **Concepte cheie** - lista cu definitii scurte pentru fiecare concept important
|
||||||
3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer
|
3. **Detalii si exemple importante** - informatii concrete, exercitii practice, exemple relevante mentionate de trainer
|
||||||
4. **Citate memorabile** - fraze sau idei remarcabile (daca exista)
|
4. **Citate memorabile** - fraze sau idei remarcabile (daca exista)
|
||||||
|
|
||||||
Raspunde in limba romana. Formateaza ca Markdown.
|
Raspunde in limba romana. Formateaza ca Markdown.
|
||||||
|
|
||||||
---
|
---
|
||||||
TITLU LECTIE: {title}
|
TITLU LECTIE: {title}
|
||||||
---
|
---
|
||||||
TRANSCRIERE:
|
TRANSCRIERE:
|
||||||
{text}
|
{text}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat).
|
MERGE_PROMPT = """Am mai multe rezumate partiale ale aceleiasi lectii (a fost prea lunga pentru un singur rezumat).
|
||||||
Combina-le intr-un singur rezumat coerent, eliminand duplicatele.
|
Combina-le intr-un singur rezumat coerent, eliminand duplicatele.
|
||||||
|
|
||||||
Pastreaza structura:
|
Pastreaza structura:
|
||||||
1. Prezentare generala (3-5 propozitii)
|
1. Prezentare generala (3-5 propozitii)
|
||||||
2. Concepte cheie cu definitii
|
2. Concepte cheie cu definitii
|
||||||
3. Detalii si exemple importante
|
3. Detalii si exemple importante
|
||||||
4. Citate memorabile
|
4. Citate memorabile
|
||||||
|
|
||||||
Raspunde in limba romana. Formateaza ca Markdown.
|
Raspunde in limba romana. Formateaza ca Markdown.
|
||||||
|
|
||||||
---
|
---
|
||||||
TITLU LECTIE: {title}
|
TITLU LECTIE: {title}
|
||||||
---
|
---
|
||||||
REZUMATE PARTIALE:
|
REZUMATE PARTIALE:
|
||||||
{chunks}
|
{chunks}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def load_manifest() -> dict:
|
def load_manifest() -> dict:
|
||||||
with open(MANIFEST_PATH, encoding="utf-8") as f:
|
with open(MANIFEST_PATH, encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
|
def split_at_sentences(text: str, max_words: int, overlap: int) -> list[str]:
|
||||||
"""Split text into chunks at sentence boundaries with overlap."""
|
"""Split text into chunks at sentence boundaries with overlap."""
|
||||||
words = text.split()
|
words = text.split()
|
||||||
if len(words) <= max_words:
|
if len(words) <= max_words:
|
||||||
return [text]
|
return [text]
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
start = 0
|
start = 0
|
||||||
while start < len(words):
|
while start < len(words):
|
||||||
end = min(start + max_words, len(words))
|
end = min(start + max_words, len(words))
|
||||||
chunk_words = words[start:end]
|
chunk_words = words[start:end]
|
||||||
chunk_text = " ".join(chunk_words)
|
chunk_text = " ".join(chunk_words)
|
||||||
|
|
||||||
# Try to break at sentence boundary (look back from end)
|
# Try to break at sentence boundary (look back from end)
|
||||||
if end < len(words):
|
if end < len(words):
|
||||||
for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
|
for sep in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
|
||||||
last_sep = chunk_text.rfind(sep)
|
last_sep = chunk_text.rfind(sep)
|
||||||
if last_sep > len(chunk_text) // 2: # Don't break too early
|
if last_sep > len(chunk_text) // 2: # Don't break too early
|
||||||
chunk_text = chunk_text[:last_sep + 1]
|
chunk_text = chunk_text[:last_sep + 1]
|
||||||
# Recalculate end based on actual words used
|
# Recalculate end based on actual words used
|
||||||
end = start + len(chunk_text.split())
|
end = start + len(chunk_text.split())
|
||||||
break
|
break
|
||||||
|
|
||||||
chunks.append(chunk_text)
|
chunks.append(chunk_text)
|
||||||
start = max(end - overlap, start + 1) # Overlap, but always advance
|
start = max(end - overlap, start + 1) # Overlap, but always advance
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def generate_prompts(manifest: dict):
|
def generate_prompts(manifest: dict):
|
||||||
"""Print summary prompts for each transcript to stdout."""
|
"""Print summary prompts for each transcript to stdout."""
|
||||||
SUMMARIES_DIR.mkdir(exist_ok=True)
|
SUMMARIES_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
for mod in manifest["modules"]:
|
for mod in manifest["modules"]:
|
||||||
for lec in mod["lectures"]:
|
for lec in mod["lectures"]:
|
||||||
if lec.get("transcribe_status") != "complete":
|
if lec.get("transcribe_status") != "complete":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
summary_path = Path(lec["summary_path"])
|
summary_path = Path(lec["summary_path"])
|
||||||
if summary_path.exists() and summary_path.stat().st_size > 0:
|
if summary_path.exists() and summary_path.stat().st_size > 0:
|
||||||
print(f"# SKIP (exists): {lec['title']}", file=sys.stderr)
|
print(f"# SKIP (exists): {lec['title']}", file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
txt_path = Path(lec["transcript_path"])
|
txt_path = Path(lec["transcript_path"])
|
||||||
if not txt_path.exists():
|
if not txt_path.exists():
|
||||||
print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr)
|
print(f"# SKIP (no transcript): {lec['title']}", file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text = txt_path.read_text(encoding="utf-8").strip()
|
text = txt_path.read_text(encoding="utf-8").strip()
|
||||||
if not text:
|
if not text:
|
||||||
print(f"# SKIP (empty): {lec['title']}", file=sys.stderr)
|
print(f"# SKIP (empty): {lec['title']}", file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS)
|
chunks = split_at_sentences(text, MAX_WORDS_PER_CHUNK, OVERLAP_WORDS)
|
||||||
|
|
||||||
print(f"\n{'='*60}", file=sys.stderr)
|
print(f"\n{'='*60}", file=sys.stderr)
|
||||||
print(f"Lecture: {lec['title']}", file=sys.stderr)
|
print(f"Lecture: {lec['title']}", file=sys.stderr)
|
||||||
print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr)
|
print(f"Words: {len(text.split())}, Chunks: {len(chunks)}", file=sys.stderr)
|
||||||
print(f"Output: {summary_path}", file=sys.stderr)
|
print(f"Output: {summary_path}", file=sys.stderr)
|
||||||
|
|
||||||
if len(chunks) == 1:
|
if len(chunks) == 1:
|
||||||
prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text)
|
prompt = SUMMARY_PROMPT.format(title=lec["title"], text=text)
|
||||||
print(f"SUMMARY_FILE:{summary_path}")
|
print(f"SUMMARY_FILE:{summary_path}")
|
||||||
print(prompt)
|
print(prompt)
|
||||||
print("---END_PROMPT---")
|
print("---END_PROMPT---")
|
||||||
else:
|
else:
|
||||||
# Multi-chunk: generate individual chunk prompts
|
# Multi-chunk: generate individual chunk prompts
|
||||||
for i, chunk in enumerate(chunks, 1):
|
for i, chunk in enumerate(chunks, 1):
|
||||||
prompt = SUMMARY_PROMPT.format(
|
prompt = SUMMARY_PROMPT.format(
|
||||||
title=f"{lec['title']} (partea {i}/{len(chunks)})",
|
title=f"{lec['title']} (partea {i}/{len(chunks)})",
|
||||||
text=chunk,
|
text=chunk,
|
||||||
)
|
)
|
||||||
print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}")
|
print(f"CHUNK_PROMPT:{i}/{len(chunks)}:{summary_path}")
|
||||||
print(prompt)
|
print(prompt)
|
||||||
print("---END_PROMPT---")
|
print("---END_PROMPT---")
|
||||||
|
|
||||||
# Then a merge prompt
|
# Then a merge prompt
|
||||||
print(f"MERGE_FILE:{summary_path}")
|
print(f"MERGE_FILE:{summary_path}")
|
||||||
merge = MERGE_PROMPT.format(
|
merge = MERGE_PROMPT.format(
|
||||||
title=lec["title"],
|
title=lec["title"],
|
||||||
chunks="{chunk_summaries}", # Placeholder for merge step
|
chunks="{chunk_summaries}", # Placeholder for merge step
|
||||||
)
|
)
|
||||||
print(merge)
|
print(merge)
|
||||||
print("---END_PROMPT---")
|
print("---END_PROMPT---")
|
||||||
|
|
||||||
|
|
||||||
def compile_master_guide(manifest: dict):
|
def compile_master_guide(manifest: dict):
|
||||||
"""Compile all summaries into SUPORT_CURS.md."""
|
"""Compile all summaries into SUPORT_CURS.md."""
|
||||||
lines = [
|
lines = [
|
||||||
"# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n",
|
"# SUPORT CURS - NLP Master Practitioner Bucuresti 2025\n",
|
||||||
"_Generat automat din transcrierile audio ale cursului._\n",
|
"_Generat automat din transcrierile audio ale cursului._\n",
|
||||||
"---\n",
|
"---\n",
|
||||||
]
|
]
|
||||||
|
|
||||||
for mod in manifest["modules"]:
|
for mod in manifest["modules"]:
|
||||||
lines.append(f"\n## {mod['name']}\n")
|
lines.append(f"\n## {mod['name']}\n")
|
||||||
|
|
||||||
for lec in mod["lectures"]:
|
for lec in mod["lectures"]:
|
||||||
summary_path = Path(lec["summary_path"])
|
summary_path = Path(lec["summary_path"])
|
||||||
lines.append(f"\n### {lec['title']}\n")
|
lines.append(f"\n### {lec['title']}\n")
|
||||||
|
|
||||||
if summary_path.exists():
|
if summary_path.exists():
|
||||||
content = summary_path.read_text(encoding="utf-8").strip()
|
content = summary_path.read_text(encoding="utf-8").strip()
|
||||||
lines.append(f"{content}\n")
|
lines.append(f"{content}\n")
|
||||||
else:
|
else:
|
||||||
lines.append("_Rezumat indisponibil._\n")
|
lines.append("_Rezumat indisponibil._\n")
|
||||||
|
|
||||||
lines.append("\n---\n")
|
lines.append("\n---\n")
|
||||||
|
|
||||||
MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8")
|
MASTER_GUIDE.write_text("\n".join(lines), encoding="utf-8")
|
||||||
print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)")
|
print(f"Compiled {MASTER_GUIDE} ({MASTER_GUIDE.stat().st_size} bytes)")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if not MANIFEST_PATH.exists():
|
if not MANIFEST_PATH.exists():
|
||||||
print("manifest.json not found. Run download.py and transcribe.py first.")
|
print("manifest.json not found. Run download.py and transcribe.py first.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
manifest = load_manifest()
|
manifest = load_manifest()
|
||||||
|
|
||||||
if "--compile" in sys.argv:
|
if "--compile" in sys.argv:
|
||||||
compile_master_guide(manifest)
|
compile_master_guide(manifest)
|
||||||
else:
|
else:
|
||||||
generate_prompts(manifest)
|
generate_prompts(manifest)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
598
transcribe.py
598
transcribe.py
@@ -1,299 +1,299 @@
|
|||||||
"""
|
"""
|
||||||
Batch transcription using whisper.cpp.
|
Batch transcription using whisper.cpp.
|
||||||
Reads manifest.json, transcribes each audio file in module order,
|
Reads manifest.json, transcribes each audio file in module order,
|
||||||
outputs .txt and .srt files, updates manifest status.
|
outputs .txt and .srt files, updates manifest status.
|
||||||
Resumable: skips files with existing transcripts.
|
Resumable: skips files with existing transcripts.
|
||||||
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
|
Converts MP3 -> WAV (16kHz mono) via ffmpeg before transcription.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
MANIFEST_PATH = Path("manifest.json")
|
MANIFEST_PATH = Path("manifest.json")
|
||||||
TRANSCRIPTS_DIR = Path("transcripts")
|
TRANSCRIPTS_DIR = Path("transcripts")
|
||||||
WAV_CACHE_DIR = Path("audio_wav")
|
WAV_CACHE_DIR = Path("audio_wav")
|
||||||
|
|
||||||
# whisper.cpp defaults — override with env vars or CLI args
|
# whisper.cpp defaults — override with env vars or CLI args
|
||||||
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
|
WHISPER_BIN = os.getenv("WHISPER_BIN", r"whisper-cli.exe")
|
||||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
|
WHISPER_MODEL = os.getenv("WHISPER_MODEL", r"models\ggml-medium-q5_0.bin")
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
handlers=[
|
handlers=[
|
||||||
logging.StreamHandler(),
|
logging.StreamHandler(),
|
||||||
logging.FileHandler("transcribe_errors.log"),
|
logging.FileHandler("transcribe_errors.log"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def find_ffmpeg() -> str:
|
def find_ffmpeg() -> str:
|
||||||
"""Find ffmpeg executable."""
|
"""Find ffmpeg executable."""
|
||||||
if shutil.which("ffmpeg"):
|
if shutil.which("ffmpeg"):
|
||||||
return "ffmpeg"
|
return "ffmpeg"
|
||||||
# Check local directories
|
# Check local directories
|
||||||
for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
|
for p in [Path("ffmpeg.exe"), Path("ffmpeg-bin/ffmpeg.exe")]:
|
||||||
if p.exists():
|
if p.exists():
|
||||||
return str(p.resolve())
|
return str(p.resolve())
|
||||||
# Try imageio-ffmpeg (pip fallback)
|
# Try imageio-ffmpeg (pip fallback)
|
||||||
try:
|
try:
|
||||||
import imageio_ffmpeg
|
import imageio_ffmpeg
|
||||||
return imageio_ffmpeg.get_ffmpeg_exe()
|
return imageio_ffmpeg.get_ffmpeg_exe()
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def convert_to_wav(audio_path: str) -> str:
|
def convert_to_wav(audio_path: str) -> str:
|
||||||
"""
|
"""
|
||||||
Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
|
Convert audio file to WAV 16kHz mono (optimal for whisper.cpp).
|
||||||
Returns path to WAV file. Skips if WAV already exists.
|
Returns path to WAV file. Skips if WAV already exists.
|
||||||
"""
|
"""
|
||||||
src = Path(audio_path)
|
src = Path(audio_path)
|
||||||
|
|
||||||
# Already a WAV file, skip
|
# Already a WAV file, skip
|
||||||
if src.suffix.lower() == ".wav":
|
if src.suffix.lower() == ".wav":
|
||||||
return audio_path
|
return audio_path
|
||||||
|
|
||||||
WAV_CACHE_DIR.mkdir(exist_ok=True)
|
WAV_CACHE_DIR.mkdir(exist_ok=True)
|
||||||
wav_path = WAV_CACHE_DIR / (src.stem + ".wav")
|
wav_path = WAV_CACHE_DIR / (src.stem + ".wav")
|
||||||
|
|
||||||
# Skip if already converted
|
# Skip if already converted
|
||||||
if wav_path.exists() and wav_path.stat().st_size > 0:
|
if wav_path.exists() and wav_path.stat().st_size > 0:
|
||||||
log.info(f" WAV cache hit: {wav_path}")
|
log.info(f" WAV cache hit: {wav_path}")
|
||||||
return str(wav_path)
|
return str(wav_path)
|
||||||
|
|
||||||
ffmpeg = find_ffmpeg()
|
ffmpeg = find_ffmpeg()
|
||||||
if not ffmpeg:
|
if not ffmpeg:
|
||||||
log.warning(" ffmpeg not found, using original file (may cause bad transcription)")
|
log.warning(" ffmpeg not found, using original file (may cause bad transcription)")
|
||||||
return audio_path
|
return audio_path
|
||||||
|
|
||||||
log.info(f" Converting to WAV: {src.name} -> {wav_path.name}")
|
log.info(f" Converting to WAV: {src.name} -> {wav_path.name}")
|
||||||
cmd = [
|
cmd = [
|
||||||
ffmpeg,
|
ffmpeg,
|
||||||
"-i", audio_path,
|
"-i", audio_path,
|
||||||
"-vn", # no video
|
"-vn", # no video
|
||||||
"-acodec", "pcm_s16le", # 16-bit PCM
|
"-acodec", "pcm_s16le", # 16-bit PCM
|
||||||
"-ar", "16000", # 16kHz sample rate (whisper standard)
|
"-ar", "16000", # 16kHz sample rate (whisper standard)
|
||||||
"-ac", "1", # mono
|
"-ac", "1", # mono
|
||||||
"-y", # overwrite
|
"-y", # overwrite
|
||||||
str(wav_path),
|
str(wav_path),
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
cmd,
|
cmd,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=300, # 5 min max for conversion
|
timeout=300, # 5 min max for conversion
|
||||||
)
|
)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
log.error(f" ffmpeg failed: {result.stderr[:300]}")
|
log.error(f" ffmpeg failed: {result.stderr[:300]}")
|
||||||
return audio_path
|
return audio_path
|
||||||
|
|
||||||
log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
|
log.info(f" WAV ready: {wav_path.name} ({wav_path.stat().st_size / 1_048_576:.0f} MB)")
|
||||||
return str(wav_path)
|
return str(wav_path)
|
||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
log.warning(f" ffmpeg not found at: {ffmpeg}")
|
log.warning(f" ffmpeg not found at: {ffmpeg}")
|
||||||
return audio_path
|
return audio_path
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
log.error(f" ffmpeg conversion timeout for {audio_path}")
|
log.error(f" ffmpeg conversion timeout for {audio_path}")
|
||||||
return audio_path
|
return audio_path
|
||||||
|
|
||||||
|
|
||||||
def load_manifest() -> dict:
|
def load_manifest() -> dict:
|
||||||
with open(MANIFEST_PATH, encoding="utf-8") as f:
|
with open(MANIFEST_PATH, encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def save_manifest(manifest: dict):
|
def save_manifest(manifest: dict):
|
||||||
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
|
||||||
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def transcribe_file(audio_path: str, output_base: str) -> bool:
|
def transcribe_file(audio_path: str, output_base: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Run whisper.cpp on a single file.
|
Run whisper.cpp on a single file.
|
||||||
Returns True on success.
|
Returns True on success.
|
||||||
"""
|
"""
|
||||||
cmd = [
|
cmd = [
|
||||||
WHISPER_BIN,
|
WHISPER_BIN,
|
||||||
"--model", WHISPER_MODEL,
|
"--model", WHISPER_MODEL,
|
||||||
"--language", "ro",
|
"--language", "ro",
|
||||||
"--no-gpu",
|
"--no-gpu",
|
||||||
"--threads", str(os.cpu_count() or 4),
|
"--threads", str(os.cpu_count() or 4),
|
||||||
"--beam-size", "1",
|
"--beam-size", "1",
|
||||||
"--best-of", "1",
|
"--best-of", "1",
|
||||||
"--output-txt",
|
"--output-txt",
|
||||||
"--output-srt",
|
"--output-srt",
|
||||||
"--output-file", output_base,
|
"--output-file", output_base,
|
||||||
"--file", audio_path,
|
"--file", audio_path,
|
||||||
]
|
]
|
||||||
|
|
||||||
log.info(f" CMD: {' '.join(cmd)}")
|
log.info(f" CMD: {' '.join(cmd)}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Add whisper.exe's directory to PATH so Windows finds its DLLs
|
# Add whisper.exe's directory to PATH so Windows finds its DLLs
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
|
whisper_dir = str(Path(WHISPER_BIN).resolve().parent)
|
||||||
env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
|
env["PATH"] = whisper_dir + os.pathsep + env.get("PATH", "")
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
cmd,
|
cmd,
|
||||||
stdout=sys.stdout,
|
stdout=sys.stdout,
|
||||||
stderr=sys.stderr,
|
stderr=sys.stderr,
|
||||||
timeout=7200, # 2 hour timeout per file
|
timeout=7200, # 2 hour timeout per file
|
||||||
env=env,
|
env=env,
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
log.error(f" whisper.cpp failed (exit {result.returncode})")
|
log.error(f" whisper.cpp failed (exit {result.returncode})")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Verify output exists and is non-empty
|
# Verify output exists and is non-empty
|
||||||
txt_path = Path(f"{output_base}.txt")
|
txt_path = Path(f"{output_base}.txt")
|
||||||
srt_path = Path(f"{output_base}.srt")
|
srt_path = Path(f"{output_base}.srt")
|
||||||
|
|
||||||
if not txt_path.exists() or txt_path.stat().st_size == 0:
|
if not txt_path.exists() or txt_path.stat().st_size == 0:
|
||||||
log.error(f" Empty or missing transcript: {txt_path}")
|
log.error(f" Empty or missing transcript: {txt_path}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
|
log.info(f" Output: {txt_path.name} ({txt_path.stat().st_size} bytes)")
|
||||||
if srt_path.exists():
|
if srt_path.exists():
|
||||||
log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
|
log.info(f" Output: {srt_path.name} ({srt_path.stat().st_size} bytes)")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
log.error(f" Timeout (>2h) for {audio_path}")
|
log.error(f" Timeout (>2h) for {audio_path}")
|
||||||
return False
|
return False
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
log.error(f" whisper.cpp not found at: {WHISPER_BIN}")
|
log.error(f" whisper.cpp not found at: {WHISPER_BIN}")
|
||||||
log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
|
log.error(f" Set WHISPER_BIN env var or put whisper-cli.exe in PATH")
|
||||||
return False
|
return False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f" Error: {e}")
|
log.error(f" Error: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def parse_module_filter(arg: str) -> set[int]:
|
def parse_module_filter(arg: str) -> set[int]:
|
||||||
"""Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
|
"""Parse module filter like '1-3' or '4,5' or '1-3,5' into a set of 1-based indices."""
|
||||||
result = set()
|
result = set()
|
||||||
for part in arg.split(","):
|
for part in arg.split(","):
|
||||||
part = part.strip()
|
part = part.strip()
|
||||||
if "-" in part:
|
if "-" in part:
|
||||||
a, b = part.split("-", 1)
|
a, b = part.split("-", 1)
|
||||||
result.update(range(int(a), int(b) + 1))
|
result.update(range(int(a), int(b) + 1))
|
||||||
else:
|
else:
|
||||||
result.add(int(part))
|
result.add(int(part))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if not MANIFEST_PATH.exists():
|
if not MANIFEST_PATH.exists():
|
||||||
log.error("manifest.json not found. Run download.py first.")
|
log.error("manifest.json not found. Run download.py first.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Parse --modules filter
|
# Parse --modules filter
|
||||||
module_filter = None
|
module_filter = None
|
||||||
if "--modules" in sys.argv:
|
if "--modules" in sys.argv:
|
||||||
idx = sys.argv.index("--modules")
|
idx = sys.argv.index("--modules")
|
||||||
if idx + 1 < len(sys.argv):
|
if idx + 1 < len(sys.argv):
|
||||||
module_filter = parse_module_filter(sys.argv[idx + 1])
|
module_filter = parse_module_filter(sys.argv[idx + 1])
|
||||||
log.info(f"Module filter: {sorted(module_filter)}")
|
log.info(f"Module filter: {sorted(module_filter)}")
|
||||||
|
|
||||||
manifest = load_manifest()
|
manifest = load_manifest()
|
||||||
TRANSCRIPTS_DIR.mkdir(exist_ok=True)
|
TRANSCRIPTS_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
transcribed = 0
|
transcribed = 0
|
||||||
skipped = 0
|
skipped = 0
|
||||||
failed = 0
|
failed = 0
|
||||||
|
|
||||||
for mod_idx, mod in enumerate(manifest["modules"], 1):
|
for mod_idx, mod in enumerate(manifest["modules"], 1):
|
||||||
if module_filter and mod_idx not in module_filter:
|
if module_filter and mod_idx not in module_filter:
|
||||||
log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
|
log.info(f"\nSkipping module {mod_idx}: {mod['name']}")
|
||||||
continue
|
continue
|
||||||
log.info(f"\n{'='*60}")
|
log.info(f"\n{'='*60}")
|
||||||
log.info(f"Module: {mod['name']}")
|
log.info(f"Module: {mod['name']}")
|
||||||
log.info(f"{'='*60}")
|
log.info(f"{'='*60}")
|
||||||
|
|
||||||
for lec in mod["lectures"]:
|
for lec in mod["lectures"]:
|
||||||
total += 1
|
total += 1
|
||||||
|
|
||||||
if lec.get("download_status") != "complete":
|
if lec.get("download_status") != "complete":
|
||||||
log.warning(f" Skipping (not downloaded): {lec['title']}")
|
log.warning(f" Skipping (not downloaded): {lec['title']}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
audio_path = lec["audio_path"]
|
audio_path = lec["audio_path"]
|
||||||
stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
|
stem = Path(lec["original_filename"]).stem.replace(" [Audio]", "")
|
||||||
output_base = str(TRANSCRIPTS_DIR / stem)
|
output_base = str(TRANSCRIPTS_DIR / stem)
|
||||||
|
|
||||||
# Check if already transcribed
|
# Check if already transcribed
|
||||||
txt_path = Path(f"{output_base}.txt")
|
txt_path = Path(f"{output_base}.txt")
|
||||||
if txt_path.exists() and txt_path.stat().st_size > 0:
|
if txt_path.exists() and txt_path.stat().st_size > 0:
|
||||||
lec["transcribe_status"] = "complete"
|
lec["transcribe_status"] = "complete"
|
||||||
skipped += 1
|
skipped += 1
|
||||||
log.info(f" Skipping (exists): {stem}.txt")
|
log.info(f" Skipping (exists): {stem}.txt")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
log.info(f" Transcribing: {lec['title']}")
|
log.info(f" Transcribing: {lec['title']}")
|
||||||
log.info(f" File: {audio_path}")
|
log.info(f" File: {audio_path}")
|
||||||
|
|
||||||
# Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input
|
# Convert MP3 -> WAV 16kHz mono for reliable whisper.cpp input
|
||||||
wav_path = convert_to_wav(audio_path)
|
wav_path = convert_to_wav(audio_path)
|
||||||
|
|
||||||
if transcribe_file(wav_path, output_base):
|
if transcribe_file(wav_path, output_base):
|
||||||
lec["transcribe_status"] = "complete"
|
lec["transcribe_status"] = "complete"
|
||||||
transcribed += 1
|
transcribed += 1
|
||||||
else:
|
else:
|
||||||
lec["transcribe_status"] = "failed"
|
lec["transcribe_status"] = "failed"
|
||||||
failed += 1
|
failed += 1
|
||||||
|
|
||||||
# Save manifest after each file (checkpoint)
|
# Save manifest after each file (checkpoint)
|
||||||
save_manifest(manifest)
|
save_manifest(manifest)
|
||||||
|
|
||||||
# Quality gate: pause after first module
|
# Quality gate: pause after first module
|
||||||
if mod == manifest["modules"][0] and transcribed > 0:
|
if mod == manifest["modules"][0] and transcribed > 0:
|
||||||
log.info("\n" + "!" * 60)
|
log.info("\n" + "!" * 60)
|
||||||
log.info("QUALITY GATE: First module complete.")
|
log.info("QUALITY GATE: First module complete.")
|
||||||
log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.")
|
log.info("Spot-check 2-3 transcripts in transcripts/ before continuing.")
|
||||||
log.info("Press Enter to continue, or Ctrl+C to abort...")
|
log.info("Press Enter to continue, or Ctrl+C to abort...")
|
||||||
log.info("!" * 60)
|
log.info("!" * 60)
|
||||||
try:
|
try:
|
||||||
input()
|
input()
|
||||||
except EOFError:
|
except EOFError:
|
||||||
pass # Non-interactive mode, continue
|
pass # Non-interactive mode, continue
|
||||||
|
|
||||||
# Validation
|
# Validation
|
||||||
empty_outputs = [
|
empty_outputs = [
|
||||||
lec["title"]
|
lec["title"]
|
||||||
for mod in manifest["modules"]
|
for mod in manifest["modules"]
|
||||||
for lec in mod["lectures"]
|
for lec in mod["lectures"]
|
||||||
if lec.get("transcribe_status") == "complete"
|
if lec.get("transcribe_status") == "complete"
|
||||||
and not Path(lec["transcript_path"]).exists()
|
and not Path(lec["transcript_path"]).exists()
|
||||||
]
|
]
|
||||||
|
|
||||||
log.info("\n" + "=" * 60)
|
log.info("\n" + "=" * 60)
|
||||||
log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
|
log.info(f"Transcribed {transcribed}/{total} files, {skipped} skipped, {failed} failures.")
|
||||||
log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
|
log.info(f"No empty outputs: {'PASS' if not empty_outputs else 'FAIL'}")
|
||||||
if empty_outputs:
|
if empty_outputs:
|
||||||
for t in empty_outputs:
|
for t in empty_outputs:
|
||||||
log.error(f" Missing transcript: {t}")
|
log.error(f" Missing transcript: {t}")
|
||||||
log.info("=" * 60)
|
log.info("=" * 60)
|
||||||
|
|
||||||
save_manifest(manifest)
|
save_manifest(manifest)
|
||||||
|
|
||||||
if failed:
|
if failed:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user