Files
nlp-master/run.bat
Marius Mutu 763999f3a9 feat: anti-hallucination params + retranscribe script for fixing broken transcripts
- transcribe.py: add --max-context 0, --entropy-thold 2.4, --max-len 60,
  --suppress-nst, --no-fallback to whisper.cpp to prevent hallucination loops
- transcribe.py: remove interactive quality gate (runs unattended now)
- run.bat: remove pause prompts for unattended operation
- retranscribe_tail.py: new script that detects hallucination bursts in SRT
  files, extracts and re-transcribes only the affected audio segments, then
  splices the result back together. Drops segments that re-hallucinate
  (silence/music). Backs up originals to transcripts/backup/.
- fix_hallucinations.bat: Windows wrapper for retranscribe_tail.py

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 21:17:14 +02:00

314 lines
9.5 KiB
Batchfile

@echo off
setlocal enabledelayedexpansion
cd /d "%~dp0"
echo ============================================================
echo NLP Master - Download + Transcribe Pipeline
echo ============================================================
echo.
:: ============================================================
:: PREREQUISITES CHECK
:: ============================================================
echo Checking prerequisites...
echo.
set "PREREQ_OK=1"
set "NEED_WHISPER="
set "NEED_MODEL="
:: --- Python ---
:: Avoid executing python.exe directly — the Microsoft Store stub terminates cmd.exe.
:: Use 'py' launcher first (safe), then find python.exe excluding WindowsApps stub.
set "PYTHON_CMD="
where py >nul 2>&1
if not errorlevel 1 (
set "PYTHON_CMD=py"
for /f "tokens=2" %%v in ('py --version 2^>^&1') do echo [OK] Python %%v (py launcher^)
)
if not defined PYTHON_CMD (
for /f "delims=" %%p in ('where python 2^>nul ^| findstr /v /i "WindowsApps"') do (
if not defined PYTHON_CMD (
set "PYTHON_CMD=%%p"
for /f "tokens=2" %%v in ('"%%p" --version 2^>^&1') do echo [OK] Python %%v
)
)
)
if not defined PYTHON_CMD (
echo [X] Python NOT FOUND
echo The Microsoft Store stub does not count as a real Python install.
echo Install from: https://www.python.org/downloads/
echo Make sure to check "Add Python to PATH" during install.
echo.
echo Cannot continue without Python. Install it and re-run.
pause
exit /b 1
)
:: --- .env credentials ---
if exist ".env" (
findstr /m "COURSE_USERNAME=." ".env" >nul 2>&1
if errorlevel 1 (
echo [X] .env File exists but COURSE_USERNAME is empty
echo Edit .env and fill in your credentials.
set "PREREQ_OK="
) else (
echo [OK] .env Credentials configured
)
) else (
echo [X] .env NOT FOUND
echo Create .env with:
echo COURSE_USERNAME=your_email
echo COURSE_PASSWORD=your_password
set "PREREQ_OK="
)
:: --- ffmpeg ---
set "FFMPEG_FOUND="
set "NEED_FFMPEG="
where ffmpeg >nul 2>&1
if not errorlevel 1 (
set "FFMPEG_FOUND=1"
for /f "delims=" %%p in ('where ffmpeg 2^>nul') do set "FFMPEG_LOCATION=%%p"
echo [OK] ffmpeg !FFMPEG_LOCATION!
) else (
if exist "ffmpeg.exe" (
set "FFMPEG_FOUND=1"
echo [OK] ffmpeg .\ffmpeg.exe (local^)
) else (
echo [--] ffmpeg Not found - will auto-install
set "NEED_FFMPEG=1"
)
)
:: --- whisper-cli.exe ---
set "WHISPER_FOUND="
set "WHISPER_LOCATION="
if defined WHISPER_BIN (
if exist "%WHISPER_BIN%" (
set "WHISPER_FOUND=1"
set "WHISPER_LOCATION=%WHISPER_BIN% (env var)"
)
)
if not defined WHISPER_FOUND (
where whisper-cli.exe >nul 2>&1
if not errorlevel 1 (
set "WHISPER_FOUND=1"
for /f "delims=" %%p in ('where whisper-cli.exe 2^>nul') do set "WHISPER_LOCATION=%%p (PATH)"
)
)
if not defined WHISPER_FOUND (
if exist "whisper-cli.exe" (
set "WHISPER_FOUND=1"
set "WHISPER_BIN=whisper-cli.exe"
set "WHISPER_LOCATION=.\whisper-cli.exe (local)"
)
)
if not defined WHISPER_FOUND (
if exist "whisper-bin\whisper-cli.exe" (
set "WHISPER_FOUND=1"
set "WHISPER_BIN=whisper-bin\whisper-cli.exe"
set "WHISPER_LOCATION=whisper-bin\whisper-cli.exe (auto-installed)"
)
)
if not defined WHISPER_FOUND (
if exist "whisper.cpp\build\bin\Release\whisper-cli.exe" (
set "WHISPER_FOUND=1"
set "WHISPER_BIN=whisper.cpp\build\bin\Release\whisper-cli.exe"
set "WHISPER_LOCATION=whisper.cpp\build\... (local build)"
)
)
if defined WHISPER_FOUND (
echo [OK] whisper-cli !WHISPER_LOCATION!
) else (
echo [--] whisper-cli Not found - will auto-download
set "NEED_WHISPER=1"
)
:: --- Whisper model ---
if not defined WHISPER_MODEL set "WHISPER_MODEL=models\ggml-medium-q5_0.bin"
if exist "%WHISPER_MODEL%" (
for %%F in ("%WHISPER_MODEL%") do (
set /a "MODEL_MB=%%~zF / 1048576"
)
echo [OK] Whisper model %WHISPER_MODEL% (!MODEL_MB! MB^)
) else (
echo [--] Whisper model Not found - will auto-download (~500 MB^)
set "NEED_MODEL=1"
)
:: --- Disk space ---
echo.
for /f "tokens=3" %%a in ('dir /-c "%~dp0." 2^>nul ^| findstr /c:"bytes free"') do (
set /a "FREE_GB=%%a / 1073741824" 2>nul
)
if defined FREE_GB (
if !FREE_GB! LSS 50 (
echo [!!] Disk space ~!FREE_GB! GB free (need ~50 GB for all audio + transcripts^)
) else (
echo [OK] Disk space ~!FREE_GB! GB free
)
)
echo.
:: --- Stop if .env is broken (can't auto-fix that) ---
if not defined PREREQ_OK (
echo ============================================================
echo MISSING PREREQUISITES - fix the [X] items above and re-run.
echo ============================================================
pause
exit /b 1
)
:: ============================================================
:: AUTO-INSTALL MISSING COMPONENTS
:: ============================================================
if defined NEED_FFMPEG (
echo ============================================================
echo Auto-downloading ffmpeg...
echo ============================================================
"!PYTHON_CMD!" setup_whisper.py ffmpeg
if errorlevel 1 (
echo.
echo ERROR: Could not install ffmpeg.
echo Download manually from: https://www.gyan.dev/ffmpeg/builds/
echo Extract ffmpeg.exe to ffmpeg-bin\ and re-run.
pause
exit /b 1
)
if exist ".ffmpeg_bin_path" del .ffmpeg_bin_path
echo.
)
:: Add ffmpeg-bin to PATH if it exists
if exist "ffmpeg-bin\ffmpeg.exe" (
set "PATH=%~dp0ffmpeg-bin;%PATH%"
)
if defined NEED_WHISPER (
echo ============================================================
echo Auto-downloading whisper.cpp (CPU build^)...
echo ============================================================
"!PYTHON_CMD!" setup_whisper.py whisper
if errorlevel 1 (
echo.
echo ERROR: Failed to auto-download whisper.cpp.
echo Download manually from: https://github.com/ggml-org/whisper.cpp/releases
pause
exit /b 1
)
:: Read the path that setup_whisper.py wrote
if exist ".whisper_bin_path" (
set /p WHISPER_BIN=<.whisper_bin_path
del .whisper_bin_path
echo Using: !WHISPER_BIN!
)
echo.
)
if defined NEED_MODEL (
echo ============================================================
echo Auto-downloading Whisper model (ggml-medium-q5_0, ~500 MB^)...
echo This will take a few minutes depending on your connection.
echo ============================================================
"!PYTHON_CMD!" setup_whisper.py model
if errorlevel 1 (
echo.
echo ERROR: Failed to download model.
echo Download manually from:
echo https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin
echo Save to: models\ggml-medium-q5_0.bin
pause
exit /b 1
)
echo.
)
echo All prerequisites OK!
echo.
echo ============================================================
echo Starting pipeline...
echo ============================================================
echo.
:: ============================================================
:: STEP 1: VENV + DEPENDENCIES
:: ============================================================
if not exist ".venv\Scripts\python.exe" (
echo [1/4] Creating Python virtual environment...
"!PYTHON_CMD!" -m venv .venv
if errorlevel 1 (
echo ERROR: Failed to create venv.
pause
exit /b 1
)
echo Done.
) else (
echo [1/4] Virtual environment already exists.
)
echo [2/4] Installing Python dependencies...
.venv\Scripts\pip install -q -r requirements.txt
if errorlevel 1 (
echo ERROR: Failed to install dependencies.
pause
exit /b 1
)
echo Done.
:: ============================================================
:: STEP 2: DOWNLOAD
:: ============================================================
echo.
echo [3/4] Downloading audio files...
echo ============================================================
if "%~1"=="" (
.venv\Scripts\python download.py
) else (
echo Modules filter: %~1
.venv\Scripts\python download.py --modules %~1
)
if errorlevel 1 (
echo.
echo WARNING: Some downloads failed. Check download_errors.log
echo Continuing to transcription automatically...
)
:: ============================================================
:: STEP 3: TRANSCRIBE
:: ============================================================
echo.
echo [4/4] Transcribing with whisper.cpp...
echo ============================================================
echo Using: %WHISPER_BIN%
echo Model: %WHISPER_MODEL%
echo.
if "%~1"=="" (
.venv\Scripts\python transcribe.py
) else (
echo Modules filter: %~1
.venv\Scripts\python transcribe.py --modules %~1
)
if errorlevel 1 (
echo.
echo WARNING: Some transcriptions failed. Check transcribe_errors.log
)
:: ============================================================
:: DONE
:: ============================================================
echo.
echo ============================================================
echo Pipeline complete!
echo - Audio files: audio\
echo - Transcripts: transcripts\
echo - Manifest: manifest.json
echo.
echo Next step: generate summaries from WSL2 with Claude Code
echo python summarize.py
echo ============================================================