Files
roa2web-service-auto/backend/TEST-OCR-WINDOWS.bat
Marius Mutu 495790411f feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00

169 lines
4.7 KiB
Batchfile

@echo off
setlocal enabledelayedexpansion
cd /d "%~dp0"
REM Parse command line arguments for worker counts
REM Usage: TEST-OCR-WINDOWS.bat [worker_counts...]
REM Examples:
REM TEST-OCR-WINDOWS.bat -> tests 1,2,3 workers (default)
REM TEST-OCR-WINDOWS.bat 1 -> tests only 1 worker
REM TEST-OCR-WINDOWS.bat 3 6 -> tests 3 and 6 workers
REM TEST-OCR-WINDOWS.bat 1 2 3 4 5 6 -> tests all
set "WORKER_LIST=%*"
if "%WORKER_LIST%"=="" set "WORKER_LIST=1 2 3"
echo.
echo ==========================================
echo OCR Benchmark - Windows (Workers: %WORKER_LIST%)
echo ==========================================
echo.
REM Check if Poppler is installed
where pdftoppm >nul 2>&1
if errorlevel 1 (
echo Checking for Poppler...
if exist "E:\poppler" (
for /r "E:\poppler" %%i in (pdftoppm.exe) do (
set "POPPLER_BIN=%%~dpi"
goto :found_poppler
)
)
echo.
echo ERROR: Poppler not found!
pause
exit /b 1
)
:found_poppler
if defined POPPLER_BIN (
echo Found Poppler at: %POPPLER_BIN%
set "PATH=%POPPLER_BIN%;%PATH%"
)
REM Check venv
if not exist "venv-win\Scripts\python.exe" (
echo ERROR: venv-win not found!
echo Run: python -m venv venv-win
echo Then: venv-win\Scripts\pip install -r requirements.txt
pause
exit /b 1
)
REM Set common environment
set JWT_SECRET_KEY=generate_with_secrets_token_urlsafe_32
set ORACLE_HOST=10.0.20.121
set ORACLE_PORT=1521
set ORACLE_USER=CONTAFIN_ORACLE
set ORACLE_PASSWORD=ROMFASTSOFT
set ORACLE_SERVICE_NAME=ROA
set OCR_ENABLE_PADDLEOCR=false
set OCR_ENABLE_TESSERACT=false
set OCR_DEFAULT_ENGINE=hybrid-doctr
set OCR_MAX_TASKS_PER_CHILD=0
set LOG_LEVEL=WARNING
REM Results file with timestamp
for /f "tokens=2 delims==" %%I in ('wmic os get localdatetime /value') do set datetime=%%I
set RESULTS_FILE=ocr_benchmark_%datetime:~0,8%_%datetime:~8,4%.json
echo Results will be saved to: %RESULTS_FILE%
echo.
REM Delete old results file if exists
if exist "%RESULTS_FILE%" del "%RESULTS_FILE%"
REM Run tests with specified workers
for %%W in (%WORKER_LIST%) do (
call :run_test %%W
)
goto :show_summary
:run_test
set WORKERS=%1
echo.
echo ############################################################
echo STARTING TEST WITH %WORKERS% WORKER(S)
echo ############################################################
echo.
REM Kill existing processes on port 8006
echo Cleaning up old processes...
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :8006 ^| findstr LISTENING 2^>nul') do (
taskkill /F /PID %%a >nul 2>&1
)
taskkill /F /FI "WINDOWTITLE eq ROA2WEB Backend*" >nul 2>&1
timeout /t 3 >nul
REM Set workers count
set OCR_WORKERS=%WORKERS%
echo Starting backend with %WORKERS% OCR worker(s)...
REM Start backend in a new minimized window with all OCR env vars
start /min "ROA2WEB Backend %WORKERS% workers" cmd /c "set OCR_WORKERS=%WORKERS%&& set OCR_ENABLE_PADDLEOCR=false&& set OCR_ENABLE_TESSERACT=false&& set OCR_DEFAULT_ENGINE=hybrid-doctr&& set LOG_LEVEL=WARNING&& venv-win\Scripts\python.exe -m uvicorn main:app --host 0.0.0.0 --port 8006 --workers 1 2>&1"
REM Wait for backend to be ready
echo Waiting for backend to start...
set attempts=0
:wait_loop
timeout /t 3 >nul
set /a attempts+=1
curl -s http://localhost:8006/health >nul 2>&1
if errorlevel 1 (
if !attempts! lss 40 (
echo Waiting... !attempts!/40
goto :wait_loop
)
echo ERROR: Backend failed to start!
goto :eof
)
echo Backend is ready!
REM Wait for OCR warmup
echo Waiting for OCR worker warmup (30s)...
timeout /t 30 >nul
echo.
echo Running OCR test with %WORKERS% worker(s)...
echo.
venv-win\Scripts\python.exe ..\tests\ocr-validation\test_receipts_parallel_windows.py --port 8006 --workers %WORKERS% --output %RESULTS_FILE%
REM Stop backend
echo.
echo Stopping backend...
taskkill /F /FI "WINDOWTITLE eq ROA2WEB Backend*" >nul 2>&1
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :8006 ^| findstr LISTENING 2^>nul') do (
taskkill /F /PID %%a >nul 2>&1
)
REM Wait for memory to be released
echo Releasing memory (10s)...
timeout /t 10 >nul
goto :eof
:show_summary
echo.
echo ############################################################
echo ALL TESTS COMPLETE
echo ############################################################
echo.
echo Results saved to: %RESULTS_FILE%
echo.
REM Show summary from results file
if exist "%RESULTS_FILE%" (
echo BENCHMARK SUMMARY:
echo ------------------
venv-win\Scripts\python.exe -c "import json; data=json.load(open('%RESULTS_FILE%')); print(); [print(f\" {r['workers']} worker(s): {r['total_time']:.1f}s total, {r['avg_time']:.1f}s avg, {r.get('peak_memory_mb', 0):.0f}MB peak, {r['successful']}/{r['submitted']} success\") for r in data]"
echo.
)
echo Press any key to exit...
pause >nul
endlocal