Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
169 lines
4.7 KiB
Batchfile
169 lines
4.7 KiB
Batchfile
@echo off
|
|
setlocal enabledelayedexpansion
|
|
|
|
cd /d "%~dp0"
|
|
|
|
REM Parse command line arguments for worker counts
|
|
REM Usage: TEST-OCR-WINDOWS.bat [worker_counts...]
|
|
REM Examples:
|
|
REM TEST-OCR-WINDOWS.bat -> tests 1,2,3 workers (default)
|
|
REM TEST-OCR-WINDOWS.bat 1 -> tests only 1 worker
|
|
REM TEST-OCR-WINDOWS.bat 3 6 -> tests 3 and 6 workers
|
|
REM TEST-OCR-WINDOWS.bat 1 2 3 4 5 6 -> tests all
|
|
|
|
set "WORKER_LIST=%*"
|
|
if "%WORKER_LIST%"=="" set "WORKER_LIST=1 2 3"
|
|
|
|
echo.
|
|
echo ==========================================
|
|
echo OCR Benchmark - Windows (Workers: %WORKER_LIST%)
|
|
echo ==========================================
|
|
echo.
|
|
|
|
REM Check if Poppler is installed
|
|
where pdftoppm >nul 2>&1
|
|
if errorlevel 1 (
|
|
echo Checking for Poppler...
|
|
if exist "E:\poppler" (
|
|
for /r "E:\poppler" %%i in (pdftoppm.exe) do (
|
|
set "POPPLER_BIN=%%~dpi"
|
|
goto :found_poppler
|
|
)
|
|
)
|
|
echo.
|
|
echo ERROR: Poppler not found!
|
|
pause
|
|
exit /b 1
|
|
)
|
|
:found_poppler
|
|
if defined POPPLER_BIN (
|
|
echo Found Poppler at: %POPPLER_BIN%
|
|
set "PATH=%POPPLER_BIN%;%PATH%"
|
|
)
|
|
|
|
REM Check venv
|
|
if not exist "venv-win\Scripts\python.exe" (
|
|
echo ERROR: venv-win not found!
|
|
echo Run: python -m venv venv-win
|
|
echo Then: venv-win\Scripts\pip install -r requirements.txt
|
|
pause
|
|
exit /b 1
|
|
)
|
|
|
|
REM Set common environment
|
|
set JWT_SECRET_KEY=generate_with_secrets_token_urlsafe_32
|
|
set ORACLE_HOST=10.0.20.121
|
|
set ORACLE_PORT=1521
|
|
set ORACLE_USER=CONTAFIN_ORACLE
|
|
set ORACLE_PASSWORD=ROMFASTSOFT
|
|
set ORACLE_SERVICE_NAME=ROA
|
|
set OCR_ENABLE_PADDLEOCR=false
|
|
set OCR_ENABLE_TESSERACT=false
|
|
set OCR_DEFAULT_ENGINE=hybrid-doctr
|
|
set OCR_MAX_TASKS_PER_CHILD=0
|
|
set LOG_LEVEL=WARNING
|
|
|
|
REM Results file with timestamp
|
|
for /f "tokens=2 delims==" %%I in ('wmic os get localdatetime /value') do set datetime=%%I
|
|
set RESULTS_FILE=ocr_benchmark_%datetime:~0,8%_%datetime:~8,4%.json
|
|
|
|
echo Results will be saved to: %RESULTS_FILE%
|
|
echo.
|
|
|
|
REM Delete old results file if exists
|
|
if exist "%RESULTS_FILE%" del "%RESULTS_FILE%"
|
|
|
|
REM Run tests with specified workers
|
|
for %%W in (%WORKER_LIST%) do (
|
|
call :run_test %%W
|
|
)
|
|
|
|
goto :show_summary
|
|
|
|
:run_test
|
|
set WORKERS=%1
|
|
echo.
|
|
echo ############################################################
|
|
echo STARTING TEST WITH %WORKERS% WORKER(S)
|
|
echo ############################################################
|
|
echo.
|
|
|
|
REM Kill existing processes on port 8006
|
|
echo Cleaning up old processes...
|
|
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :8006 ^| findstr LISTENING 2^>nul') do (
|
|
taskkill /F /PID %%a >nul 2>&1
|
|
)
|
|
taskkill /F /FI "WINDOWTITLE eq ROA2WEB Backend*" >nul 2>&1
|
|
timeout /t 3 >nul
|
|
|
|
REM Set workers count
|
|
set OCR_WORKERS=%WORKERS%
|
|
|
|
echo Starting backend with %WORKERS% OCR worker(s)...
|
|
|
|
REM Start backend in a new minimized window with all OCR env vars
|
|
start /min "ROA2WEB Backend %WORKERS% workers" cmd /c "set OCR_WORKERS=%WORKERS%&& set OCR_ENABLE_PADDLEOCR=false&& set OCR_ENABLE_TESSERACT=false&& set OCR_DEFAULT_ENGINE=hybrid-doctr&& set LOG_LEVEL=WARNING&& venv-win\Scripts\python.exe -m uvicorn main:app --host 0.0.0.0 --port 8006 --workers 1 2>&1"
|
|
|
|
REM Wait for backend to be ready
|
|
echo Waiting for backend to start...
|
|
set attempts=0
|
|
:wait_loop
|
|
timeout /t 3 >nul
|
|
set /a attempts+=1
|
|
curl -s http://localhost:8006/health >nul 2>&1
|
|
if errorlevel 1 (
|
|
if !attempts! lss 40 (
|
|
echo Waiting... !attempts!/40
|
|
goto :wait_loop
|
|
)
|
|
echo ERROR: Backend failed to start!
|
|
goto :eof
|
|
)
|
|
|
|
echo Backend is ready!
|
|
|
|
REM Wait for OCR warmup
|
|
echo Waiting for OCR worker warmup (30s)...
|
|
timeout /t 30 >nul
|
|
|
|
echo.
|
|
echo Running OCR test with %WORKERS% worker(s)...
|
|
echo.
|
|
|
|
venv-win\Scripts\python.exe ..\tests\ocr-validation\test_receipts_parallel_windows.py --port 8006 --workers %WORKERS% --output %RESULTS_FILE%
|
|
|
|
REM Stop backend
|
|
echo.
|
|
echo Stopping backend...
|
|
taskkill /F /FI "WINDOWTITLE eq ROA2WEB Backend*" >nul 2>&1
|
|
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :8006 ^| findstr LISTENING 2^>nul') do (
|
|
taskkill /F /PID %%a >nul 2>&1
|
|
)
|
|
|
|
REM Wait for memory to be released
|
|
echo Releasing memory (10s)...
|
|
timeout /t 10 >nul
|
|
goto :eof
|
|
|
|
:show_summary
|
|
echo.
|
|
echo ############################################################
|
|
echo ALL TESTS COMPLETE
|
|
echo ############################################################
|
|
echo.
|
|
echo Results saved to: %RESULTS_FILE%
|
|
echo.
|
|
|
|
REM Show summary from results file
|
|
if exist "%RESULTS_FILE%" (
|
|
echo BENCHMARK SUMMARY:
|
|
echo ------------------
|
|
venv-win\Scripts\python.exe -c "import json; data=json.load(open('%RESULTS_FILE%')); print(); [print(f\" {r['workers']} worker(s): {r['total_time']:.1f}s total, {r['avg_time']:.1f}s avg, {r.get('peak_memory_mb', 0):.0f}MB peak, {r['successful']}/{r['submitted']} success\") for r in data]"
|
|
echo.
|
|
)
|
|
|
|
echo Press any key to exit...
|
|
pause >nul
|
|
|
|
endlocal
|