feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
168
backend/TEST-OCR-WINDOWS.bat
Normal file
168
backend/TEST-OCR-WINDOWS.bat
Normal file
@@ -0,0 +1,168 @@
|
||||
@echo off
|
||||
setlocal enabledelayedexpansion
|
||||
|
||||
cd /d "%~dp0"
|
||||
|
||||
REM Parse command line arguments for worker counts
|
||||
REM Usage: TEST-OCR-WINDOWS.bat [worker_counts...]
|
||||
REM Examples:
|
||||
REM TEST-OCR-WINDOWS.bat -> tests 1,2,3 workers (default)
|
||||
REM TEST-OCR-WINDOWS.bat 1 -> tests only 1 worker
|
||||
REM TEST-OCR-WINDOWS.bat 3 6 -> tests 3 and 6 workers
|
||||
REM TEST-OCR-WINDOWS.bat 1 2 3 4 5 6 -> tests all
|
||||
|
||||
set "WORKER_LIST=%*"
|
||||
if "%WORKER_LIST%"=="" set "WORKER_LIST=1 2 3"
|
||||
|
||||
echo.
|
||||
echo ==========================================
|
||||
echo OCR Benchmark - Windows (Workers: %WORKER_LIST%)
|
||||
echo ==========================================
|
||||
echo.
|
||||
|
||||
REM Check if Poppler is installed
|
||||
where pdftoppm >nul 2>&1
|
||||
if errorlevel 1 (
|
||||
echo Checking for Poppler...
|
||||
if exist "E:\poppler" (
|
||||
for /r "E:\poppler" %%i in (pdftoppm.exe) do (
|
||||
set "POPPLER_BIN=%%~dpi"
|
||||
goto :found_poppler
|
||||
)
|
||||
)
|
||||
echo.
|
||||
echo ERROR: Poppler not found!
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
:found_poppler
|
||||
if defined POPPLER_BIN (
|
||||
echo Found Poppler at: %POPPLER_BIN%
|
||||
set "PATH=%POPPLER_BIN%;%PATH%"
|
||||
)
|
||||
|
||||
REM Check venv
|
||||
if not exist "venv-win\Scripts\python.exe" (
|
||||
echo ERROR: venv-win not found!
|
||||
echo Run: python -m venv venv-win
|
||||
echo Then: venv-win\Scripts\pip install -r requirements.txt
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
REM Set common environment
|
||||
set JWT_SECRET_KEY=generate_with_secrets_token_urlsafe_32
|
||||
set ORACLE_HOST=10.0.20.121
|
||||
set ORACLE_PORT=1521
|
||||
set ORACLE_USER=CONTAFIN_ORACLE
|
||||
set ORACLE_PASSWORD=ROMFASTSOFT
|
||||
set ORACLE_SERVICE_NAME=ROA
|
||||
set OCR_ENABLE_PADDLEOCR=false
|
||||
set OCR_ENABLE_TESSERACT=false
|
||||
set OCR_DEFAULT_ENGINE=hybrid-doctr
|
||||
set OCR_MAX_TASKS_PER_CHILD=0
|
||||
set LOG_LEVEL=WARNING
|
||||
|
||||
REM Results file with timestamp
|
||||
for /f "tokens=2 delims==" %%I in ('wmic os get localdatetime /value') do set datetime=%%I
|
||||
set RESULTS_FILE=ocr_benchmark_%datetime:~0,8%_%datetime:~8,4%.json
|
||||
|
||||
echo Results will be saved to: %RESULTS_FILE%
|
||||
echo.
|
||||
|
||||
REM Delete old results file if exists
|
||||
if exist "%RESULTS_FILE%" del "%RESULTS_FILE%"
|
||||
|
||||
REM Run tests with specified workers
|
||||
for %%W in (%WORKER_LIST%) do (
|
||||
call :run_test %%W
|
||||
)
|
||||
|
||||
goto :show_summary
|
||||
|
||||
:run_test
|
||||
set WORKERS=%1
|
||||
echo.
|
||||
echo ############################################################
|
||||
echo STARTING TEST WITH %WORKERS% WORKER(S)
|
||||
echo ############################################################
|
||||
echo.
|
||||
|
||||
REM Kill existing processes on port 8006
|
||||
echo Cleaning up old processes...
|
||||
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :8006 ^| findstr LISTENING 2^>nul') do (
|
||||
taskkill /F /PID %%a >nul 2>&1
|
||||
)
|
||||
taskkill /F /FI "WINDOWTITLE eq ROA2WEB Backend*" >nul 2>&1
|
||||
timeout /t 3 >nul
|
||||
|
||||
REM Set workers count
|
||||
set OCR_WORKERS=%WORKERS%
|
||||
|
||||
echo Starting backend with %WORKERS% OCR worker(s)...
|
||||
|
||||
REM Start backend in a new minimized window with all OCR env vars
|
||||
start /min "ROA2WEB Backend %WORKERS% workers" cmd /c "set OCR_WORKERS=%WORKERS%&& set OCR_ENABLE_PADDLEOCR=false&& set OCR_ENABLE_TESSERACT=false&& set OCR_DEFAULT_ENGINE=hybrid-doctr&& set LOG_LEVEL=WARNING&& venv-win\Scripts\python.exe -m uvicorn main:app --host 0.0.0.0 --port 8006 --workers 1 2>&1"
|
||||
|
||||
REM Wait for backend to be ready
|
||||
echo Waiting for backend to start...
|
||||
set attempts=0
|
||||
:wait_loop
|
||||
timeout /t 3 >nul
|
||||
set /a attempts+=1
|
||||
curl -s http://localhost:8006/health >nul 2>&1
|
||||
if errorlevel 1 (
|
||||
if !attempts! lss 40 (
|
||||
echo Waiting... !attempts!/40
|
||||
goto :wait_loop
|
||||
)
|
||||
echo ERROR: Backend failed to start!
|
||||
goto :eof
|
||||
)
|
||||
|
||||
echo Backend is ready!
|
||||
|
||||
REM Wait for OCR warmup
|
||||
echo Waiting for OCR worker warmup (30s)...
|
||||
timeout /t 30 >nul
|
||||
|
||||
echo.
|
||||
echo Running OCR test with %WORKERS% worker(s)...
|
||||
echo.
|
||||
|
||||
venv-win\Scripts\python.exe ..\tests\ocr-validation\test_receipts_parallel_windows.py --port 8006 --workers %WORKERS% --output %RESULTS_FILE%
|
||||
|
||||
REM Stop backend
|
||||
echo.
|
||||
echo Stopping backend...
|
||||
taskkill /F /FI "WINDOWTITLE eq ROA2WEB Backend*" >nul 2>&1
|
||||
for /f "tokens=5" %%a in ('netstat -ano ^| findstr :8006 ^| findstr LISTENING 2^>nul') do (
|
||||
taskkill /F /PID %%a >nul 2>&1
|
||||
)
|
||||
|
||||
REM Wait for memory to be released
|
||||
echo Releasing memory (10s)...
|
||||
timeout /t 10 >nul
|
||||
goto :eof
|
||||
|
||||
:show_summary
|
||||
echo.
|
||||
echo ############################################################
|
||||
echo ALL TESTS COMPLETE
|
||||
echo ############################################################
|
||||
echo.
|
||||
echo Results saved to: %RESULTS_FILE%
|
||||
echo.
|
||||
|
||||
REM Show summary from results file
|
||||
if exist "%RESULTS_FILE%" (
|
||||
echo BENCHMARK SUMMARY:
|
||||
echo ------------------
|
||||
venv-win\Scripts\python.exe -c "import json; data=json.load(open('%RESULTS_FILE%')); print(); [print(f\" {r['workers']} worker(s): {r['total_time']:.1f}s total, {r['avg_time']:.1f}s avg, {r.get('peak_memory_mb', 0):.0f}MB peak, {r['successful']}/{r['submitted']} success\") for r in data]"
|
||||
echo.
|
||||
)
|
||||
|
||||
echo Press any key to exit...
|
||||
pause >nul
|
||||
|
||||
endlocal
|
||||
Reference in New Issue
Block a user