feat(ocr): Add memory management and auto-restart for OCR stability

OCR Service improvements:
- Single worker ThreadPoolExecutor to prevent parallel OCR memory accumulation
- Semaphore to ensure only one OCR operation at a time
- Explicit numpy array cleanup after each OCR step
- Forced garbage collection after every OCR request
- Memory threshold check (2500MB) with pre-processing GC
- Memory usage logging before/after processing

Backend auto-restart:
- Add run-with-restart.sh wrapper script for uvicorn
- Auto-restart on crash (OOM, etc.) with max 10 restarts
- Reset restart counter if process runs >60s (stable)
- Graceful exit on SIGINT/SIGTERM
- Update start-prod.sh and start-test.sh to use wrapper

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-30 22:03:51 +02:00
parent c5682ead42
commit 51d736addf
4 changed files with 159 additions and 7 deletions

View File

@@ -2,7 +2,9 @@
import os
import re
import gc
import logging
import threading
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
@@ -23,10 +25,26 @@ from backend.modules.data_entry.services.ocr.validation import OCRValidationEngi
logger = logging.getLogger(__name__)
def get_memory_usage_mb() -> float:
"""Get current process memory usage in MB."""
try:
import resource
# Get memory in KB, convert to MB
rusage = resource.getrusage(resource.RUSAGE_SELF)
return rusage.ru_maxrss / 1024 # Linux returns KB
except Exception:
return 0.0
class OCRService:
"""Service for OCR processing of receipt images."""
_executor = ThreadPoolExecutor(max_workers=2)
# Single worker to prevent memory accumulation from parallel OCR
_executor = ThreadPoolExecutor(max_workers=1)
# Semaphore to ensure only one OCR operation at a time (memory protection)
_ocr_semaphore = threading.Semaphore(1)
# Memory threshold in MB - if exceeded, force GC before processing
_memory_threshold_mb = 2500
def __init__(self):
self.preprocessor = ImagePreprocessor()
@@ -60,6 +78,16 @@ class OCRService:
except Exception as e:
return False, f"OCR processing failed: {str(e)}", None
def _cleanup_memory(self, *arrays):
"""Explicitly delete numpy arrays and force garbage collection."""
for arr in arrays:
if arr is not None:
try:
del arr
except:
pass
gc.collect()
def _process_sync(
self,
image_path: Path,
@@ -67,16 +95,54 @@ class OCRService:
) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""Synchronous processing with ADAPTIVE OCR pipeline."""
# Acquire semaphore to ensure only one OCR at a time
acquired = self._ocr_semaphore.acquire(timeout=120) # 2 min timeout
if not acquired:
return False, "OCR service busy - please try again", None
try:
return self._process_sync_internal(image_path, mime_type)
finally:
# Always release semaphore and cleanup
self._ocr_semaphore.release()
# Force garbage collection after EVERY OCR request
gc.collect()
mem_after = get_memory_usage_mb()
print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True)
def _process_sync_internal(
self,
image_path: Path,
mime_type: str
) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""Internal processing - called with semaphore held."""
start_time = time.time()
mem_before = get_memory_usage_mb()
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True)
# Check if memory is high - force GC before processing
if mem_before > self._memory_threshold_mb:
print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True)
gc.collect()
mem_after_gc = get_memory_usage_mb()
print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True)
# Load image
images = None # For cleanup
image = None
if mime_type == 'application/pdf':
try:
images = self.preprocessor.pdf_to_images(image_path)
if not images:
return False, "Failed to extract images from PDF", None
image = images[0]
# Delete other pages immediately to save memory
if len(images) > 1:
for i in range(1, len(images)):
del images[i]
images = [image]
except RuntimeError as e:
return False, str(e), None
else:
@@ -98,6 +164,10 @@ class OCRService:
try:
paddle_light = self.ocr_engine._paddle_recognize(light_img)
# Cleanup light_img immediately after OCR
del light_img
light_img = None
if paddle_light and paddle_light.text:
extraction = self.extractor.extract(paddle_light.text)
extraction.ocr_engine = "paddle-light"
@@ -119,12 +189,19 @@ class OCRService:
elapsed_ms = int((time.time() - start_time) * 1000)
extraction.processing_time_ms = elapsed_ms
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
# Cleanup before return
del image
if images:
del images
return True, "OCR complete (fast mode)", extraction
else:
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
except Exception as e:
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
extraction = ExtractionResult()
# Cleanup on error
if light_img is not None:
del light_img
# ══════════════════════════════════════════════════════════════
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
@@ -136,6 +213,10 @@ class OCRService:
try:
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
# Cleanup medium_img immediately after OCR
del medium_img
medium_img = None
if paddle_medium and paddle_medium.text:
extraction_medium = self.extractor.extract(paddle_medium.text)
extraction_medium.ocr_engine = "paddle-medium"
@@ -164,11 +245,18 @@ class OCRService:
elapsed_ms = int((time.time() - start_time) * 1000)
extraction.processing_time_ms = elapsed_ms
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
# Cleanup before return
del image
if images:
del images
return True, "OCR complete (paddle dual)", extraction
else:
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
except Exception as e:
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
# Cleanup on error
if medium_img is not None:
del medium_img
# ══════════════════════════════════════════════════════════════
# STEP 3: Tesseract - ONLY to complete missing fields
@@ -178,10 +266,15 @@ class OCRService:
print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
print("=" * 60, flush=True)
tesseract_img = None
try:
# Use Tesseract-specific preprocessing (Otsu binarization)
tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
# Cleanup tesseract_img immediately after OCR
del tesseract_img
tesseract_img = None
if tesseract_result and tesseract_result.text:
extraction_tess = self.extractor.extract(tesseract_result.text)
extraction_tess.ocr_engine = "tesseract"
@@ -197,6 +290,14 @@ class OCRService:
extraction = self._complement_extraction(extraction, extraction_tess)
except Exception as e:
print(f"[OCR] Tesseract failed: {e}", flush=True)
# Cleanup on error
if tesseract_img is not None:
del tesseract_img
# Cleanup original image - no longer needed
del image
if images:
del images
# ══════════════════════════════════════════════════════════════
# FINAL VALIDATION: Fix impossible values

View File

@@ -0,0 +1,51 @@
#!/bin/bash
# Wrapper script that auto-restarts uvicorn on crash
# Usage: ./run-with-restart.sh [port] [log_file]
PORT=${1:-8000}
LOG_FILE=${2:-/tmp/unified_backend.log}
MAX_RESTARTS=10
RESTART_COUNT=0
RESTART_DELAY=3
echo "[Backend Runner] Starting uvicorn with auto-restart (max $MAX_RESTARTS restarts)" | tee -a "$LOG_FILE"
echo "[Backend Runner] Port: $PORT, Log: $LOG_FILE" | tee -a "$LOG_FILE"
while [ $RESTART_COUNT -lt $MAX_RESTARTS ]; do
START_TIME=$(date +%s)
echo "" | tee -a "$LOG_FILE"
echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - Starting uvicorn (attempt $((RESTART_COUNT + 1))/$MAX_RESTARTS)..." | tee -a "$LOG_FILE"
# Run uvicorn - it will exit on crash (OOM, etc.)
uvicorn main:app --host 0.0.0.0 --port "$PORT" 2>&1 | tee -a "$LOG_FILE"
EXIT_CODE=$?
END_TIME=$(date +%s)
RUNTIME=$((END_TIME - START_TIME))
echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - uvicorn exited with code $EXIT_CODE after ${RUNTIME}s" | tee -a "$LOG_FILE"
# If it ran for more than 60 seconds, reset restart counter (was stable)
if [ $RUNTIME -gt 60 ]; then
RESTART_COUNT=0
echo "[Backend Runner] Process was stable (>${RUNTIME}s), resetting restart counter" | tee -a "$LOG_FILE"
else
RESTART_COUNT=$((RESTART_COUNT + 1))
echo "[Backend Runner] Quick crash detected, restart count: $RESTART_COUNT/$MAX_RESTARTS" | tee -a "$LOG_FILE"
fi
# Exit if user sent SIGINT (Ctrl+C) or SIGTERM
if [ $EXIT_CODE -eq 130 ] || [ $EXIT_CODE -eq 143 ]; then
echo "[Backend Runner] Received termination signal, exiting..." | tee -a "$LOG_FILE"
exit 0
fi
# Wait before restart
if [ $RESTART_COUNT -lt $MAX_RESTARTS ]; then
echo "[Backend Runner] Restarting in ${RESTART_DELAY}s..." | tee -a "$LOG_FILE"
sleep $RESTART_DELAY
fi
done
echo "[Backend Runner] Max restarts ($MAX_RESTARTS) reached. Giving up." | tee -a "$LOG_FILE"
exit 1