feat(ocr): Add memory management and auto-restart for OCR stability
OCR Service improvements: - Single worker ThreadPoolExecutor to prevent parallel OCR memory accumulation - Semaphore to ensure only one OCR operation at a time - Explicit numpy array cleanup after each OCR step - Forced garbage collection after every OCR request - Memory threshold check (2500MB) with pre-processing GC - Memory usage logging before/after processing Backend auto-restart: - Add run-with-restart.sh wrapper script for uvicorn - Auto-restart on crash (OOM, etc.) with max 10 restarts - Reset restart counter if process runs >60s (stable) - Graceful exit on SIGINT/SIGTERM - Update start-prod.sh and start-test.sh to use wrapper 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,9 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
import gc
|
||||
import logging
|
||||
import threading
|
||||
|
||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
@@ -23,10 +25,26 @@ from backend.modules.data_entry.services.ocr.validation import OCRValidationEngi
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_memory_usage_mb() -> float:
|
||||
"""Get current process memory usage in MB."""
|
||||
try:
|
||||
import resource
|
||||
# Get memory in KB, convert to MB
|
||||
rusage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
return rusage.ru_maxrss / 1024 # Linux returns KB
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
class OCRService:
|
||||
"""Service for OCR processing of receipt images."""
|
||||
|
||||
_executor = ThreadPoolExecutor(max_workers=2)
|
||||
# Single worker to prevent memory accumulation from parallel OCR
|
||||
_executor = ThreadPoolExecutor(max_workers=1)
|
||||
# Semaphore to ensure only one OCR operation at a time (memory protection)
|
||||
_ocr_semaphore = threading.Semaphore(1)
|
||||
# Memory threshold in MB - if exceeded, force GC before processing
|
||||
_memory_threshold_mb = 2500
|
||||
|
||||
def __init__(self):
|
||||
self.preprocessor = ImagePreprocessor()
|
||||
@@ -60,6 +78,16 @@ class OCRService:
|
||||
except Exception as e:
|
||||
return False, f"OCR processing failed: {str(e)}", None
|
||||
|
||||
def _cleanup_memory(self, *arrays):
|
||||
"""Explicitly delete numpy arrays and force garbage collection."""
|
||||
for arr in arrays:
|
||||
if arr is not None:
|
||||
try:
|
||||
del arr
|
||||
except:
|
||||
pass
|
||||
gc.collect()
|
||||
|
||||
def _process_sync(
|
||||
self,
|
||||
image_path: Path,
|
||||
@@ -67,16 +95,54 @@ class OCRService:
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
||||
|
||||
# Acquire semaphore to ensure only one OCR at a time
|
||||
acquired = self._ocr_semaphore.acquire(timeout=120) # 2 min timeout
|
||||
if not acquired:
|
||||
return False, "OCR service busy - please try again", None
|
||||
|
||||
try:
|
||||
return self._process_sync_internal(image_path, mime_type)
|
||||
finally:
|
||||
# Always release semaphore and cleanup
|
||||
self._ocr_semaphore.release()
|
||||
# Force garbage collection after EVERY OCR request
|
||||
gc.collect()
|
||||
mem_after = get_memory_usage_mb()
|
||||
print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True)
|
||||
|
||||
def _process_sync_internal(
|
||||
self,
|
||||
image_path: Path,
|
||||
mime_type: str
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""Internal processing - called with semaphore held."""
|
||||
|
||||
start_time = time.time()
|
||||
mem_before = get_memory_usage_mb()
|
||||
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
||||
print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True)
|
||||
|
||||
# Check if memory is high - force GC before processing
|
||||
if mem_before > self._memory_threshold_mb:
|
||||
print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True)
|
||||
gc.collect()
|
||||
mem_after_gc = get_memory_usage_mb()
|
||||
print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True)
|
||||
|
||||
# Load image
|
||||
images = None # For cleanup
|
||||
image = None
|
||||
if mime_type == 'application/pdf':
|
||||
try:
|
||||
images = self.preprocessor.pdf_to_images(image_path)
|
||||
if not images:
|
||||
return False, "Failed to extract images from PDF", None
|
||||
image = images[0]
|
||||
# Delete other pages immediately to save memory
|
||||
if len(images) > 1:
|
||||
for i in range(1, len(images)):
|
||||
del images[i]
|
||||
images = [image]
|
||||
except RuntimeError as e:
|
||||
return False, str(e), None
|
||||
else:
|
||||
@@ -98,6 +164,10 @@ class OCRService:
|
||||
|
||||
try:
|
||||
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
||||
# Cleanup light_img immediately after OCR
|
||||
del light_img
|
||||
light_img = None
|
||||
|
||||
if paddle_light and paddle_light.text:
|
||||
extraction = self.extractor.extract(paddle_light.text)
|
||||
extraction.ocr_engine = "paddle-light"
|
||||
@@ -119,12 +189,19 @@ class OCRService:
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||
# Cleanup before return
|
||||
del image
|
||||
if images:
|
||||
del images
|
||||
return True, "OCR complete (fast mode)", extraction
|
||||
else:
|
||||
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
||||
except Exception as e:
|
||||
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
||||
extraction = ExtractionResult()
|
||||
# Cleanup on error
|
||||
if light_img is not None:
|
||||
del light_img
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
|
||||
@@ -136,6 +213,10 @@ class OCRService:
|
||||
|
||||
try:
|
||||
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
|
||||
# Cleanup medium_img immediately after OCR
|
||||
del medium_img
|
||||
medium_img = None
|
||||
|
||||
if paddle_medium and paddle_medium.text:
|
||||
extraction_medium = self.extractor.extract(paddle_medium.text)
|
||||
extraction_medium.ocr_engine = "paddle-medium"
|
||||
@@ -164,11 +245,18 @@ class OCRService:
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||
# Cleanup before return
|
||||
del image
|
||||
if images:
|
||||
del images
|
||||
return True, "OCR complete (paddle dual)", extraction
|
||||
else:
|
||||
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
||||
except Exception as e:
|
||||
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
|
||||
# Cleanup on error
|
||||
if medium_img is not None:
|
||||
del medium_img
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 3: Tesseract - ONLY to complete missing fields
|
||||
@@ -178,10 +266,15 @@ class OCRService:
|
||||
print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
|
||||
print("=" * 60, flush=True)
|
||||
|
||||
tesseract_img = None
|
||||
try:
|
||||
# Use Tesseract-specific preprocessing (Otsu binarization)
|
||||
tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
|
||||
tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
|
||||
# Cleanup tesseract_img immediately after OCR
|
||||
del tesseract_img
|
||||
tesseract_img = None
|
||||
|
||||
if tesseract_result and tesseract_result.text:
|
||||
extraction_tess = self.extractor.extract(tesseract_result.text)
|
||||
extraction_tess.ocr_engine = "tesseract"
|
||||
@@ -197,6 +290,14 @@ class OCRService:
|
||||
extraction = self._complement_extraction(extraction, extraction_tess)
|
||||
except Exception as e:
|
||||
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
||||
# Cleanup on error
|
||||
if tesseract_img is not None:
|
||||
del tesseract_img
|
||||
|
||||
# Cleanup original image - no longer needed
|
||||
del image
|
||||
if images:
|
||||
del images
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# FINAL VALIDATION: Fix impossible values
|
||||
|
||||
51
backend/run-with-restart.sh
Normal file
51
backend/run-with-restart.sh
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
# Wrapper script that auto-restarts uvicorn on crash
|
||||
# Usage: ./run-with-restart.sh [port] [log_file]
|
||||
|
||||
PORT=${1:-8000}
|
||||
LOG_FILE=${2:-/tmp/unified_backend.log}
|
||||
MAX_RESTARTS=10
|
||||
RESTART_COUNT=0
|
||||
RESTART_DELAY=3
|
||||
|
||||
echo "[Backend Runner] Starting uvicorn with auto-restart (max $MAX_RESTARTS restarts)" | tee -a "$LOG_FILE"
|
||||
echo "[Backend Runner] Port: $PORT, Log: $LOG_FILE" | tee -a "$LOG_FILE"
|
||||
|
||||
while [ $RESTART_COUNT -lt $MAX_RESTARTS ]; do
|
||||
START_TIME=$(date +%s)
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - Starting uvicorn (attempt $((RESTART_COUNT + 1))/$MAX_RESTARTS)..." | tee -a "$LOG_FILE"
|
||||
|
||||
# Run uvicorn - it will exit on crash (OOM, etc.)
|
||||
uvicorn main:app --host 0.0.0.0 --port "$PORT" 2>&1 | tee -a "$LOG_FILE"
|
||||
EXIT_CODE=$?
|
||||
|
||||
END_TIME=$(date +%s)
|
||||
RUNTIME=$((END_TIME - START_TIME))
|
||||
|
||||
echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - uvicorn exited with code $EXIT_CODE after ${RUNTIME}s" | tee -a "$LOG_FILE"
|
||||
|
||||
# If it ran for more than 60 seconds, reset restart counter (was stable)
|
||||
if [ $RUNTIME -gt 60 ]; then
|
||||
RESTART_COUNT=0
|
||||
echo "[Backend Runner] Process was stable (>${RUNTIME}s), resetting restart counter" | tee -a "$LOG_FILE"
|
||||
else
|
||||
RESTART_COUNT=$((RESTART_COUNT + 1))
|
||||
echo "[Backend Runner] Quick crash detected, restart count: $RESTART_COUNT/$MAX_RESTARTS" | tee -a "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# Exit if user sent SIGINT (Ctrl+C) or SIGTERM
|
||||
if [ $EXIT_CODE -eq 130 ] || [ $EXIT_CODE -eq 143 ]; then
|
||||
echo "[Backend Runner] Received termination signal, exiting..." | tee -a "$LOG_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Wait before restart
|
||||
if [ $RESTART_COUNT -lt $MAX_RESTARTS ]; then
|
||||
echo "[Backend Runner] Restarting in ${RESTART_DELAY}s..." | tee -a "$LOG_FILE"
|
||||
sleep $RESTART_DELAY
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[Backend Runner] Max restarts ($MAX_RESTARTS) reached. Giving up." | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
Reference in New Issue
Block a user