feat(ocr): Add memory management and auto-restart for OCR stability
OCR Service improvements: - Single worker ThreadPoolExecutor to prevent parallel OCR memory accumulation - Semaphore to ensure only one OCR operation at a time - Explicit numpy array cleanup after each OCR step - Forced garbage collection after every OCR request - Memory threshold check (2500MB) with pre-processing GC - Memory usage logging before/after processing Backend auto-restart: - Add run-with-restart.sh wrapper script for uvicorn - Auto-restart on crash (OOM, etc.) with max 10 restarts - Reset restart counter if process runs >60s (stable) - Graceful exit on SIGINT/SIGTERM - Update start-prod.sh and start-test.sh to use wrapper 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,9 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import gc
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
|
|
||||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
||||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||||
@@ -23,10 +25,26 @@ from backend.modules.data_entry.services.ocr.validation import OCRValidationEngi
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_memory_usage_mb() -> float:
|
||||||
|
"""Get current process memory usage in MB."""
|
||||||
|
try:
|
||||||
|
import resource
|
||||||
|
# Get memory in KB, convert to MB
|
||||||
|
rusage = resource.getrusage(resource.RUSAGE_SELF)
|
||||||
|
return rusage.ru_maxrss / 1024 # Linux returns KB
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
class OCRService:
|
class OCRService:
|
||||||
"""Service for OCR processing of receipt images."""
|
"""Service for OCR processing of receipt images."""
|
||||||
|
|
||||||
_executor = ThreadPoolExecutor(max_workers=2)
|
# Single worker to prevent memory accumulation from parallel OCR
|
||||||
|
_executor = ThreadPoolExecutor(max_workers=1)
|
||||||
|
# Semaphore to ensure only one OCR operation at a time (memory protection)
|
||||||
|
_ocr_semaphore = threading.Semaphore(1)
|
||||||
|
# Memory threshold in MB - if exceeded, force GC before processing
|
||||||
|
_memory_threshold_mb = 2500
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.preprocessor = ImagePreprocessor()
|
self.preprocessor = ImagePreprocessor()
|
||||||
@@ -60,6 +78,16 @@ class OCRService:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False, f"OCR processing failed: {str(e)}", None
|
return False, f"OCR processing failed: {str(e)}", None
|
||||||
|
|
||||||
|
def _cleanup_memory(self, *arrays):
|
||||||
|
"""Explicitly delete numpy arrays and force garbage collection."""
|
||||||
|
for arr in arrays:
|
||||||
|
if arr is not None:
|
||||||
|
try:
|
||||||
|
del arr
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
def _process_sync(
|
def _process_sync(
|
||||||
self,
|
self,
|
||||||
image_path: Path,
|
image_path: Path,
|
||||||
@@ -67,16 +95,54 @@ class OCRService:
|
|||||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||||
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
||||||
|
|
||||||
|
# Acquire semaphore to ensure only one OCR at a time
|
||||||
|
acquired = self._ocr_semaphore.acquire(timeout=120) # 2 min timeout
|
||||||
|
if not acquired:
|
||||||
|
return False, "OCR service busy - please try again", None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self._process_sync_internal(image_path, mime_type)
|
||||||
|
finally:
|
||||||
|
# Always release semaphore and cleanup
|
||||||
|
self._ocr_semaphore.release()
|
||||||
|
# Force garbage collection after EVERY OCR request
|
||||||
|
gc.collect()
|
||||||
|
mem_after = get_memory_usage_mb()
|
||||||
|
print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True)
|
||||||
|
|
||||||
|
def _process_sync_internal(
|
||||||
|
self,
|
||||||
|
image_path: Path,
|
||||||
|
mime_type: str
|
||||||
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||||
|
"""Internal processing - called with semaphore held."""
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
mem_before = get_memory_usage_mb()
|
||||||
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
||||||
|
print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True)
|
||||||
|
|
||||||
|
# Check if memory is high - force GC before processing
|
||||||
|
if mem_before > self._memory_threshold_mb:
|
||||||
|
print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True)
|
||||||
|
gc.collect()
|
||||||
|
mem_after_gc = get_memory_usage_mb()
|
||||||
|
print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True)
|
||||||
|
|
||||||
# Load image
|
# Load image
|
||||||
|
images = None # For cleanup
|
||||||
|
image = None
|
||||||
if mime_type == 'application/pdf':
|
if mime_type == 'application/pdf':
|
||||||
try:
|
try:
|
||||||
images = self.preprocessor.pdf_to_images(image_path)
|
images = self.preprocessor.pdf_to_images(image_path)
|
||||||
if not images:
|
if not images:
|
||||||
return False, "Failed to extract images from PDF", None
|
return False, "Failed to extract images from PDF", None
|
||||||
image = images[0]
|
image = images[0]
|
||||||
|
# Delete other pages immediately to save memory
|
||||||
|
if len(images) > 1:
|
||||||
|
for i in range(1, len(images)):
|
||||||
|
del images[i]
|
||||||
|
images = [image]
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
return False, str(e), None
|
return False, str(e), None
|
||||||
else:
|
else:
|
||||||
@@ -98,6 +164,10 @@ class OCRService:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
||||||
|
# Cleanup light_img immediately after OCR
|
||||||
|
del light_img
|
||||||
|
light_img = None
|
||||||
|
|
||||||
if paddle_light and paddle_light.text:
|
if paddle_light and paddle_light.text:
|
||||||
extraction = self.extractor.extract(paddle_light.text)
|
extraction = self.extractor.extract(paddle_light.text)
|
||||||
extraction.ocr_engine = "paddle-light"
|
extraction.ocr_engine = "paddle-light"
|
||||||
@@ -119,12 +189,19 @@ class OCRService:
|
|||||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||||
extraction.processing_time_ms = elapsed_ms
|
extraction.processing_time_ms = elapsed_ms
|
||||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||||
|
# Cleanup before return
|
||||||
|
del image
|
||||||
|
if images:
|
||||||
|
del images
|
||||||
return True, "OCR complete (fast mode)", extraction
|
return True, "OCR complete (fast mode)", extraction
|
||||||
else:
|
else:
|
||||||
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
||||||
extraction = ExtractionResult()
|
extraction = ExtractionResult()
|
||||||
|
# Cleanup on error
|
||||||
|
if light_img is not None:
|
||||||
|
del light_img
|
||||||
|
|
||||||
# ══════════════════════════════════════════════════════════════
|
# ══════════════════════════════════════════════════════════════
|
||||||
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
|
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
|
||||||
@@ -136,6 +213,10 @@ class OCRService:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
|
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
|
||||||
|
# Cleanup medium_img immediately after OCR
|
||||||
|
del medium_img
|
||||||
|
medium_img = None
|
||||||
|
|
||||||
if paddle_medium and paddle_medium.text:
|
if paddle_medium and paddle_medium.text:
|
||||||
extraction_medium = self.extractor.extract(paddle_medium.text)
|
extraction_medium = self.extractor.extract(paddle_medium.text)
|
||||||
extraction_medium.ocr_engine = "paddle-medium"
|
extraction_medium.ocr_engine = "paddle-medium"
|
||||||
@@ -164,11 +245,18 @@ class OCRService:
|
|||||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||||
extraction.processing_time_ms = elapsed_ms
|
extraction.processing_time_ms = elapsed_ms
|
||||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||||
|
# Cleanup before return
|
||||||
|
del image
|
||||||
|
if images:
|
||||||
|
del images
|
||||||
return True, "OCR complete (paddle dual)", extraction
|
return True, "OCR complete (paddle dual)", extraction
|
||||||
else:
|
else:
|
||||||
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
|
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
|
||||||
|
# Cleanup on error
|
||||||
|
if medium_img is not None:
|
||||||
|
del medium_img
|
||||||
|
|
||||||
# ══════════════════════════════════════════════════════════════
|
# ══════════════════════════════════════════════════════════════
|
||||||
# STEP 3: Tesseract - ONLY to complete missing fields
|
# STEP 3: Tesseract - ONLY to complete missing fields
|
||||||
@@ -178,10 +266,15 @@ class OCRService:
|
|||||||
print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
|
print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
|
||||||
print("=" * 60, flush=True)
|
print("=" * 60, flush=True)
|
||||||
|
|
||||||
|
tesseract_img = None
|
||||||
try:
|
try:
|
||||||
# Use Tesseract-specific preprocessing (Otsu binarization)
|
# Use Tesseract-specific preprocessing (Otsu binarization)
|
||||||
tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
|
tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
|
||||||
tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
|
tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
|
||||||
|
# Cleanup tesseract_img immediately after OCR
|
||||||
|
del tesseract_img
|
||||||
|
tesseract_img = None
|
||||||
|
|
||||||
if tesseract_result and tesseract_result.text:
|
if tesseract_result and tesseract_result.text:
|
||||||
extraction_tess = self.extractor.extract(tesseract_result.text)
|
extraction_tess = self.extractor.extract(tesseract_result.text)
|
||||||
extraction_tess.ocr_engine = "tesseract"
|
extraction_tess.ocr_engine = "tesseract"
|
||||||
@@ -197,6 +290,14 @@ class OCRService:
|
|||||||
extraction = self._complement_extraction(extraction, extraction_tess)
|
extraction = self._complement_extraction(extraction, extraction_tess)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
||||||
|
# Cleanup on error
|
||||||
|
if tesseract_img is not None:
|
||||||
|
del tesseract_img
|
||||||
|
|
||||||
|
# Cleanup original image - no longer needed
|
||||||
|
del image
|
||||||
|
if images:
|
||||||
|
del images
|
||||||
|
|
||||||
# ══════════════════════════════════════════════════════════════
|
# ══════════════════════════════════════════════════════════════
|
||||||
# FINAL VALIDATION: Fix impossible values
|
# FINAL VALIDATION: Fix impossible values
|
||||||
|
|||||||
51
backend/run-with-restart.sh
Normal file
51
backend/run-with-restart.sh
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Wrapper script that auto-restarts uvicorn on crash
|
||||||
|
# Usage: ./run-with-restart.sh [port] [log_file]
|
||||||
|
|
||||||
|
PORT=${1:-8000}
|
||||||
|
LOG_FILE=${2:-/tmp/unified_backend.log}
|
||||||
|
MAX_RESTARTS=10
|
||||||
|
RESTART_COUNT=0
|
||||||
|
RESTART_DELAY=3
|
||||||
|
|
||||||
|
echo "[Backend Runner] Starting uvicorn with auto-restart (max $MAX_RESTARTS restarts)" | tee -a "$LOG_FILE"
|
||||||
|
echo "[Backend Runner] Port: $PORT, Log: $LOG_FILE" | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
while [ $RESTART_COUNT -lt $MAX_RESTARTS ]; do
|
||||||
|
START_TIME=$(date +%s)
|
||||||
|
echo "" | tee -a "$LOG_FILE"
|
||||||
|
echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - Starting uvicorn (attempt $((RESTART_COUNT + 1))/$MAX_RESTARTS)..." | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
# Run uvicorn - it will exit on crash (OOM, etc.)
|
||||||
|
uvicorn main:app --host 0.0.0.0 --port "$PORT" 2>&1 | tee -a "$LOG_FILE"
|
||||||
|
EXIT_CODE=$?
|
||||||
|
|
||||||
|
END_TIME=$(date +%s)
|
||||||
|
RUNTIME=$((END_TIME - START_TIME))
|
||||||
|
|
||||||
|
echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - uvicorn exited with code $EXIT_CODE after ${RUNTIME}s" | tee -a "$LOG_FILE"
|
||||||
|
|
||||||
|
# If it ran for more than 60 seconds, reset restart counter (was stable)
|
||||||
|
if [ $RUNTIME -gt 60 ]; then
|
||||||
|
RESTART_COUNT=0
|
||||||
|
echo "[Backend Runner] Process was stable (>${RUNTIME}s), resetting restart counter" | tee -a "$LOG_FILE"
|
||||||
|
else
|
||||||
|
RESTART_COUNT=$((RESTART_COUNT + 1))
|
||||||
|
echo "[Backend Runner] Quick crash detected, restart count: $RESTART_COUNT/$MAX_RESTARTS" | tee -a "$LOG_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Exit if user sent SIGINT (Ctrl+C) or SIGTERM
|
||||||
|
if [ $EXIT_CODE -eq 130 ] || [ $EXIT_CODE -eq 143 ]; then
|
||||||
|
echo "[Backend Runner] Received termination signal, exiting..." | tee -a "$LOG_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Wait before restart
|
||||||
|
if [ $RESTART_COUNT -lt $MAX_RESTARTS ]; then
|
||||||
|
echo "[Backend Runner] Restarting in ${RESTART_DELAY}s..." | tee -a "$LOG_FILE"
|
||||||
|
sleep $RESTART_DELAY
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[Backend Runner] Max restarts ($MAX_RESTARTS) reached. Giving up." | tee -a "$LOG_FILE"
|
||||||
|
exit 1
|
||||||
@@ -145,9 +145,9 @@ else
|
|||||||
source .env
|
source .env
|
||||||
set +a
|
set +a
|
||||||
|
|
||||||
# Start backend with reload
|
# Start backend with auto-restart on crash (OOM protection)
|
||||||
print_message "Starting unified backend (includes Reports, Data Entry, and Telegram bot)..."
|
print_message "Starting unified backend with auto-restart (includes Reports, Data Entry, and Telegram bot)..."
|
||||||
nohup uvicorn main:app --host 0.0.0.0 --port 8000 --reload > /tmp/unified_backend_prod.log 2>&1 &
|
nohup ./run-with-restart.sh 8000 /tmp/unified_backend_prod.log > /dev/null 2>&1 &
|
||||||
|
|
||||||
cd - > /dev/null
|
cd - > /dev/null
|
||||||
|
|
||||||
|
|||||||
@@ -145,9 +145,9 @@ else
|
|||||||
source .env
|
source .env
|
||||||
set +a
|
set +a
|
||||||
|
|
||||||
# Start backend (without reload for test stability)
|
# Start backend with auto-restart on crash (OOM protection)
|
||||||
print_message "Starting unified backend (includes Reports, Data Entry, and Telegram bot)..."
|
print_message "Starting unified backend with auto-restart (includes Reports, Data Entry, and Telegram bot)..."
|
||||||
nohup uvicorn main:app --host 0.0.0.0 --port 8000 > /tmp/unified_backend_test.log 2>&1 &
|
nohup ./run-with-restart.sh 8000 /tmp/unified_backend_test.log > /dev/null 2>&1 &
|
||||||
|
|
||||||
cd - > /dev/null
|
cd - > /dev/null
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user