feat(ocr): Add memory management and auto-restart for OCR stability

OCR Service improvements: - Single worker ThreadPoolExecutor to prevent parallel OCR memory accumulation - Semaphore to ensure only one OCR operation at a time - Explicit numpy array cleanup after each OCR step - Forced garbage collection after every OCR request - Memory threshold check (2500MB) with pre-processing GC - Memory usage logging before/after processing Backend auto-restart: - Add run-with-restart.sh wrapper script for uvicorn - Auto-restart on crash (OOM, etc.) with max 10 restarts - Reset restart counter if process runs >60s (stable) - Graceful exit on SIGINT/SIGTERM - Update start-prod.sh and start-test.sh to use wrapper 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-30 22:03:51 +02:00
parent c5682ead42
commit 51d736addf
4 changed files with 159 additions and 7 deletions
--- a/backend/modules/data_entry/services/ocr_service.py
+++ b/backend/modules/data_entry/services/ocr_service.py
@@ -2,7 +2,9 @@

 import os
 import re
+import gc
 import logging
+import threading

 # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
 os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
@@ -23,10 +25,26 @@ from backend.modules.data_entry.services.ocr.validation import OCRValidationEngi
 logger = logging.getLogger(__name__)


+def get_memory_usage_mb() -> float:
+    """Get current process memory usage in MB."""
+    try:
+        import resource
+        # Get memory in KB, convert to MB
+        rusage = resource.getrusage(resource.RUSAGE_SELF)
+        return rusage.ru_maxrss / 1024  # Linux returns KB
+    except Exception:
+        return 0.0
+
+
 class OCRService:
    """Service for OCR processing of receipt images."""

-    _executor = ThreadPoolExecutor(max_workers=2)
+    # Single worker to prevent memory accumulation from parallel OCR
+    _executor = ThreadPoolExecutor(max_workers=1)
+    # Semaphore to ensure only one OCR operation at a time (memory protection)
+    _ocr_semaphore = threading.Semaphore(1)
+    # Memory threshold in MB - if exceeded, force GC before processing
+    _memory_threshold_mb = 2500

    def __init__(self):
        self.preprocessor = ImagePreprocessor()
@@ -60,6 +78,16 @@ class OCRService:
        except Exception as e:
            return False, f"OCR processing failed: {str(e)}", None

+    def _cleanup_memory(self, *arrays):
+        """Explicitly delete numpy arrays and force garbage collection."""
+        for arr in arrays:
+            if arr is not None:
+                try:
+                    del arr
+                except:
+                    pass
+        gc.collect()
+
    def _process_sync(
        self,
        image_path: Path,
@@ -67,16 +95,54 @@ class OCRService:
    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
        """Synchronous processing with ADAPTIVE OCR pipeline."""

+        # Acquire semaphore to ensure only one OCR at a time
+        acquired = self._ocr_semaphore.acquire(timeout=120)  # 2 min timeout
+        if not acquired:
+            return False, "OCR service busy - please try again", None
+
+        try:
+            return self._process_sync_internal(image_path, mime_type)
+        finally:
+            # Always release semaphore and cleanup
+            self._ocr_semaphore.release()
+            # Force garbage collection after EVERY OCR request
+            gc.collect()
+            mem_after = get_memory_usage_mb()
+            print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True)
+
+    def _process_sync_internal(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """Internal processing - called with semaphore held."""
+
        start_time = time.time()
+        mem_before = get_memory_usage_mb()
        print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
+        print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True)
+
+        # Check if memory is high - force GC before processing
+        if mem_before > self._memory_threshold_mb:
+            print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True)
+            gc.collect()
+            mem_after_gc = get_memory_usage_mb()
+            print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True)

        # Load image
+        images = None  # For cleanup
+        image = None
        if mime_type == 'application/pdf':
            try:
                images = self.preprocessor.pdf_to_images(image_path)
                if not images:
                    return False, "Failed to extract images from PDF", None
                image = images[0]
+                # Delete other pages immediately to save memory
+                if len(images) > 1:
+                    for i in range(1, len(images)):
+                        del images[i]
+                    images = [image]
            except RuntimeError as e:
                return False, str(e), None
        else:
@@ -98,6 +164,10 @@ class OCRService:

        try:
            paddle_light = self.ocr_engine._paddle_recognize(light_img)
+            # Cleanup light_img immediately after OCR
+            del light_img
+            light_img = None
+
            if paddle_light and paddle_light.text:
                extraction = self.extractor.extract(paddle_light.text)
                extraction.ocr_engine = "paddle-light"
@@ -119,12 +189,19 @@ class OCRService:
                    elapsed_ms = int((time.time() - start_time) * 1000)
                    extraction.processing_time_ms = elapsed_ms
                    print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
+                    # Cleanup before return
+                    del image
+                    if images:
+                        del images
                    return True, "OCR complete (fast mode)", extraction
                else:
                    print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
        except Exception as e:
            print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
            extraction = ExtractionResult()
+            # Cleanup on error
+            if light_img is not None:
+                del light_img

        # ══════════════════════════════════════════════════════════════
        # STEP 2: PaddleOCR + Medium (balanced preprocessing)
@@ -136,6 +213,10 @@ class OCRService:

        try:
            paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
+            # Cleanup medium_img immediately after OCR
+            del medium_img
+            medium_img = None
+
            if paddle_medium and paddle_medium.text:
                extraction_medium = self.extractor.extract(paddle_medium.text)
                extraction_medium.ocr_engine = "paddle-medium"
@@ -164,11 +245,18 @@ class OCRService:
                    elapsed_ms = int((time.time() - start_time) * 1000)
                    extraction.processing_time_ms = elapsed_ms
                    print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
+                    # Cleanup before return
+                    del image
+                    if images:
+                        del images
                    return True, "OCR complete (paddle dual)", extraction
                else:
                    print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
        except Exception as e:
            print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
+            # Cleanup on error
+            if medium_img is not None:
+                del medium_img

        # ══════════════════════════════════════════════════════════════
        # STEP 3: Tesseract - ONLY to complete missing fields
@@ -178,10 +266,15 @@ class OCRService:
        print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
        print("=" * 60, flush=True)

+        tesseract_img = None
        try:
            # Use Tesseract-specific preprocessing (Otsu binarization)
            tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
            tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
+            # Cleanup tesseract_img immediately after OCR
+            del tesseract_img
+            tesseract_img = None
+
            if tesseract_result and tesseract_result.text:
                extraction_tess = self.extractor.extract(tesseract_result.text)
                extraction_tess.ocr_engine = "tesseract"
@@ -197,6 +290,14 @@ class OCRService:
                extraction = self._complement_extraction(extraction, extraction_tess)
        except Exception as e:
            print(f"[OCR] Tesseract failed: {e}", flush=True)
+            # Cleanup on error
+            if tesseract_img is not None:
+                del tesseract_img
+
+        # Cleanup original image - no longer needed
+        del image
+        if images:
+            del images

        # ══════════════════════════════════════════════════════════════
        # FINAL VALIDATION: Fix impossible values
--- a/backend/run-with-restart.sh
+++ b/backend/run-with-restart.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Wrapper script that auto-restarts uvicorn on crash
+# Usage: ./run-with-restart.sh [port] [log_file]
+
+PORT=${1:-8000}
+LOG_FILE=${2:-/tmp/unified_backend.log}
+MAX_RESTARTS=10
+RESTART_COUNT=0
+RESTART_DELAY=3
+
+echo "[Backend Runner] Starting uvicorn with auto-restart (max $MAX_RESTARTS restarts)" | tee -a "$LOG_FILE"
+echo "[Backend Runner] Port: $PORT, Log: $LOG_FILE" | tee -a "$LOG_FILE"
+
+while [ $RESTART_COUNT -lt $MAX_RESTARTS ]; do
+    START_TIME=$(date +%s)
+    echo "" | tee -a "$LOG_FILE"
+    echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - Starting uvicorn (attempt $((RESTART_COUNT + 1))/$MAX_RESTARTS)..." | tee -a "$LOG_FILE"
+
+    # Run uvicorn - it will exit on crash (OOM, etc.)
+    uvicorn main:app --host 0.0.0.0 --port "$PORT" 2>&1 | tee -a "$LOG_FILE"
+    EXIT_CODE=$?
+
+    END_TIME=$(date +%s)
+    RUNTIME=$((END_TIME - START_TIME))
+
+    echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - uvicorn exited with code $EXIT_CODE after ${RUNTIME}s" | tee -a "$LOG_FILE"
+
+    # If it ran for more than 60 seconds, reset restart counter (was stable)
+    if [ $RUNTIME -gt 60 ]; then
+        RESTART_COUNT=0
+        echo "[Backend Runner] Process was stable (>${RUNTIME}s), resetting restart counter" | tee -a "$LOG_FILE"
+    else
+        RESTART_COUNT=$((RESTART_COUNT + 1))
+        echo "[Backend Runner] Quick crash detected, restart count: $RESTART_COUNT/$MAX_RESTARTS" | tee -a "$LOG_FILE"
+    fi
+
+    # Exit if user sent SIGINT (Ctrl+C) or SIGTERM
+    if [ $EXIT_CODE -eq 130 ] || [ $EXIT_CODE -eq 143 ]; then
+        echo "[Backend Runner] Received termination signal, exiting..." | tee -a "$LOG_FILE"
+        exit 0
+    fi
+
+    # Wait before restart
+    if [ $RESTART_COUNT -lt $MAX_RESTARTS ]; then
+        echo "[Backend Runner] Restarting in ${RESTART_DELAY}s..." | tee -a "$LOG_FILE"
+        sleep $RESTART_DELAY
+    fi
+done
+
+echo "[Backend Runner] Max restarts ($MAX_RESTARTS) reached. Giving up." | tee -a "$LOG_FILE"
+exit 1