feat(ocr): Add memory management and auto-restart for OCR stability
OCR Service improvements: - Single worker ThreadPoolExecutor to prevent parallel OCR memory accumulation - Semaphore to ensure only one OCR operation at a time - Explicit numpy array cleanup after each OCR step - Forced garbage collection after every OCR request - Memory threshold check (2500MB) with pre-processing GC - Memory usage logging before/after processing Backend auto-restart: - Add run-with-restart.sh wrapper script for uvicorn - Auto-restart on crash (OOM, etc.) with max 10 restarts - Reset restart counter if process runs >60s (stable) - Graceful exit on SIGINT/SIGTERM - Update start-prod.sh and start-test.sh to use wrapper 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,9 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
import gc
|
||||
import logging
|
||||
import threading
|
||||
|
||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
@@ -23,10 +25,26 @@ from backend.modules.data_entry.services.ocr.validation import OCRValidationEngi
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_memory_usage_mb() -> float:
|
||||
"""Get current process memory usage in MB."""
|
||||
try:
|
||||
import resource
|
||||
# Get memory in KB, convert to MB
|
||||
rusage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
return rusage.ru_maxrss / 1024 # Linux returns KB
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
class OCRService:
|
||||
"""Service for OCR processing of receipt images."""
|
||||
|
||||
_executor = ThreadPoolExecutor(max_workers=2)
|
||||
# Single worker to prevent memory accumulation from parallel OCR
|
||||
_executor = ThreadPoolExecutor(max_workers=1)
|
||||
# Semaphore to ensure only one OCR operation at a time (memory protection)
|
||||
_ocr_semaphore = threading.Semaphore(1)
|
||||
# Memory threshold in MB - if exceeded, force GC before processing
|
||||
_memory_threshold_mb = 2500
|
||||
|
||||
def __init__(self):
|
||||
self.preprocessor = ImagePreprocessor()
|
||||
@@ -60,6 +78,16 @@ class OCRService:
|
||||
except Exception as e:
|
||||
return False, f"OCR processing failed: {str(e)}", None
|
||||
|
||||
def _cleanup_memory(self, *arrays):
|
||||
"""Explicitly delete numpy arrays and force garbage collection."""
|
||||
for arr in arrays:
|
||||
if arr is not None:
|
||||
try:
|
||||
del arr
|
||||
except:
|
||||
pass
|
||||
gc.collect()
|
||||
|
||||
def _process_sync(
|
||||
self,
|
||||
image_path: Path,
|
||||
@@ -67,16 +95,54 @@ class OCRService:
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
||||
|
||||
# Acquire semaphore to ensure only one OCR at a time
|
||||
acquired = self._ocr_semaphore.acquire(timeout=120) # 2 min timeout
|
||||
if not acquired:
|
||||
return False, "OCR service busy - please try again", None
|
||||
|
||||
try:
|
||||
return self._process_sync_internal(image_path, mime_type)
|
||||
finally:
|
||||
# Always release semaphore and cleanup
|
||||
self._ocr_semaphore.release()
|
||||
# Force garbage collection after EVERY OCR request
|
||||
gc.collect()
|
||||
mem_after = get_memory_usage_mb()
|
||||
print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True)
|
||||
|
||||
def _process_sync_internal(
|
||||
self,
|
||||
image_path: Path,
|
||||
mime_type: str
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""Internal processing - called with semaphore held."""
|
||||
|
||||
start_time = time.time()
|
||||
mem_before = get_memory_usage_mb()
|
||||
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
||||
print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True)
|
||||
|
||||
# Check if memory is high - force GC before processing
|
||||
if mem_before > self._memory_threshold_mb:
|
||||
print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True)
|
||||
gc.collect()
|
||||
mem_after_gc = get_memory_usage_mb()
|
||||
print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True)
|
||||
|
||||
# Load image
|
||||
images = None # For cleanup
|
||||
image = None
|
||||
if mime_type == 'application/pdf':
|
||||
try:
|
||||
images = self.preprocessor.pdf_to_images(image_path)
|
||||
if not images:
|
||||
return False, "Failed to extract images from PDF", None
|
||||
image = images[0]
|
||||
# Delete other pages immediately to save memory
|
||||
if len(images) > 1:
|
||||
for i in range(1, len(images)):
|
||||
del images[i]
|
||||
images = [image]
|
||||
except RuntimeError as e:
|
||||
return False, str(e), None
|
||||
else:
|
||||
@@ -98,6 +164,10 @@ class OCRService:
|
||||
|
||||
try:
|
||||
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
||||
# Cleanup light_img immediately after OCR
|
||||
del light_img
|
||||
light_img = None
|
||||
|
||||
if paddle_light and paddle_light.text:
|
||||
extraction = self.extractor.extract(paddle_light.text)
|
||||
extraction.ocr_engine = "paddle-light"
|
||||
@@ -119,12 +189,19 @@ class OCRService:
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||
# Cleanup before return
|
||||
del image
|
||||
if images:
|
||||
del images
|
||||
return True, "OCR complete (fast mode)", extraction
|
||||
else:
|
||||
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
||||
except Exception as e:
|
||||
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
||||
extraction = ExtractionResult()
|
||||
# Cleanup on error
|
||||
if light_img is not None:
|
||||
del light_img
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
|
||||
@@ -136,6 +213,10 @@ class OCRService:
|
||||
|
||||
try:
|
||||
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
|
||||
# Cleanup medium_img immediately after OCR
|
||||
del medium_img
|
||||
medium_img = None
|
||||
|
||||
if paddle_medium and paddle_medium.text:
|
||||
extraction_medium = self.extractor.extract(paddle_medium.text)
|
||||
extraction_medium.ocr_engine = "paddle-medium"
|
||||
@@ -164,11 +245,18 @@ class OCRService:
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||
# Cleanup before return
|
||||
del image
|
||||
if images:
|
||||
del images
|
||||
return True, "OCR complete (paddle dual)", extraction
|
||||
else:
|
||||
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
||||
except Exception as e:
|
||||
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
|
||||
# Cleanup on error
|
||||
if medium_img is not None:
|
||||
del medium_img
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 3: Tesseract - ONLY to complete missing fields
|
||||
@@ -178,10 +266,15 @@ class OCRService:
|
||||
print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
|
||||
print("=" * 60, flush=True)
|
||||
|
||||
tesseract_img = None
|
||||
try:
|
||||
# Use Tesseract-specific preprocessing (Otsu binarization)
|
||||
tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
|
||||
tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
|
||||
# Cleanup tesseract_img immediately after OCR
|
||||
del tesseract_img
|
||||
tesseract_img = None
|
||||
|
||||
if tesseract_result and tesseract_result.text:
|
||||
extraction_tess = self.extractor.extract(tesseract_result.text)
|
||||
extraction_tess.ocr_engine = "tesseract"
|
||||
@@ -197,6 +290,14 @@ class OCRService:
|
||||
extraction = self._complement_extraction(extraction, extraction_tess)
|
||||
except Exception as e:
|
||||
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
||||
# Cleanup on error
|
||||
if tesseract_img is not None:
|
||||
del tesseract_img
|
||||
|
||||
# Cleanup original image - no longer needed
|
||||
del image
|
||||
if images:
|
||||
del images
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# FINAL VALIDATION: Fix impossible values
|
||||
|
||||
Reference in New Issue
Block a user