diff --git a/backend/modules/data_entry/services/ocr_service.py b/backend/modules/data_entry/services/ocr_service.py index 21bb382..d18d38e 100644 --- a/backend/modules/data_entry/services/ocr_service.py +++ b/backend/modules/data_entry/services/ocr_service.py @@ -2,7 +2,9 @@ import os import re +import gc import logging +import threading # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' @@ -23,10 +25,26 @@ from backend.modules.data_entry.services.ocr.validation import OCRValidationEngi logger = logging.getLogger(__name__) +def get_memory_usage_mb() -> float: + """Get current process memory usage in MB.""" + try: + import resource + # Get memory in KB, convert to MB + rusage = resource.getrusage(resource.RUSAGE_SELF) + return rusage.ru_maxrss / 1024 # Linux returns KB + except Exception: + return 0.0 + + class OCRService: """Service for OCR processing of receipt images.""" - _executor = ThreadPoolExecutor(max_workers=2) + # Single worker to prevent memory accumulation from parallel OCR + _executor = ThreadPoolExecutor(max_workers=1) + # Semaphore to ensure only one OCR operation at a time (memory protection) + _ocr_semaphore = threading.Semaphore(1) + # Memory threshold in MB - if exceeded, force GC before processing + _memory_threshold_mb = 2500 def __init__(self): self.preprocessor = ImagePreprocessor() @@ -60,6 +78,16 @@ class OCRService: except Exception as e: return False, f"OCR processing failed: {str(e)}", None + def _cleanup_memory(self, *arrays): + """Explicitly delete numpy arrays and force garbage collection.""" + for arr in arrays: + if arr is not None: + try: + del arr + except: + pass + gc.collect() + def _process_sync( self, image_path: Path, @@ -67,16 +95,54 @@ class OCRService: ) -> Tuple[bool, str, Optional[ExtractionResult]]: """Synchronous processing with ADAPTIVE OCR pipeline.""" + # Acquire semaphore to ensure only one OCR at a time + acquired = self._ocr_semaphore.acquire(timeout=120) # 2 min timeout + if not acquired: + return False, "OCR service busy - please try again", None + + try: + return self._process_sync_internal(image_path, mime_type) + finally: + # Always release semaphore and cleanup + self._ocr_semaphore.release() + # Force garbage collection after EVERY OCR request + gc.collect() + mem_after = get_memory_usage_mb() + print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True) + + def _process_sync_internal( + self, + image_path: Path, + mime_type: str + ) -> Tuple[bool, str, Optional[ExtractionResult]]: + """Internal processing - called with semaphore held.""" + start_time = time.time() + mem_before = get_memory_usage_mb() print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True) + print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True) + + # Check if memory is high - force GC before processing + if mem_before > self._memory_threshold_mb: + print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True) + gc.collect() + mem_after_gc = get_memory_usage_mb() + print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True) # Load image + images = None # For cleanup + image = None if mime_type == 'application/pdf': try: images = self.preprocessor.pdf_to_images(image_path) if not images: return False, "Failed to extract images from PDF", None image = images[0] + # Delete other pages immediately to save memory + if len(images) > 1: + for i in range(1, len(images)): + del images[i] + images = [image] except RuntimeError as e: return False, str(e), None else: @@ -98,6 +164,10 @@ class OCRService: try: paddle_light = self.ocr_engine._paddle_recognize(light_img) + # Cleanup light_img immediately after OCR + del light_img + light_img = None + if paddle_light and paddle_light.text: extraction = self.extractor.extract(paddle_light.text) extraction.ocr_engine = "paddle-light" @@ -119,12 +189,19 @@ class OCRService: elapsed_ms = int((time.time() - start_time) * 1000) extraction.processing_time_ms = elapsed_ms print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True) + # Cleanup before return + del image + if images: + del images return True, "OCR complete (fast mode)", extraction else: print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True) except Exception as e: print(f"[OCR] PaddleOCR light failed: {e}", flush=True) extraction = ExtractionResult() + # Cleanup on error + if light_img is not None: + del light_img # ══════════════════════════════════════════════════════════════ # STEP 2: PaddleOCR + Medium (balanced preprocessing) @@ -136,6 +213,10 @@ class OCRService: try: paddle_medium = self.ocr_engine._paddle_recognize(medium_img) + # Cleanup medium_img immediately after OCR + del medium_img + medium_img = None + if paddle_medium and paddle_medium.text: extraction_medium = self.extractor.extract(paddle_medium.text) extraction_medium.ocr_engine = "paddle-medium" @@ -164,11 +245,18 @@ class OCRService: elapsed_ms = int((time.time() - start_time) * 1000) extraction.processing_time_ms = elapsed_ms print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True) + # Cleanup before return + del image + if images: + del images return True, "OCR complete (paddle dual)", extraction else: print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True) except Exception as e: print(f"[OCR] PaddleOCR medium failed: {e}", flush=True) + # Cleanup on error + if medium_img is not None: + del medium_img # ══════════════════════════════════════════════════════════════ # STEP 3: Tesseract - ONLY to complete missing fields @@ -178,10 +266,15 @@ class OCRService: print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True) print("=" * 60, flush=True) + tesseract_img = None try: # Use Tesseract-specific preprocessing (Otsu binarization) tesseract_img = self.preprocessor.preprocess_for_tesseract(image) tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img) + # Cleanup tesseract_img immediately after OCR + del tesseract_img + tesseract_img = None + if tesseract_result and tesseract_result.text: extraction_tess = self.extractor.extract(tesseract_result.text) extraction_tess.ocr_engine = "tesseract" @@ -197,6 +290,14 @@ class OCRService: extraction = self._complement_extraction(extraction, extraction_tess) except Exception as e: print(f"[OCR] Tesseract failed: {e}", flush=True) + # Cleanup on error + if tesseract_img is not None: + del tesseract_img + + # Cleanup original image - no longer needed + del image + if images: + del images # ══════════════════════════════════════════════════════════════ # FINAL VALIDATION: Fix impossible values diff --git a/backend/run-with-restart.sh b/backend/run-with-restart.sh new file mode 100644 index 0000000..c20de3e --- /dev/null +++ b/backend/run-with-restart.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Wrapper script that auto-restarts uvicorn on crash +# Usage: ./run-with-restart.sh [port] [log_file] + +PORT=${1:-8000} +LOG_FILE=${2:-/tmp/unified_backend.log} +MAX_RESTARTS=10 +RESTART_COUNT=0 +RESTART_DELAY=3 + +echo "[Backend Runner] Starting uvicorn with auto-restart (max $MAX_RESTARTS restarts)" | tee -a "$LOG_FILE" +echo "[Backend Runner] Port: $PORT, Log: $LOG_FILE" | tee -a "$LOG_FILE" + +while [ $RESTART_COUNT -lt $MAX_RESTARTS ]; do + START_TIME=$(date +%s) + echo "" | tee -a "$LOG_FILE" + echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - Starting uvicorn (attempt $((RESTART_COUNT + 1))/$MAX_RESTARTS)..." | tee -a "$LOG_FILE" + + # Run uvicorn - it will exit on crash (OOM, etc.) + uvicorn main:app --host 0.0.0.0 --port "$PORT" 2>&1 | tee -a "$LOG_FILE" + EXIT_CODE=$? + + END_TIME=$(date +%s) + RUNTIME=$((END_TIME - START_TIME)) + + echo "[Backend Runner] $(date '+%Y-%m-%d %H:%M:%S') - uvicorn exited with code $EXIT_CODE after ${RUNTIME}s" | tee -a "$LOG_FILE" + + # If it ran for more than 60 seconds, reset restart counter (was stable) + if [ $RUNTIME -gt 60 ]; then + RESTART_COUNT=0 + echo "[Backend Runner] Process was stable (>${RUNTIME}s), resetting restart counter" | tee -a "$LOG_FILE" + else + RESTART_COUNT=$((RESTART_COUNT + 1)) + echo "[Backend Runner] Quick crash detected, restart count: $RESTART_COUNT/$MAX_RESTARTS" | tee -a "$LOG_FILE" + fi + + # Exit if user sent SIGINT (Ctrl+C) or SIGTERM + if [ $EXIT_CODE -eq 130 ] || [ $EXIT_CODE -eq 143 ]; then + echo "[Backend Runner] Received termination signal, exiting..." | tee -a "$LOG_FILE" + exit 0 + fi + + # Wait before restart + if [ $RESTART_COUNT -lt $MAX_RESTARTS ]; then + echo "[Backend Runner] Restarting in ${RESTART_DELAY}s..." | tee -a "$LOG_FILE" + sleep $RESTART_DELAY + fi +done + +echo "[Backend Runner] Max restarts ($MAX_RESTARTS) reached. Giving up." | tee -a "$LOG_FILE" +exit 1 diff --git a/start-prod.sh b/start-prod.sh index a09a6a4..ce08ee1 100644 --- a/start-prod.sh +++ b/start-prod.sh @@ -145,9 +145,9 @@ else source .env set +a - # Start backend with reload - print_message "Starting unified backend (includes Reports, Data Entry, and Telegram bot)..." - nohup uvicorn main:app --host 0.0.0.0 --port 8000 --reload > /tmp/unified_backend_prod.log 2>&1 & + # Start backend with auto-restart on crash (OOM protection) + print_message "Starting unified backend with auto-restart (includes Reports, Data Entry, and Telegram bot)..." + nohup ./run-with-restart.sh 8000 /tmp/unified_backend_prod.log > /dev/null 2>&1 & cd - > /dev/null diff --git a/start-test.sh b/start-test.sh index 5b03447..d9700c3 100644 --- a/start-test.sh +++ b/start-test.sh @@ -145,9 +145,9 @@ else source .env set +a - # Start backend (without reload for test stability) - print_message "Starting unified backend (includes Reports, Data Entry, and Telegram bot)..." - nohup uvicorn main:app --host 0.0.0.0 --port 8000 > /tmp/unified_backend_test.log 2>&1 & + # Start backend with auto-restart on crash (OOM protection) + print_message "Starting unified backend with auto-restart (includes Reports, Data Entry, and Telegram bot)..." + nohup ./run-with-restart.sh 8000 /tmp/unified_backend_test.log > /dev/null 2>&1 & cd - > /dev/null