"""OCR engine wrapper for PaddleOCR and Tesseract.""" import os from dataclasses import dataclass from typing import List, Optional import numpy as np # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' # Lazy imports - these will be imported on first use PaddleOCR = None # Will be imported lazily pytesseract = None # Will be imported lazily # Check availability without importing heavy libraries def _check_paddle_available() -> bool: """Check if paddleocr is installed without importing it.""" try: import importlib.util return importlib.util.find_spec("paddleocr") is not None except Exception: return False def _check_tesseract_available() -> bool: """Check if pytesseract is installed without importing it.""" try: import importlib.util return importlib.util.find_spec("pytesseract") is not None except Exception: return False PADDLE_AVAILABLE = _check_paddle_available() TESSERACT_AVAILABLE = _check_tesseract_available() @dataclass class OCRResult: """Raw OCR result.""" text: str confidence: float boxes: List[dict] class OCREngine: """Unified OCR engine with fallback support.""" def __init__(self): self._paddle = None self._paddle_initialized = False def _init_paddle_lazy(self): """Lazy initialize PaddleOCR on first use (avoids slow startup).""" global PaddleOCR if self._paddle_initialized: return self._paddle_initialized = True if PADDLE_AVAILABLE: try: print("Importing PaddleOCR (first use, may take ~15-20 seconds)...") from paddleocr import PaddleOCR as _PaddleOCR PaddleOCR = _PaddleOCR print("Initializing PaddleOCR engine...") # PaddleOCR 3.x API - simplified parameters self._paddle = PaddleOCR( lang='en', # Better for mixed text with numbers ) print("PaddleOCR initialized successfully") except Exception as e: print(f"Warning: Failed to initialize PaddleOCR: {e}") self._paddle = None def recognize(self, image: np.ndarray) -> OCRResult: """Perform OCR on preprocessed image.""" # Lazy init PaddleOCR on first call self._init_paddle_lazy() if PADDLE_AVAILABLE and self._paddle: return self._paddle_recognize(image) elif TESSERACT_AVAILABLE: return self._tesseract_recognize(image) else: raise RuntimeError( "No OCR engine available. Install PaddleOCR or Tesseract." ) def _paddle_recognize(self, image: np.ndarray) -> OCRResult: """Recognize text using PaddleOCR 3.x API.""" try: # PaddleOCR 3.x requires 3-channel images if len(image.shape) == 2: # Convert grayscale to 3-channel BGR import cv2 image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) # PaddleOCR 3.x uses predict() with new parameter names result = self._paddle.predict(image, use_textline_orientation=True) if not result or len(result) == 0: return OCRResult(text="", confidence=0.0, boxes=[]) # PaddleOCR 3.x returns OCRResult objects with different structure ocr_result = result[0] # Extract texts and scores from the new format rec_texts = ocr_result.get('rec_texts', []) rec_scores = ocr_result.get('rec_scores', []) dt_polys = ocr_result.get('dt_polys', []) if not rec_texts: return OCRResult(text="", confidence=0.0, boxes=[]) boxes = [] for i, text in enumerate(rec_texts): conf = rec_scores[i] if i < len(rec_scores) else 0.0 box = dt_polys[i].tolist() if i < len(dt_polys) else [] boxes.append({ 'text': text, 'confidence': float(conf), 'box': box }) avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0 return OCRResult( text='\n'.join(rec_texts), confidence=float(avg_conf), boxes=boxes ) except Exception as e: print(f"PaddleOCR error: {e}, falling back to Tesseract") if TESSERACT_AVAILABLE: return self._tesseract_recognize(image) raise def _tesseract_recognize(self, image: np.ndarray) -> OCRResult: """Recognize text using Tesseract.""" global pytesseract # Lazy import pytesseract if pytesseract is None: print("Importing pytesseract...") import pytesseract as _pytesseract pytesseract = _pytesseract config = '--psm 6 -l ron+eng' text = pytesseract.image_to_string(image, config=config) data = pytesseract.image_to_data( image, config=config, output_type=pytesseract.Output.DICT ) confidences = [int(c) for c in data['conf'] if int(c) > 0] avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0 return OCRResult(text=text, confidence=avg_conf, boxes=[]) @staticmethod def get_available_engines() -> List[str]: """Return list of available OCR engines.""" engines = [] if PADDLE_AVAILABLE: engines.append('paddleocr') if TESSERACT_AVAILABLE: engines.append('tesseract') return engines