"""OCR engine wrapper for PaddleOCR, docTR, and Tesseract.""" import os import logging import threading import time from dataclasses import dataclass from typing import List, Optional, Tuple import numpy as np # Setup logging (respects LOG_LEVEL env var set in main.py) logger = logging.getLogger(__name__) # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' # Lazy imports - these will be imported on first use PaddleOCR = None # Will be imported lazily pytesseract = None # Will be imported lazily doctr_ocr_predictor = None # Will be imported lazily # Check availability without importing heavy libraries def _check_paddle_available() -> bool: """Check if paddleocr is installed without importing it.""" try: import importlib.util return importlib.util.find_spec("paddleocr") is not None except Exception: return False def _check_tesseract_available() -> bool: """Check if pytesseract is installed without importing it.""" try: import importlib.util return importlib.util.find_spec("pytesseract") is not None except Exception: return False def _check_doctr_available() -> bool: """Check if doctr is installed without importing it.""" try: import importlib.util return importlib.util.find_spec("doctr") is not None except Exception: return False PADDLE_AVAILABLE = _check_paddle_available() TESSERACT_AVAILABLE = _check_tesseract_available() DOCTR_AVAILABLE = _check_doctr_available() @dataclass class OCRResult: """Raw OCR result.""" text: str confidence: float boxes: List[dict] engine: str = "" # OCR engine used: paddleocr or tesseract class OCREngine: """Unified OCR engine with fallback support.""" def __init__(self): self._paddle = None self._paddle_init_started = False self._paddle_ready = threading.Event() # Signals when PaddleOCR is FULLY ready self._paddle_init_lock = threading.Lock() self._doctr = None self._doctr_init_started = False self._doctr_ready = threading.Event() # Signals when docTR is FULLY ready self._doctr_init_lock = threading.Lock() def _init_paddle_lazy(self): """Lazy initialize PaddleOCR on first use (avoids slow startup).""" global PaddleOCR with self._paddle_init_lock: if self._paddle_init_started: return # Already initializing or done self._paddle_init_started = True if PADDLE_AVAILABLE: try: print("Importing PaddleOCR (first use, may take ~15-20 seconds)...", flush=True) from paddleocr import PaddleOCR as _PaddleOCR PaddleOCR = _PaddleOCR print("Initializing PaddleOCR engine...", flush=True) # PaddleOCR 3.x API - optimized for Romanian receipts # Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts self._paddle = PaddleOCR( lang='en', # 'en' handles Latin alphabet well for receipts # High quality settings for better accuracy det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3) det_db_box_thresh=0.5, # Box confidence threshold (default 0.5) det_db_unclip_ratio=1.8, # Expand detected boxes slightly (default 1.5) rec_batch_num=6, # Batch size for recognition use_angle_cls=True, # Enable text angle classification ) print("PaddleOCR initialized successfully with high-quality settings", flush=True) except Exception as e: print(f"Warning: Failed to initialize PaddleOCR: {e}", flush=True) self._paddle = None # Signal that initialization is complete (success or failure) self._paddle_ready.set() def _init_doctr_lazy(self): """Lazy initialize docTR on first use (avoids slow startup).""" global doctr_ocr_predictor with self._doctr_init_lock: if self._doctr_init_started: return # Already initializing or done self._doctr_init_started = True if DOCTR_AVAILABLE: try: print("Importing docTR (first use, may take ~10-15 seconds)...", flush=True) from doctr.io import DocumentFile from doctr.models import ocr_predictor print("Initializing docTR engine (PyTorch backend)...", flush=True) # Initialize docTR predictor with pretrained models # Uses db_resnet50 for detection and crnn_vgg16_bn for recognition self._doctr = ocr_predictor( det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True, assume_straight_pages=True, straighten_pages=False, preserve_aspect_ratio=True, ) doctr_ocr_predictor = self._doctr print("docTR initialized successfully with PyTorch backend", flush=True) except Exception as e: print(f"Warning: Failed to initialize docTR: {e}", flush=True) self._doctr = None # Signal that initialization is complete (success or failure) self._doctr_ready.set() def wait_for_doctr(self, timeout: float = 30.0) -> bool: """ Wait for docTR to be fully initialized. Args: timeout: Max seconds to wait (default 30s) Returns: True if docTR is ready, False if timeout or unavailable """ if not DOCTR_AVAILABLE: return False if self._doctr is not None: return True # Already ready if not self._doctr_init_started: # Start initialization if not already started self._init_doctr_lazy() # Wait for initialization to complete print(f"[OCR] Waiting for docTR to be ready (max {timeout}s)...", flush=True) start = time.time() ready = self._doctr_ready.wait(timeout=timeout) elapsed = time.time() - start if ready and self._doctr is not None: print(f"[OCR] docTR ready after {elapsed:.1f}s", flush=True) return True else: print(f"[OCR] docTR not ready after {elapsed:.1f}s (timeout or failed)", flush=True) return False def is_doctr_ready(self) -> bool: """Check if docTR is ready without waiting.""" return self._doctr is not None def wait_for_paddle(self, timeout: float = 30.0) -> bool: """ Wait for PaddleOCR to be fully initialized. Args: timeout: Max seconds to wait (default 30s) Returns: True if PaddleOCR is ready, False if timeout or unavailable """ if not PADDLE_AVAILABLE: return False if self._paddle is not None: return True # Already ready if not self._paddle_init_started: # Start initialization if not already started self._init_paddle_lazy() # Wait for initialization to complete print(f"[OCR] Waiting for PaddleOCR to be ready (max {timeout}s)...", flush=True) start = time.time() ready = self._paddle_ready.wait(timeout=timeout) elapsed = time.time() - start if ready and self._paddle is not None: print(f"[OCR] PaddleOCR ready after {elapsed:.1f}s", flush=True) return True else: print(f"[OCR] PaddleOCR not ready after {elapsed:.1f}s (timeout or failed)", flush=True) return False def is_paddle_ready(self) -> bool: """Check if PaddleOCR is ready without waiting.""" return self._paddle is not None def recognize(self, image: np.ndarray) -> OCRResult: """Perform OCR on preprocessed image.""" logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}") # Lazy init PaddleOCR on first call self._init_paddle_lazy() if PADDLE_AVAILABLE and self._paddle: logger.info("[OCR] Using PaddleOCR engine") return self._paddle_recognize(image) elif TESSERACT_AVAILABLE: logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)") return self._tesseract_recognize(image) else: logger.error("[OCR] No OCR engine available!") raise RuntimeError( "No OCR engine available. Install PaddleOCR or Tesseract." ) def _paddle_recognize(self, image: np.ndarray) -> OCRResult: """Recognize text using PaddleOCR 3.x API.""" # Wait for PaddleOCR to be fully ready (handles background init) if not self.wait_for_paddle(timeout=30.0): logger.warning("[PaddleOCR] Not ready, falling back to Tesseract") if TESSERACT_AVAILABLE: return self._tesseract_recognize(image) raise RuntimeError("PaddleOCR not ready and Tesseract not available") try: logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}") # PaddleOCR 3.x requires 3-channel images if len(image.shape) == 2: # Convert grayscale to 3-channel BGR import cv2 image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}") # PaddleOCR 3.x uses predict() with new parameter names logger.info("[PaddleOCR] Calling predict()...") result = self._paddle.predict(image, use_textline_orientation=True) logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}") if not result or len(result) == 0: logger.warning("[PaddleOCR] No results returned") return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr") # PaddleOCR 3.x returns OCRResult objects with different structure ocr_result = result[0] # Extract texts and scores from the new format rec_texts = ocr_result.get('rec_texts', []) rec_scores = ocr_result.get('rec_scores', []) dt_polys = ocr_result.get('dt_polys', []) if not rec_texts: return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr") boxes = [] for i, text in enumerate(rec_texts): conf = rec_scores[i] if i < len(rec_scores) else 0.0 box = dt_polys[i].tolist() if i < len(dt_polys) else [] boxes.append({ 'text': text, 'confidence': float(conf), 'box': box }) avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0 text_result = '\n'.join(rec_texts) logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}") logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...") return OCRResult( text=text_result, confidence=float(avg_conf), boxes=boxes, engine="paddleocr" ) except Exception as e: logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract") if TESSERACT_AVAILABLE: return self._tesseract_recognize(image) raise def _tesseract_recognize(self, image: np.ndarray) -> OCRResult: """Recognize text using Tesseract.""" global pytesseract logger.info(f"[Tesseract] Processing image, shape: {image.shape}") # Lazy import pytesseract if pytesseract is None: logger.info("[Tesseract] Importing pytesseract...") import pytesseract as _pytesseract pytesseract = _pytesseract # PSM 4: Single column (best for receipts) config = '--psm 4 -l ron+eng' text = pytesseract.image_to_string(image, config=config) # Quick confidence estimate data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT) confidences = [int(c) for c in data['conf'] if int(c) > 0] avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0 logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}") return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract") def _doctr_recognize(self, image: np.ndarray) -> OCRResult: """Recognize text using docTR.""" # Wait for docTR to be fully ready if not self.wait_for_doctr(timeout=30.0): logger.warning("[docTR] Not ready, falling back to Tesseract") if TESSERACT_AVAILABLE: return self._tesseract_recognize(image) raise RuntimeError("docTR not ready and Tesseract not available") try: logger.info(f"[docTR] Processing image, shape: {image.shape}") # docTR requires RGB images import cv2 if len(image.shape) == 2: # Convert grayscale to RGB image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) logger.info(f"[docTR] Converted grayscale to RGB, new shape: {image.shape}") elif image.shape[2] == 4: # Convert RGBA to RGB image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) logger.info(f"[docTR] Converted RGBA to RGB, new shape: {image.shape}") elif image.shape[2] == 3: # Check if BGR (from OpenCV) and convert to RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) logger.info(f"[docTR] Converted BGR to RGB, shape: {image.shape}") # Process image with docTR logger.info("[docTR] Running prediction...") from doctr.io import DocumentFile # docTR expects a document (list of pages as numpy arrays) result = self._doctr([image]) if not result or not result.pages: logger.warning("[docTR] No results returned") return OCRResult(text="", confidence=0.0, boxes=[], engine="doctr") # Extract text from all pages all_texts = [] all_confidences = [] boxes = [] for page in result.pages: for block in page.blocks: for line in block.lines: line_text = ' '.join(word.value for word in line.words) line_confidence = sum(w.confidence for w in line.words) / len(line.words) if line.words else 0.0 all_texts.append(line_text) all_confidences.append(line_confidence) # Store word-level boxes for word in line.words: boxes.append({ 'text': word.value, 'confidence': float(word.confidence), 'box': word.geometry # (xmin, ymin), (xmax, ymax) }) text_result = '\n'.join(all_texts) avg_conf = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0 logger.info(f"[docTR] SUCCESS - Found {len(all_texts)} text lines, avg confidence: {avg_conf:.2%}") logger.debug(f"[docTR] Raw text preview: {text_result[:200]}...") return OCRResult( text=text_result, confidence=float(avg_conf), boxes=boxes, engine="doctr" ) except Exception as e: logger.error(f"[docTR] ERROR: {e}, falling back to Tesseract") if TESSERACT_AVAILABLE: return self._tesseract_recognize(image) raise def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]: """ Run both OCR engines and return both results. Returns: Tuple of (paddle_result, tesseract_result) tesseract_result may be None if Tesseract is not available """ logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}") # Lazy init PaddleOCR self._init_paddle_lazy() paddle_result = None tesseract_result = None # Run PaddleOCR if PADDLE_AVAILABLE and self._paddle: try: logger.info("[OCR Dual] Running PaddleOCR...") paddle_result = self._paddle_recognize(image) logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}") except Exception as e: logger.error(f"[OCR Dual] PaddleOCR failed: {e}") paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr") # Run Tesseract if TESSERACT_AVAILABLE: try: logger.info("[OCR Dual] Running Tesseract...") tesseract_result = self._tesseract_recognize(image) logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}") except Exception as e: logger.error(f"[OCR Dual] Tesseract failed: {e}") tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract") # Fallback if PaddleOCR not available if paddle_result is None: if tesseract_result: paddle_result = tesseract_result else: raise RuntimeError("No OCR engine available") return paddle_result, tesseract_result @staticmethod def get_available_engines() -> List[str]: """ Return list of available OCR engines. Respects OCR_ENABLE_PADDLEOCR and OCR_ENABLE_TESSERACT from .env. Engines that are disabled via .env are not returned even if installed. Available engines: tesseract, doctr, doctr_plus, paddleocr """ # Check .env settings paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true" tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true" engines = [] # Base engines (only if installed AND enabled) if TESSERACT_AVAILABLE and tesseract_enabled: engines.append('tesseract') if DOCTR_AVAILABLE: engines.append('doctr') engines.append('doctr_plus') # docTR with 2-tier sequential + early exit if PADDLE_AVAILABLE and paddle_enabled: engines.append('paddleocr') return engines