roa2web-service-auto/backend/modules/data_entry/services/ocr_engine.py

"""OCR engine wrapper for PaddleOCR, docTR, and Tesseract."""

import os
import logging
import threading
import time
from dataclasses import dataclass
from typing import List, Optional, Tuple

import numpy as np

# Setup logging (respects LOG_LEVEL env var set in main.py)
logger = logging.getLogger(__name__)

# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

# Lazy imports - these will be imported on first use
PaddleOCR = None  # Will be imported lazily
pytesseract = None  # Will be imported lazily
doctr_ocr_predictor = None  # Will be imported lazily

# Check availability without importing heavy libraries
def _check_paddle_available() -> bool:
    """Check if paddleocr is installed without importing it."""
    try:
        import importlib.util
        return importlib.util.find_spec("paddleocr") is not None
    except Exception:
        return False

def _check_tesseract_available() -> bool:
    """Check if pytesseract is installed without importing it."""
    try:
        import importlib.util
        return importlib.util.find_spec("pytesseract") is not None
    except Exception:
        return False

def _check_doctr_available() -> bool:
    """Check if doctr is installed without importing it."""
    try:
        import importlib.util
        return importlib.util.find_spec("doctr") is not None
    except Exception:
        return False

PADDLE_AVAILABLE = _check_paddle_available()
TESSERACT_AVAILABLE = _check_tesseract_available()
DOCTR_AVAILABLE = _check_doctr_available()


@dataclass
class OCRResult:
    """Raw OCR result."""
    text: str
    confidence: float
    boxes: List[dict]
    engine: str = ""  # OCR engine used: paddleocr or tesseract


class OCREngine:
    """Unified OCR engine with fallback support."""

    def __init__(self):
        self._paddle = None
        self._paddle_init_started = False
        self._paddle_ready = threading.Event()  # Signals when PaddleOCR is FULLY ready
        self._paddle_init_lock = threading.Lock()

        self._doctr = None
        self._doctr_init_started = False
        self._doctr_ready = threading.Event()  # Signals when docTR is FULLY ready
        self._doctr_init_lock = threading.Lock()

    def _init_paddle_lazy(self):
        """Lazy initialize PaddleOCR on first use (avoids slow startup)."""
        global PaddleOCR

        with self._paddle_init_lock:
            if self._paddle_init_started:
                return  # Already initializing or done
            self._paddle_init_started = True

        if PADDLE_AVAILABLE:
            try:
                print("Importing PaddleOCR (first use, may take ~15-20 seconds)...", flush=True)
                from paddleocr import PaddleOCR as _PaddleOCR
                PaddleOCR = _PaddleOCR

                print("Initializing PaddleOCR engine...", flush=True)
                # PaddleOCR 3.x API - optimized for Romanian receipts
                # Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
                self._paddle = PaddleOCR(
                    lang='en',  # 'en' handles Latin alphabet well for receipts
                    # High quality settings for better accuracy
                    det_db_thresh=0.3,      # Lower threshold = detect more text (default 0.3)
                    det_db_box_thresh=0.5,  # Box confidence threshold (default 0.5)
                    det_db_unclip_ratio=1.8,  # Expand detected boxes slightly (default 1.5)
                    rec_batch_num=6,        # Batch size for recognition
                    use_angle_cls=True,     # Enable text angle classification
                )
                print("PaddleOCR initialized successfully with high-quality settings", flush=True)
            except Exception as e:
                print(f"Warning: Failed to initialize PaddleOCR: {e}", flush=True)
                self._paddle = None

        # Signal that initialization is complete (success or failure)
        self._paddle_ready.set()

    def _init_doctr_lazy(self):
        """Lazy initialize docTR on first use (avoids slow startup)."""
        global doctr_ocr_predictor

        with self._doctr_init_lock:
            if self._doctr_init_started:
                return  # Already initializing or done
            self._doctr_init_started = True

        if DOCTR_AVAILABLE:
            try:
                print("Importing docTR (first use, may take ~10-15 seconds)...", flush=True)
                from doctr.io import DocumentFile
                from doctr.models import ocr_predictor

                print("Initializing docTR engine (PyTorch backend)...", flush=True)
                # Initialize docTR predictor with pretrained models
                # Uses db_resnet50 for detection and crnn_vgg16_bn for recognition
                self._doctr = ocr_predictor(
                    det_arch='db_resnet50',
                    reco_arch='crnn_vgg16_bn',
                    pretrained=True,
                    assume_straight_pages=True,
                    straighten_pages=False,
                    preserve_aspect_ratio=True,
                )
                doctr_ocr_predictor = self._doctr
                print("docTR initialized successfully with PyTorch backend", flush=True)
            except Exception as e:
                print(f"Warning: Failed to initialize docTR: {e}", flush=True)
                self._doctr = None

        # Signal that initialization is complete (success or failure)
        self._doctr_ready.set()

    def wait_for_doctr(self, timeout: float = 30.0) -> bool:
        """
        Wait for docTR to be fully initialized.

        Args:
            timeout: Max seconds to wait (default 30s)

        Returns:
            True if docTR is ready, False if timeout or unavailable
        """
        if not DOCTR_AVAILABLE:
            return False

        if self._doctr is not None:
            return True  # Already ready

        if not self._doctr_init_started:
            # Start initialization if not already started
            self._init_doctr_lazy()

        # Wait for initialization to complete
        print(f"[OCR] Waiting for docTR to be ready (max {timeout}s)...", flush=True)
        start = time.time()
        ready = self._doctr_ready.wait(timeout=timeout)
        elapsed = time.time() - start

        if ready and self._doctr is not None:
            print(f"[OCR] docTR ready after {elapsed:.1f}s", flush=True)
            return True
        else:
            print(f"[OCR] docTR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
            return False

    def is_doctr_ready(self) -> bool:
        """Check if docTR is ready without waiting."""
        return self._doctr is not None

    def wait_for_paddle(self, timeout: float = 30.0) -> bool:
        """
        Wait for PaddleOCR to be fully initialized.

        Args:
            timeout: Max seconds to wait (default 30s)

        Returns:
            True if PaddleOCR is ready, False if timeout or unavailable
        """
        if not PADDLE_AVAILABLE:
            return False

        if self._paddle is not None:
            return True  # Already ready

        if not self._paddle_init_started:
            # Start initialization if not already started
            self._init_paddle_lazy()

        # Wait for initialization to complete
        print(f"[OCR] Waiting for PaddleOCR to be ready (max {timeout}s)...", flush=True)
        start = time.time()
        ready = self._paddle_ready.wait(timeout=timeout)
        elapsed = time.time() - start

        if ready and self._paddle is not None:
            print(f"[OCR] PaddleOCR ready after {elapsed:.1f}s", flush=True)
            return True
        else:
            print(f"[OCR] PaddleOCR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
            return False

    def is_paddle_ready(self) -> bool:
        """Check if PaddleOCR is ready without waiting."""
        return self._paddle is not None

    def recognize(self, image: np.ndarray) -> OCRResult:
        """Perform OCR on preprocessed image."""
        logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")

        # Lazy init PaddleOCR on first call
        self._init_paddle_lazy()

        if PADDLE_AVAILABLE and self._paddle:
            logger.info("[OCR] Using PaddleOCR engine")
            return self._paddle_recognize(image)
        elif TESSERACT_AVAILABLE:
            logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
            return self._tesseract_recognize(image)
        else:
            logger.error("[OCR] No OCR engine available!")
            raise RuntimeError(
                "No OCR engine available. Install PaddleOCR or Tesseract."
            )

    def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using PaddleOCR 3.x API."""
        # Wait for PaddleOCR to be fully ready (handles background init)
        if not self.wait_for_paddle(timeout=30.0):
            logger.warning("[PaddleOCR] Not ready, falling back to Tesseract")
            if TESSERACT_AVAILABLE:
                return self._tesseract_recognize(image)
            raise RuntimeError("PaddleOCR not ready and Tesseract not available")

        try:
            logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")

            # PaddleOCR 3.x requires 3-channel images
            if len(image.shape) == 2:
                # Convert grayscale to 3-channel BGR
                import cv2
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
                logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")

            # PaddleOCR 3.x uses predict() with new parameter names
            logger.info("[PaddleOCR] Calling predict()...")
            result = self._paddle.predict(image, use_textline_orientation=True)
            logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")

            if not result or len(result) == 0:
                logger.warning("[PaddleOCR] No results returned")
                return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")

            # PaddleOCR 3.x returns OCRResult objects with different structure
            ocr_result = result[0]

            # Extract texts and scores from the new format
            rec_texts = ocr_result.get('rec_texts', [])
            rec_scores = ocr_result.get('rec_scores', [])
            dt_polys = ocr_result.get('dt_polys', [])

            if not rec_texts:
                return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")

            boxes = []
            for i, text in enumerate(rec_texts):
                conf = rec_scores[i] if i < len(rec_scores) else 0.0
                box = dt_polys[i].tolist() if i < len(dt_polys) else []
                boxes.append({
                    'text': text,
                    'confidence': float(conf),
                    'box': box
                })

            avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
            text_result = '\n'.join(rec_texts)
            logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
            logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
            return OCRResult(
                text=text_result,
                confidence=float(avg_conf),
                boxes=boxes,
                engine="paddleocr"
            )
        except Exception as e:
            logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
            if TESSERACT_AVAILABLE:
                return self._tesseract_recognize(image)
            raise

    def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using Tesseract."""
        global pytesseract

        logger.info(f"[Tesseract] Processing image, shape: {image.shape}")

        # Lazy import pytesseract
        if pytesseract is None:
            logger.info("[Tesseract] Importing pytesseract...")
            import pytesseract as _pytesseract
            pytesseract = _pytesseract

        # PSM 4: Single column (best for receipts)
        config = '--psm 4 -l ron+eng'
        text = pytesseract.image_to_string(image, config=config)

        # Quick confidence estimate
        data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
        confidences = [int(c) for c in data['conf'] if int(c) > 0]
        avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0

        logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
        return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")

    def _doctr_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using docTR."""
        # Wait for docTR to be fully ready
        if not self.wait_for_doctr(timeout=30.0):
            logger.warning("[docTR] Not ready, falling back to Tesseract")
            if TESSERACT_AVAILABLE:
                return self._tesseract_recognize(image)
            raise RuntimeError("docTR not ready and Tesseract not available")

        try:
            logger.info(f"[docTR] Processing image, shape: {image.shape}")

            # docTR requires RGB images
            import cv2
            if len(image.shape) == 2:
                # Convert grayscale to RGB
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
                logger.info(f"[docTR] Converted grayscale to RGB, new shape: {image.shape}")
            elif image.shape[2] == 4:
                # Convert RGBA to RGB
                image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
                logger.info(f"[docTR] Converted RGBA to RGB, new shape: {image.shape}")
            elif image.shape[2] == 3:
                # Check if BGR (from OpenCV) and convert to RGB
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                logger.info(f"[docTR] Converted BGR to RGB, shape: {image.shape}")

            # Process image with docTR
            logger.info("[docTR] Running prediction...")
            from doctr.io import DocumentFile

            # docTR expects a document (list of pages as numpy arrays)
            result = self._doctr([image])

            if not result or not result.pages:
                logger.warning("[docTR] No results returned")
                return OCRResult(text="", confidence=0.0, boxes=[], engine="doctr")

            # Extract text from all pages
            all_texts = []
            all_confidences = []
            boxes = []

            for page in result.pages:
                for block in page.blocks:
                    for line in block.lines:
                        line_text = ' '.join(word.value for word in line.words)
                        line_confidence = sum(w.confidence for w in line.words) / len(line.words) if line.words else 0.0
                        all_texts.append(line_text)
                        all_confidences.append(line_confidence)

                        # Store word-level boxes
                        for word in line.words:
                            boxes.append({
                                'text': word.value,
                                'confidence': float(word.confidence),
                                'box': word.geometry  # (xmin, ymin), (xmax, ymax)
                            })

            text_result = '\n'.join(all_texts)
            avg_conf = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0

            logger.info(f"[docTR] SUCCESS - Found {len(all_texts)} text lines, avg confidence: {avg_conf:.2%}")
            logger.debug(f"[docTR] Raw text preview: {text_result[:200]}...")

            return OCRResult(
                text=text_result,
                confidence=float(avg_conf),
                boxes=boxes,
                engine="doctr"
            )

        except Exception as e:
            logger.error(f"[docTR] ERROR: {e}, falling back to Tesseract")
            if TESSERACT_AVAILABLE:
                return self._tesseract_recognize(image)
            raise

    def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
        """
        Run both OCR engines and return both results.

        Returns:
            Tuple of (paddle_result, tesseract_result)
            tesseract_result may be None if Tesseract is not available
        """
        logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")

        # Lazy init PaddleOCR
        self._init_paddle_lazy()

        paddle_result = None
        tesseract_result = None

        # Run PaddleOCR
        if PADDLE_AVAILABLE and self._paddle:
            try:
                logger.info("[OCR Dual] Running PaddleOCR...")
                paddle_result = self._paddle_recognize(image)
                logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
            except Exception as e:
                logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
                paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")

        # Run Tesseract
        if TESSERACT_AVAILABLE:
            try:
                logger.info("[OCR Dual] Running Tesseract...")
                tesseract_result = self._tesseract_recognize(image)
                logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
            except Exception as e:
                logger.error(f"[OCR Dual] Tesseract failed: {e}")
                tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

        # Fallback if PaddleOCR not available
        if paddle_result is None:
            if tesseract_result:
                paddle_result = tesseract_result
            else:
                raise RuntimeError("No OCR engine available")

        return paddle_result, tesseract_result

    @staticmethod
    def get_available_engines() -> List[str]:
        """
        Return list of available OCR engines.

        Respects OCR_ENABLE_PADDLEOCR and OCR_ENABLE_TESSERACT from .env.
        Engines that are disabled via .env are not returned even if installed.

        Available engines: tesseract, doctr, doctr_plus, paddleocr
        """
        # Check .env settings
        paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
        tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"

        engines = []

        # Base engines (only if installed AND enabled)
        if TESSERACT_AVAILABLE and tesseract_enabled:
            engines.append('tesseract')
        if DOCTR_AVAILABLE:
            engines.append('doctr')
            engines.append('doctr_plus')  # docTR with 2-tier sequential + early exit
        if PADDLE_AVAILABLE and paddle_enabled:
            engines.append('paddleocr')

        return engines