roa2web-service-auto/data-entry-app/backend/app/services/ocr_engine.py

"""OCR engine wrapper for PaddleOCR and Tesseract."""

import os
from dataclasses import dataclass
from typing import List, Optional

import numpy as np

# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

# Lazy imports - these will be imported on first use
PaddleOCR = None  # Will be imported lazily
pytesseract = None  # Will be imported lazily

# Check availability without importing heavy libraries
def _check_paddle_available() -> bool:
    """Check if paddleocr is installed without importing it."""
    try:
        import importlib.util
        return importlib.util.find_spec("paddleocr") is not None
    except Exception:
        return False

def _check_tesseract_available() -> bool:
    """Check if pytesseract is installed without importing it."""
    try:
        import importlib.util
        return importlib.util.find_spec("pytesseract") is not None
    except Exception:
        return False

PADDLE_AVAILABLE = _check_paddle_available()
TESSERACT_AVAILABLE = _check_tesseract_available()


@dataclass
class OCRResult:
    """Raw OCR result."""
    text: str
    confidence: float
    boxes: List[dict]


class OCREngine:
    """Unified OCR engine with fallback support."""

    def __init__(self):
        self._paddle = None
        self._paddle_initialized = False

    def _init_paddle_lazy(self):
        """Lazy initialize PaddleOCR on first use (avoids slow startup)."""
        global PaddleOCR

        if self._paddle_initialized:
            return

        self._paddle_initialized = True
        if PADDLE_AVAILABLE:
            try:
                print("Importing PaddleOCR (first use, may take ~15-20 seconds)...")
                from paddleocr import PaddleOCR as _PaddleOCR
                PaddleOCR = _PaddleOCR

                print("Initializing PaddleOCR engine...")
                # PaddleOCR 3.x API - optimized for Romanian receipts
                self._paddle = PaddleOCR(
                    lang='en',  # 'en' works better than 'ro' for mixed alphanumeric
                    # High quality settings for better accuracy
                    det_db_thresh=0.3,      # Lower threshold = detect more text (default 0.3)
                    det_db_box_thresh=0.5,  # Box confidence threshold (default 0.5)
                    det_db_unclip_ratio=1.8,  # Expand detected boxes slightly (default 1.5)
                    rec_batch_num=6,        # Batch size for recognition
                    use_angle_cls=True,     # Enable text angle classification
                )
                print("PaddleOCR initialized successfully with high-quality settings")
            except Exception as e:
                print(f"Warning: Failed to initialize PaddleOCR: {e}")
                self._paddle = None

    def recognize(self, image: np.ndarray) -> OCRResult:
        """Perform OCR on preprocessed image."""
        # Lazy init PaddleOCR on first call
        self._init_paddle_lazy()

        if PADDLE_AVAILABLE and self._paddle:
            return self._paddle_recognize(image)
        elif TESSERACT_AVAILABLE:
            return self._tesseract_recognize(image)
        else:
            raise RuntimeError(
                "No OCR engine available. Install PaddleOCR or Tesseract."
            )

    def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using PaddleOCR 3.x API."""
        try:
            # PaddleOCR 3.x requires 3-channel images
            if len(image.shape) == 2:
                # Convert grayscale to 3-channel BGR
                import cv2
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)

            # PaddleOCR 3.x uses predict() with new parameter names
            result = self._paddle.predict(image, use_textline_orientation=True)

            if not result or len(result) == 0:
                return OCRResult(text="", confidence=0.0, boxes=[])

            # PaddleOCR 3.x returns OCRResult objects with different structure
            ocr_result = result[0]

            # Extract texts and scores from the new format
            rec_texts = ocr_result.get('rec_texts', [])
            rec_scores = ocr_result.get('rec_scores', [])
            dt_polys = ocr_result.get('dt_polys', [])

            if not rec_texts:
                return OCRResult(text="", confidence=0.0, boxes=[])

            boxes = []
            for i, text in enumerate(rec_texts):
                conf = rec_scores[i] if i < len(rec_scores) else 0.0
                box = dt_polys[i].tolist() if i < len(dt_polys) else []
                boxes.append({
                    'text': text,
                    'confidence': float(conf),
                    'box': box
                })

            avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
            return OCRResult(
                text='\n'.join(rec_texts),
                confidence=float(avg_conf),
                boxes=boxes
            )
        except Exception as e:
            print(f"PaddleOCR error: {e}, falling back to Tesseract")
            if TESSERACT_AVAILABLE:
                return self._tesseract_recognize(image)
            raise

    def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using Tesseract."""
        global pytesseract

        # Lazy import pytesseract
        if pytesseract is None:
            print("Importing pytesseract...")
            import pytesseract as _pytesseract
            pytesseract = _pytesseract

        config = '--psm 6 -l ron+eng'
        text = pytesseract.image_to_string(image, config=config)
        data = pytesseract.image_to_data(
            image, config=config,
            output_type=pytesseract.Output.DICT
        )

        confidences = [int(c) for c in data['conf'] if int(c) > 0]
        avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0

        return OCRResult(text=text, confidence=avg_conf, boxes=[])

    @staticmethod
    def get_available_engines() -> List[str]:
        """Return list of available OCR engines."""
        engines = []
        if PADDLE_AVAILABLE:
            engines.append('paddleocr')
        if TESSERACT_AVAILABLE:
            engines.append('tesseract')
        return engines