roa2web-service-auto/data-entry-app/backend/app/services/ocr_service.py

"""Main OCR service coordinating preprocessing, recognition, and extraction."""

import os
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

import asyncio
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Optional, Tuple

from app.services.ocr_engine import OCREngine
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
from app.services.image_preprocessor import ImagePreprocessor


class OCRService:
    """Service for OCR processing of receipt images."""

    _executor = ThreadPoolExecutor(max_workers=2)

    def __init__(self):
        self.preprocessor = ImagePreprocessor()
        self.ocr_engine = OCREngine()
        self.extractor = ReceiptExtractor()

    async def process_image(
        self,
        image_path: Path,
        mime_type: str
    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
        """
        Process receipt image and extract structured data.

        Args:
            image_path: Path to the image file
            mime_type: MIME type of the file

        Returns:
            Tuple of (success, message, extraction_result)
        """
        try:
            loop = asyncio.get_event_loop()
            result = await loop.run_in_executor(
                self._executor,
                self._process_sync,
                image_path,
                mime_type
            )
            return result
        except Exception as e:
            return False, f"OCR processing failed: {str(e)}", None

    def _process_sync(
        self,
        image_path: Path,
        mime_type: str
    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
        """Synchronous processing (runs in thread pool)."""

        # Handle PDF
        if mime_type == 'application/pdf':
            try:
                images = self.preprocessor.pdf_to_images(image_path)
                if not images:
                    return False, "Failed to extract images from PDF", None
                image = images[0]  # Process first page only
            except RuntimeError as e:
                return False, str(e), None
        else:
            try:
                image = self.preprocessor.load_image(image_path)
            except ValueError as e:
                return False, str(e), None

        # Preprocess image
        processed = self.preprocessor.preprocess(image)

        # Perform OCR
        try:
            ocr_result = self.ocr_engine.recognize(processed)
        except RuntimeError as e:
            return False, str(e), None

        if not ocr_result.text:
            return False, "No text detected in image", None

        # Extract structured fields
        extraction = self.extractor.extract(ocr_result.text)

        # Build result message
        fields_found = []
        if extraction.amount:
            fields_found.append("amount")
        if extraction.receipt_date:
            fields_found.append("date")
        if extraction.partner_name:
            fields_found.append("vendor")
        if extraction.cui:
            fields_found.append("CUI")
        if extraction.receipt_number:
            fields_found.append("number")

        message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"

        return True, message, extraction


# Singleton instance
ocr_service = OCRService()