roa2web-service-auto/data-entry-app/backend/app/services/image_preprocessor.py

"""Image preprocessing for optimal OCR results."""

from pathlib import Path
from typing import List

import numpy as np
import cv2

try:
    import pdf2image
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False


class ImagePreprocessor:
    """Preprocess receipt images for OCR."""

    def load_image(self, path: Path) -> np.ndarray:
        """Load image from file."""
        image = cv2.imread(str(path))
        if image is None:
            raise ValueError(f"Could not load image: {path}")
        return image

    def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]:
        """
        Convert PDF to images with high DPI for better OCR.

        Args:
            path: Path to PDF file
            dpi: Resolution (400 recommended for receipts, higher = better quality but slower)
        """
        if not PDF_AVAILABLE:
            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
        # Use 400 DPI for better text recognition on thermal receipts
        images = pdf2image.convert_from_path(str(path), dpi=dpi)
        return [np.array(img) for img in images]

    def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
        """
        Apply preprocessing pipeline for thermal receipt images.

        Pipeline:
        1. Convert to grayscale
        2. Resize if too small (min 1500px width for high quality)
        3. Deskew (straighten rotated text)
        4. Contrast enhancement (CLAHE)
        5. Denoise (Non-local means)
        6. Sharpening (for clearer text edges)
        7. Adaptive thresholding (binarization)
        8. Morphological operations (connect broken chars)

        Args:
            image: Input image (BGR or grayscale)
            high_quality: If True, apply more aggressive preprocessing
        """
        # 1. Grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # 2. Resize if too small (larger = better OCR)
        height, width = gray.shape
        min_width = 1500 if high_quality else 1000
        if width < min_width:
            scale = min_width / width
            gray = cv2.resize(
                gray, None, fx=scale, fy=scale,
                interpolation=cv2.INTER_CUBIC
            )

        # 3. Deskew
        gray = self._deskew(gray)

        # 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # 5. Denoise (slightly less aggressive to preserve text details)
        denoised = cv2.fastNlMeansDenoising(
            enhanced, h=8,  # Lower h = preserve more details
            templateWindowSize=7,
            searchWindowSize=21
        )

        # 6. Sharpening to enhance text edges
        if high_quality:
            # Unsharp mask for better text clarity
            gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
            sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
        else:
            sharpened = denoised

        # 7. Adaptive thresholding with optimized parameters
        binary = cv2.adaptiveThreshold(
            sharpened, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            blockSize=11,  # Smaller block = better for small text
            C=5  # Lower C = darker result, better for faded receipts
        )

        # 8. Morphological operations
        # Close small gaps in characters
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)

        # Optional: Remove small noise spots
        if high_quality:
            kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
            result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open)

        return result

    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """Correct image rotation/skew using Hough lines."""
        edges = cv2.Canny(image, 50, 150, apertureSize=3)
        lines = cv2.HoughLinesP(
            edges, 1, np.pi / 180,
            threshold=100, minLineLength=100, maxLineGap=10
        )

        if lines is None:
            return image

        angles = []
        for line in lines:
            x1, y1, x2, y2 = line[0]
            angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
            if abs(angle) < 45:
                angles.append(angle)

        if not angles:
            return image

        median_angle = np.median(angles)
        if abs(median_angle) < 0.5:
            return image

        h, w = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
        return cv2.warpAffine(
            image, M, (w, h),
            flags=cv2.INTER_CUBIC,
            borderMode=cv2.BORDER_REPLICATE
        )