roa2web-service-auto/deploy-package-20260223-151231/backend/modules/data_entry/services/image_preprocessor.py

"""Image preprocessing for optimal OCR results."""

from pathlib import Path
from typing import List

import numpy as np
import cv2

try:
    import pdf2image
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False


class ImagePreprocessor:
    """Preprocess receipt images for OCR."""

    def _add_safety_padding(self, image: np.ndarray, padding: int = 50) -> np.ndarray:
        """Add white padding around image to protect edge content during rotation.

        This prevents left/right margin truncation in OCR by ensuring text near
        edges isn't lost during deskew rotation.
        """
        if len(image.shape) == 2:
            # Grayscale
            return cv2.copyMakeBorder(
                image, padding, padding, padding, padding,
                cv2.BORDER_CONSTANT, value=255
            )
        else:
            # Color (BGR)
            return cv2.copyMakeBorder(
                image, padding, padding, padding, padding,
                cv2.BORDER_CONSTANT, value=(255, 255, 255)
            )

    def load_image(self, path: Path) -> np.ndarray:
        """Load image from file."""
        image = cv2.imread(str(path))
        if image is None:
            raise ValueError(f"Could not load image: {path}")
        return image

    def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
        """
        Convert PDF to images.

        Args:
            path: Path to PDF file
            dpi: Resolution (300 = fast & good quality, 400 = better but slower)
        """
        if not PDF_AVAILABLE:
            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
        images = pdf2image.convert_from_path(str(path), dpi=dpi)
        return [np.array(img) for img in images]

    def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
        """
        Apply LIGHT preprocessing - better for clear PDFs.
        Heavy binarization can destroy text on clear images.
        """
        return self.preprocess_light(image)

    def preprocess_light(self, image: np.ndarray) -> np.ndarray:
        """
        Light preprocessing for CLEAR images (PDFs, good scans).
        Preserves original quality, only enhances contrast.
        """
        # 0. Add safety padding to protect edge content during deskew rotation
        image = self._add_safety_padding(image)

        # 1. Grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
        height, width = gray.shape
        max_side = max(height, width)
        if max_side > 4000:
            scale = 4000 / max_side
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
            height, width = gray.shape

        # 2b. Scale UP if too small
        if width < 1500:
            scale = 1500 / width
            # Ensure we don't exceed 4000px after upscaling
            new_width = int(width * scale)
            new_height = int(height * scale)
            if max(new_width, new_height) > 4000:
                scale = 4000 / max(new_width, new_height)
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

        # 3. Deskew
        gray = self._deskew(gray)

        # 4. Light contrast enhancement only
        clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # NO binarization, NO morphological ops - preserve original quality
        return enhanced

    def preprocess_medium(self, image: np.ndarray) -> np.ndarray:
        """
        Medium preprocessing for MIXED-QUALITY images.
        Balance between Light (too gentle) and Heavy (too aggressive).

        Use cases:
        - Moderately faded receipts
        - Photos with uneven lighting
        - Scans with slight blur

        Preprocessing steps:
        - Moderate contrast enhancement (CLAHE clipLimit=2.0)
        - Light denoising (fastNlMeansDenoising h=6)
        - Gentle sharpening
        - NO binarization (preserves text boundaries)
        - NO morphological operations (avoids digit concatenation)

        This method was created to replace preprocess_heavy() which caused
        digit concatenation errors on high-quality PDFs (85.99 → 859,762.16).
        """
        # 0. Add safety padding to protect edge content during deskew rotation
        image = self._add_safety_padding(image)

        # 1. Grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
        height, width = gray.shape
        max_side = max(height, width)
        if max_side > 4000:
            scale = 4000 / max_side
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
            height, width = gray.shape

        # 2b. Scale UP if too small
        if width < 1500:
            scale = 1500 / width
            # Ensure we don't exceed 4000px after upscaling
            new_width = int(width * scale)
            new_height = int(height * scale)
            if max(new_width, new_height) > 4000:
                scale = 4000 / max(new_width, new_height)
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

        # 3. Deskew
        gray = self._deskew(gray)

        # 4. Moderate contrast enhancement (CLAHE clipLimit=2.0)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # 5. Light denoising (less aggressive than Heavy)
        denoised = cv2.fastNlMeansDenoising(enhanced, h=6, templateWindowSize=7, searchWindowSize=15)

        # 6. Gentle sharpening
        gaussian = cv2.GaussianBlur(denoised, (0, 0), 1.0)
        sharpened = cv2.addWeighted(denoised, 1.3, gaussian, -0.3, 0)

        # NO binarization, NO morphological operations
        # This preserves text boundaries and avoids digit concatenation
        return sharpened

    def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
        """
        Heavy preprocessing for FADED thermal receipts.
        Aggressive binarization to recover faded text.

        ⚠️ DEPRECATED: Use preprocess_medium() instead.
        Heavy preprocessing causes digit concatenation on clear PDFs
        (e.g., 85.99 → 859,762.16 due to binarization + morphological operations).
        Kept for backward compatibility only.
        """
        # 0. Add safety padding to protect edge content during deskew rotation
        image = self._add_safety_padding(image)

        # 1. Grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
        height, width = gray.shape
        max_side = max(height, width)
        if max_side > 4000:
            scale = 4000 / max_side
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
            height, width = gray.shape

        # 2b. Scale UP if too small (larger = better OCR)
        if width < 1500:
            scale = 1500 / width
            # Ensure we don't exceed 4000px after upscaling
            new_width = int(width * scale)
            new_height = int(height * scale)
            if max(new_width, new_height) > 4000:
                scale = 4000 / max(new_width, new_height)
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

        # 3. Deskew
        gray = self._deskew(gray)

        # 4. Contrast enhancement with CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # 5. Denoise
        denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)

        # 6. Sharpening
        gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
        sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)

        # 7. Adaptive thresholding (binarization)
        binary = cv2.adaptiveThreshold(
            sharpened, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            blockSize=11, C=5
        )

        # 8. Morphological operations
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)

        return result

    def preprocess_for_tesseract(self, image: np.ndarray, binarize: bool = False,
                                   padding: int = 0, clahe_clip: float = 1.5) -> np.ndarray:
        """
        Tesseract-optimized preprocessing (based on comprehensive benchmark).

        BENCHMARK FINDINGS:
        - DPI 200 is optimal (not 300!)
        - Padding 40px fixes left margin truncation issues
        - CLAHE 1.5 for most receipts, 2.0 for difficult ones
        - NO deskew, NO denoising for clear PDFs

        Recommended usage:
        - Simple receipts: padding=0, clahe_clip=1.5
        - Complex receipts: padding=40, clahe_clip=1.5
        - Difficult/faded: padding=40, clahe_clip=2.0, binarize=True

        Args:
            image: Input image (RGB from pdf2image or BGR from OpenCV)
            binarize: Apply Otsu binarization (for faded receipts)
            padding: White padding in pixels (40px recommended for edge protection)
            clahe_clip: CLAHE clip limit (1.5 normal, 2.0 for difficult)

        Returns:
            Preprocessed grayscale image
        """
        # 1. Grayscale (handle both RGB and BGR)
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image.copy()

        # 2. Add padding if specified (protects against left margin truncation)
        if padding > 0:
            gray = cv2.copyMakeBorder(
                gray, padding, padding, padding, padding,
                cv2.BORDER_CONSTANT, value=255
            )

        # 3. CLAHE contrast enhancement
        clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # NO deskew, NO denoising - these DEGRADE quality on clear PDFs!

        if not binarize:
            return enhanced

        # Binarization only for faded receipts
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Ensure correct polarity
        if np.mean(binary) < 127:
            binary = 255 - binary

        return binary

    def preprocess_for_tesseract_padded(self, image: np.ndarray) -> np.ndarray:
        """
        Tesseract preprocessing with optimal padding (40px).

        Best for complex receipts where left margin gets truncated.
        """
        return self.preprocess_for_tesseract(image, padding=40)

    def preprocess_for_tesseract_faded(self, image: np.ndarray) -> np.ndarray:
        """
        Tesseract preprocessing for FADED thermal receipts.
        Uses binarization to recover faded text.
        """
        return self.preprocess_for_tesseract(image, binarize=True)

    def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
        """
        Generate 2 preprocessing variants for OCR (fast mode).
        Returns: [light_processed, heavy_processed]
        """
        return [
            self.preprocess_light(image),
            self.preprocess_heavy(image),
        ]

    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """Correct image rotation/skew using Hough lines.

        Uses expanded canvas to preserve all content during rotation,
        preventing left/right margin truncation.
        """
        edges = cv2.Canny(image, 50, 150, apertureSize=3)
        lines = cv2.HoughLinesP(
            edges, 1, np.pi / 180,
            threshold=100, minLineLength=100, maxLineGap=10
        )

        if lines is None:
            return image

        angles = []
        for line in lines:
            x1, y1, x2, y2 = line[0]
            angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
            if abs(angle) < 45:
                angles.append(angle)

        if not angles:
            return image

        median_angle = np.median(angles)
        if abs(median_angle) < 0.5:
            return image

        h, w = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, median_angle, 1.0)

        # Calculate new canvas size to fit entire rotated image (prevents edge truncation)
        cos_angle = abs(np.cos(np.radians(median_angle)))
        sin_angle = abs(np.sin(np.radians(median_angle)))
        new_w = int(h * sin_angle + w * cos_angle)
        new_h = int(h * cos_angle + w * sin_angle)

        # Adjust rotation matrix for new canvas center
        M[0, 2] += (new_w - w) / 2
        M[1, 2] += (new_h - h) / 2

        return cv2.warpAffine(
            image, M, (new_w, new_h),
            flags=cv2.INTER_CUBIC,
            borderMode=cv2.BORDER_CONSTANT,
            borderValue=255  # White background (grayscale)
        )