"""Image preprocessing for optimal OCR results.""" from pathlib import Path from typing import List import numpy as np import cv2 try: import pdf2image PDF_AVAILABLE = True except ImportError: PDF_AVAILABLE = False class ImagePreprocessor: """Preprocess receipt images for OCR.""" def load_image(self, path: Path) -> np.ndarray: """Load image from file.""" image = cv2.imread(str(path)) if image is None: raise ValueError(f"Could not load image: {path}") return image def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]: """ Convert PDF to images with high DPI for better OCR. Args: path: Path to PDF file dpi: Resolution (400 recommended for receipts, higher = better quality but slower) """ if not PDF_AVAILABLE: raise RuntimeError("pdf2image not available. Install with: pip install pdf2image") # Use 400 DPI for better text recognition on thermal receipts images = pdf2image.convert_from_path(str(path), dpi=dpi) return [np.array(img) for img in images] def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray: """ Apply preprocessing pipeline for thermal receipt images. Pipeline: 1. Convert to grayscale 2. Resize if too small (min 1500px width for high quality) 3. Deskew (straighten rotated text) 4. Contrast enhancement (CLAHE) 5. Denoise (Non-local means) 6. Sharpening (for clearer text edges) 7. Adaptive thresholding (binarization) 8. Morphological operations (connect broken chars) Args: image: Input image (BGR or grayscale) high_quality: If True, apply more aggressive preprocessing """ # 1. Grayscale if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) else: gray = image.copy() # 2. Resize if too small (larger = better OCR) height, width = gray.shape min_width = 1500 if high_quality else 1000 if width < min_width: scale = min_width / width gray = cv2.resize( gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC ) # 3. Deskew gray = self._deskew(gray) # 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # 5. Denoise (slightly less aggressive to preserve text details) denoised = cv2.fastNlMeansDenoising( enhanced, h=8, # Lower h = preserve more details templateWindowSize=7, searchWindowSize=21 ) # 6. Sharpening to enhance text edges if high_quality: # Unsharp mask for better text clarity gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0) sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0) else: sharpened = denoised # 7. Adaptive thresholding with optimized parameters binary = cv2.adaptiveThreshold( sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blockSize=11, # Smaller block = better for small text C=5 # Lower C = darker result, better for faded receipts ) # 8. Morphological operations # Close small gaps in characters kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close) # Optional: Remove small noise spots if high_quality: kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open) return result def _deskew(self, image: np.ndarray) -> np.ndarray: """Correct image rotation/skew using Hough lines.""" edges = cv2.Canny(image, 50, 150, apertureSize=3) lines = cv2.HoughLinesP( edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10 ) if lines is None: return image angles = [] for line in lines: x1, y1, x2, y2 = line[0] angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi if abs(angle) < 45: angles.append(angle) if not angles: return image median_angle = np.median(angles) if abs(median_angle) < 0.5: return image h, w = image.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, median_angle, 1.0) return cv2.warpAffine( image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE )