"""Image preprocessing for optimal OCR results.""" from pathlib import Path from typing import List import numpy as np import cv2 try: import pdf2image PDF_AVAILABLE = True except ImportError: PDF_AVAILABLE = False class ImagePreprocessor: """Preprocess receipt images for OCR.""" def _add_safety_padding(self, image: np.ndarray, padding: int = 50) -> np.ndarray: """Add white padding around image to protect edge content during rotation. This prevents left/right margin truncation in OCR by ensuring text near edges isn't lost during deskew rotation. """ if len(image.shape) == 2: # Grayscale return cv2.copyMakeBorder( image, padding, padding, padding, padding, cv2.BORDER_CONSTANT, value=255 ) else: # Color (BGR) return cv2.copyMakeBorder( image, padding, padding, padding, padding, cv2.BORDER_CONSTANT, value=(255, 255, 255) ) def load_image(self, path: Path) -> np.ndarray: """Load image from file.""" image = cv2.imread(str(path)) if image is None: raise ValueError(f"Could not load image: {path}") return image def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]: """ Convert PDF to images. Args: path: Path to PDF file dpi: Resolution (300 = fast & good quality, 400 = better but slower) """ if not PDF_AVAILABLE: raise RuntimeError("pdf2image not available. Install with: pip install pdf2image") images = pdf2image.convert_from_path(str(path), dpi=dpi) return [np.array(img) for img in images] def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray: """ Apply LIGHT preprocessing - better for clear PDFs. Heavy binarization can destroy text on clear images. """ return self.preprocess_light(image) def preprocess_light(self, image: np.ndarray) -> np.ndarray: """ Light preprocessing for CLEAR images (PDFs, good scans). Preserves original quality, only enhances contrast. """ # 0. Add safety padding to protect edge content during deskew rotation image = self._add_safety_padding(image) # 1. Grayscale if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) else: gray = image.copy() # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit) height, width = gray.shape max_side = max(height, width) if max_side > 4000: scale = 4000 / max_side gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) height, width = gray.shape # 2b. Scale UP if too small if width < 1500: scale = 1500 / width # Ensure we don't exceed 4000px after upscaling new_width = int(width * scale) new_height = int(height * scale) if max(new_width, new_height) > 4000: scale = 4000 / max(new_width, new_height) gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) # 3. Deskew gray = self._deskew(gray) # 4. Light contrast enhancement only clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # NO binarization, NO morphological ops - preserve original quality return enhanced def preprocess_medium(self, image: np.ndarray) -> np.ndarray: """ Medium preprocessing for MIXED-QUALITY images. Balance between Light (too gentle) and Heavy (too aggressive). Use cases: - Moderately faded receipts - Photos with uneven lighting - Scans with slight blur Preprocessing steps: - Moderate contrast enhancement (CLAHE clipLimit=2.0) - Light denoising (fastNlMeansDenoising h=6) - Gentle sharpening - NO binarization (preserves text boundaries) - NO morphological operations (avoids digit concatenation) This method was created to replace preprocess_heavy() which caused digit concatenation errors on high-quality PDFs (85.99 → 859,762.16). """ # 0. Add safety padding to protect edge content during deskew rotation image = self._add_safety_padding(image) # 1. Grayscale if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) else: gray = image.copy() # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit) height, width = gray.shape max_side = max(height, width) if max_side > 4000: scale = 4000 / max_side gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) height, width = gray.shape # 2b. Scale UP if too small if width < 1500: scale = 1500 / width # Ensure we don't exceed 4000px after upscaling new_width = int(width * scale) new_height = int(height * scale) if max(new_width, new_height) > 4000: scale = 4000 / max(new_width, new_height) gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) # 3. Deskew gray = self._deskew(gray) # 4. Moderate contrast enhancement (CLAHE clipLimit=2.0) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # 5. Light denoising (less aggressive than Heavy) denoised = cv2.fastNlMeansDenoising(enhanced, h=6, templateWindowSize=7, searchWindowSize=15) # 6. Gentle sharpening gaussian = cv2.GaussianBlur(denoised, (0, 0), 1.0) sharpened = cv2.addWeighted(denoised, 1.3, gaussian, -0.3, 0) # NO binarization, NO morphological operations # This preserves text boundaries and avoids digit concatenation return sharpened def preprocess_heavy(self, image: np.ndarray) -> np.ndarray: """ Heavy preprocessing for FADED thermal receipts. Aggressive binarization to recover faded text. ⚠️ DEPRECATED: Use preprocess_medium() instead. Heavy preprocessing causes digit concatenation on clear PDFs (e.g., 85.99 → 859,762.16 due to binarization + morphological operations). Kept for backward compatibility only. """ # 0. Add safety padding to protect edge content during deskew rotation image = self._add_safety_padding(image) # 1. Grayscale if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) else: gray = image.copy() # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit) height, width = gray.shape max_side = max(height, width) if max_side > 4000: scale = 4000 / max_side gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) height, width = gray.shape # 2b. Scale UP if too small (larger = better OCR) if width < 1500: scale = 1500 / width # Ensure we don't exceed 4000px after upscaling new_width = int(width * scale) new_height = int(height * scale) if max(new_width, new_height) > 4000: scale = 4000 / max(new_width, new_height) gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) # 3. Deskew gray = self._deskew(gray) # 4. Contrast enhancement with CLAHE clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # 5. Denoise denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21) # 6. Sharpening gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0) sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0) # 7. Adaptive thresholding (binarization) binary = cv2.adaptiveThreshold( sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blockSize=11, C=5 ) # 8. Morphological operations kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close) return result def preprocess_for_tesseract(self, image: np.ndarray, binarize: bool = False, padding: int = 0, clahe_clip: float = 1.5) -> np.ndarray: """ Tesseract-optimized preprocessing (based on comprehensive benchmark). BENCHMARK FINDINGS: - DPI 200 is optimal (not 300!) - Padding 40px fixes left margin truncation issues - CLAHE 1.5 for most receipts, 2.0 for difficult ones - NO deskew, NO denoising for clear PDFs Recommended usage: - Simple receipts: padding=0, clahe_clip=1.5 - Complex receipts: padding=40, clahe_clip=1.5 - Difficult/faded: padding=40, clahe_clip=2.0, binarize=True Args: image: Input image (RGB from pdf2image or BGR from OpenCV) binarize: Apply Otsu binarization (for faded receipts) padding: White padding in pixels (40px recommended for edge protection) clahe_clip: CLAHE clip limit (1.5 normal, 2.0 for difficult) Returns: Preprocessed grayscale image """ # 1. Grayscale (handle both RGB and BGR) if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) else: gray = image.copy() # 2. Add padding if specified (protects against left margin truncation) if padding > 0: gray = cv2.copyMakeBorder( gray, padding, padding, padding, padding, cv2.BORDER_CONSTANT, value=255 ) # 3. CLAHE contrast enhancement clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # NO deskew, NO denoising - these DEGRADE quality on clear PDFs! if not binarize: return enhanced # Binarization only for faded receipts _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # Ensure correct polarity if np.mean(binary) < 127: binary = 255 - binary return binary def preprocess_for_tesseract_padded(self, image: np.ndarray) -> np.ndarray: """ Tesseract preprocessing with optimal padding (40px). Best for complex receipts where left margin gets truncated. """ return self.preprocess_for_tesseract(image, padding=40) def preprocess_for_tesseract_faded(self, image: np.ndarray) -> np.ndarray: """ Tesseract preprocessing for FADED thermal receipts. Uses binarization to recover faded text. """ return self.preprocess_for_tesseract(image, binarize=True) def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]: """ Generate 2 preprocessing variants for OCR (fast mode). Returns: [light_processed, heavy_processed] """ return [ self.preprocess_light(image), self.preprocess_heavy(image), ] def _deskew(self, image: np.ndarray) -> np.ndarray: """Correct image rotation/skew using Hough lines. Uses expanded canvas to preserve all content during rotation, preventing left/right margin truncation. """ edges = cv2.Canny(image, 50, 150, apertureSize=3) lines = cv2.HoughLinesP( edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10 ) if lines is None: return image angles = [] for line in lines: x1, y1, x2, y2 = line[0] angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi if abs(angle) < 45: angles.append(angle) if not angles: return image median_angle = np.median(angles) if abs(median_angle) < 0.5: return image h, w = image.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, median_angle, 1.0) # Calculate new canvas size to fit entire rotated image (prevents edge truncation) cos_angle = abs(np.cos(np.radians(median_angle))) sin_angle = abs(np.sin(np.radians(median_angle))) new_w = int(h * sin_angle + w * cos_angle) new_h = int(h * cos_angle + w * sin_angle) # Adjust rotation matrix for new canvas center M[0, 2] += (new_w - w) / 2 M[1, 2] += (new_h - h) / 2 return cv2.warpAffine( image, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_CONSTANT, borderValue=255 # White background (grayscale) )