Files
roa2web-service-auto/backend/modules/data_entry/services/image_preprocessor.py
Marius Mutu ab160b628d feat(ocr): Add validation system and CLIENT CUI extraction
OCR Data Extraction Validation System:
- Add 7 validation rules (amount range, TVA ratio, payment sum, etc.)
- Add Medium preprocessing to replace Heavy (fixes digit concatenation)
- Add validation warnings to API responses
- Flag receipts needing manual review (needs_manual_review field)
- Add database migration for needs_manual_review column

CLIENT CUI Extraction Improvements:
- Support all format variations: CIF CLIENT:, CLIENT C.U.I/C.I.F., etc.
- Handle OCR errors (R0 vs RO, C1F vs CIF)
- Add client_name, client_cui, client_address to API response
- Add validation fields to API response (was missing)

QA Review: 12 issues found, 9 fixed (5 errors + 4 warnings)
- Fixed type safety in validation rules
- Fixed ZeroDivisionError risk
- Fixed schema mismatch (Optional[bool] for needs_manual_review)
- All 37 unit tests passing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 19:12:52 +02:00

341 lines
12 KiB
Python

"""Image preprocessing for optimal OCR results."""
from pathlib import Path
from typing import List
import numpy as np
import cv2
try:
import pdf2image
PDF_AVAILABLE = True
except ImportError:
PDF_AVAILABLE = False
class ImagePreprocessor:
"""Preprocess receipt images for OCR."""
def _add_safety_padding(self, image: np.ndarray, padding: int = 50) -> np.ndarray:
"""Add white padding around image to protect edge content during rotation.
This prevents left/right margin truncation in OCR by ensuring text near
edges isn't lost during deskew rotation.
"""
if len(image.shape) == 2:
# Grayscale
return cv2.copyMakeBorder(
image, padding, padding, padding, padding,
cv2.BORDER_CONSTANT, value=255
)
else:
# Color (BGR)
return cv2.copyMakeBorder(
image, padding, padding, padding, padding,
cv2.BORDER_CONSTANT, value=(255, 255, 255)
)
def load_image(self, path: Path) -> np.ndarray:
"""Load image from file."""
image = cv2.imread(str(path))
if image is None:
raise ValueError(f"Could not load image: {path}")
return image
def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
"""
Convert PDF to images.
Args:
path: Path to PDF file
dpi: Resolution (300 = fast & good quality, 400 = better but slower)
"""
if not PDF_AVAILABLE:
raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
images = pdf2image.convert_from_path(str(path), dpi=dpi)
return [np.array(img) for img in images]
def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
"""
Apply LIGHT preprocessing - better for clear PDFs.
Heavy binarization can destroy text on clear images.
"""
return self.preprocess_light(image)
def preprocess_light(self, image: np.ndarray) -> np.ndarray:
"""
Light preprocessing for CLEAR images (PDFs, good scans).
Preserves original quality, only enhances contrast.
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)
# 1. Grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
height, width = gray.shape
max_side = max(height, width)
if max_side > 4000:
scale = 4000 / max_side
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
height, width = gray.shape
# 2b. Scale UP if too small
if width < 1500:
scale = 1500 / width
# Ensure we don't exceed 4000px after upscaling
new_width = int(width * scale)
new_height = int(height * scale)
if max(new_width, new_height) > 4000:
scale = 4000 / max(new_width, new_height)
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# 3. Deskew
gray = self._deskew(gray)
# 4. Light contrast enhancement only
clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# NO binarization, NO morphological ops - preserve original quality
return enhanced
def preprocess_medium(self, image: np.ndarray) -> np.ndarray:
"""
Medium preprocessing for MIXED-QUALITY images.
Balance between Light (too gentle) and Heavy (too aggressive).
Use cases:
- Moderately faded receipts
- Photos with uneven lighting
- Scans with slight blur
Preprocessing steps:
- Moderate contrast enhancement (CLAHE clipLimit=2.0)
- Light denoising (fastNlMeansDenoising h=6)
- Gentle sharpening
- NO binarization (preserves text boundaries)
- NO morphological operations (avoids digit concatenation)
This method was created to replace preprocess_heavy() which caused
digit concatenation errors on high-quality PDFs (85.99 → 859,762.16).
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)
# 1. Grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
height, width = gray.shape
max_side = max(height, width)
if max_side > 4000:
scale = 4000 / max_side
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
height, width = gray.shape
# 2b. Scale UP if too small
if width < 1500:
scale = 1500 / width
# Ensure we don't exceed 4000px after upscaling
new_width = int(width * scale)
new_height = int(height * scale)
if max(new_width, new_height) > 4000:
scale = 4000 / max(new_width, new_height)
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# 3. Deskew
gray = self._deskew(gray)
# 4. Moderate contrast enhancement (CLAHE clipLimit=2.0)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 5. Light denoising (less aggressive than Heavy)
denoised = cv2.fastNlMeansDenoising(enhanced, h=6, templateWindowSize=7, searchWindowSize=15)
# 6. Gentle sharpening
gaussian = cv2.GaussianBlur(denoised, (0, 0), 1.0)
sharpened = cv2.addWeighted(denoised, 1.3, gaussian, -0.3, 0)
# NO binarization, NO morphological operations
# This preserves text boundaries and avoids digit concatenation
return sharpened
def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
"""
Heavy preprocessing for FADED thermal receipts.
Aggressive binarization to recover faded text.
⚠️ DEPRECATED: Use preprocess_medium() instead.
Heavy preprocessing causes digit concatenation on clear PDFs
(e.g., 85.99 → 859,762.16 due to binarization + morphological operations).
Kept for backward compatibility only.
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)
# 1. Grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
height, width = gray.shape
max_side = max(height, width)
if max_side > 4000:
scale = 4000 / max_side
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
height, width = gray.shape
# 2b. Scale UP if too small (larger = better OCR)
if width < 1500:
scale = 1500 / width
# Ensure we don't exceed 4000px after upscaling
new_width = int(width * scale)
new_height = int(height * scale)
if max(new_width, new_height) > 4000:
scale = 4000 / max(new_width, new_height)
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# 3. Deskew
gray = self._deskew(gray)
# 4. Contrast enhancement with CLAHE
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 5. Denoise
denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
# 6. Sharpening
gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
# 7. Adaptive thresholding (binarization)
binary = cv2.adaptiveThreshold(
sharpened, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11, C=5
)
# 8. Morphological operations
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)
return result
def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray:
"""
Tesseract-optimized preprocessing.
Tesseract works best with:
- Clean black text on white background (binarized)
- High DPI (scale up small images)
- Otsu thresholding (better than adaptive for clean documents)
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)
# 1. Grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# 2. Scale for optimal Tesseract (target ~2000px width for receipts)
height, width = gray.shape
if width < 2000:
scale = 2000 / width
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
elif width > 3000:
scale = 3000 / width
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
# 3. Deskew
gray = self._deskew(gray)
# 4. Strong contrast enhancement
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 5. Denoise before binarization
denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21)
# 6. Otsu binarization (better than adaptive for clean PDFs)
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 7. Light morphological cleanup
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
return cleaned
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
"""
Generate 2 preprocessing variants for OCR (fast mode).
Returns: [light_processed, heavy_processed]
"""
return [
self.preprocess_light(image),
self.preprocess_heavy(image),
]
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""Correct image rotation/skew using Hough lines.
Uses expanded canvas to preserve all content during rotation,
preventing left/right margin truncation.
"""
edges = cv2.Canny(image, 50, 150, apertureSize=3)
lines = cv2.HoughLinesP(
edges, 1, np.pi / 180,
threshold=100, minLineLength=100, maxLineGap=10
)
if lines is None:
return image
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
if abs(angle) < 45:
angles.append(angle)
if not angles:
return image
median_angle = np.median(angles)
if abs(median_angle) < 0.5:
return image
h, w = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
# Calculate new canvas size to fit entire rotated image (prevents edge truncation)
cos_angle = abs(np.cos(np.radians(median_angle)))
sin_angle = abs(np.sin(np.radians(median_angle)))
new_w = int(h * sin_angle + w * cos_angle)
new_h = int(h * cos_angle + w * sin_angle)
# Adjust rotation matrix for new canvas center
M[0, 2] += (new_w - w) / 2
M[1, 2] += (new_h - h) / 2
return cv2.warpAffine(
image, M, (new_w, new_h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_CONSTANT,
borderValue=255 # White background (grayscale)
)