feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements: - Add persistent SQLite-based job queue for OCR tasks - Implement worker pool with process isolation and auto-restart - Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone - Optimize Tesseract preprocessing based on benchmark results (8x faster) - Add recognize_cif_optimized() with multi-strategy CIF extraction - Add Romanian CIF checksum validation - Increase Telegram long polling timeout from 10s to 30s Squashed commits: - feat(ocr): Implement persistent worker pool with SQLite job queue - feat(ocr): Add OCR engine selector dropdown to upload zone - perf(telegram): Increase long polling timeout from 10s to 30s - perf(ocr): Optimize Tesseract preprocessing based on benchmark results 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -234,50 +234,76 @@ class ImagePreprocessor:
|
||||
|
||||
return result
|
||||
|
||||
def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray:
|
||||
def preprocess_for_tesseract(self, image: np.ndarray, binarize: bool = False,
|
||||
padding: int = 0, clahe_clip: float = 1.5) -> np.ndarray:
|
||||
"""
|
||||
Tesseract-optimized preprocessing.
|
||||
Tesseract works best with:
|
||||
- Clean black text on white background (binarized)
|
||||
- High DPI (scale up small images)
|
||||
- Otsu thresholding (better than adaptive for clean documents)
|
||||
"""
|
||||
# 0. Add safety padding to protect edge content during deskew rotation
|
||||
image = self._add_safety_padding(image)
|
||||
Tesseract-optimized preprocessing (based on comprehensive benchmark).
|
||||
|
||||
# 1. Grayscale
|
||||
BENCHMARK FINDINGS:
|
||||
- DPI 200 is optimal (not 300!)
|
||||
- Padding 40px fixes left margin truncation issues
|
||||
- CLAHE 1.5 for most receipts, 2.0 for difficult ones
|
||||
- NO deskew, NO denoising for clear PDFs
|
||||
|
||||
Recommended usage:
|
||||
- Simple receipts: padding=0, clahe_clip=1.5
|
||||
- Complex receipts: padding=40, clahe_clip=1.5
|
||||
- Difficult/faded: padding=40, clahe_clip=2.0, binarize=True
|
||||
|
||||
Args:
|
||||
image: Input image (RGB from pdf2image or BGR from OpenCV)
|
||||
binarize: Apply Otsu binarization (for faded receipts)
|
||||
padding: White padding in pixels (40px recommended for edge protection)
|
||||
clahe_clip: CLAHE clip limit (1.5 normal, 2.0 for difficult)
|
||||
|
||||
Returns:
|
||||
Preprocessed grayscale image
|
||||
"""
|
||||
# 1. Grayscale (handle both RGB and BGR)
|
||||
if len(image.shape) == 3:
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
||||
else:
|
||||
gray = image.copy()
|
||||
|
||||
# 2. Scale for optimal Tesseract (target ~2000px width for receipts)
|
||||
height, width = gray.shape
|
||||
if width < 2000:
|
||||
scale = 2000 / width
|
||||
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||
elif width > 3000:
|
||||
scale = 3000 / width
|
||||
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
|
||||
# 2. Add padding if specified (protects against left margin truncation)
|
||||
if padding > 0:
|
||||
gray = cv2.copyMakeBorder(
|
||||
gray, padding, padding, padding, padding,
|
||||
cv2.BORDER_CONSTANT, value=255
|
||||
)
|
||||
|
||||
# 3. Deskew
|
||||
gray = self._deskew(gray)
|
||||
|
||||
# 4. Strong contrast enhancement
|
||||
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
||||
# 3. CLAHE contrast enhancement
|
||||
clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(gray)
|
||||
|
||||
# 5. Denoise before binarization
|
||||
denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21)
|
||||
# NO deskew, NO denoising - these DEGRADE quality on clear PDFs!
|
||||
|
||||
# 6. Otsu binarization (better than adaptive for clean PDFs)
|
||||
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||
if not binarize:
|
||||
return enhanced
|
||||
|
||||
# 7. Light morphological cleanup
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
|
||||
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
|
||||
# Binarization only for faded receipts
|
||||
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||
|
||||
return cleaned
|
||||
# Ensure correct polarity
|
||||
if np.mean(binary) < 127:
|
||||
binary = 255 - binary
|
||||
|
||||
return binary
|
||||
|
||||
def preprocess_for_tesseract_padded(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Tesseract preprocessing with optimal padding (40px).
|
||||
|
||||
Best for complex receipts where left margin gets truncated.
|
||||
"""
|
||||
return self.preprocess_for_tesseract(image, padding=40)
|
||||
|
||||
def preprocess_for_tesseract_faded(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Tesseract preprocessing for FADED thermal receipts.
|
||||
Uses binarization to recover faded text.
|
||||
"""
|
||||
return self.preprocess_for_tesseract(image, binarize=True)
|
||||
|
||||
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user