feat(ocr): Implement persistent worker pool with SQLite job queue

Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-31 12:32:12 +02:00
parent 00f826f7ed
commit 74f7aefc26
23 changed files with 3616 additions and 209 deletions

View File

@@ -234,50 +234,76 @@ class ImagePreprocessor:
return result
def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray:
def preprocess_for_tesseract(self, image: np.ndarray, binarize: bool = False,
padding: int = 0, clahe_clip: float = 1.5) -> np.ndarray:
"""
Tesseract-optimized preprocessing.
Tesseract works best with:
- Clean black text on white background (binarized)
- High DPI (scale up small images)
- Otsu thresholding (better than adaptive for clean documents)
"""
# 0. Add safety padding to protect edge content during deskew rotation
image = self._add_safety_padding(image)
Tesseract-optimized preprocessing (based on comprehensive benchmark).
# 1. Grayscale
BENCHMARK FINDINGS:
- DPI 200 is optimal (not 300!)
- Padding 40px fixes left margin truncation issues
- CLAHE 1.5 for most receipts, 2.0 for difficult ones
- NO deskew, NO denoising for clear PDFs
Recommended usage:
- Simple receipts: padding=0, clahe_clip=1.5
- Complex receipts: padding=40, clahe_clip=1.5
- Difficult/faded: padding=40, clahe_clip=2.0, binarize=True
Args:
image: Input image (RGB from pdf2image or BGR from OpenCV)
binarize: Apply Otsu binarization (for faded receipts)
padding: White padding in pixels (40px recommended for edge protection)
clahe_clip: CLAHE clip limit (1.5 normal, 2.0 for difficult)
Returns:
Preprocessed grayscale image
"""
# 1. Grayscale (handle both RGB and BGR)
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
else:
gray = image.copy()
# 2. Scale for optimal Tesseract (target ~2000px width for receipts)
height, width = gray.shape
if width < 2000:
scale = 2000 / width
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
elif width > 3000:
scale = 3000 / width
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
# 2. Add padding if specified (protects against left margin truncation)
if padding > 0:
gray = cv2.copyMakeBorder(
gray, padding, padding, padding, padding,
cv2.BORDER_CONSTANT, value=255
)
# 3. Deskew
gray = self._deskew(gray)
# 4. Strong contrast enhancement
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
# 3. CLAHE contrast enhancement
clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 5. Denoise before binarization
denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21)
# NO deskew, NO denoising - these DEGRADE quality on clear PDFs!
# 6. Otsu binarization (better than adaptive for clean PDFs)
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
if not binarize:
return enhanced
# 7. Light morphological cleanup
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# Binarization only for faded receipts
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return cleaned
# Ensure correct polarity
if np.mean(binary) < 127:
binary = 255 - binary
return binary
def preprocess_for_tesseract_padded(self, image: np.ndarray) -> np.ndarray:
"""
Tesseract preprocessing with optimal padding (40px).
Best for complex receipts where left margin gets truncated.
"""
return self.preprocess_for_tesseract(image, padding=40)
def preprocess_for_tesseract_faded(self, image: np.ndarray) -> np.ndarray:
"""
Tesseract preprocessing for FADED thermal receipts.
Uses binarization to recover faded text.
"""
return self.preprocess_for_tesseract(image, binarize=True)
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
"""