Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
477 lines
19 KiB
Python
477 lines
19 KiB
Python
"""OCR engine wrapper for PaddleOCR, docTR, and Tesseract."""
|
|
|
|
import os
|
|
import logging
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
# Setup logging (respects LOG_LEVEL env var set in main.py)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
|
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
|
|
# Lazy imports - these will be imported on first use
|
|
PaddleOCR = None # Will be imported lazily
|
|
pytesseract = None # Will be imported lazily
|
|
doctr_ocr_predictor = None # Will be imported lazily
|
|
|
|
# Check availability without importing heavy libraries
|
|
def _check_paddle_available() -> bool:
|
|
"""Check if paddleocr is installed without importing it."""
|
|
try:
|
|
import importlib.util
|
|
return importlib.util.find_spec("paddleocr") is not None
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_tesseract_available() -> bool:
|
|
"""Check if pytesseract is installed without importing it."""
|
|
try:
|
|
import importlib.util
|
|
return importlib.util.find_spec("pytesseract") is not None
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_doctr_available() -> bool:
|
|
"""Check if doctr is installed without importing it."""
|
|
try:
|
|
import importlib.util
|
|
return importlib.util.find_spec("doctr") is not None
|
|
except Exception:
|
|
return False
|
|
|
|
PADDLE_AVAILABLE = _check_paddle_available()
|
|
TESSERACT_AVAILABLE = _check_tesseract_available()
|
|
DOCTR_AVAILABLE = _check_doctr_available()
|
|
|
|
|
|
@dataclass
|
|
class OCRResult:
|
|
"""Raw OCR result."""
|
|
text: str
|
|
confidence: float
|
|
boxes: List[dict]
|
|
engine: str = "" # OCR engine used: paddleocr or tesseract
|
|
|
|
|
|
class OCREngine:
|
|
"""Unified OCR engine with fallback support."""
|
|
|
|
def __init__(self):
|
|
self._paddle = None
|
|
self._paddle_init_started = False
|
|
self._paddle_ready = threading.Event() # Signals when PaddleOCR is FULLY ready
|
|
self._paddle_init_lock = threading.Lock()
|
|
|
|
self._doctr = None
|
|
self._doctr_init_started = False
|
|
self._doctr_ready = threading.Event() # Signals when docTR is FULLY ready
|
|
self._doctr_init_lock = threading.Lock()
|
|
|
|
def _init_paddle_lazy(self):
|
|
"""Lazy initialize PaddleOCR on first use (avoids slow startup)."""
|
|
global PaddleOCR
|
|
|
|
with self._paddle_init_lock:
|
|
if self._paddle_init_started:
|
|
return # Already initializing or done
|
|
self._paddle_init_started = True
|
|
|
|
if PADDLE_AVAILABLE:
|
|
try:
|
|
print("Importing PaddleOCR (first use, may take ~15-20 seconds)...", flush=True)
|
|
from paddleocr import PaddleOCR as _PaddleOCR
|
|
PaddleOCR = _PaddleOCR
|
|
|
|
print("Initializing PaddleOCR engine...", flush=True)
|
|
# PaddleOCR 3.x API - optimized for Romanian receipts
|
|
# Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
|
|
self._paddle = PaddleOCR(
|
|
lang='en', # 'en' handles Latin alphabet well for receipts
|
|
# High quality settings for better accuracy
|
|
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
|
|
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
|
|
det_db_unclip_ratio=1.8, # Expand detected boxes slightly (default 1.5)
|
|
rec_batch_num=6, # Batch size for recognition
|
|
use_angle_cls=True, # Enable text angle classification
|
|
)
|
|
print("PaddleOCR initialized successfully with high-quality settings", flush=True)
|
|
except Exception as e:
|
|
print(f"Warning: Failed to initialize PaddleOCR: {e}", flush=True)
|
|
self._paddle = None
|
|
|
|
# Signal that initialization is complete (success or failure)
|
|
self._paddle_ready.set()
|
|
|
|
def _init_doctr_lazy(self):
|
|
"""Lazy initialize docTR on first use (avoids slow startup)."""
|
|
global doctr_ocr_predictor
|
|
|
|
with self._doctr_init_lock:
|
|
if self._doctr_init_started:
|
|
return # Already initializing or done
|
|
self._doctr_init_started = True
|
|
|
|
if DOCTR_AVAILABLE:
|
|
try:
|
|
print("Importing docTR (first use, may take ~10-15 seconds)...", flush=True)
|
|
from doctr.io import DocumentFile
|
|
from doctr.models import ocr_predictor
|
|
|
|
print("Initializing docTR engine (PyTorch backend)...", flush=True)
|
|
# Initialize docTR predictor with pretrained models
|
|
# Uses db_resnet50 for detection and crnn_vgg16_bn for recognition
|
|
self._doctr = ocr_predictor(
|
|
det_arch='db_resnet50',
|
|
reco_arch='crnn_vgg16_bn',
|
|
pretrained=True,
|
|
assume_straight_pages=True,
|
|
straighten_pages=False,
|
|
preserve_aspect_ratio=True,
|
|
)
|
|
doctr_ocr_predictor = self._doctr
|
|
print("docTR initialized successfully with PyTorch backend", flush=True)
|
|
except Exception as e:
|
|
print(f"Warning: Failed to initialize docTR: {e}", flush=True)
|
|
self._doctr = None
|
|
|
|
# Signal that initialization is complete (success or failure)
|
|
self._doctr_ready.set()
|
|
|
|
def wait_for_doctr(self, timeout: float = 30.0) -> bool:
|
|
"""
|
|
Wait for docTR to be fully initialized.
|
|
|
|
Args:
|
|
timeout: Max seconds to wait (default 30s)
|
|
|
|
Returns:
|
|
True if docTR is ready, False if timeout or unavailable
|
|
"""
|
|
if not DOCTR_AVAILABLE:
|
|
return False
|
|
|
|
if self._doctr is not None:
|
|
return True # Already ready
|
|
|
|
if not self._doctr_init_started:
|
|
# Start initialization if not already started
|
|
self._init_doctr_lazy()
|
|
|
|
# Wait for initialization to complete
|
|
print(f"[OCR] Waiting for docTR to be ready (max {timeout}s)...", flush=True)
|
|
start = time.time()
|
|
ready = self._doctr_ready.wait(timeout=timeout)
|
|
elapsed = time.time() - start
|
|
|
|
if ready and self._doctr is not None:
|
|
print(f"[OCR] docTR ready after {elapsed:.1f}s", flush=True)
|
|
return True
|
|
else:
|
|
print(f"[OCR] docTR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
|
|
return False
|
|
|
|
def is_doctr_ready(self) -> bool:
|
|
"""Check if docTR is ready without waiting."""
|
|
return self._doctr is not None
|
|
|
|
def wait_for_paddle(self, timeout: float = 30.0) -> bool:
|
|
"""
|
|
Wait for PaddleOCR to be fully initialized.
|
|
|
|
Args:
|
|
timeout: Max seconds to wait (default 30s)
|
|
|
|
Returns:
|
|
True if PaddleOCR is ready, False if timeout or unavailable
|
|
"""
|
|
if not PADDLE_AVAILABLE:
|
|
return False
|
|
|
|
if self._paddle is not None:
|
|
return True # Already ready
|
|
|
|
if not self._paddle_init_started:
|
|
# Start initialization if not already started
|
|
self._init_paddle_lazy()
|
|
|
|
# Wait for initialization to complete
|
|
print(f"[OCR] Waiting for PaddleOCR to be ready (max {timeout}s)...", flush=True)
|
|
start = time.time()
|
|
ready = self._paddle_ready.wait(timeout=timeout)
|
|
elapsed = time.time() - start
|
|
|
|
if ready and self._paddle is not None:
|
|
print(f"[OCR] PaddleOCR ready after {elapsed:.1f}s", flush=True)
|
|
return True
|
|
else:
|
|
print(f"[OCR] PaddleOCR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
|
|
return False
|
|
|
|
def is_paddle_ready(self) -> bool:
|
|
"""Check if PaddleOCR is ready without waiting."""
|
|
return self._paddle is not None
|
|
|
|
def recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Perform OCR on preprocessed image."""
|
|
logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
|
|
|
|
# Lazy init PaddleOCR on first call
|
|
self._init_paddle_lazy()
|
|
|
|
if PADDLE_AVAILABLE and self._paddle:
|
|
logger.info("[OCR] Using PaddleOCR engine")
|
|
return self._paddle_recognize(image)
|
|
elif TESSERACT_AVAILABLE:
|
|
logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
|
|
return self._tesseract_recognize(image)
|
|
else:
|
|
logger.error("[OCR] No OCR engine available!")
|
|
raise RuntimeError(
|
|
"No OCR engine available. Install PaddleOCR or Tesseract."
|
|
)
|
|
|
|
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Recognize text using PaddleOCR 3.x API."""
|
|
# Wait for PaddleOCR to be fully ready (handles background init)
|
|
if not self.wait_for_paddle(timeout=30.0):
|
|
logger.warning("[PaddleOCR] Not ready, falling back to Tesseract")
|
|
if TESSERACT_AVAILABLE:
|
|
return self._tesseract_recognize(image)
|
|
raise RuntimeError("PaddleOCR not ready and Tesseract not available")
|
|
|
|
try:
|
|
logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
|
|
|
|
# PaddleOCR 3.x requires 3-channel images
|
|
if len(image.shape) == 2:
|
|
# Convert grayscale to 3-channel BGR
|
|
import cv2
|
|
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
|
logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
|
|
|
|
# PaddleOCR 3.x uses predict() with new parameter names
|
|
logger.info("[PaddleOCR] Calling predict()...")
|
|
result = self._paddle.predict(image, use_textline_orientation=True)
|
|
logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
|
|
|
|
if not result or len(result) == 0:
|
|
logger.warning("[PaddleOCR] No results returned")
|
|
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
|
|
|
# PaddleOCR 3.x returns OCRResult objects with different structure
|
|
ocr_result = result[0]
|
|
|
|
# Extract texts and scores from the new format
|
|
rec_texts = ocr_result.get('rec_texts', [])
|
|
rec_scores = ocr_result.get('rec_scores', [])
|
|
dt_polys = ocr_result.get('dt_polys', [])
|
|
|
|
if not rec_texts:
|
|
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
|
|
|
boxes = []
|
|
for i, text in enumerate(rec_texts):
|
|
conf = rec_scores[i] if i < len(rec_scores) else 0.0
|
|
box = dt_polys[i].tolist() if i < len(dt_polys) else []
|
|
boxes.append({
|
|
'text': text,
|
|
'confidence': float(conf),
|
|
'box': box
|
|
})
|
|
|
|
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
|
|
text_result = '\n'.join(rec_texts)
|
|
logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
|
|
logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
|
|
return OCRResult(
|
|
text=text_result,
|
|
confidence=float(avg_conf),
|
|
boxes=boxes,
|
|
engine="paddleocr"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
|
|
if TESSERACT_AVAILABLE:
|
|
return self._tesseract_recognize(image)
|
|
raise
|
|
|
|
def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Recognize text using Tesseract."""
|
|
global pytesseract
|
|
|
|
logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
|
|
|
|
# Lazy import pytesseract
|
|
if pytesseract is None:
|
|
logger.info("[Tesseract] Importing pytesseract...")
|
|
import pytesseract as _pytesseract
|
|
pytesseract = _pytesseract
|
|
|
|
# PSM 4: Single column (best for receipts)
|
|
config = '--psm 4 -l ron+eng'
|
|
text = pytesseract.image_to_string(image, config=config)
|
|
|
|
# Quick confidence estimate
|
|
data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
|
|
confidences = [int(c) for c in data['conf'] if int(c) > 0]
|
|
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
|
|
|
|
logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
|
|
return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
|
|
|
|
def _doctr_recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Recognize text using docTR."""
|
|
# Wait for docTR to be fully ready
|
|
if not self.wait_for_doctr(timeout=30.0):
|
|
logger.warning("[docTR] Not ready, falling back to Tesseract")
|
|
if TESSERACT_AVAILABLE:
|
|
return self._tesseract_recognize(image)
|
|
raise RuntimeError("docTR not ready and Tesseract not available")
|
|
|
|
try:
|
|
logger.info(f"[docTR] Processing image, shape: {image.shape}")
|
|
|
|
# docTR requires RGB images
|
|
import cv2
|
|
if len(image.shape) == 2:
|
|
# Convert grayscale to RGB
|
|
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
|
logger.info(f"[docTR] Converted grayscale to RGB, new shape: {image.shape}")
|
|
elif image.shape[2] == 4:
|
|
# Convert RGBA to RGB
|
|
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
|
logger.info(f"[docTR] Converted RGBA to RGB, new shape: {image.shape}")
|
|
elif image.shape[2] == 3:
|
|
# Check if BGR (from OpenCV) and convert to RGB
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
logger.info(f"[docTR] Converted BGR to RGB, shape: {image.shape}")
|
|
|
|
# Process image with docTR
|
|
logger.info("[docTR] Running prediction...")
|
|
from doctr.io import DocumentFile
|
|
|
|
# docTR expects a document (list of pages as numpy arrays)
|
|
result = self._doctr([image])
|
|
|
|
if not result or not result.pages:
|
|
logger.warning("[docTR] No results returned")
|
|
return OCRResult(text="", confidence=0.0, boxes=[], engine="doctr")
|
|
|
|
# Extract text from all pages
|
|
all_texts = []
|
|
all_confidences = []
|
|
boxes = []
|
|
|
|
for page in result.pages:
|
|
for block in page.blocks:
|
|
for line in block.lines:
|
|
line_text = ' '.join(word.value for word in line.words)
|
|
line_confidence = sum(w.confidence for w in line.words) / len(line.words) if line.words else 0.0
|
|
all_texts.append(line_text)
|
|
all_confidences.append(line_confidence)
|
|
|
|
# Store word-level boxes
|
|
for word in line.words:
|
|
boxes.append({
|
|
'text': word.value,
|
|
'confidence': float(word.confidence),
|
|
'box': word.geometry # (xmin, ymin), (xmax, ymax)
|
|
})
|
|
|
|
text_result = '\n'.join(all_texts)
|
|
avg_conf = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
|
|
|
|
logger.info(f"[docTR] SUCCESS - Found {len(all_texts)} text lines, avg confidence: {avg_conf:.2%}")
|
|
logger.debug(f"[docTR] Raw text preview: {text_result[:200]}...")
|
|
|
|
return OCRResult(
|
|
text=text_result,
|
|
confidence=float(avg_conf),
|
|
boxes=boxes,
|
|
engine="doctr"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"[docTR] ERROR: {e}, falling back to Tesseract")
|
|
if TESSERACT_AVAILABLE:
|
|
return self._tesseract_recognize(image)
|
|
raise
|
|
|
|
def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
|
|
"""
|
|
Run both OCR engines and return both results.
|
|
|
|
Returns:
|
|
Tuple of (paddle_result, tesseract_result)
|
|
tesseract_result may be None if Tesseract is not available
|
|
"""
|
|
logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
|
|
|
|
# Lazy init PaddleOCR
|
|
self._init_paddle_lazy()
|
|
|
|
paddle_result = None
|
|
tesseract_result = None
|
|
|
|
# Run PaddleOCR
|
|
if PADDLE_AVAILABLE and self._paddle:
|
|
try:
|
|
logger.info("[OCR Dual] Running PaddleOCR...")
|
|
paddle_result = self._paddle_recognize(image)
|
|
logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
|
|
except Exception as e:
|
|
logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
|
|
paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
|
|
|
# Run Tesseract
|
|
if TESSERACT_AVAILABLE:
|
|
try:
|
|
logger.info("[OCR Dual] Running Tesseract...")
|
|
tesseract_result = self._tesseract_recognize(image)
|
|
logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
|
|
except Exception as e:
|
|
logger.error(f"[OCR Dual] Tesseract failed: {e}")
|
|
tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
|
|
|
|
# Fallback if PaddleOCR not available
|
|
if paddle_result is None:
|
|
if tesseract_result:
|
|
paddle_result = tesseract_result
|
|
else:
|
|
raise RuntimeError("No OCR engine available")
|
|
|
|
return paddle_result, tesseract_result
|
|
|
|
@staticmethod
|
|
def get_available_engines() -> List[str]:
|
|
"""
|
|
Return list of available OCR engines.
|
|
|
|
Respects OCR_ENABLE_PADDLEOCR and OCR_ENABLE_TESSERACT from .env.
|
|
Engines that are disabled via .env are not returned even if installed.
|
|
|
|
Available engines: tesseract, doctr, doctr_plus, paddleocr
|
|
"""
|
|
# Check .env settings
|
|
paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
|
|
tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
|
|
|
|
engines = []
|
|
|
|
# Base engines (only if installed AND enabled)
|
|
if TESSERACT_AVAILABLE and tesseract_enabled:
|
|
engines.append('tesseract')
|
|
if DOCTR_AVAILABLE:
|
|
engines.append('doctr')
|
|
engines.append('doctr_plus') # docTR with 2-tier sequential + early exit
|
|
if PADDLE_AVAILABLE and paddle_enabled:
|
|
engines.append('paddleocr')
|
|
|
|
return engines
|