feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""OCR engine wrapper for PaddleOCR and Tesseract."""
|
||||
"""OCR engine wrapper for PaddleOCR, docTR, and Tesseract."""
|
||||
|
||||
import os
|
||||
import logging
|
||||
@@ -9,9 +9,8 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Setup logging
|
||||
# Setup logging (respects LOG_LEVEL env var set in main.py)
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO) # Ensure logs are visible
|
||||
|
||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
|
||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
@@ -19,6 +18,7 @@ os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
# Lazy imports - these will be imported on first use
|
||||
PaddleOCR = None # Will be imported lazily
|
||||
pytesseract = None # Will be imported lazily
|
||||
doctr_ocr_predictor = None # Will be imported lazily
|
||||
|
||||
# Check availability without importing heavy libraries
|
||||
def _check_paddle_available() -> bool:
|
||||
@@ -37,8 +37,17 @@ def _check_tesseract_available() -> bool:
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_doctr_available() -> bool:
|
||||
"""Check if doctr is installed without importing it."""
|
||||
try:
|
||||
import importlib.util
|
||||
return importlib.util.find_spec("doctr") is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
PADDLE_AVAILABLE = _check_paddle_available()
|
||||
TESSERACT_AVAILABLE = _check_tesseract_available()
|
||||
DOCTR_AVAILABLE = _check_doctr_available()
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -59,6 +68,11 @@ class OCREngine:
|
||||
self._paddle_ready = threading.Event() # Signals when PaddleOCR is FULLY ready
|
||||
self._paddle_init_lock = threading.Lock()
|
||||
|
||||
self._doctr = None
|
||||
self._doctr_init_started = False
|
||||
self._doctr_ready = threading.Event() # Signals when docTR is FULLY ready
|
||||
self._doctr_init_lock = threading.Lock()
|
||||
|
||||
def _init_paddle_lazy(self):
|
||||
"""Lazy initialize PaddleOCR on first use (avoids slow startup)."""
|
||||
global PaddleOCR
|
||||
@@ -94,6 +108,78 @@ class OCREngine:
|
||||
# Signal that initialization is complete (success or failure)
|
||||
self._paddle_ready.set()
|
||||
|
||||
def _init_doctr_lazy(self):
|
||||
"""Lazy initialize docTR on first use (avoids slow startup)."""
|
||||
global doctr_ocr_predictor
|
||||
|
||||
with self._doctr_init_lock:
|
||||
if self._doctr_init_started:
|
||||
return # Already initializing or done
|
||||
self._doctr_init_started = True
|
||||
|
||||
if DOCTR_AVAILABLE:
|
||||
try:
|
||||
print("Importing docTR (first use, may take ~10-15 seconds)...", flush=True)
|
||||
from doctr.io import DocumentFile
|
||||
from doctr.models import ocr_predictor
|
||||
|
||||
print("Initializing docTR engine (PyTorch backend)...", flush=True)
|
||||
# Initialize docTR predictor with pretrained models
|
||||
# Uses db_resnet50 for detection and crnn_vgg16_bn for recognition
|
||||
self._doctr = ocr_predictor(
|
||||
det_arch='db_resnet50',
|
||||
reco_arch='crnn_vgg16_bn',
|
||||
pretrained=True,
|
||||
assume_straight_pages=True,
|
||||
straighten_pages=False,
|
||||
preserve_aspect_ratio=True,
|
||||
)
|
||||
doctr_ocr_predictor = self._doctr
|
||||
print("docTR initialized successfully with PyTorch backend", flush=True)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to initialize docTR: {e}", flush=True)
|
||||
self._doctr = None
|
||||
|
||||
# Signal that initialization is complete (success or failure)
|
||||
self._doctr_ready.set()
|
||||
|
||||
def wait_for_doctr(self, timeout: float = 30.0) -> bool:
|
||||
"""
|
||||
Wait for docTR to be fully initialized.
|
||||
|
||||
Args:
|
||||
timeout: Max seconds to wait (default 30s)
|
||||
|
||||
Returns:
|
||||
True if docTR is ready, False if timeout or unavailable
|
||||
"""
|
||||
if not DOCTR_AVAILABLE:
|
||||
return False
|
||||
|
||||
if self._doctr is not None:
|
||||
return True # Already ready
|
||||
|
||||
if not self._doctr_init_started:
|
||||
# Start initialization if not already started
|
||||
self._init_doctr_lazy()
|
||||
|
||||
# Wait for initialization to complete
|
||||
print(f"[OCR] Waiting for docTR to be ready (max {timeout}s)...", flush=True)
|
||||
start = time.time()
|
||||
ready = self._doctr_ready.wait(timeout=timeout)
|
||||
elapsed = time.time() - start
|
||||
|
||||
if ready and self._doctr is not None:
|
||||
print(f"[OCR] docTR ready after {elapsed:.1f}s", flush=True)
|
||||
return True
|
||||
else:
|
||||
print(f"[OCR] docTR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
|
||||
return False
|
||||
|
||||
def is_doctr_ready(self) -> bool:
|
||||
"""Check if docTR is ready without waiting."""
|
||||
return self._doctr is not None
|
||||
|
||||
def wait_for_paddle(self, timeout: float = 30.0) -> bool:
|
||||
"""
|
||||
Wait for PaddleOCR to be fully initialized.
|
||||
@@ -239,6 +325,84 @@ class OCREngine:
|
||||
logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
|
||||
return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
|
||||
|
||||
def _doctr_recognize(self, image: np.ndarray) -> OCRResult:
|
||||
"""Recognize text using docTR."""
|
||||
# Wait for docTR to be fully ready
|
||||
if not self.wait_for_doctr(timeout=30.0):
|
||||
logger.warning("[docTR] Not ready, falling back to Tesseract")
|
||||
if TESSERACT_AVAILABLE:
|
||||
return self._tesseract_recognize(image)
|
||||
raise RuntimeError("docTR not ready and Tesseract not available")
|
||||
|
||||
try:
|
||||
logger.info(f"[docTR] Processing image, shape: {image.shape}")
|
||||
|
||||
# docTR requires RGB images
|
||||
import cv2
|
||||
if len(image.shape) == 2:
|
||||
# Convert grayscale to RGB
|
||||
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
||||
logger.info(f"[docTR] Converted grayscale to RGB, new shape: {image.shape}")
|
||||
elif image.shape[2] == 4:
|
||||
# Convert RGBA to RGB
|
||||
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
||||
logger.info(f"[docTR] Converted RGBA to RGB, new shape: {image.shape}")
|
||||
elif image.shape[2] == 3:
|
||||
# Check if BGR (from OpenCV) and convert to RGB
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
logger.info(f"[docTR] Converted BGR to RGB, shape: {image.shape}")
|
||||
|
||||
# Process image with docTR
|
||||
logger.info("[docTR] Running prediction...")
|
||||
from doctr.io import DocumentFile
|
||||
|
||||
# docTR expects a document (list of pages as numpy arrays)
|
||||
result = self._doctr([image])
|
||||
|
||||
if not result or not result.pages:
|
||||
logger.warning("[docTR] No results returned")
|
||||
return OCRResult(text="", confidence=0.0, boxes=[], engine="doctr")
|
||||
|
||||
# Extract text from all pages
|
||||
all_texts = []
|
||||
all_confidences = []
|
||||
boxes = []
|
||||
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
line_text = ' '.join(word.value for word in line.words)
|
||||
line_confidence = sum(w.confidence for w in line.words) / len(line.words) if line.words else 0.0
|
||||
all_texts.append(line_text)
|
||||
all_confidences.append(line_confidence)
|
||||
|
||||
# Store word-level boxes
|
||||
for word in line.words:
|
||||
boxes.append({
|
||||
'text': word.value,
|
||||
'confidence': float(word.confidence),
|
||||
'box': word.geometry # (xmin, ymin), (xmax, ymax)
|
||||
})
|
||||
|
||||
text_result = '\n'.join(all_texts)
|
||||
avg_conf = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
|
||||
|
||||
logger.info(f"[docTR] SUCCESS - Found {len(all_texts)} text lines, avg confidence: {avg_conf:.2%}")
|
||||
logger.debug(f"[docTR] Raw text preview: {text_result[:200]}...")
|
||||
|
||||
return OCRResult(
|
||||
text=text_result,
|
||||
confidence=float(avg_conf),
|
||||
boxes=boxes,
|
||||
engine="doctr"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[docTR] ERROR: {e}, falling back to Tesseract")
|
||||
if TESSERACT_AVAILABLE:
|
||||
return self._tesseract_recognize(image)
|
||||
raise
|
||||
|
||||
def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
|
||||
"""
|
||||
Run both OCR engines and return both results.
|
||||
@@ -286,10 +450,27 @@ class OCREngine:
|
||||
|
||||
@staticmethod
|
||||
def get_available_engines() -> List[str]:
|
||||
"""Return list of available OCR engines."""
|
||||
"""
|
||||
Return list of available OCR engines.
|
||||
|
||||
Respects OCR_ENABLE_PADDLEOCR and OCR_ENABLE_TESSERACT from .env.
|
||||
Engines that are disabled via .env are not returned even if installed.
|
||||
|
||||
Available engines: tesseract, doctr, doctr_plus, paddleocr
|
||||
"""
|
||||
# Check .env settings
|
||||
paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
|
||||
tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
|
||||
|
||||
engines = []
|
||||
if PADDLE_AVAILABLE:
|
||||
engines.append('paddleocr')
|
||||
if TESSERACT_AVAILABLE:
|
||||
|
||||
# Base engines (only if installed AND enabled)
|
||||
if TESSERACT_AVAILABLE and tesseract_enabled:
|
||||
engines.append('tesseract')
|
||||
if DOCTR_AVAILABLE:
|
||||
engines.append('doctr')
|
||||
engines.append('doctr_plus') # docTR with 2-tier sequential + early exit
|
||||
if PADDLE_AVAILABLE and paddle_enabled:
|
||||
engines.append('paddleocr')
|
||||
|
||||
return engines
|
||||
|
||||
Reference in New Issue
Block a user