Files
roa2web-service-auto/backend/modules/data_entry/services/ocr_engine.py
Marius Mutu 495790411f feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 05:37:16 +02:00

477 lines
19 KiB
Python

"""OCR engine wrapper for PaddleOCR, docTR, and Tesseract."""
import os
import logging
import threading
import time
from dataclasses import dataclass
from typing import List, Optional, Tuple
import numpy as np
# Setup logging (respects LOG_LEVEL env var set in main.py)
logger = logging.getLogger(__name__)
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
# Lazy imports - these will be imported on first use
PaddleOCR = None # Will be imported lazily
pytesseract = None # Will be imported lazily
doctr_ocr_predictor = None # Will be imported lazily
# Check availability without importing heavy libraries
def _check_paddle_available() -> bool:
"""Check if paddleocr is installed without importing it."""
try:
import importlib.util
return importlib.util.find_spec("paddleocr") is not None
except Exception:
return False
def _check_tesseract_available() -> bool:
"""Check if pytesseract is installed without importing it."""
try:
import importlib.util
return importlib.util.find_spec("pytesseract") is not None
except Exception:
return False
def _check_doctr_available() -> bool:
"""Check if doctr is installed without importing it."""
try:
import importlib.util
return importlib.util.find_spec("doctr") is not None
except Exception:
return False
PADDLE_AVAILABLE = _check_paddle_available()
TESSERACT_AVAILABLE = _check_tesseract_available()
DOCTR_AVAILABLE = _check_doctr_available()
@dataclass
class OCRResult:
"""Raw OCR result."""
text: str
confidence: float
boxes: List[dict]
engine: str = "" # OCR engine used: paddleocr or tesseract
class OCREngine:
"""Unified OCR engine with fallback support."""
def __init__(self):
self._paddle = None
self._paddle_init_started = False
self._paddle_ready = threading.Event() # Signals when PaddleOCR is FULLY ready
self._paddle_init_lock = threading.Lock()
self._doctr = None
self._doctr_init_started = False
self._doctr_ready = threading.Event() # Signals when docTR is FULLY ready
self._doctr_init_lock = threading.Lock()
def _init_paddle_lazy(self):
"""Lazy initialize PaddleOCR on first use (avoids slow startup)."""
global PaddleOCR
with self._paddle_init_lock:
if self._paddle_init_started:
return # Already initializing or done
self._paddle_init_started = True
if PADDLE_AVAILABLE:
try:
print("Importing PaddleOCR (first use, may take ~15-20 seconds)...", flush=True)
from paddleocr import PaddleOCR as _PaddleOCR
PaddleOCR = _PaddleOCR
print("Initializing PaddleOCR engine...", flush=True)
# PaddleOCR 3.x API - optimized for Romanian receipts
# Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
self._paddle = PaddleOCR(
lang='en', # 'en' handles Latin alphabet well for receipts
# High quality settings for better accuracy
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
det_db_unclip_ratio=1.8, # Expand detected boxes slightly (default 1.5)
rec_batch_num=6, # Batch size for recognition
use_angle_cls=True, # Enable text angle classification
)
print("PaddleOCR initialized successfully with high-quality settings", flush=True)
except Exception as e:
print(f"Warning: Failed to initialize PaddleOCR: {e}", flush=True)
self._paddle = None
# Signal that initialization is complete (success or failure)
self._paddle_ready.set()
def _init_doctr_lazy(self):
"""Lazy initialize docTR on first use (avoids slow startup)."""
global doctr_ocr_predictor
with self._doctr_init_lock:
if self._doctr_init_started:
return # Already initializing or done
self._doctr_init_started = True
if DOCTR_AVAILABLE:
try:
print("Importing docTR (first use, may take ~10-15 seconds)...", flush=True)
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
print("Initializing docTR engine (PyTorch backend)...", flush=True)
# Initialize docTR predictor with pretrained models
# Uses db_resnet50 for detection and crnn_vgg16_bn for recognition
self._doctr = ocr_predictor(
det_arch='db_resnet50',
reco_arch='crnn_vgg16_bn',
pretrained=True,
assume_straight_pages=True,
straighten_pages=False,
preserve_aspect_ratio=True,
)
doctr_ocr_predictor = self._doctr
print("docTR initialized successfully with PyTorch backend", flush=True)
except Exception as e:
print(f"Warning: Failed to initialize docTR: {e}", flush=True)
self._doctr = None
# Signal that initialization is complete (success or failure)
self._doctr_ready.set()
def wait_for_doctr(self, timeout: float = 30.0) -> bool:
"""
Wait for docTR to be fully initialized.
Args:
timeout: Max seconds to wait (default 30s)
Returns:
True if docTR is ready, False if timeout or unavailable
"""
if not DOCTR_AVAILABLE:
return False
if self._doctr is not None:
return True # Already ready
if not self._doctr_init_started:
# Start initialization if not already started
self._init_doctr_lazy()
# Wait for initialization to complete
print(f"[OCR] Waiting for docTR to be ready (max {timeout}s)...", flush=True)
start = time.time()
ready = self._doctr_ready.wait(timeout=timeout)
elapsed = time.time() - start
if ready and self._doctr is not None:
print(f"[OCR] docTR ready after {elapsed:.1f}s", flush=True)
return True
else:
print(f"[OCR] docTR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
return False
def is_doctr_ready(self) -> bool:
"""Check if docTR is ready without waiting."""
return self._doctr is not None
def wait_for_paddle(self, timeout: float = 30.0) -> bool:
"""
Wait for PaddleOCR to be fully initialized.
Args:
timeout: Max seconds to wait (default 30s)
Returns:
True if PaddleOCR is ready, False if timeout or unavailable
"""
if not PADDLE_AVAILABLE:
return False
if self._paddle is not None:
return True # Already ready
if not self._paddle_init_started:
# Start initialization if not already started
self._init_paddle_lazy()
# Wait for initialization to complete
print(f"[OCR] Waiting for PaddleOCR to be ready (max {timeout}s)...", flush=True)
start = time.time()
ready = self._paddle_ready.wait(timeout=timeout)
elapsed = time.time() - start
if ready and self._paddle is not None:
print(f"[OCR] PaddleOCR ready after {elapsed:.1f}s", flush=True)
return True
else:
print(f"[OCR] PaddleOCR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
return False
def is_paddle_ready(self) -> bool:
"""Check if PaddleOCR is ready without waiting."""
return self._paddle is not None
def recognize(self, image: np.ndarray) -> OCRResult:
"""Perform OCR on preprocessed image."""
logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
# Lazy init PaddleOCR on first call
self._init_paddle_lazy()
if PADDLE_AVAILABLE and self._paddle:
logger.info("[OCR] Using PaddleOCR engine")
return self._paddle_recognize(image)
elif TESSERACT_AVAILABLE:
logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
return self._tesseract_recognize(image)
else:
logger.error("[OCR] No OCR engine available!")
raise RuntimeError(
"No OCR engine available. Install PaddleOCR or Tesseract."
)
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
"""Recognize text using PaddleOCR 3.x API."""
# Wait for PaddleOCR to be fully ready (handles background init)
if not self.wait_for_paddle(timeout=30.0):
logger.warning("[PaddleOCR] Not ready, falling back to Tesseract")
if TESSERACT_AVAILABLE:
return self._tesseract_recognize(image)
raise RuntimeError("PaddleOCR not ready and Tesseract not available")
try:
logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
# PaddleOCR 3.x requires 3-channel images
if len(image.shape) == 2:
# Convert grayscale to 3-channel BGR
import cv2
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
# PaddleOCR 3.x uses predict() with new parameter names
logger.info("[PaddleOCR] Calling predict()...")
result = self._paddle.predict(image, use_textline_orientation=True)
logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
if not result or len(result) == 0:
logger.warning("[PaddleOCR] No results returned")
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
# PaddleOCR 3.x returns OCRResult objects with different structure
ocr_result = result[0]
# Extract texts and scores from the new format
rec_texts = ocr_result.get('rec_texts', [])
rec_scores = ocr_result.get('rec_scores', [])
dt_polys = ocr_result.get('dt_polys', [])
if not rec_texts:
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
boxes = []
for i, text in enumerate(rec_texts):
conf = rec_scores[i] if i < len(rec_scores) else 0.0
box = dt_polys[i].tolist() if i < len(dt_polys) else []
boxes.append({
'text': text,
'confidence': float(conf),
'box': box
})
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
text_result = '\n'.join(rec_texts)
logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
return OCRResult(
text=text_result,
confidence=float(avg_conf),
boxes=boxes,
engine="paddleocr"
)
except Exception as e:
logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
if TESSERACT_AVAILABLE:
return self._tesseract_recognize(image)
raise
def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
"""Recognize text using Tesseract."""
global pytesseract
logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
# Lazy import pytesseract
if pytesseract is None:
logger.info("[Tesseract] Importing pytesseract...")
import pytesseract as _pytesseract
pytesseract = _pytesseract
# PSM 4: Single column (best for receipts)
config = '--psm 4 -l ron+eng'
text = pytesseract.image_to_string(image, config=config)
# Quick confidence estimate
data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
confidences = [int(c) for c in data['conf'] if int(c) > 0]
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
def _doctr_recognize(self, image: np.ndarray) -> OCRResult:
"""Recognize text using docTR."""
# Wait for docTR to be fully ready
if not self.wait_for_doctr(timeout=30.0):
logger.warning("[docTR] Not ready, falling back to Tesseract")
if TESSERACT_AVAILABLE:
return self._tesseract_recognize(image)
raise RuntimeError("docTR not ready and Tesseract not available")
try:
logger.info(f"[docTR] Processing image, shape: {image.shape}")
# docTR requires RGB images
import cv2
if len(image.shape) == 2:
# Convert grayscale to RGB
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
logger.info(f"[docTR] Converted grayscale to RGB, new shape: {image.shape}")
elif image.shape[2] == 4:
# Convert RGBA to RGB
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
logger.info(f"[docTR] Converted RGBA to RGB, new shape: {image.shape}")
elif image.shape[2] == 3:
# Check if BGR (from OpenCV) and convert to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
logger.info(f"[docTR] Converted BGR to RGB, shape: {image.shape}")
# Process image with docTR
logger.info("[docTR] Running prediction...")
from doctr.io import DocumentFile
# docTR expects a document (list of pages as numpy arrays)
result = self._doctr([image])
if not result or not result.pages:
logger.warning("[docTR] No results returned")
return OCRResult(text="", confidence=0.0, boxes=[], engine="doctr")
# Extract text from all pages
all_texts = []
all_confidences = []
boxes = []
for page in result.pages:
for block in page.blocks:
for line in block.lines:
line_text = ' '.join(word.value for word in line.words)
line_confidence = sum(w.confidence for w in line.words) / len(line.words) if line.words else 0.0
all_texts.append(line_text)
all_confidences.append(line_confidence)
# Store word-level boxes
for word in line.words:
boxes.append({
'text': word.value,
'confidence': float(word.confidence),
'box': word.geometry # (xmin, ymin), (xmax, ymax)
})
text_result = '\n'.join(all_texts)
avg_conf = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
logger.info(f"[docTR] SUCCESS - Found {len(all_texts)} text lines, avg confidence: {avg_conf:.2%}")
logger.debug(f"[docTR] Raw text preview: {text_result[:200]}...")
return OCRResult(
text=text_result,
confidence=float(avg_conf),
boxes=boxes,
engine="doctr"
)
except Exception as e:
logger.error(f"[docTR] ERROR: {e}, falling back to Tesseract")
if TESSERACT_AVAILABLE:
return self._tesseract_recognize(image)
raise
def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
"""
Run both OCR engines and return both results.
Returns:
Tuple of (paddle_result, tesseract_result)
tesseract_result may be None if Tesseract is not available
"""
logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
# Lazy init PaddleOCR
self._init_paddle_lazy()
paddle_result = None
tesseract_result = None
# Run PaddleOCR
if PADDLE_AVAILABLE and self._paddle:
try:
logger.info("[OCR Dual] Running PaddleOCR...")
paddle_result = self._paddle_recognize(image)
logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
except Exception as e:
logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
# Run Tesseract
if TESSERACT_AVAILABLE:
try:
logger.info("[OCR Dual] Running Tesseract...")
tesseract_result = self._tesseract_recognize(image)
logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
except Exception as e:
logger.error(f"[OCR Dual] Tesseract failed: {e}")
tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
# Fallback if PaddleOCR not available
if paddle_result is None:
if tesseract_result:
paddle_result = tesseract_result
else:
raise RuntimeError("No OCR engine available")
return paddle_result, tesseract_result
@staticmethod
def get_available_engines() -> List[str]:
"""
Return list of available OCR engines.
Respects OCR_ENABLE_PADDLEOCR and OCR_ENABLE_TESSERACT from .env.
Engines that are disabled via .env are not returned even if installed.
Available engines: tesseract, doctr, doctr_plus, paddleocr
"""
# Check .env settings
paddle_enabled = os.getenv("OCR_ENABLE_PADDLEOCR", "true").lower() == "true"
tesseract_enabled = os.getenv("OCR_ENABLE_TESSERACT", "true").lower() == "true"
engines = []
# Base engines (only if installed AND enabled)
if TESSERACT_AVAILABLE and tesseract_enabled:
engines.append('tesseract')
if DOCTR_AVAILABLE:
engines.append('doctr')
engines.append('doctr_plus') # docTR with 2-tier sequential + early exit
if PADDLE_AVAILABLE and paddle_enabled:
engines.append('paddleocr')
return engines