feat: Improve OCR adaptive pipeline with early exit and better pattern matching
- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
"""OCR engine wrapper for PaddleOCR and Tesseract."""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Setup logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO) # Ensure logs are visible
|
||||
|
||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
|
||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
|
||||
@@ -40,6 +45,7 @@ class OCRResult:
|
||||
text: str
|
||||
confidence: float
|
||||
boxes: List[dict]
|
||||
engine: str = "" # OCR engine used: paddleocr or tesseract
|
||||
|
||||
|
||||
class OCREngine:
|
||||
@@ -65,8 +71,9 @@ class OCREngine:
|
||||
|
||||
print("Initializing PaddleOCR engine...")
|
||||
# PaddleOCR 3.x API - optimized for Romanian receipts
|
||||
# Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
|
||||
self._paddle = PaddleOCR(
|
||||
lang='en', # 'en' works better than 'ro' for mixed alphanumeric
|
||||
lang='en', # 'en' handles Latin alphabet well for receipts
|
||||
# High quality settings for better accuracy
|
||||
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
|
||||
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
|
||||
@@ -81,14 +88,19 @@ class OCREngine:
|
||||
|
||||
def recognize(self, image: np.ndarray) -> OCRResult:
|
||||
"""Perform OCR on preprocessed image."""
|
||||
logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
|
||||
|
||||
# Lazy init PaddleOCR on first call
|
||||
self._init_paddle_lazy()
|
||||
|
||||
if PADDLE_AVAILABLE and self._paddle:
|
||||
logger.info("[OCR] Using PaddleOCR engine")
|
||||
return self._paddle_recognize(image)
|
||||
elif TESSERACT_AVAILABLE:
|
||||
logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
|
||||
return self._tesseract_recognize(image)
|
||||
else:
|
||||
logger.error("[OCR] No OCR engine available!")
|
||||
raise RuntimeError(
|
||||
"No OCR engine available. Install PaddleOCR or Tesseract."
|
||||
)
|
||||
@@ -96,17 +108,23 @@ class OCREngine:
|
||||
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
|
||||
"""Recognize text using PaddleOCR 3.x API."""
|
||||
try:
|
||||
logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
|
||||
|
||||
# PaddleOCR 3.x requires 3-channel images
|
||||
if len(image.shape) == 2:
|
||||
# Convert grayscale to 3-channel BGR
|
||||
import cv2
|
||||
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
||||
logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
|
||||
|
||||
# PaddleOCR 3.x uses predict() with new parameter names
|
||||
logger.info("[PaddleOCR] Calling predict()...")
|
||||
result = self._paddle.predict(image, use_textline_orientation=True)
|
||||
logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
|
||||
|
||||
if not result or len(result) == 0:
|
||||
return OCRResult(text="", confidence=0.0, boxes=[])
|
||||
logger.warning("[PaddleOCR] No results returned")
|
||||
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
||||
|
||||
# PaddleOCR 3.x returns OCRResult objects with different structure
|
||||
ocr_result = result[0]
|
||||
@@ -117,7 +135,7 @@ class OCREngine:
|
||||
dt_polys = ocr_result.get('dt_polys', [])
|
||||
|
||||
if not rec_texts:
|
||||
return OCRResult(text="", confidence=0.0, boxes=[])
|
||||
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
||||
|
||||
boxes = []
|
||||
for i, text in enumerate(rec_texts):
|
||||
@@ -130,13 +148,17 @@ class OCREngine:
|
||||
})
|
||||
|
||||
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
|
||||
text_result = '\n'.join(rec_texts)
|
||||
logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
|
||||
logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
|
||||
return OCRResult(
|
||||
text='\n'.join(rec_texts),
|
||||
text=text_result,
|
||||
confidence=float(avg_conf),
|
||||
boxes=boxes
|
||||
boxes=boxes,
|
||||
engine="paddleocr"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"PaddleOCR error: {e}, falling back to Tesseract")
|
||||
logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
|
||||
if TESSERACT_AVAILABLE:
|
||||
return self._tesseract_recognize(image)
|
||||
raise
|
||||
@@ -145,23 +167,70 @@ class OCREngine:
|
||||
"""Recognize text using Tesseract."""
|
||||
global pytesseract
|
||||
|
||||
logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
|
||||
|
||||
# Lazy import pytesseract
|
||||
if pytesseract is None:
|
||||
print("Importing pytesseract...")
|
||||
logger.info("[Tesseract] Importing pytesseract...")
|
||||
import pytesseract as _pytesseract
|
||||
pytesseract = _pytesseract
|
||||
|
||||
config = '--psm 6 -l ron+eng'
|
||||
# PSM 4: Single column (best for receipts)
|
||||
config = '--psm 4 -l ron+eng'
|
||||
text = pytesseract.image_to_string(image, config=config)
|
||||
data = pytesseract.image_to_data(
|
||||
image, config=config,
|
||||
output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Quick confidence estimate
|
||||
data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
|
||||
confidences = [int(c) for c in data['conf'] if int(c) > 0]
|
||||
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
|
||||
|
||||
return OCRResult(text=text, confidence=avg_conf, boxes=[])
|
||||
logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
|
||||
return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
|
||||
|
||||
def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
|
||||
"""
|
||||
Run both OCR engines and return both results.
|
||||
|
||||
Returns:
|
||||
Tuple of (paddle_result, tesseract_result)
|
||||
tesseract_result may be None if Tesseract is not available
|
||||
"""
|
||||
logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
|
||||
|
||||
# Lazy init PaddleOCR
|
||||
self._init_paddle_lazy()
|
||||
|
||||
paddle_result = None
|
||||
tesseract_result = None
|
||||
|
||||
# Run PaddleOCR
|
||||
if PADDLE_AVAILABLE and self._paddle:
|
||||
try:
|
||||
logger.info("[OCR Dual] Running PaddleOCR...")
|
||||
paddle_result = self._paddle_recognize(image)
|
||||
logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
|
||||
except Exception as e:
|
||||
logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
|
||||
paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
||||
|
||||
# Run Tesseract
|
||||
if TESSERACT_AVAILABLE:
|
||||
try:
|
||||
logger.info("[OCR Dual] Running Tesseract...")
|
||||
tesseract_result = self._tesseract_recognize(image)
|
||||
logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
|
||||
except Exception as e:
|
||||
logger.error(f"[OCR Dual] Tesseract failed: {e}")
|
||||
tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
|
||||
|
||||
# Fallback if PaddleOCR not available
|
||||
if paddle_result is None:
|
||||
if tesseract_result:
|
||||
paddle_result = tesseract_result
|
||||
else:
|
||||
raise RuntimeError("No OCR engine available")
|
||||
|
||||
return paddle_result, tesseract_result
|
||||
|
||||
@staticmethod
|
||||
def get_available_engines() -> List[str]:
|
||||
|
||||
Reference in New Issue
Block a user