- Add TvaEntry schema supporting multiple TVA rates (A, B, C, D codes) - Update OCR extractor to extract multiple TVA entries from receipts - Support both old (19%, 9%, 5%) and new Romanian rates (21%, 11% from Aug 2025) - Add tva_breakdown, tva_total, items_count, vendor_address to Receipt model - Update OCRPreview.vue to display TVA entries with rate badges - Add "Detalii Suplimentare" section in ReceiptCreateView with editable TVA table - Add TVA breakdown display in ReceiptDetailView - Create database migration for new TVA columns 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
175 lines
6.2 KiB
Python
175 lines
6.2 KiB
Python
"""OCR engine wrapper for PaddleOCR and Tesseract."""
|
|
|
|
import os
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional
|
|
|
|
import numpy as np
|
|
|
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
|
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
|
|
# Lazy imports - these will be imported on first use
|
|
PaddleOCR = None # Will be imported lazily
|
|
pytesseract = None # Will be imported lazily
|
|
|
|
# Check availability without importing heavy libraries
|
|
def _check_paddle_available() -> bool:
|
|
"""Check if paddleocr is installed without importing it."""
|
|
try:
|
|
import importlib.util
|
|
return importlib.util.find_spec("paddleocr") is not None
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_tesseract_available() -> bool:
|
|
"""Check if pytesseract is installed without importing it."""
|
|
try:
|
|
import importlib.util
|
|
return importlib.util.find_spec("pytesseract") is not None
|
|
except Exception:
|
|
return False
|
|
|
|
PADDLE_AVAILABLE = _check_paddle_available()
|
|
TESSERACT_AVAILABLE = _check_tesseract_available()
|
|
|
|
|
|
@dataclass
|
|
class OCRResult:
|
|
"""Raw OCR result."""
|
|
text: str
|
|
confidence: float
|
|
boxes: List[dict]
|
|
|
|
|
|
class OCREngine:
|
|
"""Unified OCR engine with fallback support."""
|
|
|
|
def __init__(self):
|
|
self._paddle = None
|
|
self._paddle_initialized = False
|
|
|
|
def _init_paddle_lazy(self):
|
|
"""Lazy initialize PaddleOCR on first use (avoids slow startup)."""
|
|
global PaddleOCR
|
|
|
|
if self._paddle_initialized:
|
|
return
|
|
|
|
self._paddle_initialized = True
|
|
if PADDLE_AVAILABLE:
|
|
try:
|
|
print("Importing PaddleOCR (first use, may take ~15-20 seconds)...")
|
|
from paddleocr import PaddleOCR as _PaddleOCR
|
|
PaddleOCR = _PaddleOCR
|
|
|
|
print("Initializing PaddleOCR engine...")
|
|
# PaddleOCR 3.x API - optimized for Romanian receipts
|
|
self._paddle = PaddleOCR(
|
|
lang='en', # 'en' works better than 'ro' for mixed alphanumeric
|
|
# High quality settings for better accuracy
|
|
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
|
|
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
|
|
det_db_unclip_ratio=1.8, # Expand detected boxes slightly (default 1.5)
|
|
rec_batch_num=6, # Batch size for recognition
|
|
use_angle_cls=True, # Enable text angle classification
|
|
)
|
|
print("PaddleOCR initialized successfully with high-quality settings")
|
|
except Exception as e:
|
|
print(f"Warning: Failed to initialize PaddleOCR: {e}")
|
|
self._paddle = None
|
|
|
|
def recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Perform OCR on preprocessed image."""
|
|
# Lazy init PaddleOCR on first call
|
|
self._init_paddle_lazy()
|
|
|
|
if PADDLE_AVAILABLE and self._paddle:
|
|
return self._paddle_recognize(image)
|
|
elif TESSERACT_AVAILABLE:
|
|
return self._tesseract_recognize(image)
|
|
else:
|
|
raise RuntimeError(
|
|
"No OCR engine available. Install PaddleOCR or Tesseract."
|
|
)
|
|
|
|
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Recognize text using PaddleOCR 3.x API."""
|
|
try:
|
|
# PaddleOCR 3.x requires 3-channel images
|
|
if len(image.shape) == 2:
|
|
# Convert grayscale to 3-channel BGR
|
|
import cv2
|
|
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
|
|
|
# PaddleOCR 3.x uses predict() with new parameter names
|
|
result = self._paddle.predict(image, use_textline_orientation=True)
|
|
|
|
if not result or len(result) == 0:
|
|
return OCRResult(text="", confidence=0.0, boxes=[])
|
|
|
|
# PaddleOCR 3.x returns OCRResult objects with different structure
|
|
ocr_result = result[0]
|
|
|
|
# Extract texts and scores from the new format
|
|
rec_texts = ocr_result.get('rec_texts', [])
|
|
rec_scores = ocr_result.get('rec_scores', [])
|
|
dt_polys = ocr_result.get('dt_polys', [])
|
|
|
|
if not rec_texts:
|
|
return OCRResult(text="", confidence=0.0, boxes=[])
|
|
|
|
boxes = []
|
|
for i, text in enumerate(rec_texts):
|
|
conf = rec_scores[i] if i < len(rec_scores) else 0.0
|
|
box = dt_polys[i].tolist() if i < len(dt_polys) else []
|
|
boxes.append({
|
|
'text': text,
|
|
'confidence': float(conf),
|
|
'box': box
|
|
})
|
|
|
|
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
|
|
return OCRResult(
|
|
text='\n'.join(rec_texts),
|
|
confidence=float(avg_conf),
|
|
boxes=boxes
|
|
)
|
|
except Exception as e:
|
|
print(f"PaddleOCR error: {e}, falling back to Tesseract")
|
|
if TESSERACT_AVAILABLE:
|
|
return self._tesseract_recognize(image)
|
|
raise
|
|
|
|
def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
|
|
"""Recognize text using Tesseract."""
|
|
global pytesseract
|
|
|
|
# Lazy import pytesseract
|
|
if pytesseract is None:
|
|
print("Importing pytesseract...")
|
|
import pytesseract as _pytesseract
|
|
pytesseract = _pytesseract
|
|
|
|
config = '--psm 6 -l ron+eng'
|
|
text = pytesseract.image_to_string(image, config=config)
|
|
data = pytesseract.image_to_data(
|
|
image, config=config,
|
|
output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
confidences = [int(c) for c in data['conf'] if int(c) > 0]
|
|
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
|
|
|
|
return OCRResult(text=text, confidence=avg_conf, boxes=[])
|
|
|
|
@staticmethod
|
|
def get_available_engines() -> List[str]:
|
|
"""Return list of available OCR engines."""
|
|
engines = []
|
|
if PADDLE_AVAILABLE:
|
|
engines.append('paddleocr')
|
|
if TESSERACT_AVAILABLE:
|
|
engines.append('tesseract')
|
|
return engines
|