feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name,
date, total amount, and VAT from uploaded receipt images/PDFs,
reducing manual data entry and improving accuracy.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions

View File

@@ -0,0 +1,116 @@
"""Image preprocessing for optimal OCR results."""
from pathlib import Path
from typing import List
import numpy as np
import cv2
try:
import pdf2image
PDF_AVAILABLE = True
except ImportError:
PDF_AVAILABLE = False
class ImagePreprocessor:
"""Preprocess receipt images for OCR."""
def load_image(self, path: Path) -> np.ndarray:
"""Load image from file."""
image = cv2.imread(str(path))
if image is None:
raise ValueError(f"Could not load image: {path}")
return image
def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
"""Convert PDF to images."""
if not PDF_AVAILABLE:
raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
images = pdf2image.convert_from_path(str(path), dpi=dpi)
return [np.array(img) for img in images]
def preprocess(self, image: np.ndarray) -> np.ndarray:
"""
Apply preprocessing pipeline for thermal receipt images.
Pipeline:
1. Convert to grayscale
2. Resize if too small (min 1000px width)
3. Deskew (straighten rotated text)
4. Denoise (Non-local means)
5. Adaptive thresholding (binarization)
6. Morphological close (connect broken chars)
"""
# 1. Grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# 2. Resize if too small
height, width = gray.shape
if width < 1000:
scale = 1000 / width
gray = cv2.resize(
gray, None, fx=scale, fy=scale,
interpolation=cv2.INTER_CUBIC
)
# 3. Deskew
gray = self._deskew(gray)
# 4. Denoise
denoised = cv2.fastNlMeansDenoising(
gray, h=10,
templateWindowSize=7,
searchWindowSize=21
)
# 5. Adaptive thresholding
binary = cv2.adaptiveThreshold(
denoised, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=15, C=8
)
# 6. Morphological close
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
return result
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""Correct image rotation/skew using Hough lines."""
edges = cv2.Canny(image, 50, 150, apertureSize=3)
lines = cv2.HoughLinesP(
edges, 1, np.pi / 180,
threshold=100, minLineLength=100, maxLineGap=10
)
if lines is None:
return image
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
if abs(angle) < 45:
angles.append(angle)
if not angles:
return image
median_angle = np.median(angles)
if abs(median_angle) < 0.5:
return image
h, w = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
return cv2.warpAffine(
image, M, (w, h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE
)

View File

@@ -0,0 +1,168 @@
"""OCR engine wrapper for PaddleOCR and Tesseract."""
import os
from dataclasses import dataclass
from typing import List, Optional
import numpy as np
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
# Lazy imports - these will be imported on first use
PaddleOCR = None # Will be imported lazily
pytesseract = None # Will be imported lazily
# Check availability without importing heavy libraries
def _check_paddle_available() -> bool:
"""Check if paddleocr is installed without importing it."""
try:
import importlib.util
return importlib.util.find_spec("paddleocr") is not None
except Exception:
return False
def _check_tesseract_available() -> bool:
"""Check if pytesseract is installed without importing it."""
try:
import importlib.util
return importlib.util.find_spec("pytesseract") is not None
except Exception:
return False
PADDLE_AVAILABLE = _check_paddle_available()
TESSERACT_AVAILABLE = _check_tesseract_available()
@dataclass
class OCRResult:
"""Raw OCR result."""
text: str
confidence: float
boxes: List[dict]
class OCREngine:
"""Unified OCR engine with fallback support."""
def __init__(self):
self._paddle = None
self._paddle_initialized = False
def _init_paddle_lazy(self):
"""Lazy initialize PaddleOCR on first use (avoids slow startup)."""
global PaddleOCR
if self._paddle_initialized:
return
self._paddle_initialized = True
if PADDLE_AVAILABLE:
try:
print("Importing PaddleOCR (first use, may take ~15-20 seconds)...")
from paddleocr import PaddleOCR as _PaddleOCR
PaddleOCR = _PaddleOCR
print("Initializing PaddleOCR engine...")
# PaddleOCR 3.x API - simplified parameters
self._paddle = PaddleOCR(
lang='en', # Better for mixed text with numbers
)
print("PaddleOCR initialized successfully")
except Exception as e:
print(f"Warning: Failed to initialize PaddleOCR: {e}")
self._paddle = None
def recognize(self, image: np.ndarray) -> OCRResult:
"""Perform OCR on preprocessed image."""
# Lazy init PaddleOCR on first call
self._init_paddle_lazy()
if PADDLE_AVAILABLE and self._paddle:
return self._paddle_recognize(image)
elif TESSERACT_AVAILABLE:
return self._tesseract_recognize(image)
else:
raise RuntimeError(
"No OCR engine available. Install PaddleOCR or Tesseract."
)
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
"""Recognize text using PaddleOCR 3.x API."""
try:
# PaddleOCR 3.x requires 3-channel images
if len(image.shape) == 2:
# Convert grayscale to 3-channel BGR
import cv2
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
# PaddleOCR 3.x uses predict() with new parameter names
result = self._paddle.predict(image, use_textline_orientation=True)
if not result or len(result) == 0:
return OCRResult(text="", confidence=0.0, boxes=[])
# PaddleOCR 3.x returns OCRResult objects with different structure
ocr_result = result[0]
# Extract texts and scores from the new format
rec_texts = ocr_result.get('rec_texts', [])
rec_scores = ocr_result.get('rec_scores', [])
dt_polys = ocr_result.get('dt_polys', [])
if not rec_texts:
return OCRResult(text="", confidence=0.0, boxes=[])
boxes = []
for i, text in enumerate(rec_texts):
conf = rec_scores[i] if i < len(rec_scores) else 0.0
box = dt_polys[i].tolist() if i < len(dt_polys) else []
boxes.append({
'text': text,
'confidence': float(conf),
'box': box
})
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
return OCRResult(
text='\n'.join(rec_texts),
confidence=float(avg_conf),
boxes=boxes
)
except Exception as e:
print(f"PaddleOCR error: {e}, falling back to Tesseract")
if TESSERACT_AVAILABLE:
return self._tesseract_recognize(image)
raise
def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
"""Recognize text using Tesseract."""
global pytesseract
# Lazy import pytesseract
if pytesseract is None:
print("Importing pytesseract...")
import pytesseract as _pytesseract
pytesseract = _pytesseract
config = '--psm 6 -l ron+eng'
text = pytesseract.image_to_string(image, config=config)
data = pytesseract.image_to_data(
image, config=config,
output_type=pytesseract.Output.DICT
)
confidences = [int(c) for c in data['conf'] if int(c) > 0]
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
return OCRResult(text=text, confidence=avg_conf, boxes=[])
@staticmethod
def get_available_engines() -> List[str]:
"""Return list of available OCR engines."""
engines = []
if PADDLE_AVAILABLE:
engines.append('paddleocr')
if TESSERACT_AVAILABLE:
engines.append('tesseract')
return engines

View File

@@ -0,0 +1,231 @@
"""Extract structured fields from OCR text (Romanian receipts)."""
import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple
from dataclasses import dataclass, field
@dataclass
class ExtractionResult:
"""Structured extraction result from receipt."""
receipt_type: str = 'bon_fiscal'
receipt_number: Optional[str] = None
receipt_series: Optional[str] = None
receipt_date: Optional[date] = None
amount: Optional[Decimal] = None
partner_name: Optional[str] = None
cui: Optional[str] = None
description: Optional[str] = None
confidence_amount: float = 0.0
confidence_date: float = 0.0
confidence_vendor: float = 0.0
raw_text: str = ""
@property
def overall_confidence(self) -> float:
"""Calculate weighted overall confidence score."""
weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
return round(
self.confidence_amount * weights['amount'] +
self.confidence_date * weights['date'] +
self.confidence_vendor * weights['vendor'],
2
)
class ReceiptExtractor:
"""Extract receipt fields using pattern matching for Romanian receipts."""
# Total amount patterns (most specific first)
TOTAL_PATTERNS = [
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
]
# Date patterns
DATE_PATTERNS = [
(r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
(r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
(r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
(r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format
]
# Receipt number patterns
NUMBER_PATTERNS = [
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
(r'NR\.?\s*:?\s*(\d{4,})', 0.70),
]
# CUI (fiscal code) patterns
CUI_PATTERNS = [
(r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
(r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
(r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
(r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
]
# Series patterns
SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
(r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
]
def extract(self, text: str) -> ExtractionResult:
"""Extract all fields from OCR text."""
result = ExtractionResult()
result.raw_text = text
text_upper = text.upper()
# Extract fields
result.amount, result.confidence_amount = self._extract_amount(text_upper)
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
result.receipt_number, _ = self._extract_number(text_upper)
result.receipt_series, _ = self._extract_series(text_upper)
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
result.cui, _ = self._extract_cui(text_upper)
# Detect receipt type
result.receipt_type = self._detect_receipt_type(text_upper)
return result
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
"""Extract total amount from text."""
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if match:
try:
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
# Handle Romanian number format (1.234,56)
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 0:
return amount, confidence
except (InvalidOperation, ValueError):
continue
return None, 0.0
def _normalize_number(self, num_str: str) -> str:
"""Normalize Romanian number format to standard decimal."""
# Remove spaces
num_str = num_str.replace(' ', '')
# Handle comma as decimal separator
if ',' in num_str and '.' in num_str:
# Romanian format: 1.234,56
num_str = num_str.replace('.', '').replace(',', '.')
elif ',' in num_str:
# Could be 1,50 or 1,234
parts = num_str.split(',')
if len(parts) == 2 and len(parts[1]) <= 2:
# Decimal comma: 1,50
num_str = num_str.replace(',', '.')
else:
# Thousands comma: 1,234
num_str = num_str.replace(',', '')
elif '.' in num_str:
parts = num_str.split('.')
if len(parts) > 2:
# Multiple dots: 1.234.567 -> 1234567
num_str = ''.join(parts[:-1]) + '.' + parts[-1]
return num_str
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
"""Extract receipt date from text."""
for pattern, confidence in self.DATE_PATTERNS:
match = re.search(pattern, text)
if match:
try:
date_str = match.group(1).replace('/', '.')
# Try DD.MM.YYYY format first
try:
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
except ValueError:
# Try YYYY.MM.DD format
parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
# Validate date range
today = date.today()
if parsed <= today and parsed.year >= 2020:
return parsed, confidence
except ValueError:
continue
return None, 0.0
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
"""Extract receipt number from text."""
for pattern, confidence in self.NUMBER_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1), confidence
return None, 0.0
def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
"""Extract receipt series from text."""
for pattern, confidence in self.SERIES_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).upper(), confidence
return None, 0.0
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
"""Extract vendor/partner name from text."""
lines = text.split('\n')
skip_keywords = [
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
'RON', 'LEI', 'CHITANTA', 'REST'
]
for i, line in enumerate(lines[:7]): # Check first 7 lines
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip lines that are just numbers
if re.match(r'^[\d.,\s]+$', line):
continue
# Skip lines with keywords
if any(kw in line.upper() for kw in skip_keywords):
continue
# Clean the line
vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()
if len(vendor) >= 3:
# Confidence decreases for lines further down
confidence = max(0.3, 0.8 - (i * 0.1))
return vendor, confidence
return None, 0.0
def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
"""Extract CUI (fiscal identification code) from text."""
for pattern, confidence in self.CUI_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
return cui, confidence
return None, 0.0
def _detect_receipt_type(self, text: str) -> str:
"""Detect receipt type from text content."""
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
return 'chitanta'
return 'bon_fiscal'

View File

@@ -0,0 +1,110 @@
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
import os
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
import asyncio
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Optional, Tuple
from app.services.ocr_engine import OCREngine
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
from app.services.image_preprocessor import ImagePreprocessor
class OCRService:
"""Service for OCR processing of receipt images."""
_executor = ThreadPoolExecutor(max_workers=2)
def __init__(self):
self.preprocessor = ImagePreprocessor()
self.ocr_engine = OCREngine()
self.extractor = ReceiptExtractor()
async def process_image(
self,
image_path: Path,
mime_type: str
) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""
Process receipt image and extract structured data.
Args:
image_path: Path to the image file
mime_type: MIME type of the file
Returns:
Tuple of (success, message, extraction_result)
"""
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
self._executor,
self._process_sync,
image_path,
mime_type
)
return result
except Exception as e:
return False, f"OCR processing failed: {str(e)}", None
def _process_sync(
self,
image_path: Path,
mime_type: str
) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""Synchronous processing (runs in thread pool)."""
# Handle PDF
if mime_type == 'application/pdf':
try:
images = self.preprocessor.pdf_to_images(image_path)
if not images:
return False, "Failed to extract images from PDF", None
image = images[0] # Process first page only
except RuntimeError as e:
return False, str(e), None
else:
try:
image = self.preprocessor.load_image(image_path)
except ValueError as e:
return False, str(e), None
# Preprocess image
processed = self.preprocessor.preprocess(image)
# Perform OCR
try:
ocr_result = self.ocr_engine.recognize(processed)
except RuntimeError as e:
return False, str(e), None
if not ocr_result.text:
return False, "No text detected in image", None
# Extract structured fields
extraction = self.extractor.extract(ocr_result.text)
# Build result message
fields_found = []
if extraction.amount:
fields_found.append("amount")
if extraction.receipt_date:
fields_found.append("date")
if extraction.partner_name:
fields_found.append("vendor")
if extraction.cui:
fields_found.append("CUI")
if extraction.receipt_number:
fields_found.append("number")
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
return True, message, extraction
# Singleton instance
ocr_service = OCRService()