feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name,
date, total amount, and VAT from uploaded receipt images/PDFs,
reducing manual data entry and improving accuracy.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions

View File

@@ -0,0 +1,110 @@
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
import os
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
import asyncio
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Optional, Tuple
from app.services.ocr_engine import OCREngine
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
from app.services.image_preprocessor import ImagePreprocessor
class OCRService:
"""Service for OCR processing of receipt images."""
_executor = ThreadPoolExecutor(max_workers=2)
def __init__(self):
self.preprocessor = ImagePreprocessor()
self.ocr_engine = OCREngine()
self.extractor = ReceiptExtractor()
async def process_image(
self,
image_path: Path,
mime_type: str
) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""
Process receipt image and extract structured data.
Args:
image_path: Path to the image file
mime_type: MIME type of the file
Returns:
Tuple of (success, message, extraction_result)
"""
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
self._executor,
self._process_sync,
image_path,
mime_type
)
return result
except Exception as e:
return False, f"OCR processing failed: {str(e)}", None
def _process_sync(
self,
image_path: Path,
mime_type: str
) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""Synchronous processing (runs in thread pool)."""
# Handle PDF
if mime_type == 'application/pdf':
try:
images = self.preprocessor.pdf_to_images(image_path)
if not images:
return False, "Failed to extract images from PDF", None
image = images[0] # Process first page only
except RuntimeError as e:
return False, str(e), None
else:
try:
image = self.preprocessor.load_image(image_path)
except ValueError as e:
return False, str(e), None
# Preprocess image
processed = self.preprocessor.preprocess(image)
# Perform OCR
try:
ocr_result = self.ocr_engine.recognize(processed)
except RuntimeError as e:
return False, str(e), None
if not ocr_result.text:
return False, "No text detected in image", None
# Extract structured fields
extraction = self.extractor.extract(ocr_result.text)
# Build result message
fields_found = []
if extraction.amount:
fields_found.append("amount")
if extraction.receipt_date:
fields_found.append("date")
if extraction.partner_name:
fields_found.append("vendor")
if extraction.cui:
fields_found.append("CUI")
if extraction.receipt_number:
fields_found.append("number")
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
return True, message, extraction
# Singleton instance
ocr_service = OCRService()