Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
111 lines
3.4 KiB
Python
111 lines
3.4 KiB
Python
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
|
|
|
|
import os
|
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
|
|
import asyncio
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple
|
|
|
|
from app.services.ocr_engine import OCREngine
|
|
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
|
|
from app.services.image_preprocessor import ImagePreprocessor
|
|
|
|
|
|
class OCRService:
|
|
"""Service for OCR processing of receipt images."""
|
|
|
|
_executor = ThreadPoolExecutor(max_workers=2)
|
|
|
|
def __init__(self):
|
|
self.preprocessor = ImagePreprocessor()
|
|
self.ocr_engine = OCREngine()
|
|
self.extractor = ReceiptExtractor()
|
|
|
|
async def process_image(
|
|
self,
|
|
image_path: Path,
|
|
mime_type: str
|
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
|
"""
|
|
Process receipt image and extract structured data.
|
|
|
|
Args:
|
|
image_path: Path to the image file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
Tuple of (success, message, extraction_result)
|
|
"""
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
self._executor,
|
|
self._process_sync,
|
|
image_path,
|
|
mime_type
|
|
)
|
|
return result
|
|
except Exception as e:
|
|
return False, f"OCR processing failed: {str(e)}", None
|
|
|
|
def _process_sync(
|
|
self,
|
|
image_path: Path,
|
|
mime_type: str
|
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
|
"""Synchronous processing (runs in thread pool)."""
|
|
|
|
# Handle PDF
|
|
if mime_type == 'application/pdf':
|
|
try:
|
|
images = self.preprocessor.pdf_to_images(image_path)
|
|
if not images:
|
|
return False, "Failed to extract images from PDF", None
|
|
image = images[0] # Process first page only
|
|
except RuntimeError as e:
|
|
return False, str(e), None
|
|
else:
|
|
try:
|
|
image = self.preprocessor.load_image(image_path)
|
|
except ValueError as e:
|
|
return False, str(e), None
|
|
|
|
# Preprocess image
|
|
processed = self.preprocessor.preprocess(image)
|
|
|
|
# Perform OCR
|
|
try:
|
|
ocr_result = self.ocr_engine.recognize(processed)
|
|
except RuntimeError as e:
|
|
return False, str(e), None
|
|
|
|
if not ocr_result.text:
|
|
return False, "No text detected in image", None
|
|
|
|
# Extract structured fields
|
|
extraction = self.extractor.extract(ocr_result.text)
|
|
|
|
# Build result message
|
|
fields_found = []
|
|
if extraction.amount:
|
|
fields_found.append("amount")
|
|
if extraction.receipt_date:
|
|
fields_found.append("date")
|
|
if extraction.partner_name:
|
|
fields_found.append("vendor")
|
|
if extraction.cui:
|
|
fields_found.append("CUI")
|
|
if extraction.receipt_number:
|
|
fields_found.append("number")
|
|
|
|
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
|
|
|
|
return True, message, extraction
|
|
|
|
|
|
# Singleton instance
|
|
ocr_service = OCRService()
|