feat: Add OCR integration for automatic receipt data extraction
Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
110
data-entry-app/backend/app/services/ocr_service.py
Normal file
110
data-entry-app/backend/app/services/ocr_service.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
|
||||
|
||||
import os
|
||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from app.services.ocr_engine import OCREngine
|
||||
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
|
||||
from app.services.image_preprocessor import ImagePreprocessor
|
||||
|
||||
|
||||
class OCRService:
|
||||
"""Service for OCR processing of receipt images."""
|
||||
|
||||
_executor = ThreadPoolExecutor(max_workers=2)
|
||||
|
||||
def __init__(self):
|
||||
self.preprocessor = ImagePreprocessor()
|
||||
self.ocr_engine = OCREngine()
|
||||
self.extractor = ReceiptExtractor()
|
||||
|
||||
async def process_image(
|
||||
self,
|
||||
image_path: Path,
|
||||
mime_type: str
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""
|
||||
Process receipt image and extract structured data.
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
mime_type: MIME type of the file
|
||||
|
||||
Returns:
|
||||
Tuple of (success, message, extraction_result)
|
||||
"""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
self._executor,
|
||||
self._process_sync,
|
||||
image_path,
|
||||
mime_type
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
return False, f"OCR processing failed: {str(e)}", None
|
||||
|
||||
def _process_sync(
|
||||
self,
|
||||
image_path: Path,
|
||||
mime_type: str
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""Synchronous processing (runs in thread pool)."""
|
||||
|
||||
# Handle PDF
|
||||
if mime_type == 'application/pdf':
|
||||
try:
|
||||
images = self.preprocessor.pdf_to_images(image_path)
|
||||
if not images:
|
||||
return False, "Failed to extract images from PDF", None
|
||||
image = images[0] # Process first page only
|
||||
except RuntimeError as e:
|
||||
return False, str(e), None
|
||||
else:
|
||||
try:
|
||||
image = self.preprocessor.load_image(image_path)
|
||||
except ValueError as e:
|
||||
return False, str(e), None
|
||||
|
||||
# Preprocess image
|
||||
processed = self.preprocessor.preprocess(image)
|
||||
|
||||
# Perform OCR
|
||||
try:
|
||||
ocr_result = self.ocr_engine.recognize(processed)
|
||||
except RuntimeError as e:
|
||||
return False, str(e), None
|
||||
|
||||
if not ocr_result.text:
|
||||
return False, "No text detected in image", None
|
||||
|
||||
# Extract structured fields
|
||||
extraction = self.extractor.extract(ocr_result.text)
|
||||
|
||||
# Build result message
|
||||
fields_found = []
|
||||
if extraction.amount:
|
||||
fields_found.append("amount")
|
||||
if extraction.receipt_date:
|
||||
fields_found.append("date")
|
||||
if extraction.partner_name:
|
||||
fields_found.append("vendor")
|
||||
if extraction.cui:
|
||||
fields_found.append("CUI")
|
||||
if extraction.receipt_number:
|
||||
fields_found.append("number")
|
||||
|
||||
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
|
||||
|
||||
return True, message, extraction
|
||||
|
||||
|
||||
# Singleton instance
|
||||
ocr_service = OCRService()
|
||||
Reference in New Issue
Block a user