feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions
--- a/data-entry-app/backend/app/routers/ocr.py
+++ b/data-entry-app/backend/app/routers/ocr.py
@@ -0,0 +1,156 @@
+"""OCR API endpoints."""
+
+import os
+import tempfile
+from pathlib import Path
+
+from fastapi import APIRouter, HTTPException, UploadFile, File, Depends
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.database import get_session
+from app.db.crud.attachment import AttachmentCRUD
+from app.services.ocr_service import ocr_service
+from app.services.ocr_engine import OCREngine
+from app.schemas.ocr import OCRResponse, OCRStatusResponse, ExtractionData
+
+router = APIRouter()
+
+
+@router.get("/status", response_model=OCRStatusResponse)
+async def get_ocr_status():
+    """Check OCR service status and available engines."""
+    engines = OCREngine.get_available_engines()
+    available = len(engines) > 0
+
+    if available:
+        message = f"OCR service ready with engines: {', '.join(engines)}"
+    else:
+        message = "No OCR engines available. Install PaddleOCR or Tesseract."
+
+    return OCRStatusResponse(
+        available=available,
+        engines=engines,
+        message=message
+    )
+
+
+@router.post("/extract", response_model=OCRResponse)
+async def extract_from_image(file: UploadFile = File(...)):
+    """
+    Extract receipt data from uploaded image.
+
+    Accepts JPG, PNG, or PDF files (max 10MB).
+    Returns extracted fields with confidence scores.
+    """
+    allowed_types = ['image/jpeg', 'image/png', 'application/pdf']
+
+    if file.content_type not in allowed_types:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File type not supported: {file.content_type}. Allowed: JPG, PNG, PDF"
+        )
+
+    # Get file extension
+    suffix = Path(file.filename).suffix.lower() if file.filename else '.jpg'
+    if suffix not in ['.jpg', '.jpeg', '.png', '.pdf']:
+        suffix = '.jpg'
+
+    # Save to temp file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        content = await file.read()
+
+        # Check file size (10MB limit)
+        if len(content) > 10 * 1024 * 1024:
+            raise HTTPException(
+                status_code=400,
+                detail="File too large. Maximum size is 10MB."
+            )
+
+        tmp.write(content)
+        tmp_path = Path(tmp.name)
+
+    try:
+        success, message, result = await ocr_service.process_image(
+            tmp_path, file.content_type
+        )
+
+        if not success:
+            raise HTTPException(status_code=422, detail=message)
+
+        # Convert ExtractionResult to ExtractionData schema
+        data = ExtractionData(
+            receipt_type=result.receipt_type,
+            receipt_number=result.receipt_number,
+            receipt_series=result.receipt_series,
+            receipt_date=result.receipt_date,
+            amount=result.amount,
+            partner_name=result.partner_name,
+            cui=result.cui,
+            description=result.description,
+            confidence_amount=result.confidence_amount,
+            confidence_date=result.confidence_date,
+            confidence_vendor=result.confidence_vendor,
+            overall_confidence=result.overall_confidence,
+            raw_text=result.raw_text,
+        )
+
+        return OCRResponse(success=True, message=message, data=data)
+
+    finally:
+        # Clean up temp file
+        if tmp_path.exists():
+            os.unlink(tmp_path)
+
+
+@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse)
+async def extract_from_attachment(
+    attachment_id: int,
+    session: AsyncSession = Depends(get_session),
+):
+    """
+    Extract receipt data from an existing attachment.
+
+    Re-processes an already uploaded file with OCR.
+    """
+    attachment = await AttachmentCRUD.get_by_id(session, attachment_id)
+
+    if not attachment:
+        raise HTTPException(status_code=404, detail="Attachment not found")
+
+    file_path = AttachmentCRUD.get_file_path(attachment)
+
+    if not file_path.exists():
+        raise HTTPException(status_code=404, detail="File not found on disk")
+
+    # Check if file type is supported
+    if attachment.mime_type not in ['image/jpeg', 'image/png', 'application/pdf']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File type not supported for OCR: {attachment.mime_type}"
+        )
+
+    success, message, result = await ocr_service.process_image(
+        file_path, attachment.mime_type
+    )
+
+    if not success:
+        raise HTTPException(status_code=422, detail=message)
+
+    # Convert ExtractionResult to ExtractionData schema
+    data = ExtractionData(
+        receipt_type=result.receipt_type,
+        receipt_number=result.receipt_number,
+        receipt_series=result.receipt_series,
+        receipt_date=result.receipt_date,
+        amount=result.amount,
+        partner_name=result.partner_name,
+        cui=result.cui,
+        description=result.description,
+        confidence_amount=result.confidence_amount,
+        confidence_date=result.confidence_date,
+        confidence_vendor=result.confidence_vendor,
+        overall_confidence=result.overall_confidence,
+        raw_text=result.raw_text,
+    )
+
+    return OCRResponse(success=True, message=message, data=data)