feat: Add OCR integration for automatic receipt data extraction
Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
84
data-entry-app/backend/app/schemas/ocr.py
Normal file
84
data-entry-app/backend/app/schemas/ocr.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Pydantic schemas for OCR API."""
|
||||
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ExtractionData(BaseModel):
|
||||
"""Extracted receipt data from OCR."""
|
||||
|
||||
receipt_type: str = Field(default='bon_fiscal', description="Receipt type: bon_fiscal or chitanta")
|
||||
receipt_number: Optional[str] = Field(default=None, description="Receipt number")
|
||||
receipt_series: Optional[str] = Field(default=None, description="Receipt series")
|
||||
receipt_date: Optional[date] = Field(default=None, description="Receipt date")
|
||||
amount: Optional[Decimal] = Field(default=None, description="Total amount")
|
||||
partner_name: Optional[str] = Field(default=None, description="Vendor/partner name")
|
||||
cui: Optional[str] = Field(default=None, description="CUI (fiscal identification code)")
|
||||
description: Optional[str] = Field(default=None, description="Optional description")
|
||||
|
||||
confidence_amount: float = Field(default=0.0, ge=0, le=1, description="Amount extraction confidence")
|
||||
confidence_date: float = Field(default=0.0, ge=0, le=1, description="Date extraction confidence")
|
||||
confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
|
||||
overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
|
||||
raw_text: str = Field(default="", description="Raw OCR text")
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"receipt_type": "bon_fiscal",
|
||||
"receipt_number": "12345",
|
||||
"receipt_series": None,
|
||||
"receipt_date": "2024-01-15",
|
||||
"amount": 125.50,
|
||||
"partner_name": "MEGA IMAGE SRL",
|
||||
"cui": "12345678",
|
||||
"description": None,
|
||||
"confidence_amount": 0.95,
|
||||
"confidence_date": 0.90,
|
||||
"confidence_vendor": 0.75,
|
||||
"overall_confidence": 0.87,
|
||||
"raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..."
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class OCRResponse(BaseModel):
|
||||
"""OCR API response."""
|
||||
|
||||
success: bool = Field(description="Whether OCR processing was successful")
|
||||
message: str = Field(description="Status message")
|
||||
data: Optional[ExtractionData] = Field(default=None, description="Extracted data")
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"success": True,
|
||||
"message": "OCR processing successful. Found: amount, date, vendor",
|
||||
"data": {
|
||||
"receipt_type": "bon_fiscal",
|
||||
"receipt_number": "12345",
|
||||
"receipt_date": "2024-01-15",
|
||||
"amount": 125.50,
|
||||
"partner_name": "MEGA IMAGE SRL",
|
||||
"cui": "12345678",
|
||||
"confidence_amount": 0.95,
|
||||
"confidence_date": 0.90,
|
||||
"confidence_vendor": 0.75,
|
||||
"overall_confidence": 0.87,
|
||||
"raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class OCRStatusResponse(BaseModel):
|
||||
"""OCR service status response."""
|
||||
|
||||
available: bool = Field(description="Whether OCR service is available")
|
||||
engines: list[str] = Field(description="Available OCR engines")
|
||||
message: str = Field(description="Status message")
|
||||
Reference in New Issue
Block a user