feat: Add OCR integration for automatic receipt data extraction

Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:48:29 +02:00
parent 5960154094
commit 41ae97180e
16 changed files with 2773 additions and 32 deletions
--- a/data-entry-app/backend/app/main.py
+++ b/data-entry-app/backend/app/main.py
@@ -71,9 +71,10 @@ async def health_check():


 # Import and include routers
-from app.routers import receipts
+from app.routers import receipts, ocr

 app.include_router(receipts.router, prefix="/api/receipts", tags=["receipts"])
+app.include_router(ocr.router, prefix="/api/ocr", tags=["ocr"])


 # Root endpoint
--- a/data-entry-app/backend/app/routers/ocr.py
+++ b/data-entry-app/backend/app/routers/ocr.py
@@ -0,0 +1,156 @@
+"""OCR API endpoints."""
+
+import os
+import tempfile
+from pathlib import Path
+
+from fastapi import APIRouter, HTTPException, UploadFile, File, Depends
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.database import get_session
+from app.db.crud.attachment import AttachmentCRUD
+from app.services.ocr_service import ocr_service
+from app.services.ocr_engine import OCREngine
+from app.schemas.ocr import OCRResponse, OCRStatusResponse, ExtractionData
+
+router = APIRouter()
+
+
+@router.get("/status", response_model=OCRStatusResponse)
+async def get_ocr_status():
+    """Check OCR service status and available engines."""
+    engines = OCREngine.get_available_engines()
+    available = len(engines) > 0
+
+    if available:
+        message = f"OCR service ready with engines: {', '.join(engines)}"
+    else:
+        message = "No OCR engines available. Install PaddleOCR or Tesseract."
+
+    return OCRStatusResponse(
+        available=available,
+        engines=engines,
+        message=message
+    )
+
+
+@router.post("/extract", response_model=OCRResponse)
+async def extract_from_image(file: UploadFile = File(...)):
+    """
+    Extract receipt data from uploaded image.
+
+    Accepts JPG, PNG, or PDF files (max 10MB).
+    Returns extracted fields with confidence scores.
+    """
+    allowed_types = ['image/jpeg', 'image/png', 'application/pdf']
+
+    if file.content_type not in allowed_types:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File type not supported: {file.content_type}. Allowed: JPG, PNG, PDF"
+        )
+
+    # Get file extension
+    suffix = Path(file.filename).suffix.lower() if file.filename else '.jpg'
+    if suffix not in ['.jpg', '.jpeg', '.png', '.pdf']:
+        suffix = '.jpg'
+
+    # Save to temp file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        content = await file.read()
+
+        # Check file size (10MB limit)
+        if len(content) > 10 * 1024 * 1024:
+            raise HTTPException(
+                status_code=400,
+                detail="File too large. Maximum size is 10MB."
+            )
+
+        tmp.write(content)
+        tmp_path = Path(tmp.name)
+
+    try:
+        success, message, result = await ocr_service.process_image(
+            tmp_path, file.content_type
+        )
+
+        if not success:
+            raise HTTPException(status_code=422, detail=message)
+
+        # Convert ExtractionResult to ExtractionData schema
+        data = ExtractionData(
+            receipt_type=result.receipt_type,
+            receipt_number=result.receipt_number,
+            receipt_series=result.receipt_series,
+            receipt_date=result.receipt_date,
+            amount=result.amount,
+            partner_name=result.partner_name,
+            cui=result.cui,
+            description=result.description,
+            confidence_amount=result.confidence_amount,
+            confidence_date=result.confidence_date,
+            confidence_vendor=result.confidence_vendor,
+            overall_confidence=result.overall_confidence,
+            raw_text=result.raw_text,
+        )
+
+        return OCRResponse(success=True, message=message, data=data)
+
+    finally:
+        # Clean up temp file
+        if tmp_path.exists():
+            os.unlink(tmp_path)
+
+
+@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse)
+async def extract_from_attachment(
+    attachment_id: int,
+    session: AsyncSession = Depends(get_session),
+):
+    """
+    Extract receipt data from an existing attachment.
+
+    Re-processes an already uploaded file with OCR.
+    """
+    attachment = await AttachmentCRUD.get_by_id(session, attachment_id)
+
+    if not attachment:
+        raise HTTPException(status_code=404, detail="Attachment not found")
+
+    file_path = AttachmentCRUD.get_file_path(attachment)
+
+    if not file_path.exists():
+        raise HTTPException(status_code=404, detail="File not found on disk")
+
+    # Check if file type is supported
+    if attachment.mime_type not in ['image/jpeg', 'image/png', 'application/pdf']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File type not supported for OCR: {attachment.mime_type}"
+        )
+
+    success, message, result = await ocr_service.process_image(
+        file_path, attachment.mime_type
+    )
+
+    if not success:
+        raise HTTPException(status_code=422, detail=message)
+
+    # Convert ExtractionResult to ExtractionData schema
+    data = ExtractionData(
+        receipt_type=result.receipt_type,
+        receipt_number=result.receipt_number,
+        receipt_series=result.receipt_series,
+        receipt_date=result.receipt_date,
+        amount=result.amount,
+        partner_name=result.partner_name,
+        cui=result.cui,
+        description=result.description,
+        confidence_amount=result.confidence_amount,
+        confidence_date=result.confidence_date,
+        confidence_vendor=result.confidence_vendor,
+        overall_confidence=result.overall_confidence,
+        raw_text=result.raw_text,
+    )
+
+    return OCRResponse(success=True, message=message, data=data)
--- a/data-entry-app/backend/app/schemas/ocr.py
+++ b/data-entry-app/backend/app/schemas/ocr.py
@@ -0,0 +1,84 @@
+"""Pydantic schemas for OCR API."""
+
+from datetime import date
+from decimal import Decimal
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class ExtractionData(BaseModel):
+    """Extracted receipt data from OCR."""
+
+    receipt_type: str = Field(default='bon_fiscal', description="Receipt type: bon_fiscal or chitanta")
+    receipt_number: Optional[str] = Field(default=None, description="Receipt number")
+    receipt_series: Optional[str] = Field(default=None, description="Receipt series")
+    receipt_date: Optional[date] = Field(default=None, description="Receipt date")
+    amount: Optional[Decimal] = Field(default=None, description="Total amount")
+    partner_name: Optional[str] = Field(default=None, description="Vendor/partner name")
+    cui: Optional[str] = Field(default=None, description="CUI (fiscal identification code)")
+    description: Optional[str] = Field(default=None, description="Optional description")
+
+    confidence_amount: float = Field(default=0.0, ge=0, le=1, description="Amount extraction confidence")
+    confidence_date: float = Field(default=0.0, ge=0, le=1, description="Date extraction confidence")
+    confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
+    overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
+    raw_text: str = Field(default="", description="Raw OCR text")
+
+    class Config:
+        """Pydantic config."""
+        json_schema_extra = {
+            "example": {
+                "receipt_type": "bon_fiscal",
+                "receipt_number": "12345",
+                "receipt_series": None,
+                "receipt_date": "2024-01-15",
+                "amount": 125.50,
+                "partner_name": "MEGA IMAGE SRL",
+                "cui": "12345678",
+                "description": None,
+                "confidence_amount": 0.95,
+                "confidence_date": 0.90,
+                "confidence_vendor": 0.75,
+                "overall_confidence": 0.87,
+                "raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..."
+            }
+        }
+
+
+class OCRResponse(BaseModel):
+    """OCR API response."""
+
+    success: bool = Field(description="Whether OCR processing was successful")
+    message: str = Field(description="Status message")
+    data: Optional[ExtractionData] = Field(default=None, description="Extracted data")
+
+    class Config:
+        """Pydantic config."""
+        json_schema_extra = {
+            "example": {
+                "success": True,
+                "message": "OCR processing successful. Found: amount, date, vendor",
+                "data": {
+                    "receipt_type": "bon_fiscal",
+                    "receipt_number": "12345",
+                    "receipt_date": "2024-01-15",
+                    "amount": 125.50,
+                    "partner_name": "MEGA IMAGE SRL",
+                    "cui": "12345678",
+                    "confidence_amount": 0.95,
+                    "confidence_date": 0.90,
+                    "confidence_vendor": 0.75,
+                    "overall_confidence": 0.87,
+                    "raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..."
+                }
+            }
+        }
+
+
+class OCRStatusResponse(BaseModel):
+    """OCR service status response."""
+
+    available: bool = Field(description="Whether OCR service is available")
+    engines: list[str] = Field(description="Available OCR engines")
+    message: str = Field(description="Status message")
--- a/data-entry-app/backend/app/services/image_preprocessor.py
+++ b/data-entry-app/backend/app/services/image_preprocessor.py
@@ -0,0 +1,116 @@
+"""Image preprocessing for optimal OCR results."""
+
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import cv2
+
+try:
+    import pdf2image
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+
+
+class ImagePreprocessor:
+    """Preprocess receipt images for OCR."""
+
+    def load_image(self, path: Path) -> np.ndarray:
+        """Load image from file."""
+        image = cv2.imread(str(path))
+        if image is None:
+            raise ValueError(f"Could not load image: {path}")
+        return image
+
+    def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
+        """Convert PDF to images."""
+        if not PDF_AVAILABLE:
+            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
+        images = pdf2image.convert_from_path(str(path), dpi=dpi)
+        return [np.array(img) for img in images]
+
+    def preprocess(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply preprocessing pipeline for thermal receipt images.
+
+        Pipeline:
+        1. Convert to grayscale
+        2. Resize if too small (min 1000px width)
+        3. Deskew (straighten rotated text)
+        4. Denoise (Non-local means)
+        5. Adaptive thresholding (binarization)
+        6. Morphological close (connect broken chars)
+        """
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # 2. Resize if too small
+        height, width = gray.shape
+        if width < 1000:
+            scale = 1000 / width
+            gray = cv2.resize(
+                gray, None, fx=scale, fy=scale,
+                interpolation=cv2.INTER_CUBIC
+            )
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Denoise
+        denoised = cv2.fastNlMeansDenoising(
+            gray, h=10,
+            templateWindowSize=7,
+            searchWindowSize=21
+        )
+
+        # 5. Adaptive thresholding
+        binary = cv2.adaptiveThreshold(
+            denoised, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            blockSize=15, C=8
+        )
+
+        # 6. Morphological close
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
+
+        return result
+
+    def _deskew(self, image: np.ndarray) -> np.ndarray:
+        """Correct image rotation/skew using Hough lines."""
+        edges = cv2.Canny(image, 50, 150, apertureSize=3)
+        lines = cv2.HoughLinesP(
+            edges, 1, np.pi / 180,
+            threshold=100, minLineLength=100, maxLineGap=10
+        )
+
+        if lines is None:
+            return image
+
+        angles = []
+        for line in lines:
+            x1, y1, x2, y2 = line[0]
+            angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
+            if abs(angle) < 45:
+                angles.append(angle)
+
+        if not angles:
+            return image
+
+        median_angle = np.median(angles)
+        if abs(median_angle) < 0.5:
+            return image
+
+        h, w = image.shape[:2]
+        center = (w // 2, h // 2)
+        M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+        return cv2.warpAffine(
+            image, M, (w, h),
+            flags=cv2.INTER_CUBIC,
+            borderMode=cv2.BORDER_REPLICATE
+        )
--- a/data-entry-app/backend/app/services/ocr_engine.py
+++ b/data-entry-app/backend/app/services/ocr_engine.py
@@ -0,0 +1,168 @@
+"""OCR engine wrapper for PaddleOCR and Tesseract."""
+
+import os
+from dataclasses import dataclass
+from typing import List, Optional
+
+import numpy as np
+
+# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+# Lazy imports - these will be imported on first use
+PaddleOCR = None  # Will be imported lazily
+pytesseract = None  # Will be imported lazily
+
+# Check availability without importing heavy libraries
+def _check_paddle_available() -> bool:
+    """Check if paddleocr is installed without importing it."""
+    try:
+        import importlib.util
+        return importlib.util.find_spec("paddleocr") is not None
+    except Exception:
+        return False
+
+def _check_tesseract_available() -> bool:
+    """Check if pytesseract is installed without importing it."""
+    try:
+        import importlib.util
+        return importlib.util.find_spec("pytesseract") is not None
+    except Exception:
+        return False
+
+PADDLE_AVAILABLE = _check_paddle_available()
+TESSERACT_AVAILABLE = _check_tesseract_available()
+
+
+@dataclass
+class OCRResult:
+    """Raw OCR result."""
+    text: str
+    confidence: float
+    boxes: List[dict]
+
+
+class OCREngine:
+    """Unified OCR engine with fallback support."""
+
+    def __init__(self):
+        self._paddle = None
+        self._paddle_initialized = False
+
+    def _init_paddle_lazy(self):
+        """Lazy initialize PaddleOCR on first use (avoids slow startup)."""
+        global PaddleOCR
+
+        if self._paddle_initialized:
+            return
+
+        self._paddle_initialized = True
+        if PADDLE_AVAILABLE:
+            try:
+                print("Importing PaddleOCR (first use, may take ~15-20 seconds)...")
+                from paddleocr import PaddleOCR as _PaddleOCR
+                PaddleOCR = _PaddleOCR
+
+                print("Initializing PaddleOCR engine...")
+                # PaddleOCR 3.x API - simplified parameters
+                self._paddle = PaddleOCR(
+                    lang='en',  # Better for mixed text with numbers
+                )
+                print("PaddleOCR initialized successfully")
+            except Exception as e:
+                print(f"Warning: Failed to initialize PaddleOCR: {e}")
+                self._paddle = None
+
+    def recognize(self, image: np.ndarray) -> OCRResult:
+        """Perform OCR on preprocessed image."""
+        # Lazy init PaddleOCR on first call
+        self._init_paddle_lazy()
+
+        if PADDLE_AVAILABLE and self._paddle:
+            return self._paddle_recognize(image)
+        elif TESSERACT_AVAILABLE:
+            return self._tesseract_recognize(image)
+        else:
+            raise RuntimeError(
+                "No OCR engine available. Install PaddleOCR or Tesseract."
+            )
+
+    def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
+        """Recognize text using PaddleOCR 3.x API."""
+        try:
+            # PaddleOCR 3.x requires 3-channel images
+            if len(image.shape) == 2:
+                # Convert grayscale to 3-channel BGR
+                import cv2
+                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+
+            # PaddleOCR 3.x uses predict() with new parameter names
+            result = self._paddle.predict(image, use_textline_orientation=True)
+
+            if not result or len(result) == 0:
+                return OCRResult(text="", confidence=0.0, boxes=[])
+
+            # PaddleOCR 3.x returns OCRResult objects with different structure
+            ocr_result = result[0]
+
+            # Extract texts and scores from the new format
+            rec_texts = ocr_result.get('rec_texts', [])
+            rec_scores = ocr_result.get('rec_scores', [])
+            dt_polys = ocr_result.get('dt_polys', [])
+
+            if not rec_texts:
+                return OCRResult(text="", confidence=0.0, boxes=[])
+
+            boxes = []
+            for i, text in enumerate(rec_texts):
+                conf = rec_scores[i] if i < len(rec_scores) else 0.0
+                box = dt_polys[i].tolist() if i < len(dt_polys) else []
+                boxes.append({
+                    'text': text,
+                    'confidence': float(conf),
+                    'box': box
+                })
+
+            avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
+            return OCRResult(
+                text='\n'.join(rec_texts),
+                confidence=float(avg_conf),
+                boxes=boxes
+            )
+        except Exception as e:
+            print(f"PaddleOCR error: {e}, falling back to Tesseract")
+            if TESSERACT_AVAILABLE:
+                return self._tesseract_recognize(image)
+            raise
+
+    def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
+        """Recognize text using Tesseract."""
+        global pytesseract
+
+        # Lazy import pytesseract
+        if pytesseract is None:
+            print("Importing pytesseract...")
+            import pytesseract as _pytesseract
+            pytesseract = _pytesseract
+
+        config = '--psm 6 -l ron+eng'
+        text = pytesseract.image_to_string(image, config=config)
+        data = pytesseract.image_to_data(
+            image, config=config,
+            output_type=pytesseract.Output.DICT
+        )
+
+        confidences = [int(c) for c in data['conf'] if int(c) > 0]
+        avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
+
+        return OCRResult(text=text, confidence=avg_conf, boxes=[])
+
+    @staticmethod
+    def get_available_engines() -> List[str]:
+        """Return list of available OCR engines."""
+        engines = []
+        if PADDLE_AVAILABLE:
+            engines.append('paddleocr')
+        if TESSERACT_AVAILABLE:
+            engines.append('tesseract')
+        return engines
--- a/data-entry-app/backend/app/services/ocr_extractor.py
+++ b/data-entry-app/backend/app/services/ocr_extractor.py
@@ -0,0 +1,231 @@
+"""Extract structured fields from OCR text (Romanian receipts)."""
+
+import re
+from datetime import date, datetime
+from decimal import Decimal, InvalidOperation
+from typing import Optional, Tuple
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ExtractionResult:
+    """Structured extraction result from receipt."""
+    receipt_type: str = 'bon_fiscal'
+    receipt_number: Optional[str] = None
+    receipt_series: Optional[str] = None
+    receipt_date: Optional[date] = None
+    amount: Optional[Decimal] = None
+    partner_name: Optional[str] = None
+    cui: Optional[str] = None
+    description: Optional[str] = None
+
+    confidence_amount: float = 0.0
+    confidence_date: float = 0.0
+    confidence_vendor: float = 0.0
+    raw_text: str = ""
+
+    @property
+    def overall_confidence(self) -> float:
+        """Calculate weighted overall confidence score."""
+        weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3}
+        return round(
+            self.confidence_amount * weights['amount'] +
+            self.confidence_date * weights['date'] +
+            self.confidence_vendor * weights['vendor'],
+            2
+        )
+
+
+class ReceiptExtractor:
+    """Extract receipt fields using pattern matching for Romanian receipts."""
+
+    # Total amount patterns (most specific first)
+    TOTAL_PATTERNS = [
+        (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
+        (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
+        (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
+        (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
+        (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
+        (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
+    ]
+
+    # Date patterns
+    DATE_PATTERNS = [
+        (r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
+        (r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
+        (r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
+        (r'(\d{4}[./]\d{2}[./]\d{2})', 0.75),  # YYYY.MM.DD format
+    ]
+
+    # Receipt number patterns
+    NUMBER_PATTERNS = [
+        (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
+        (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
+        (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
+        (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
+        (r'NR\.?\s*:?\s*(\d{4,})', 0.70),
+    ]
+
+    # CUI (fiscal code) patterns
+    CUI_PATTERNS = [
+        (r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
+        (r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
+        (r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
+        (r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
+    ]
+
+    # Series patterns
+    SERIES_PATTERNS = [
+        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
+        (r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
+    ]
+
+    def extract(self, text: str) -> ExtractionResult:
+        """Extract all fields from OCR text."""
+        result = ExtractionResult()
+        result.raw_text = text
+        text_upper = text.upper()
+
+        # Extract fields
+        result.amount, result.confidence_amount = self._extract_amount(text_upper)
+        result.receipt_date, result.confidence_date = self._extract_date(text_upper)
+        result.receipt_number, _ = self._extract_number(text_upper)
+        result.receipt_series, _ = self._extract_series(text_upper)
+        result.partner_name, result.confidence_vendor = self._extract_vendor(text)
+        result.cui, _ = self._extract_cui(text_upper)
+
+        # Detect receipt type
+        result.receipt_type = self._detect_receipt_type(text_upper)
+
+        return result
+
+    def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
+        """Extract total amount from text."""
+        for pattern, confidence in self.TOTAL_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+            if match:
+                try:
+                    amount_str = re.sub(r'[^\d.,]', '', match.group(1))
+                    # Handle Romanian number format (1.234,56)
+                    amount_str = self._normalize_number(amount_str)
+                    amount = Decimal(amount_str)
+                    if amount > 0:
+                        return amount, confidence
+                except (InvalidOperation, ValueError):
+                    continue
+        return None, 0.0
+
+    def _normalize_number(self, num_str: str) -> str:
+        """Normalize Romanian number format to standard decimal."""
+        # Remove spaces
+        num_str = num_str.replace(' ', '')
+
+        # Handle comma as decimal separator
+        if ',' in num_str and '.' in num_str:
+            # Romanian format: 1.234,56
+            num_str = num_str.replace('.', '').replace(',', '.')
+        elif ',' in num_str:
+            # Could be 1,50 or 1,234
+            parts = num_str.split(',')
+            if len(parts) == 2 and len(parts[1]) <= 2:
+                # Decimal comma: 1,50
+                num_str = num_str.replace(',', '.')
+            else:
+                # Thousands comma: 1,234
+                num_str = num_str.replace(',', '')
+        elif '.' in num_str:
+            parts = num_str.split('.')
+            if len(parts) > 2:
+                # Multiple dots: 1.234.567 -> 1234567
+                num_str = ''.join(parts[:-1]) + '.' + parts[-1]
+
+        return num_str
+
+    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
+        """Extract receipt date from text."""
+        for pattern, confidence in self.DATE_PATTERNS:
+            match = re.search(pattern, text)
+            if match:
+                try:
+                    date_str = match.group(1).replace('/', '.')
+
+                    # Try DD.MM.YYYY format first
+                    try:
+                        parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
+                    except ValueError:
+                        # Try YYYY.MM.DD format
+                        parsed = datetime.strptime(date_str, '%Y.%m.%d').date()
+
+                    # Validate date range
+                    today = date.today()
+                    if parsed <= today and parsed.year >= 2020:
+                        return parsed, confidence
+                except ValueError:
+                    continue
+        return None, 0.0
+
+    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract receipt number from text."""
+        for pattern, confidence in self.NUMBER_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(1), confidence
+        return None, 0.0
+
+    def _extract_series(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract receipt series from text."""
+        for pattern, confidence in self.SERIES_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(1).upper(), confidence
+        return None, 0.0
+
+    def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract vendor/partner name from text."""
+        lines = text.split('\n')
+        skip_keywords = [
+            'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
+            'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
+            'RON', 'LEI', 'CHITANTA', 'REST'
+        ]
+
+        for i, line in enumerate(lines[:7]):  # Check first 7 lines
+            line = line.strip()
+
+            # Skip empty lines
+            if not line:
+                continue
+
+            # Skip lines that are just numbers
+            if re.match(r'^[\d.,\s]+$', line):
+                continue
+
+            # Skip lines with keywords
+            if any(kw in line.upper() for kw in skip_keywords):
+                continue
+
+            # Clean the line
+            vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()
+
+            if len(vendor) >= 3:
+                # Confidence decreases for lines further down
+                confidence = max(0.3, 0.8 - (i * 0.1))
+                return vendor, confidence
+
+        return None, 0.0
+
+    def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
+        """Extract CUI (fiscal identification code) from text."""
+        for pattern, confidence in self.CUI_PATTERNS:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                cui = match.group(1)
+                if 6 <= len(cui) <= 10:
+                    return cui, confidence
+        return None, 0.0
+
+    def _detect_receipt_type(self, text: str) -> str:
+        """Detect receipt type from text content."""
+        if 'CHITANTA' in text or 'CHITANȚĂ' in text:
+            return 'chitanta'
+        return 'bon_fiscal'
--- a/data-entry-app/backend/app/services/ocr_service.py
+++ b/data-entry-app/backend/app/services/ocr_service.py
@@ -0,0 +1,110 @@
+"""Main OCR service coordinating preprocessing, recognition, and extraction."""
+
+import os
+# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Optional, Tuple
+
+from app.services.ocr_engine import OCREngine
+from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
+from app.services.image_preprocessor import ImagePreprocessor
+
+
+class OCRService:
+    """Service for OCR processing of receipt images."""
+
+    _executor = ThreadPoolExecutor(max_workers=2)
+
+    def __init__(self):
+        self.preprocessor = ImagePreprocessor()
+        self.ocr_engine = OCREngine()
+        self.extractor = ReceiptExtractor()
+
+    async def process_image(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """
+        Process receipt image and extract structured data.
+
+        Args:
+            image_path: Path to the image file
+            mime_type: MIME type of the file
+
+        Returns:
+            Tuple of (success, message, extraction_result)
+        """
+        try:
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(
+                self._executor,
+                self._process_sync,
+                image_path,
+                mime_type
+            )
+            return result
+        except Exception as e:
+            return False, f"OCR processing failed: {str(e)}", None
+
+    def _process_sync(
+        self,
+        image_path: Path,
+        mime_type: str
+    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
+        """Synchronous processing (runs in thread pool)."""
+
+        # Handle PDF
+        if mime_type == 'application/pdf':
+            try:
+                images = self.preprocessor.pdf_to_images(image_path)
+                if not images:
+                    return False, "Failed to extract images from PDF", None
+                image = images[0]  # Process first page only
+            except RuntimeError as e:
+                return False, str(e), None
+        else:
+            try:
+                image = self.preprocessor.load_image(image_path)
+            except ValueError as e:
+                return False, str(e), None
+
+        # Preprocess image
+        processed = self.preprocessor.preprocess(image)
+
+        # Perform OCR
+        try:
+            ocr_result = self.ocr_engine.recognize(processed)
+        except RuntimeError as e:
+            return False, str(e), None
+
+        if not ocr_result.text:
+            return False, "No text detected in image", None
+
+        # Extract structured fields
+        extraction = self.extractor.extract(ocr_result.text)
+
+        # Build result message
+        fields_found = []
+        if extraction.amount:
+            fields_found.append("amount")
+        if extraction.receipt_date:
+            fields_found.append("date")
+        if extraction.partner_name:
+            fields_found.append("vendor")
+        if extraction.cui:
+            fields_found.append("CUI")
+        if extraction.receipt_number:
+            fields_found.append("number")
+
+        message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
+
+        return True, message, extraction
+
+
+# Singleton instance
+ocr_service = OCRService()
--- a/data-entry-app/backend/requirements.txt
+++ b/data-entry-app/backend/requirements.txt
@@ -30,3 +30,11 @@ httpx>=0.26.0
 # Testing
 pytest>=8.0.0
 pytest-asyncio>=0.23.3
+
+# OCR Dependencies
+paddleocr>=2.7.0
+paddlepaddle>=2.5.0
+opencv-python>=4.8.0
+pytesseract>=0.3.10
+pdf2image>=1.16.0
+numpy>=1.24.0