From 41ae97180ec301f45be523eb7de44e110c3deffc Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Fri, 12 Dec 2025 11:48:29 +0200 Subject: [PATCH] feat: Add OCR integration for automatic receipt data extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement Tesseract-based OCR to automatically extract vendor name, date, total amount, and VAT from uploaded receipt images/PDFs, reducing manual data entry and improving accuracy. ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- data-entry-app/README.md | 133 +++- data-entry-app/backend/app/main.py | 3 +- data-entry-app/backend/app/routers/ocr.py | 156 ++++ data-entry-app/backend/app/schemas/ocr.py | 84 ++ .../app/services/image_preprocessor.py | 116 +++ .../backend/app/services/ocr_engine.py | 168 ++++ .../backend/app/services/ocr_extractor.py | 231 ++++++ .../backend/app/services/ocr_service.py | 110 +++ data-entry-app/backend/requirements.txt | 8 + .../components/ocr/OCRConfidenceIndicator.vue | 125 +++ .../src/components/ocr/OCRPreview.vue | 279 +++++++ .../src/components/ocr/OCRUploadZone.vue | 291 +++++++ .../src/views/receipts/ReceiptCreateView.vue | 199 ++++- docs/OCR_IMPLEMENTATION_PLAN.md | 717 ++++++++++++++++++ docs/data-entry/ARCHITECTURE.md | 106 ++- docs/data-entry/REQUIREMENTS.md | 79 +- 16 files changed, 2773 insertions(+), 32 deletions(-) create mode 100644 data-entry-app/backend/app/routers/ocr.py create mode 100644 data-entry-app/backend/app/schemas/ocr.py create mode 100644 data-entry-app/backend/app/services/image_preprocessor.py create mode 100644 data-entry-app/backend/app/services/ocr_engine.py create mode 100644 data-entry-app/backend/app/services/ocr_extractor.py create mode 100644 data-entry-app/backend/app/services/ocr_service.py create mode 100644 data-entry-app/frontend/src/components/ocr/OCRConfidenceIndicator.vue create mode 100644 data-entry-app/frontend/src/components/ocr/OCRPreview.vue create mode 100644 data-entry-app/frontend/src/components/ocr/OCRUploadZone.vue create mode 100644 docs/OCR_IMPLEMENTATION_PLAN.md diff --git a/data-entry-app/README.md b/data-entry-app/README.md index 100adf9..172ec71 100644 --- a/data-entry-app/README.md +++ b/data-entry-app/README.md @@ -1,6 +1,6 @@ # Data Entry App - Bonuri Fiscale -Aplicatie pentru introducere bonuri fiscale cu workflow de aprobare. +Aplicatie pentru introducere bonuri fiscale cu workflow de aprobare si extragere automata date prin OCR. ## Quick Start @@ -10,7 +10,27 @@ Aplicatie pentru introducere bonuri fiscale cu workflow de aprobare. - Node.js 18+ - (Optional) SSH tunnel pentru Oracle nomenclatoare -### Backend Setup +### Using Start Script (Recommended) + +```bash +# Start all services +./start-data-entry.sh + +# Or individual commands: +./start-data-entry.sh start # Start all +./start-data-entry.sh stop # Stop all +./start-data-entry.sh status # Check status +./start-data-entry.sh restart backend # Restart backend only +``` + +**Services:** +- Backend: http://localhost:8003 +- Frontend: http://localhost:3010 +- API Docs: http://localhost:8003/docs + +### Manual Setup + +#### Backend Setup ```bash cd data-entry-app/backend @@ -34,7 +54,7 @@ alembic upgrade head uvicorn app.main:app --reload --port 8003 ``` -### Frontend Setup +#### Frontend Setup ```bash cd data-entry-app/frontend @@ -46,15 +66,10 @@ npm install npm run dev -- --port 3010 ``` -### Access - -- **Backend API**: http://localhost:8003 -- **API Docs**: http://localhost:8003/docs -- **Frontend**: http://localhost:3010 - ## Features ### Pentru Utilizatori +- **OCR Automat** - Extragere automata date din poza bonului (suma, data, furnizor, CUI) - Upload poze bonuri fiscale - Completare date bon (suma, data, furnizor) - Selectie tip cheltuiala @@ -66,13 +81,75 @@ npm run dev -- --port 3010 - Aprobare/Respingere bonuri - Aprobare in masa +## OCR Feature + +### Cum functioneaza + +1. **Upload imagine** - Trage sau selecteaza poza bonului +2. **Procesare OCR** - Click pe "Proceseaza cu OCR" +3. **Previzualizare** - Datele extrase sunt afisate cu indicatori de incredere +4. **Aplicare** - Click "Aplica datele in formular" pentru auto-fill + +### Campuri extrase automat + +| Camp | Acuratete estimata | +|------|-------------------| +| Suma (TOTAL) | 90-95% | +| Data | 85-90% | +| Numar bon | 80-85% | +| Furnizor | 70-80% | +| CUI | 85-90% | +| Tip document | 95%+ | + +### OCR System Dependencies (Linux/Docker) + +Pentru functionarea OCR trebuie instalate: + +```bash +# Ubuntu/Debian +apt-get install -y \ + tesseract-ocr \ + tesseract-ocr-ron \ + tesseract-ocr-eng \ + poppler-utils \ + libgl1-mesa-glx \ + libglib2.0-0 + +# Fedora/RHEL +dnf install -y \ + tesseract \ + tesseract-langpack-ron \ + tesseract-langpack-eng \ + poppler-utils +``` + +**Note:** PaddleOCR (engine principal) se instaleaza automat cu pip. Tesseract este folosit ca fallback. + +### OCR API Endpoints + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | /api/ocr/status | Check OCR service status | +| POST | /api/ocr/extract | Extract data from uploaded image | +| POST | /api/ocr/extract-attachment/{id} | Re-process existing attachment | + +### Test OCR + +```bash +# Check OCR status +curl http://localhost:8003/api/ocr/status + +# Extract from image +curl -X POST -F "file=@bon.jpg" http://localhost:8003/api/ocr/extract +``` + ## Workflow ``` DRAFT โ†’ PENDING_REVIEW โ†’ APPROVED/REJECTED โ†’ (SYNCED in Oracle) ``` -1. **DRAFT**: Utilizator completeaza datele +1. **DRAFT**: Utilizator completeaza datele (manual sau via OCR) 2. **PENDING_REVIEW**: Sistemul genereaza note contabile automat 3. **APPROVED**: Contabil a aprobat bonul 4. **REJECTED**: Contabil a respins (utilizatorul poate corecta) @@ -90,8 +167,16 @@ data-entry-app/ โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ models/ # SQLModel models โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ crud/ # CRUD operations โ”‚ โ”‚ โ”œโ”€โ”€ schemas/ # Pydantic schemas -โ”‚ โ”‚ โ”œโ”€โ”€ services/ # Business logic -โ”‚ โ”‚ โ””โ”€โ”€ routers/ # API endpoints +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ ocr.py # OCR response schemas +โ”‚ โ”‚ โ”œโ”€โ”€ services/ +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ receipt_service.py +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ocr_service.py # OCR orchestration +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ocr_engine.py # PaddleOCR/Tesseract +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ ocr_extractor.py # Regex patterns RO +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ image_preprocessor.py # OpenCV pipeline +โ”‚ โ”‚ โ””โ”€โ”€ routers/ +โ”‚ โ”‚ โ”œโ”€โ”€ receipts.py +โ”‚ โ”‚ โ””โ”€โ”€ ocr.py # OCR endpoints โ”‚ โ”œโ”€โ”€ migrations/ # Alembic migrations โ”‚ โ”œโ”€โ”€ data/ โ”‚ โ”‚ โ”œโ”€โ”€ receipts.db # SQLite database @@ -101,7 +186,12 @@ data-entry-app/ โ”œโ”€โ”€ frontend/ โ”‚ โ”œโ”€โ”€ src/ โ”‚ โ”‚ โ”œโ”€โ”€ views/receipts/ # Page components -โ”‚ โ”‚ โ”œโ”€โ”€ components/receipts/ # Reusable components +โ”‚ โ”‚ โ”œโ”€โ”€ components/ +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ receipts/ # Receipt components +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ ocr/ # OCR components +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ OCRUploadZone.vue +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ OCRPreview.vue +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ OCRConfidenceIndicator.vue โ”‚ โ”‚ โ”œโ”€โ”€ stores/ # Pinia stores โ”‚ โ”‚ โ””โ”€โ”€ router/ # Vue Router โ”‚ โ”œโ”€โ”€ package.json @@ -169,6 +259,23 @@ Full API documentation available at http://localhost:8003/docs when backend is r | POST | /api/receipts/{id}/approve | Approve receipt | | POST | /api/receipts/{id}/reject | Reject receipt | | POST | /api/receipts/{id}/attachments | Upload attachment | +| GET | /api/ocr/status | OCR service status | +| POST | /api/ocr/extract | OCR image extraction | + +## Troubleshooting + +### OCR not working + +1. Check OCR status: `curl http://localhost:8003/api/ocr/status` +2. Install system dependencies (tesseract, poppler) +3. Verify PaddleOCR installed: `python -c "from paddleocr import PaddleOCR"` + +### Low OCR accuracy + +- Ensure good lighting when taking receipt photos +- Keep receipt flat (no folds/wrinkles) +- Try PDF instead of JPG for scanned documents +- Check if text is in focus ## Phase 2 (Future) diff --git a/data-entry-app/backend/app/main.py b/data-entry-app/backend/app/main.py index 82eca1b..cbd68a3 100644 --- a/data-entry-app/backend/app/main.py +++ b/data-entry-app/backend/app/main.py @@ -71,9 +71,10 @@ async def health_check(): # Import and include routers -from app.routers import receipts +from app.routers import receipts, ocr app.include_router(receipts.router, prefix="/api/receipts", tags=["receipts"]) +app.include_router(ocr.router, prefix="/api/ocr", tags=["ocr"]) # Root endpoint diff --git a/data-entry-app/backend/app/routers/ocr.py b/data-entry-app/backend/app/routers/ocr.py new file mode 100644 index 0000000..f071421 --- /dev/null +++ b/data-entry-app/backend/app/routers/ocr.py @@ -0,0 +1,156 @@ +"""OCR API endpoints.""" + +import os +import tempfile +from pathlib import Path + +from fastapi import APIRouter, HTTPException, UploadFile, File, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.database import get_session +from app.db.crud.attachment import AttachmentCRUD +from app.services.ocr_service import ocr_service +from app.services.ocr_engine import OCREngine +from app.schemas.ocr import OCRResponse, OCRStatusResponse, ExtractionData + +router = APIRouter() + + +@router.get("/status", response_model=OCRStatusResponse) +async def get_ocr_status(): + """Check OCR service status and available engines.""" + engines = OCREngine.get_available_engines() + available = len(engines) > 0 + + if available: + message = f"OCR service ready with engines: {', '.join(engines)}" + else: + message = "No OCR engines available. Install PaddleOCR or Tesseract." + + return OCRStatusResponse( + available=available, + engines=engines, + message=message + ) + + +@router.post("/extract", response_model=OCRResponse) +async def extract_from_image(file: UploadFile = File(...)): + """ + Extract receipt data from uploaded image. + + Accepts JPG, PNG, or PDF files (max 10MB). + Returns extracted fields with confidence scores. + """ + allowed_types = ['image/jpeg', 'image/png', 'application/pdf'] + + if file.content_type not in allowed_types: + raise HTTPException( + status_code=400, + detail=f"File type not supported: {file.content_type}. Allowed: JPG, PNG, PDF" + ) + + # Get file extension + suffix = Path(file.filename).suffix.lower() if file.filename else '.jpg' + if suffix not in ['.jpg', '.jpeg', '.png', '.pdf']: + suffix = '.jpg' + + # Save to temp file + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + content = await file.read() + + # Check file size (10MB limit) + if len(content) > 10 * 1024 * 1024: + raise HTTPException( + status_code=400, + detail="File too large. Maximum size is 10MB." + ) + + tmp.write(content) + tmp_path = Path(tmp.name) + + try: + success, message, result = await ocr_service.process_image( + tmp_path, file.content_type + ) + + if not success: + raise HTTPException(status_code=422, detail=message) + + # Convert ExtractionResult to ExtractionData schema + data = ExtractionData( + receipt_type=result.receipt_type, + receipt_number=result.receipt_number, + receipt_series=result.receipt_series, + receipt_date=result.receipt_date, + amount=result.amount, + partner_name=result.partner_name, + cui=result.cui, + description=result.description, + confidence_amount=result.confidence_amount, + confidence_date=result.confidence_date, + confidence_vendor=result.confidence_vendor, + overall_confidence=result.overall_confidence, + raw_text=result.raw_text, + ) + + return OCRResponse(success=True, message=message, data=data) + + finally: + # Clean up temp file + if tmp_path.exists(): + os.unlink(tmp_path) + + +@router.post("/extract-attachment/{attachment_id}", response_model=OCRResponse) +async def extract_from_attachment( + attachment_id: int, + session: AsyncSession = Depends(get_session), +): + """ + Extract receipt data from an existing attachment. + + Re-processes an already uploaded file with OCR. + """ + attachment = await AttachmentCRUD.get_by_id(session, attachment_id) + + if not attachment: + raise HTTPException(status_code=404, detail="Attachment not found") + + file_path = AttachmentCRUD.get_file_path(attachment) + + if not file_path.exists(): + raise HTTPException(status_code=404, detail="File not found on disk") + + # Check if file type is supported + if attachment.mime_type not in ['image/jpeg', 'image/png', 'application/pdf']: + raise HTTPException( + status_code=400, + detail=f"File type not supported for OCR: {attachment.mime_type}" + ) + + success, message, result = await ocr_service.process_image( + file_path, attachment.mime_type + ) + + if not success: + raise HTTPException(status_code=422, detail=message) + + # Convert ExtractionResult to ExtractionData schema + data = ExtractionData( + receipt_type=result.receipt_type, + receipt_number=result.receipt_number, + receipt_series=result.receipt_series, + receipt_date=result.receipt_date, + amount=result.amount, + partner_name=result.partner_name, + cui=result.cui, + description=result.description, + confidence_amount=result.confidence_amount, + confidence_date=result.confidence_date, + confidence_vendor=result.confidence_vendor, + overall_confidence=result.overall_confidence, + raw_text=result.raw_text, + ) + + return OCRResponse(success=True, message=message, data=data) diff --git a/data-entry-app/backend/app/schemas/ocr.py b/data-entry-app/backend/app/schemas/ocr.py new file mode 100644 index 0000000..6dd0e4b --- /dev/null +++ b/data-entry-app/backend/app/schemas/ocr.py @@ -0,0 +1,84 @@ +"""Pydantic schemas for OCR API.""" + +from datetime import date +from decimal import Decimal +from typing import Optional + +from pydantic import BaseModel, Field + + +class ExtractionData(BaseModel): + """Extracted receipt data from OCR.""" + + receipt_type: str = Field(default='bon_fiscal', description="Receipt type: bon_fiscal or chitanta") + receipt_number: Optional[str] = Field(default=None, description="Receipt number") + receipt_series: Optional[str] = Field(default=None, description="Receipt series") + receipt_date: Optional[date] = Field(default=None, description="Receipt date") + amount: Optional[Decimal] = Field(default=None, description="Total amount") + partner_name: Optional[str] = Field(default=None, description="Vendor/partner name") + cui: Optional[str] = Field(default=None, description="CUI (fiscal identification code)") + description: Optional[str] = Field(default=None, description="Optional description") + + confidence_amount: float = Field(default=0.0, ge=0, le=1, description="Amount extraction confidence") + confidence_date: float = Field(default=0.0, ge=0, le=1, description="Date extraction confidence") + confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence") + overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score") + raw_text: str = Field(default="", description="Raw OCR text") + + class Config: + """Pydantic config.""" + json_schema_extra = { + "example": { + "receipt_type": "bon_fiscal", + "receipt_number": "12345", + "receipt_series": None, + "receipt_date": "2024-01-15", + "amount": 125.50, + "partner_name": "MEGA IMAGE SRL", + "cui": "12345678", + "description": None, + "confidence_amount": 0.95, + "confidence_date": 0.90, + "confidence_vendor": 0.75, + "overall_confidence": 0.87, + "raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..." + } + } + + +class OCRResponse(BaseModel): + """OCR API response.""" + + success: bool = Field(description="Whether OCR processing was successful") + message: str = Field(description="Status message") + data: Optional[ExtractionData] = Field(default=None, description="Extracted data") + + class Config: + """Pydantic config.""" + json_schema_extra = { + "example": { + "success": True, + "message": "OCR processing successful. Found: amount, date, vendor", + "data": { + "receipt_type": "bon_fiscal", + "receipt_number": "12345", + "receipt_date": "2024-01-15", + "amount": 125.50, + "partner_name": "MEGA IMAGE SRL", + "cui": "12345678", + "confidence_amount": 0.95, + "confidence_date": 0.90, + "confidence_vendor": 0.75, + "overall_confidence": 0.87, + "raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..." + } + } + } + + +class OCRStatusResponse(BaseModel): + """OCR service status response.""" + + available: bool = Field(description="Whether OCR service is available") + engines: list[str] = Field(description="Available OCR engines") + message: str = Field(description="Status message") diff --git a/data-entry-app/backend/app/services/image_preprocessor.py b/data-entry-app/backend/app/services/image_preprocessor.py new file mode 100644 index 0000000..fd38368 --- /dev/null +++ b/data-entry-app/backend/app/services/image_preprocessor.py @@ -0,0 +1,116 @@ +"""Image preprocessing for optimal OCR results.""" + +from pathlib import Path +from typing import List + +import numpy as np +import cv2 + +try: + import pdf2image + PDF_AVAILABLE = True +except ImportError: + PDF_AVAILABLE = False + + +class ImagePreprocessor: + """Preprocess receipt images for OCR.""" + + def load_image(self, path: Path) -> np.ndarray: + """Load image from file.""" + image = cv2.imread(str(path)) + if image is None: + raise ValueError(f"Could not load image: {path}") + return image + + def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]: + """Convert PDF to images.""" + if not PDF_AVAILABLE: + raise RuntimeError("pdf2image not available. Install with: pip install pdf2image") + images = pdf2image.convert_from_path(str(path), dpi=dpi) + return [np.array(img) for img in images] + + def preprocess(self, image: np.ndarray) -> np.ndarray: + """ + Apply preprocessing pipeline for thermal receipt images. + + Pipeline: + 1. Convert to grayscale + 2. Resize if too small (min 1000px width) + 3. Deskew (straighten rotated text) + 4. Denoise (Non-local means) + 5. Adaptive thresholding (binarization) + 6. Morphological close (connect broken chars) + """ + # 1. Grayscale + if len(image.shape) == 3: + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = image.copy() + + # 2. Resize if too small + height, width = gray.shape + if width < 1000: + scale = 1000 / width + gray = cv2.resize( + gray, None, fx=scale, fy=scale, + interpolation=cv2.INTER_CUBIC + ) + + # 3. Deskew + gray = self._deskew(gray) + + # 4. Denoise + denoised = cv2.fastNlMeansDenoising( + gray, h=10, + templateWindowSize=7, + searchWindowSize=21 + ) + + # 5. Adaptive thresholding + binary = cv2.adaptiveThreshold( + denoised, 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blockSize=15, C=8 + ) + + # 6. Morphological close + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) + result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) + + return result + + def _deskew(self, image: np.ndarray) -> np.ndarray: + """Correct image rotation/skew using Hough lines.""" + edges = cv2.Canny(image, 50, 150, apertureSize=3) + lines = cv2.HoughLinesP( + edges, 1, np.pi / 180, + threshold=100, minLineLength=100, maxLineGap=10 + ) + + if lines is None: + return image + + angles = [] + for line in lines: + x1, y1, x2, y2 = line[0] + angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi + if abs(angle) < 45: + angles.append(angle) + + if not angles: + return image + + median_angle = np.median(angles) + if abs(median_angle) < 0.5: + return image + + h, w = image.shape[:2] + center = (w // 2, h // 2) + M = cv2.getRotationMatrix2D(center, median_angle, 1.0) + return cv2.warpAffine( + image, M, (w, h), + flags=cv2.INTER_CUBIC, + borderMode=cv2.BORDER_REPLICATE + ) diff --git a/data-entry-app/backend/app/services/ocr_engine.py b/data-entry-app/backend/app/services/ocr_engine.py new file mode 100644 index 0000000..7d46027 --- /dev/null +++ b/data-entry-app/backend/app/services/ocr_engine.py @@ -0,0 +1,168 @@ +"""OCR engine wrapper for PaddleOCR and Tesseract.""" + +import os +from dataclasses import dataclass +from typing import List, Optional + +import numpy as np + +# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) +os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' + +# Lazy imports - these will be imported on first use +PaddleOCR = None # Will be imported lazily +pytesseract = None # Will be imported lazily + +# Check availability without importing heavy libraries +def _check_paddle_available() -> bool: + """Check if paddleocr is installed without importing it.""" + try: + import importlib.util + return importlib.util.find_spec("paddleocr") is not None + except Exception: + return False + +def _check_tesseract_available() -> bool: + """Check if pytesseract is installed without importing it.""" + try: + import importlib.util + return importlib.util.find_spec("pytesseract") is not None + except Exception: + return False + +PADDLE_AVAILABLE = _check_paddle_available() +TESSERACT_AVAILABLE = _check_tesseract_available() + + +@dataclass +class OCRResult: + """Raw OCR result.""" + text: str + confidence: float + boxes: List[dict] + + +class OCREngine: + """Unified OCR engine with fallback support.""" + + def __init__(self): + self._paddle = None + self._paddle_initialized = False + + def _init_paddle_lazy(self): + """Lazy initialize PaddleOCR on first use (avoids slow startup).""" + global PaddleOCR + + if self._paddle_initialized: + return + + self._paddle_initialized = True + if PADDLE_AVAILABLE: + try: + print("Importing PaddleOCR (first use, may take ~15-20 seconds)...") + from paddleocr import PaddleOCR as _PaddleOCR + PaddleOCR = _PaddleOCR + + print("Initializing PaddleOCR engine...") + # PaddleOCR 3.x API - simplified parameters + self._paddle = PaddleOCR( + lang='en', # Better for mixed text with numbers + ) + print("PaddleOCR initialized successfully") + except Exception as e: + print(f"Warning: Failed to initialize PaddleOCR: {e}") + self._paddle = None + + def recognize(self, image: np.ndarray) -> OCRResult: + """Perform OCR on preprocessed image.""" + # Lazy init PaddleOCR on first call + self._init_paddle_lazy() + + if PADDLE_AVAILABLE and self._paddle: + return self._paddle_recognize(image) + elif TESSERACT_AVAILABLE: + return self._tesseract_recognize(image) + else: + raise RuntimeError( + "No OCR engine available. Install PaddleOCR or Tesseract." + ) + + def _paddle_recognize(self, image: np.ndarray) -> OCRResult: + """Recognize text using PaddleOCR 3.x API.""" + try: + # PaddleOCR 3.x requires 3-channel images + if len(image.shape) == 2: + # Convert grayscale to 3-channel BGR + import cv2 + image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + + # PaddleOCR 3.x uses predict() with new parameter names + result = self._paddle.predict(image, use_textline_orientation=True) + + if not result or len(result) == 0: + return OCRResult(text="", confidence=0.0, boxes=[]) + + # PaddleOCR 3.x returns OCRResult objects with different structure + ocr_result = result[0] + + # Extract texts and scores from the new format + rec_texts = ocr_result.get('rec_texts', []) + rec_scores = ocr_result.get('rec_scores', []) + dt_polys = ocr_result.get('dt_polys', []) + + if not rec_texts: + return OCRResult(text="", confidence=0.0, boxes=[]) + + boxes = [] + for i, text in enumerate(rec_texts): + conf = rec_scores[i] if i < len(rec_scores) else 0.0 + box = dt_polys[i].tolist() if i < len(dt_polys) else [] + boxes.append({ + 'text': text, + 'confidence': float(conf), + 'box': box + }) + + avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0 + return OCRResult( + text='\n'.join(rec_texts), + confidence=float(avg_conf), + boxes=boxes + ) + except Exception as e: + print(f"PaddleOCR error: {e}, falling back to Tesseract") + if TESSERACT_AVAILABLE: + return self._tesseract_recognize(image) + raise + + def _tesseract_recognize(self, image: np.ndarray) -> OCRResult: + """Recognize text using Tesseract.""" + global pytesseract + + # Lazy import pytesseract + if pytesseract is None: + print("Importing pytesseract...") + import pytesseract as _pytesseract + pytesseract = _pytesseract + + config = '--psm 6 -l ron+eng' + text = pytesseract.image_to_string(image, config=config) + data = pytesseract.image_to_data( + image, config=config, + output_type=pytesseract.Output.DICT + ) + + confidences = [int(c) for c in data['conf'] if int(c) > 0] + avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0 + + return OCRResult(text=text, confidence=avg_conf, boxes=[]) + + @staticmethod + def get_available_engines() -> List[str]: + """Return list of available OCR engines.""" + engines = [] + if PADDLE_AVAILABLE: + engines.append('paddleocr') + if TESSERACT_AVAILABLE: + engines.append('tesseract') + return engines diff --git a/data-entry-app/backend/app/services/ocr_extractor.py b/data-entry-app/backend/app/services/ocr_extractor.py new file mode 100644 index 0000000..e815079 --- /dev/null +++ b/data-entry-app/backend/app/services/ocr_extractor.py @@ -0,0 +1,231 @@ +"""Extract structured fields from OCR text (Romanian receipts).""" + +import re +from datetime import date, datetime +from decimal import Decimal, InvalidOperation +from typing import Optional, Tuple +from dataclasses import dataclass, field + + +@dataclass +class ExtractionResult: + """Structured extraction result from receipt.""" + receipt_type: str = 'bon_fiscal' + receipt_number: Optional[str] = None + receipt_series: Optional[str] = None + receipt_date: Optional[date] = None + amount: Optional[Decimal] = None + partner_name: Optional[str] = None + cui: Optional[str] = None + description: Optional[str] = None + + confidence_amount: float = 0.0 + confidence_date: float = 0.0 + confidence_vendor: float = 0.0 + raw_text: str = "" + + @property + def overall_confidence(self) -> float: + """Calculate weighted overall confidence score.""" + weights = {'amount': 0.4, 'date': 0.3, 'vendor': 0.3} + return round( + self.confidence_amount * weights['amount'] + + self.confidence_date * weights['date'] + + self.confidence_vendor * weights['vendor'], + 2 + ) + + +class ReceiptExtractor: + """Extract receipt fields using pattern matching for Romanian receipts.""" + + # Total amount patterns (most specific first) + TOTAL_PATTERNS = [ + (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95), + (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95), + (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90), + (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85), + (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85), + (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80), + ] + + # Date patterns + DATE_PATTERNS = [ + (r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95), + (r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90), + (r'(\d{2}[./]\d{2}[./]\d{4})', 0.80), + (r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format + ] + + # Receipt number patterns + NUMBER_PATTERNS = [ + (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95), + (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95), + (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95), + (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90), + (r'NR\.?\s*:?\s*(\d{4,})', 0.70), + ] + + # CUI (fiscal code) patterns + CUI_PATTERNS = [ + (r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95), + (r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95), + (r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90), + (r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80), + ] + + # Series patterns + SERIES_PATTERNS = [ + (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90), + (r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80), + ] + + def extract(self, text: str) -> ExtractionResult: + """Extract all fields from OCR text.""" + result = ExtractionResult() + result.raw_text = text + text_upper = text.upper() + + # Extract fields + result.amount, result.confidence_amount = self._extract_amount(text_upper) + result.receipt_date, result.confidence_date = self._extract_date(text_upper) + result.receipt_number, _ = self._extract_number(text_upper) + result.receipt_series, _ = self._extract_series(text_upper) + result.partner_name, result.confidence_vendor = self._extract_vendor(text) + result.cui, _ = self._extract_cui(text_upper) + + # Detect receipt type + result.receipt_type = self._detect_receipt_type(text_upper) + + return result + + def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]: + """Extract total amount from text.""" + for pattern, confidence in self.TOTAL_PATTERNS: + match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) + if match: + try: + amount_str = re.sub(r'[^\d.,]', '', match.group(1)) + # Handle Romanian number format (1.234,56) + amount_str = self._normalize_number(amount_str) + amount = Decimal(amount_str) + if amount > 0: + return amount, confidence + except (InvalidOperation, ValueError): + continue + return None, 0.0 + + def _normalize_number(self, num_str: str) -> str: + """Normalize Romanian number format to standard decimal.""" + # Remove spaces + num_str = num_str.replace(' ', '') + + # Handle comma as decimal separator + if ',' in num_str and '.' in num_str: + # Romanian format: 1.234,56 + num_str = num_str.replace('.', '').replace(',', '.') + elif ',' in num_str: + # Could be 1,50 or 1,234 + parts = num_str.split(',') + if len(parts) == 2 and len(parts[1]) <= 2: + # Decimal comma: 1,50 + num_str = num_str.replace(',', '.') + else: + # Thousands comma: 1,234 + num_str = num_str.replace(',', '') + elif '.' in num_str: + parts = num_str.split('.') + if len(parts) > 2: + # Multiple dots: 1.234.567 -> 1234567 + num_str = ''.join(parts[:-1]) + '.' + parts[-1] + + return num_str + + def _extract_date(self, text: str) -> Tuple[Optional[date], float]: + """Extract receipt date from text.""" + for pattern, confidence in self.DATE_PATTERNS: + match = re.search(pattern, text) + if match: + try: + date_str = match.group(1).replace('/', '.') + + # Try DD.MM.YYYY format first + try: + parsed = datetime.strptime(date_str, '%d.%m.%Y').date() + except ValueError: + # Try YYYY.MM.DD format + parsed = datetime.strptime(date_str, '%Y.%m.%d').date() + + # Validate date range + today = date.today() + if parsed <= today and parsed.year >= 2020: + return parsed, confidence + except ValueError: + continue + return None, 0.0 + + def _extract_number(self, text: str) -> Tuple[Optional[str], float]: + """Extract receipt number from text.""" + for pattern, confidence in self.NUMBER_PATTERNS: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1), confidence + return None, 0.0 + + def _extract_series(self, text: str) -> Tuple[Optional[str], float]: + """Extract receipt series from text.""" + for pattern, confidence in self.SERIES_PATTERNS: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1).upper(), confidence + return None, 0.0 + + def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]: + """Extract vendor/partner name from text.""" + lines = text.split('\n') + skip_keywords = [ + 'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA', + 'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR', + 'RON', 'LEI', 'CHITANTA', 'REST' + ] + + for i, line in enumerate(lines[:7]): # Check first 7 lines + line = line.strip() + + # Skip empty lines + if not line: + continue + + # Skip lines that are just numbers + if re.match(r'^[\d.,\s]+$', line): + continue + + # Skip lines with keywords + if any(kw in line.upper() for kw in skip_keywords): + continue + + # Clean the line + vendor = re.sub(r'[^\w\s.,&-]', '', line).strip() + + if len(vendor) >= 3: + # Confidence decreases for lines further down + confidence = max(0.3, 0.8 - (i * 0.1)) + return vendor, confidence + + return None, 0.0 + + def _extract_cui(self, text: str) -> Tuple[Optional[str], float]: + """Extract CUI (fiscal identification code) from text.""" + for pattern, confidence in self.CUI_PATTERNS: + match = re.search(pattern, text, re.IGNORECASE) + if match: + cui = match.group(1) + if 6 <= len(cui) <= 10: + return cui, confidence + return None, 0.0 + + def _detect_receipt_type(self, text: str) -> str: + """Detect receipt type from text content.""" + if 'CHITANTA' in text or 'CHITANศšฤ‚' in text: + return 'chitanta' + return 'bon_fiscal' diff --git a/data-entry-app/backend/app/services/ocr_service.py b/data-entry-app/backend/app/services/ocr_service.py new file mode 100644 index 0000000..3d54ec6 --- /dev/null +++ b/data-entry-app/backend/app/services/ocr_service.py @@ -0,0 +1,110 @@ +"""Main OCR service coordinating preprocessing, recognition, and extraction.""" + +import os +# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import +os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' + +import asyncio +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Optional, Tuple + +from app.services.ocr_engine import OCREngine +from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult +from app.services.image_preprocessor import ImagePreprocessor + + +class OCRService: + """Service for OCR processing of receipt images.""" + + _executor = ThreadPoolExecutor(max_workers=2) + + def __init__(self): + self.preprocessor = ImagePreprocessor() + self.ocr_engine = OCREngine() + self.extractor = ReceiptExtractor() + + async def process_image( + self, + image_path: Path, + mime_type: str + ) -> Tuple[bool, str, Optional[ExtractionResult]]: + """ + Process receipt image and extract structured data. + + Args: + image_path: Path to the image file + mime_type: MIME type of the file + + Returns: + Tuple of (success, message, extraction_result) + """ + try: + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + self._executor, + self._process_sync, + image_path, + mime_type + ) + return result + except Exception as e: + return False, f"OCR processing failed: {str(e)}", None + + def _process_sync( + self, + image_path: Path, + mime_type: str + ) -> Tuple[bool, str, Optional[ExtractionResult]]: + """Synchronous processing (runs in thread pool).""" + + # Handle PDF + if mime_type == 'application/pdf': + try: + images = self.preprocessor.pdf_to_images(image_path) + if not images: + return False, "Failed to extract images from PDF", None + image = images[0] # Process first page only + except RuntimeError as e: + return False, str(e), None + else: + try: + image = self.preprocessor.load_image(image_path) + except ValueError as e: + return False, str(e), None + + # Preprocess image + processed = self.preprocessor.preprocess(image) + + # Perform OCR + try: + ocr_result = self.ocr_engine.recognize(processed) + except RuntimeError as e: + return False, str(e), None + + if not ocr_result.text: + return False, "No text detected in image", None + + # Extract structured fields + extraction = self.extractor.extract(ocr_result.text) + + # Build result message + fields_found = [] + if extraction.amount: + fields_found.append("amount") + if extraction.receipt_date: + fields_found.append("date") + if extraction.partner_name: + fields_found.append("vendor") + if extraction.cui: + fields_found.append("CUI") + if extraction.receipt_number: + fields_found.append("number") + + message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}" + + return True, message, extraction + + +# Singleton instance +ocr_service = OCRService() diff --git a/data-entry-app/backend/requirements.txt b/data-entry-app/backend/requirements.txt index 3164347..08770f4 100644 --- a/data-entry-app/backend/requirements.txt +++ b/data-entry-app/backend/requirements.txt @@ -30,3 +30,11 @@ httpx>=0.26.0 # Testing pytest>=8.0.0 pytest-asyncio>=0.23.3 + +# OCR Dependencies +paddleocr>=2.7.0 +paddlepaddle>=2.5.0 +opencv-python>=4.8.0 +pytesseract>=0.3.10 +pdf2image>=1.16.0 +numpy>=1.24.0 diff --git a/data-entry-app/frontend/src/components/ocr/OCRConfidenceIndicator.vue b/data-entry-app/frontend/src/components/ocr/OCRConfidenceIndicator.vue new file mode 100644 index 0000000..f4e8c5f --- /dev/null +++ b/data-entry-app/frontend/src/components/ocr/OCRConfidenceIndicator.vue @@ -0,0 +1,125 @@ + + + + + diff --git a/data-entry-app/frontend/src/components/ocr/OCRPreview.vue b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue new file mode 100644 index 0000000..78127ff --- /dev/null +++ b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue @@ -0,0 +1,279 @@ + + + + + diff --git a/data-entry-app/frontend/src/components/ocr/OCRUploadZone.vue b/data-entry-app/frontend/src/components/ocr/OCRUploadZone.vue new file mode 100644 index 0000000..cc8aa33 --- /dev/null +++ b/data-entry-app/frontend/src/components/ocr/OCRUploadZone.vue @@ -0,0 +1,291 @@ + + + + + diff --git a/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue b/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue index d6356bf..f64e554 100644 --- a/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue +++ b/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue @@ -15,14 +15,43 @@
- -
+ +

Poza Bon (obligatoriu)

+ + + + + +
+ + +
+

+ + Poza Bon (obligatoriu) +

+

+ + Fisiere Selectate +

+
+ + +
+
+ + {{ file.name }} + {{ formatFileSize(file.size) }} +
+
@@ -235,10 +284,12 @@