feat: Improve OCR adaptive pipeline with early exit and better pattern matching
- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,19 @@
|
|||||||
"""FastAPI application entry point for Data Entry App."""
|
"""FastAPI application entry point for Data Entry App."""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
# Configure logging to show INFO level messages
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
datefmt='%H:%M:%S'
|
||||||
|
)
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
@@ -30,6 +39,18 @@ async def lifespan(app: FastAPI):
|
|||||||
settings.upload_path_resolved
|
settings.upload_path_resolved
|
||||||
print(f"Upload path: {settings.upload_path_resolved}")
|
print(f"Upload path: {settings.upload_path_resolved}")
|
||||||
|
|
||||||
|
# Pre-initialize OCR engine in background (PaddleOCR takes 15-20s)
|
||||||
|
def init_ocr_background():
|
||||||
|
try:
|
||||||
|
from app.services.ocr_service import ocr_service
|
||||||
|
ocr_service.ocr_engine._init_paddle_lazy()
|
||||||
|
print("OCR engine ready")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: OCR engine pre-load failed: {e}")
|
||||||
|
|
||||||
|
print("Starting OCR engine pre-load (background)...")
|
||||||
|
threading.Thread(target=init_ocr_background, daemon=True).start()
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
# Shutdown
|
# Shutdown
|
||||||
|
|||||||
@@ -102,6 +102,8 @@ async def extract_from_image(file: UploadFile = File(...)):
|
|||||||
confidence_vendor=result.confidence_vendor,
|
confidence_vendor=result.confidence_vendor,
|
||||||
overall_confidence=result.overall_confidence,
|
overall_confidence=result.overall_confidence,
|
||||||
raw_text=result.raw_text,
|
raw_text=result.raw_text,
|
||||||
|
ocr_engine=result.ocr_engine,
|
||||||
|
processing_time_ms=result.processing_time_ms,
|
||||||
)
|
)
|
||||||
|
|
||||||
return OCRResponse(success=True, message=message, data=data)
|
return OCRResponse(success=True, message=message, data=data)
|
||||||
@@ -171,6 +173,8 @@ async def extract_from_attachment(
|
|||||||
confidence_vendor=result.confidence_vendor,
|
confidence_vendor=result.confidence_vendor,
|
||||||
overall_confidence=result.overall_confidence,
|
overall_confidence=result.overall_confidence,
|
||||||
raw_text=result.raw_text,
|
raw_text=result.raw_text,
|
||||||
|
ocr_engine=result.ocr_engine,
|
||||||
|
processing_time_ms=result.processing_time_ms,
|
||||||
)
|
)
|
||||||
|
|
||||||
return OCRResponse(success=True, message=message, data=data)
|
return OCRResponse(success=True, message=message, data=data)
|
||||||
|
|||||||
@@ -37,6 +37,8 @@ class ExtractionData(BaseModel):
|
|||||||
confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
|
confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
|
||||||
overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
|
overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
|
||||||
raw_text: str = Field(default="", description="Raw OCR text")
|
raw_text: str = Field(default="", description="Raw OCR text")
|
||||||
|
ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
|
||||||
|
processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
"""Pydantic config."""
|
"""Pydantic config."""
|
||||||
|
|||||||
@@ -23,37 +23,57 @@ class ImagePreprocessor:
|
|||||||
raise ValueError(f"Could not load image: {path}")
|
raise ValueError(f"Could not load image: {path}")
|
||||||
return image
|
return image
|
||||||
|
|
||||||
def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]:
|
def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
|
||||||
"""
|
"""
|
||||||
Convert PDF to images with high DPI for better OCR.
|
Convert PDF to images.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: Path to PDF file
|
path: Path to PDF file
|
||||||
dpi: Resolution (400 recommended for receipts, higher = better quality but slower)
|
dpi: Resolution (300 = fast & good quality, 400 = better but slower)
|
||||||
"""
|
"""
|
||||||
if not PDF_AVAILABLE:
|
if not PDF_AVAILABLE:
|
||||||
raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
|
raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
|
||||||
# Use 400 DPI for better text recognition on thermal receipts
|
|
||||||
images = pdf2image.convert_from_path(str(path), dpi=dpi)
|
images = pdf2image.convert_from_path(str(path), dpi=dpi)
|
||||||
return [np.array(img) for img in images]
|
return [np.array(img) for img in images]
|
||||||
|
|
||||||
def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
|
def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Apply preprocessing pipeline for thermal receipt images.
|
Apply LIGHT preprocessing - better for clear PDFs.
|
||||||
|
Heavy binarization can destroy text on clear images.
|
||||||
|
"""
|
||||||
|
return self.preprocess_light(image)
|
||||||
|
|
||||||
Pipeline:
|
def preprocess_light(self, image: np.ndarray) -> np.ndarray:
|
||||||
1. Convert to grayscale
|
"""
|
||||||
2. Resize if too small (min 1500px width for high quality)
|
Light preprocessing for CLEAR images (PDFs, good scans).
|
||||||
3. Deskew (straighten rotated text)
|
Preserves original quality, only enhances contrast.
|
||||||
4. Contrast enhancement (CLAHE)
|
"""
|
||||||
5. Denoise (Non-local means)
|
# 1. Grayscale
|
||||||
6. Sharpening (for clearer text edges)
|
if len(image.shape) == 3:
|
||||||
7. Adaptive thresholding (binarization)
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
8. Morphological operations (connect broken chars)
|
else:
|
||||||
|
gray = image.copy()
|
||||||
|
|
||||||
Args:
|
# 2. Resize if too small
|
||||||
image: Input image (BGR or grayscale)
|
height, width = gray.shape
|
||||||
high_quality: If True, apply more aggressive preprocessing
|
if width < 1500:
|
||||||
|
scale = 1500 / width
|
||||||
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||||
|
|
||||||
|
# 3. Deskew
|
||||||
|
gray = self._deskew(gray)
|
||||||
|
|
||||||
|
# 4. Light contrast enhancement only
|
||||||
|
clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
|
||||||
|
enhanced = clahe.apply(gray)
|
||||||
|
|
||||||
|
# NO binarization, NO morphological ops - preserve original quality
|
||||||
|
return enhanced
|
||||||
|
|
||||||
|
def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Heavy preprocessing for FADED thermal receipts.
|
||||||
|
Aggressive binarization to recover faded text.
|
||||||
"""
|
"""
|
||||||
# 1. Grayscale
|
# 1. Grayscale
|
||||||
if len(image.shape) == 3:
|
if len(image.shape) == 3:
|
||||||
@@ -63,57 +83,48 @@ class ImagePreprocessor:
|
|||||||
|
|
||||||
# 2. Resize if too small (larger = better OCR)
|
# 2. Resize if too small (larger = better OCR)
|
||||||
height, width = gray.shape
|
height, width = gray.shape
|
||||||
min_width = 1500 if high_quality else 1000
|
if width < 1500:
|
||||||
if width < min_width:
|
scale = 1500 / width
|
||||||
scale = min_width / width
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||||
gray = cv2.resize(
|
|
||||||
gray, None, fx=scale, fy=scale,
|
|
||||||
interpolation=cv2.INTER_CUBIC
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3. Deskew
|
# 3. Deskew
|
||||||
gray = self._deskew(gray)
|
gray = self._deskew(gray)
|
||||||
|
|
||||||
# 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
# 4. Contrast enhancement with CLAHE
|
||||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||||
enhanced = clahe.apply(gray)
|
enhanced = clahe.apply(gray)
|
||||||
|
|
||||||
# 5. Denoise (slightly less aggressive to preserve text details)
|
# 5. Denoise
|
||||||
denoised = cv2.fastNlMeansDenoising(
|
denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
|
||||||
enhanced, h=8, # Lower h = preserve more details
|
|
||||||
templateWindowSize=7,
|
|
||||||
searchWindowSize=21
|
|
||||||
)
|
|
||||||
|
|
||||||
# 6. Sharpening to enhance text edges
|
# 6. Sharpening
|
||||||
if high_quality:
|
gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
|
||||||
# Unsharp mask for better text clarity
|
sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
|
||||||
gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
|
|
||||||
sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
|
|
||||||
else:
|
|
||||||
sharpened = denoised
|
|
||||||
|
|
||||||
# 7. Adaptive thresholding with optimized parameters
|
# 7. Adaptive thresholding (binarization)
|
||||||
binary = cv2.adaptiveThreshold(
|
binary = cv2.adaptiveThreshold(
|
||||||
sharpened, 255,
|
sharpened, 255,
|
||||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
cv2.THRESH_BINARY,
|
cv2.THRESH_BINARY,
|
||||||
blockSize=11, # Smaller block = better for small text
|
blockSize=11, C=5
|
||||||
C=5 # Lower C = darker result, better for faded receipts
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# 8. Morphological operations
|
# 8. Morphological operations
|
||||||
# Close small gaps in characters
|
|
||||||
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
||||||
result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)
|
result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)
|
||||||
|
|
||||||
# Optional: Remove small noise spots
|
|
||||||
if high_quality:
|
|
||||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
|
|
||||||
result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
|
||||||
|
"""
|
||||||
|
Generate 2 preprocessing variants for OCR (fast mode).
|
||||||
|
Returns: [light_processed, heavy_processed]
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
self.preprocess_light(image),
|
||||||
|
self.preprocess_heavy(image),
|
||||||
|
]
|
||||||
|
|
||||||
def _deskew(self, image: np.ndarray) -> np.ndarray:
|
def _deskew(self, image: np.ndarray) -> np.ndarray:
|
||||||
"""Correct image rotation/skew using Hough lines."""
|
"""Correct image rotation/skew using Hough lines."""
|
||||||
edges = cv2.Canny(image, 50, 150, apertureSize=3)
|
edges = cv2.Canny(image, 50, 150, apertureSize=3)
|
||||||
|
|||||||
@@ -1,11 +1,16 @@
|
|||||||
"""OCR engine wrapper for PaddleOCR and Tesseract."""
|
"""OCR engine wrapper for PaddleOCR and Tesseract."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.basicConfig(level=logging.INFO) # Ensure logs are visible
|
||||||
|
|
||||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
|
||||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||||
|
|
||||||
@@ -40,6 +45,7 @@ class OCRResult:
|
|||||||
text: str
|
text: str
|
||||||
confidence: float
|
confidence: float
|
||||||
boxes: List[dict]
|
boxes: List[dict]
|
||||||
|
engine: str = "" # OCR engine used: paddleocr or tesseract
|
||||||
|
|
||||||
|
|
||||||
class OCREngine:
|
class OCREngine:
|
||||||
@@ -65,8 +71,9 @@ class OCREngine:
|
|||||||
|
|
||||||
print("Initializing PaddleOCR engine...")
|
print("Initializing PaddleOCR engine...")
|
||||||
# PaddleOCR 3.x API - optimized for Romanian receipts
|
# PaddleOCR 3.x API - optimized for Romanian receipts
|
||||||
|
# Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
|
||||||
self._paddle = PaddleOCR(
|
self._paddle = PaddleOCR(
|
||||||
lang='en', # 'en' works better than 'ro' for mixed alphanumeric
|
lang='en', # 'en' handles Latin alphabet well for receipts
|
||||||
# High quality settings for better accuracy
|
# High quality settings for better accuracy
|
||||||
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
|
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
|
||||||
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
|
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
|
||||||
@@ -81,14 +88,19 @@ class OCREngine:
|
|||||||
|
|
||||||
def recognize(self, image: np.ndarray) -> OCRResult:
|
def recognize(self, image: np.ndarray) -> OCRResult:
|
||||||
"""Perform OCR on preprocessed image."""
|
"""Perform OCR on preprocessed image."""
|
||||||
|
logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
|
||||||
|
|
||||||
# Lazy init PaddleOCR on first call
|
# Lazy init PaddleOCR on first call
|
||||||
self._init_paddle_lazy()
|
self._init_paddle_lazy()
|
||||||
|
|
||||||
if PADDLE_AVAILABLE and self._paddle:
|
if PADDLE_AVAILABLE and self._paddle:
|
||||||
|
logger.info("[OCR] Using PaddleOCR engine")
|
||||||
return self._paddle_recognize(image)
|
return self._paddle_recognize(image)
|
||||||
elif TESSERACT_AVAILABLE:
|
elif TESSERACT_AVAILABLE:
|
||||||
|
logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
|
||||||
return self._tesseract_recognize(image)
|
return self._tesseract_recognize(image)
|
||||||
else:
|
else:
|
||||||
|
logger.error("[OCR] No OCR engine available!")
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"No OCR engine available. Install PaddleOCR or Tesseract."
|
"No OCR engine available. Install PaddleOCR or Tesseract."
|
||||||
)
|
)
|
||||||
@@ -96,17 +108,23 @@ class OCREngine:
|
|||||||
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
|
def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
|
||||||
"""Recognize text using PaddleOCR 3.x API."""
|
"""Recognize text using PaddleOCR 3.x API."""
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
|
||||||
|
|
||||||
# PaddleOCR 3.x requires 3-channel images
|
# PaddleOCR 3.x requires 3-channel images
|
||||||
if len(image.shape) == 2:
|
if len(image.shape) == 2:
|
||||||
# Convert grayscale to 3-channel BGR
|
# Convert grayscale to 3-channel BGR
|
||||||
import cv2
|
import cv2
|
||||||
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
||||||
|
logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
|
||||||
|
|
||||||
# PaddleOCR 3.x uses predict() with new parameter names
|
# PaddleOCR 3.x uses predict() with new parameter names
|
||||||
|
logger.info("[PaddleOCR] Calling predict()...")
|
||||||
result = self._paddle.predict(image, use_textline_orientation=True)
|
result = self._paddle.predict(image, use_textline_orientation=True)
|
||||||
|
logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
|
||||||
|
|
||||||
if not result or len(result) == 0:
|
if not result or len(result) == 0:
|
||||||
return OCRResult(text="", confidence=0.0, boxes=[])
|
logger.warning("[PaddleOCR] No results returned")
|
||||||
|
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
||||||
|
|
||||||
# PaddleOCR 3.x returns OCRResult objects with different structure
|
# PaddleOCR 3.x returns OCRResult objects with different structure
|
||||||
ocr_result = result[0]
|
ocr_result = result[0]
|
||||||
@@ -117,7 +135,7 @@ class OCREngine:
|
|||||||
dt_polys = ocr_result.get('dt_polys', [])
|
dt_polys = ocr_result.get('dt_polys', [])
|
||||||
|
|
||||||
if not rec_texts:
|
if not rec_texts:
|
||||||
return OCRResult(text="", confidence=0.0, boxes=[])
|
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
||||||
|
|
||||||
boxes = []
|
boxes = []
|
||||||
for i, text in enumerate(rec_texts):
|
for i, text in enumerate(rec_texts):
|
||||||
@@ -130,13 +148,17 @@ class OCREngine:
|
|||||||
})
|
})
|
||||||
|
|
||||||
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
|
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
|
||||||
|
text_result = '\n'.join(rec_texts)
|
||||||
|
logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
|
||||||
|
logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
|
||||||
return OCRResult(
|
return OCRResult(
|
||||||
text='\n'.join(rec_texts),
|
text=text_result,
|
||||||
confidence=float(avg_conf),
|
confidence=float(avg_conf),
|
||||||
boxes=boxes
|
boxes=boxes,
|
||||||
|
engine="paddleocr"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"PaddleOCR error: {e}, falling back to Tesseract")
|
logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
|
||||||
if TESSERACT_AVAILABLE:
|
if TESSERACT_AVAILABLE:
|
||||||
return self._tesseract_recognize(image)
|
return self._tesseract_recognize(image)
|
||||||
raise
|
raise
|
||||||
@@ -145,23 +167,70 @@ class OCREngine:
|
|||||||
"""Recognize text using Tesseract."""
|
"""Recognize text using Tesseract."""
|
||||||
global pytesseract
|
global pytesseract
|
||||||
|
|
||||||
|
logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
|
||||||
|
|
||||||
# Lazy import pytesseract
|
# Lazy import pytesseract
|
||||||
if pytesseract is None:
|
if pytesseract is None:
|
||||||
print("Importing pytesseract...")
|
logger.info("[Tesseract] Importing pytesseract...")
|
||||||
import pytesseract as _pytesseract
|
import pytesseract as _pytesseract
|
||||||
pytesseract = _pytesseract
|
pytesseract = _pytesseract
|
||||||
|
|
||||||
config = '--psm 6 -l ron+eng'
|
# PSM 4: Single column (best for receipts)
|
||||||
|
config = '--psm 4 -l ron+eng'
|
||||||
text = pytesseract.image_to_string(image, config=config)
|
text = pytesseract.image_to_string(image, config=config)
|
||||||
data = pytesseract.image_to_data(
|
|
||||||
image, config=config,
|
|
||||||
output_type=pytesseract.Output.DICT
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# Quick confidence estimate
|
||||||
|
data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
|
||||||
confidences = [int(c) for c in data['conf'] if int(c) > 0]
|
confidences = [int(c) for c in data['conf'] if int(c) > 0]
|
||||||
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
|
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
|
||||||
|
|
||||||
return OCRResult(text=text, confidence=avg_conf, boxes=[])
|
logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
|
||||||
|
return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
|
||||||
|
|
||||||
|
def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
|
||||||
|
"""
|
||||||
|
Run both OCR engines and return both results.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (paddle_result, tesseract_result)
|
||||||
|
tesseract_result may be None if Tesseract is not available
|
||||||
|
"""
|
||||||
|
logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
|
||||||
|
|
||||||
|
# Lazy init PaddleOCR
|
||||||
|
self._init_paddle_lazy()
|
||||||
|
|
||||||
|
paddle_result = None
|
||||||
|
tesseract_result = None
|
||||||
|
|
||||||
|
# Run PaddleOCR
|
||||||
|
if PADDLE_AVAILABLE and self._paddle:
|
||||||
|
try:
|
||||||
|
logger.info("[OCR Dual] Running PaddleOCR...")
|
||||||
|
paddle_result = self._paddle_recognize(image)
|
||||||
|
logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
|
||||||
|
paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
|
||||||
|
|
||||||
|
# Run Tesseract
|
||||||
|
if TESSERACT_AVAILABLE:
|
||||||
|
try:
|
||||||
|
logger.info("[OCR Dual] Running Tesseract...")
|
||||||
|
tesseract_result = self._tesseract_recognize(image)
|
||||||
|
logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[OCR Dual] Tesseract failed: {e}")
|
||||||
|
tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
|
||||||
|
|
||||||
|
# Fallback if PaddleOCR not available
|
||||||
|
if paddle_result is None:
|
||||||
|
if tesseract_result:
|
||||||
|
paddle_result = tesseract_result
|
||||||
|
else:
|
||||||
|
raise RuntimeError("No OCR engine available")
|
||||||
|
|
||||||
|
return paddle_result, tesseract_result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_available_engines() -> List[str]:
|
def get_available_engines() -> List[str]:
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ class ExtractionResult:
|
|||||||
confidence_date: float = 0.0
|
confidence_date: float = 0.0
|
||||||
confidence_vendor: float = 0.0
|
confidence_vendor: float = 0.0
|
||||||
raw_text: str = ""
|
raw_text: str = ""
|
||||||
|
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
|
||||||
|
processing_time_ms: int = 0 # Processing time in milliseconds
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def overall_confidence(self) -> float:
|
def overall_confidence(self) -> float:
|
||||||
@@ -70,6 +72,7 @@ class ReceiptExtractor:
|
|||||||
|
|
||||||
# Date patterns - support dash, dot, and slash separators
|
# Date patterns - support dash, dot, and slash separators
|
||||||
# OCR may produce DRTA instead of DATA, DAIA, etc.
|
# OCR may produce DRTA instead of DATA, DAIA, etc.
|
||||||
|
# OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
|
||||||
DATE_PATTERNS = [
|
DATE_PATTERNS = [
|
||||||
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
|
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
|
||||||
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
|
||||||
@@ -84,6 +87,19 @@ class ReceiptExtractor:
|
|||||||
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
|
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# OCR-corrupted date patterns with spaces/commas
|
||||||
|
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
|
||||||
|
DATE_PATTERNS_OCR_SPACES = [
|
||||||
|
# YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
|
||||||
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
|
||||||
|
# YYYY. MM. DD format with spaces (standalone)
|
||||||
|
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
|
||||||
|
# DD. MM, YYYY or DD, MM. YYYY (with time following)
|
||||||
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
|
||||||
|
# DD. MM, YYYY or DD, MM. YYYY (standalone)
|
||||||
|
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
|
||||||
|
]
|
||||||
|
|
||||||
# Receipt number patterns - Romanian fiscal receipt formats
|
# Receipt number patterns - Romanian fiscal receipt formats
|
||||||
# OCR may produce N instead of : or other errors
|
# OCR may produce N instead of : or other errors
|
||||||
NUMBER_PATTERNS = [
|
NUMBER_PATTERNS = [
|
||||||
@@ -127,12 +143,23 @@ class ReceiptExtractor:
|
|||||||
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
|
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
|
||||||
# COD FISCAL (vendor)
|
# COD FISCAL (vendor)
|
||||||
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
|
||||||
# C.I.F. format (with dots)
|
# C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
|
||||||
|
(r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
|
||||||
|
# C.I.F. format (with dots, no spaces)
|
||||||
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
|
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
|
||||||
# CUI format (less specific, use with caution)
|
# CUI format (less specific, use with caution)
|
||||||
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
|
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
|
||||||
|
# Common in some receipts: "R011201891\nC. I. F." - number on line before label
|
||||||
|
CUI_REVERSED_PATTERNS = [
|
||||||
|
# RO + 8-10 digits on line immediately before C.I.F./CIF label
|
||||||
|
(r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
|
||||||
|
# Just digits before C.I.F. label
|
||||||
|
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
|
||||||
|
]
|
||||||
|
|
||||||
# Series patterns - be strict to avoid false matches
|
# Series patterns - be strict to avoid false matches
|
||||||
SERIES_PATTERNS = [
|
SERIES_PATTERNS = [
|
||||||
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
|
||||||
@@ -158,6 +185,7 @@ class ReceiptExtractor:
|
|||||||
|
|
||||||
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
||||||
# Number may be on separate line before or after the label
|
# Number may be on separate line before or after the label
|
||||||
|
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
|
||||||
ITEMS_COUNT_PATTERNS = [
|
ITEMS_COUNT_PATTERNS = [
|
||||||
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
|
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
|
||||||
# OCR tolerant: OZ instead of POZ, ARI instead of ART
|
# OCR tolerant: OZ instead of POZ, ARI instead of ART
|
||||||
@@ -167,11 +195,10 @@ class ReceiptExtractor:
|
|||||||
# Number may be on next line after label
|
# Number may be on next line after label
|
||||||
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
|
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
|
||||||
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
|
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
|
||||||
# Simpler patterns
|
# Simpler patterns - but more specific
|
||||||
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
|
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
|
||||||
(r'P?[O0]Z\s*:?\s*(\d+)', 0.85),
|
# POZ at start of line or after colon (not in product descriptions)
|
||||||
# X articole/pozitii
|
(r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
|
||||||
(r'(\d+)\s*(?:ARTIC[O0]LE|P[O0]ZITII|BUC)', 0.80),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Address patterns (Romanian format)
|
# Address patterns (Romanian format)
|
||||||
@@ -183,20 +210,21 @@ class ReceiptExtractor:
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Vendor name indicators (lines containing these are likely vendor names)
|
# Vendor name indicators (lines containing these are likely vendor names)
|
||||||
|
# These should be company type suffixes, not generic words
|
||||||
|
# Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
|
||||||
VENDOR_INDICATORS = [
|
VENDOR_INDICATORS = [
|
||||||
r'\bS\.?R\.?L\.?\b', # S.R.L.
|
r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
|
||||||
r'\bS\.?A\.?\b', # S.A.
|
r'\bS\.?\s*A\.?\b', # S.A. or S. A.
|
||||||
r'\bS\.?N\.?C\.?\b', # S.N.C.
|
r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
|
||||||
r'\bS\.?C\.?S\.?\b', # S.C.S.
|
r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
|
||||||
r'\bI\.?I\.?\b', # I.I. (Individual)
|
r'\bI\.?\s*I\.?\b', # I.I. or I. I.
|
||||||
r'\bP\.?F\.?A\.?\b', # P.F.A.
|
r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
|
||||||
r'\bS\.?C\.?\b', # S.C.
|
# S.C. alone is too short and generic - only match if followed by company name
|
||||||
|
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
|
||||||
r'HOLDING',
|
r'HOLDING',
|
||||||
r'COMPANY',
|
r'COMPANY',
|
||||||
r'GROUP',
|
r'GROUP',
|
||||||
r'MAGAZIN',
|
# Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
|
||||||
r'MARKET',
|
|
||||||
r'SHOP',
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def extract(self, text: str) -> ExtractionResult:
|
def extract(self, text: str) -> ExtractionResult:
|
||||||
@@ -215,6 +243,14 @@ class ReceiptExtractor:
|
|||||||
|
|
||||||
# Extract additional fields - Multiple TVA entries
|
# Extract additional fields - Multiple TVA entries
|
||||||
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
|
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
|
||||||
|
if not result.tva_entries:
|
||||||
|
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
|
||||||
|
# Debug: show what patterns see
|
||||||
|
import re
|
||||||
|
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
||||||
|
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
||||||
|
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
||||||
|
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
|
||||||
result.items_count = self._extract_items_count(text_upper)
|
result.items_count = self._extract_items_count(text_upper)
|
||||||
result.address = self._extract_address(text_upper)
|
result.address = self._extract_address(text_upper)
|
||||||
|
|
||||||
@@ -334,6 +370,7 @@ class ReceiptExtractor:
|
|||||||
|
|
||||||
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
|
||||||
"""Extract receipt date from text."""
|
"""Extract receipt date from text."""
|
||||||
|
# First try standard patterns (clean dates)
|
||||||
for pattern, confidence in self.DATE_PATTERNS:
|
for pattern, confidence in self.DATE_PATTERNS:
|
||||||
match = re.search(pattern, text)
|
match = re.search(pattern, text)
|
||||||
if match:
|
if match:
|
||||||
@@ -354,6 +391,34 @@ class ReceiptExtractor:
|
|||||||
return parsed, confidence
|
return parsed, confidence
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Then try OCR-corrupted patterns (dates with spaces/commas)
|
||||||
|
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
|
||||||
|
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
|
||||||
|
match = re.search(pattern, text)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
if fmt == 'ymd':
|
||||||
|
# YYYY. MM. DD format (OMV/Petrom)
|
||||||
|
year = match.group(1)
|
||||||
|
month = match.group(2)
|
||||||
|
day = match.group(3)
|
||||||
|
else:
|
||||||
|
# DD. MM. YYYY format (default)
|
||||||
|
day = match.group(1)
|
||||||
|
month = match.group(2)
|
||||||
|
year = match.group(3)
|
||||||
|
|
||||||
|
date_str = f"{day}.{month}.{year}"
|
||||||
|
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
|
||||||
|
|
||||||
|
# Validate date range
|
||||||
|
today = date.today()
|
||||||
|
if parsed <= today and parsed.year >= 2020:
|
||||||
|
return parsed, confidence
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
return None, 0.0
|
return None, 0.0
|
||||||
|
|
||||||
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
|
def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
|
||||||
@@ -377,8 +442,9 @@ class ReceiptExtractor:
|
|||||||
Extract vendor/partner name from text.
|
Extract vendor/partner name from text.
|
||||||
Uses multiple strategies:
|
Uses multiple strategies:
|
||||||
1. Look for lines with company type indicators (S.R.L., S.A., etc.)
|
1. Look for lines with company type indicators (S.R.L., S.A., etc.)
|
||||||
2. Look for lines near CIF
|
2. Look for company name + SRL on separate lines
|
||||||
3. Use first valid line as fallback
|
3. Look for lines near CIF
|
||||||
|
4. Use first valid line as fallback
|
||||||
"""
|
"""
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
skip_keywords = [
|
skip_keywords = [
|
||||||
@@ -388,9 +454,37 @@ class ReceiptExtractor:
|
|||||||
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
|
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
|
||||||
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
|
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
|
||||||
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
|
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
|
||||||
'BUC', 'ROLA', 'CUMPARATOR'
|
'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
|
||||||
|
'NIVS', 'BENZINA', 'PETROM', 'OMV'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Strategy 0: Look for company name followed by SRL/SA on next line
|
||||||
|
# Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
|
||||||
|
for i, line in enumerate(lines[:15]):
|
||||||
|
line = line.strip()
|
||||||
|
if not line or len(line) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
line_upper = line.upper()
|
||||||
|
|
||||||
|
# Skip lines with skip keywords
|
||||||
|
if any(kw in line_upper for kw in skip_keywords):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
|
||||||
|
if i + 1 < len(lines):
|
||||||
|
next_line = lines[i + 1].strip().upper()
|
||||||
|
# Match standalone company type suffix
|
||||||
|
if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
|
||||||
|
re.match(r'^S\.?\s*A\.?$', next_line) or \
|
||||||
|
re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
|
||||||
|
re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
|
||||||
|
re.match(r'^I\.?\s*I\.?$', next_line):
|
||||||
|
# Combine: "COMPANY NAME" + " " + "SRL"
|
||||||
|
vendor = self._clean_vendor_name(f"{line} {next_line}")
|
||||||
|
if vendor and len(vendor) >= 5:
|
||||||
|
return vendor, 0.95
|
||||||
|
|
||||||
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
|
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
|
||||||
for i, line in enumerate(lines[:15]): # Check first 15 lines
|
for i, line in enumerate(lines[:15]): # Check first 15 lines
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
@@ -476,7 +570,22 @@ class ReceiptExtractor:
|
|||||||
Extract vendor CUI (fiscal identification code) from text.
|
Extract vendor CUI (fiscal identification code) from text.
|
||||||
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
|
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
|
||||||
"""
|
"""
|
||||||
# First, try to find CIF on a line that doesn't contain CLIENT
|
# Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
|
||||||
|
# This is common in some receipts: "R011201891\nC. I. F."
|
||||||
|
for pattern, confidence in self.CUI_REVERSED_PATTERNS:
|
||||||
|
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
|
||||||
|
if match:
|
||||||
|
cui = match.group(1)
|
||||||
|
if 6 <= len(cui) <= 10:
|
||||||
|
# Verify this is not the CLIENT CUI by checking context
|
||||||
|
start = match.start()
|
||||||
|
# Check 50 chars before the match for CLIENT keyword
|
||||||
|
context_start = max(0, start - 50)
|
||||||
|
context = text_upper[context_start:start]
|
||||||
|
if 'CLIENT' not in context and 'LIENT' not in context:
|
||||||
|
return cui, confidence
|
||||||
|
|
||||||
|
# Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
|
||||||
lines = text_upper.split('\n')
|
lines = text_upper.split('\n')
|
||||||
for line in lines:
|
for line in lines:
|
||||||
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
|
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
|
||||||
@@ -491,7 +600,7 @@ class ReceiptExtractor:
|
|||||||
if 6 <= len(cui) <= 10:
|
if 6 <= len(cui) <= 10:
|
||||||
return cui, confidence
|
return cui, confidence
|
||||||
|
|
||||||
# Fallback: search entire text but exclude CLIENT patterns
|
# Strategy 2: Fallback - search entire text but exclude CLIENT patterns
|
||||||
for pattern, confidence in self.CUI_PATTERNS:
|
for pattern, confidence in self.CUI_PATTERNS:
|
||||||
# Find all matches
|
# Find all matches
|
||||||
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
||||||
@@ -523,8 +632,94 @@ class ReceiptExtractor:
|
|||||||
tva_entries = []
|
tva_entries = []
|
||||||
seen_entries = set() # To avoid duplicates
|
seen_entries = set() # To avoid duplicates
|
||||||
|
|
||||||
# Normalize spaces in numbers first (OCR may produce "32. 31")
|
# Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
|
||||||
|
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
|
||||||
|
# Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
|
||||||
|
non_vat_patterns = [
|
||||||
|
# Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
|
||||||
|
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
|
||||||
|
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
|
||||||
|
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
|
||||||
|
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
|
||||||
|
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
|
||||||
|
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
|
||||||
|
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
|
||||||
|
]
|
||||||
|
for pattern in non_vat_patterns:
|
||||||
|
if re.search(pattern, text, re.IGNORECASE):
|
||||||
|
# Non-VAT payer - return TVA = 0
|
||||||
|
return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')
|
||||||
|
|
||||||
|
# Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
|
||||||
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
||||||
|
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
|
||||||
|
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
|
||||||
|
|
||||||
|
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
|
||||||
|
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
|
||||||
|
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
|
||||||
|
# OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
|
||||||
|
total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
|
||||||
|
taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
# Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
|
||||||
|
if not taxe_match:
|
||||||
|
reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
|
||||||
|
taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
if taxe_match:
|
||||||
|
# Also need to find the TVA rate from the table
|
||||||
|
# Pattern handles: "A-21%", "-21,00%", "21%" etc.
|
||||||
|
rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
|
||||||
|
rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
|
||||||
|
if rate_match:
|
||||||
|
try:
|
||||||
|
code = rate_match.group(1).upper() if rate_match.group(1) else 'A' # Default to A if missing
|
||||||
|
percent = int(rate_match.group(2))
|
||||||
|
amount_str = taxe_match.group(1).replace(' ', '')
|
||||||
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
||||||
|
amount = Decimal(amount_str)
|
||||||
|
if amount > 0:
|
||||||
|
entry_key = (code, percent)
|
||||||
|
if entry_key not in seen_entries:
|
||||||
|
tva_entries.append({
|
||||||
|
'code': code,
|
||||||
|
'percent': percent,
|
||||||
|
'amount': amount
|
||||||
|
})
|
||||||
|
seen_entries.add(entry_key)
|
||||||
|
except (ValueError, InvalidOperation):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Pattern 0b: Table format "A-21,00% 285,66 49,58" (code-percent base tva_amount)
|
||||||
|
# This format appears after a TVA header line like "TVA TOTAL VALDARE"
|
||||||
|
# The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
|
||||||
|
if not tva_entries:
|
||||||
|
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
|
||||||
|
for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
|
||||||
|
try:
|
||||||
|
code = match.group(1).upper()
|
||||||
|
percent = int(match.group(2))
|
||||||
|
amount1_str = match.group(3).replace(' ', '')
|
||||||
|
amount2_str = match.group(4).replace(' ', '')
|
||||||
|
amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
|
||||||
|
amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
|
||||||
|
|
||||||
|
# Determine which is TVA: the smaller amount is usually TVA
|
||||||
|
# (TVA is a fraction of the total, so it's always smaller)
|
||||||
|
tva_amount = min(amount1, amount2)
|
||||||
|
|
||||||
|
if tva_amount > 0:
|
||||||
|
entry_key = (code, percent)
|
||||||
|
if entry_key not in seen_entries:
|
||||||
|
tva_entries.append({
|
||||||
|
'code': code,
|
||||||
|
'percent': percent,
|
||||||
|
'amount': tva_amount
|
||||||
|
})
|
||||||
|
seen_entries.add(entry_key)
|
||||||
|
except (ValueError, InvalidOperation):
|
||||||
|
continue
|
||||||
|
|
||||||
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
|
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
|
||||||
# OCR tolerant: TUA, TVR, etc.
|
# OCR tolerant: TUA, TVR, etc.
|
||||||
@@ -571,7 +766,75 @@ class ReceiptExtractor:
|
|||||||
except (ValueError, InvalidOperation):
|
except (ValueError, InvalidOperation):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Pattern 3: "TVAA - 21%" on one line, amount on next line
|
# Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
|
||||||
|
if not tva_entries:
|
||||||
|
# First try: "TOTAL TVA A - 21% 32.31" (amount on same line)
|
||||||
|
tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
|
||||||
|
for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
|
||||||
|
try:
|
||||||
|
code = match.group(1).upper()
|
||||||
|
percent = int(match.group(2))
|
||||||
|
amount_str = self._normalize_number(match.group(3))
|
||||||
|
amount = Decimal(amount_str)
|
||||||
|
if amount > 0:
|
||||||
|
entry_key = (code, percent)
|
||||||
|
if entry_key not in seen_entries:
|
||||||
|
tva_entries.append({
|
||||||
|
'code': code,
|
||||||
|
'percent': percent,
|
||||||
|
'amount': amount
|
||||||
|
})
|
||||||
|
seen_entries.add(entry_key)
|
||||||
|
except (ValueError, InvalidOperation):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
|
||||||
|
if not tva_entries:
|
||||||
|
tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
|
||||||
|
for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
|
||||||
|
try:
|
||||||
|
code = match.group(1).upper()
|
||||||
|
percent = int(match.group(2))
|
||||||
|
|
||||||
|
# Look for "TOTAL TVA BON" followed by amount
|
||||||
|
tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
|
||||||
|
tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
|
||||||
|
if tva_bon_match:
|
||||||
|
amount_str = self._normalize_number(tva_bon_match.group(1))
|
||||||
|
amount = Decimal(amount_str)
|
||||||
|
if amount > 0:
|
||||||
|
entry_key = (code, percent)
|
||||||
|
if entry_key not in seen_entries:
|
||||||
|
tva_entries.append({
|
||||||
|
'code': code,
|
||||||
|
'percent': percent,
|
||||||
|
'amount': amount
|
||||||
|
})
|
||||||
|
seen_entries.add(entry_key)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Fallback: Amount after TOTAL TVA BON on next line
|
||||||
|
tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
|
||||||
|
if tva_bon_pos:
|
||||||
|
after_bon = normalized_text[tva_bon_pos.end():]
|
||||||
|
# Find first standalone number (likely TVA amount)
|
||||||
|
amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
|
||||||
|
if amount_match:
|
||||||
|
amount_str = self._normalize_number(amount_match.group(1))
|
||||||
|
amount = Decimal(amount_str)
|
||||||
|
if amount > 0:
|
||||||
|
entry_key = (code, percent)
|
||||||
|
if entry_key not in seen_entries:
|
||||||
|
tva_entries.append({
|
||||||
|
'code': code,
|
||||||
|
'percent': percent,
|
||||||
|
'amount': amount
|
||||||
|
})
|
||||||
|
seen_entries.add(entry_key)
|
||||||
|
except (ValueError, InvalidOperation):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
|
||||||
if not tva_entries:
|
if not tva_entries:
|
||||||
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
|
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
|
||||||
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
|
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
|
||||||
|
|||||||
@@ -1,11 +1,16 @@
|
|||||||
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
|
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
||||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||||
|
|
||||||
|
import time
|
||||||
import asyncio
|
import asyncio
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from decimal import Decimal
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
@@ -13,6 +18,9 @@ from app.services.ocr_engine import OCREngine
|
|||||||
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
|
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
|
||||||
from app.services.image_preprocessor import ImagePreprocessor
|
from app.services.image_preprocessor import ImagePreprocessor
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class OCRService:
|
class OCRService:
|
||||||
"""Service for OCR processing of receipt images."""
|
"""Service for OCR processing of receipt images."""
|
||||||
@@ -56,15 +64,18 @@ class OCRService:
|
|||||||
image_path: Path,
|
image_path: Path,
|
||||||
mime_type: str
|
mime_type: str
|
||||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||||
"""Synchronous processing (runs in thread pool)."""
|
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
||||||
|
|
||||||
# Handle PDF
|
start_time = time.time()
|
||||||
|
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
||||||
|
|
||||||
|
# Load image
|
||||||
if mime_type == 'application/pdf':
|
if mime_type == 'application/pdf':
|
||||||
try:
|
try:
|
||||||
images = self.preprocessor.pdf_to_images(image_path)
|
images = self.preprocessor.pdf_to_images(image_path)
|
||||||
if not images:
|
if not images:
|
||||||
return False, "Failed to extract images from PDF", None
|
return False, "Failed to extract images from PDF", None
|
||||||
image = images[0] # Process first page only
|
image = images[0]
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
return False, str(e), None
|
return False, str(e), None
|
||||||
else:
|
else:
|
||||||
@@ -73,38 +84,360 @@ class OCRService:
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return False, str(e), None
|
return False, str(e), None
|
||||||
|
|
||||||
# Preprocess image
|
raw_texts = []
|
||||||
processed = self.preprocessor.preprocess(image)
|
extraction = None
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
# STEP 1: PaddleOCR + Light (fastest, best for clear PDFs)
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True)
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
light_img = self.preprocessor.preprocess_light(image)
|
||||||
|
|
||||||
# Perform OCR
|
|
||||||
try:
|
try:
|
||||||
ocr_result = self.ocr_engine.recognize(processed)
|
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
||||||
except RuntimeError as e:
|
if paddle_light and paddle_light.text:
|
||||||
return False, str(e), None
|
extraction = self.extractor.extract(paddle_light.text)
|
||||||
|
extraction.ocr_engine = "paddle-light"
|
||||||
|
raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}")
|
||||||
|
|
||||||
if not ocr_result.text:
|
# Log extraction results
|
||||||
return False, "No text detected in image", None
|
print(f"[OCR] Step 1 Results:", flush=True)
|
||||||
|
print(f" - OCR Confidence: {paddle_light.confidence:.0%}", flush=True)
|
||||||
|
print(f" - Amount: {extraction.amount}", flush=True)
|
||||||
|
print(f" - Date: {extraction.receipt_date}", flush=True)
|
||||||
|
print(f" - Number: {extraction.receipt_number}", flush=True)
|
||||||
|
print(f" - CUI: {extraction.cui}", flush=True)
|
||||||
|
print(f" - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True)
|
||||||
|
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
||||||
|
|
||||||
# Extract structured fields
|
# Early exit if complete
|
||||||
extraction = self.extractor.extract(ocr_result.text)
|
if self._is_extraction_complete(extraction):
|
||||||
|
extraction.raw_text = "\n\n".join(raw_texts)
|
||||||
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||||
|
extraction.processing_time_ms = elapsed_ms
|
||||||
|
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||||
|
return True, "OCR complete (fast mode)", extraction
|
||||||
|
else:
|
||||||
|
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
||||||
|
extraction = ExtractionResult()
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
# STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
heavy_img = self.preprocessor.preprocess_heavy(image)
|
||||||
|
|
||||||
|
try:
|
||||||
|
paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
|
||||||
|
if paddle_heavy and paddle_heavy.text:
|
||||||
|
extraction_heavy = self.extractor.extract(paddle_heavy.text)
|
||||||
|
extraction_heavy.ocr_engine = "paddle-heavy"
|
||||||
|
raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
|
||||||
|
|
||||||
|
print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
|
||||||
|
print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
|
||||||
|
print(f" - Amount: {extraction_heavy.amount}", flush=True)
|
||||||
|
print(f" - Date: {extraction_heavy.receipt_date}", flush=True)
|
||||||
|
print(f" - CUI: {extraction_heavy.cui}", flush=True)
|
||||||
|
|
||||||
|
# Merge with previous
|
||||||
|
extraction = self._merge_extractions(extraction, extraction_heavy)
|
||||||
|
|
||||||
|
print(f"[OCR] After merge:", flush=True)
|
||||||
|
print(f" - Amount: {extraction.amount}", flush=True)
|
||||||
|
print(f" - Date: {extraction.receipt_date}", flush=True)
|
||||||
|
print(f" - Number: {extraction.receipt_number}", flush=True)
|
||||||
|
print(f" - CUI: {extraction.cui}", flush=True)
|
||||||
|
print(f" - TVA: {extraction.tva_total}", flush=True)
|
||||||
|
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
||||||
|
|
||||||
|
if self._is_extraction_complete(extraction):
|
||||||
|
extraction.raw_text = "\n\n".join(raw_texts)
|
||||||
|
extraction.ocr_engine = "paddle-adaptive"
|
||||||
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||||
|
extraction.processing_time_ms = elapsed_ms
|
||||||
|
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||||
|
return True, "OCR complete (paddle dual)", extraction
|
||||||
|
else:
|
||||||
|
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
# STEP 3: Tesseract fallback
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
print("[OCR] STEP 3: Tesseract fallback", flush=True)
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
tesseract_result = self.ocr_engine._tesseract_recognize(light_img)
|
||||||
|
if tesseract_result and tesseract_result.text:
|
||||||
|
extraction_tess = self.extractor.extract(tesseract_result.text)
|
||||||
|
extraction_tess.ocr_engine = "tesseract"
|
||||||
|
raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}")
|
||||||
|
|
||||||
|
print(f"[OCR] Step 3 (Tesseract) Results:", flush=True)
|
||||||
|
print(f" - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True)
|
||||||
|
print(f" - Amount: {extraction_tess.amount}", flush=True)
|
||||||
|
print(f" - Date: {extraction_tess.receipt_date}", flush=True)
|
||||||
|
print(f" - CUI: {extraction_tess.cui}", flush=True)
|
||||||
|
|
||||||
|
extraction = self._merge_extractions(extraction, extraction_tess)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
||||||
|
|
||||||
|
# Final result
|
||||||
|
if extraction is None:
|
||||||
|
return False, "No text detected", None
|
||||||
|
|
||||||
|
extraction.raw_text = "\n\n".join(raw_texts)
|
||||||
|
extraction.ocr_engine = "adaptive-full"
|
||||||
|
|
||||||
# Build result message
|
# Build result message
|
||||||
fields_found = []
|
fields_found = []
|
||||||
if extraction.amount:
|
if extraction.amount: fields_found.append("amount")
|
||||||
fields_found.append("amount")
|
if extraction.receipt_date: fields_found.append("date")
|
||||||
if extraction.receipt_date:
|
if extraction.receipt_number: fields_found.append("number")
|
||||||
fields_found.append("date")
|
if extraction.cui: fields_found.append("CUI")
|
||||||
if extraction.partner_name:
|
if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA")
|
||||||
fields_found.append("vendor")
|
|
||||||
if extraction.cui:
|
|
||||||
fields_found.append("CUI")
|
|
||||||
if extraction.receipt_number:
|
|
||||||
fields_found.append("number")
|
|
||||||
|
|
||||||
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
|
message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}"
|
||||||
|
|
||||||
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||||
|
extraction.processing_time_ms = elapsed_ms
|
||||||
|
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True)
|
||||||
|
print("=" * 60, flush=True)
|
||||||
|
print(f" - Amount: {extraction.amount}", flush=True)
|
||||||
|
print(f" - Date: {extraction.receipt_date}", flush=True)
|
||||||
|
print(f" - Number: {extraction.receipt_number}", flush=True)
|
||||||
|
print(f" - CUI: {extraction.cui}", flush=True)
|
||||||
|
print(f" - TVA: {extraction.tva_total}", flush=True)
|
||||||
|
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
||||||
|
print(f" - Processing Time: {elapsed_ms}ms", flush=True)
|
||||||
|
print(f" - Message: {message}", flush=True)
|
||||||
|
|
||||||
return True, message, extraction
|
return True, message, extraction
|
||||||
|
|
||||||
|
def _merge_extractions(
|
||||||
|
self,
|
||||||
|
paddle: Optional[ExtractionResult],
|
||||||
|
tesseract: Optional[ExtractionResult]
|
||||||
|
) -> ExtractionResult:
|
||||||
|
"""
|
||||||
|
Merge two extractions, picking best fields from each engine.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
- For each field, prefer the one with higher confidence
|
||||||
|
- Use validation rules (CUI format, date validity, company indicators)
|
||||||
|
- Combine TVA entries if different
|
||||||
|
"""
|
||||||
|
result = ExtractionResult()
|
||||||
|
|
||||||
|
# Handle case where one is None
|
||||||
|
if paddle is None and tesseract is None:
|
||||||
|
return result
|
||||||
|
if paddle is None:
|
||||||
|
return tesseract
|
||||||
|
if tesseract is None:
|
||||||
|
return paddle
|
||||||
|
|
||||||
|
print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True)
|
||||||
|
|
||||||
|
# === AMOUNT ===
|
||||||
|
# Pick higher confidence, both must be positive
|
||||||
|
if paddle.amount and tesseract.amount:
|
||||||
|
if paddle.confidence_amount >= tesseract.confidence_amount:
|
||||||
|
result.amount = paddle.amount
|
||||||
|
result.confidence_amount = paddle.confidence_amount
|
||||||
|
print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True)
|
||||||
|
else:
|
||||||
|
result.amount = tesseract.amount
|
||||||
|
result.confidence_amount = tesseract.confidence_amount
|
||||||
|
print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True)
|
||||||
|
elif paddle.amount:
|
||||||
|
result.amount = paddle.amount
|
||||||
|
result.confidence_amount = paddle.confidence_amount
|
||||||
|
elif tesseract.amount:
|
||||||
|
result.amount = tesseract.amount
|
||||||
|
result.confidence_amount = tesseract.confidence_amount
|
||||||
|
|
||||||
|
# === DATE ===
|
||||||
|
# Pick higher confidence, validate date reasonableness
|
||||||
|
if paddle.receipt_date and tesseract.receipt_date:
|
||||||
|
if paddle.confidence_date >= tesseract.confidence_date:
|
||||||
|
result.receipt_date = paddle.receipt_date
|
||||||
|
result.confidence_date = paddle.confidence_date
|
||||||
|
print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True)
|
||||||
|
else:
|
||||||
|
result.receipt_date = tesseract.receipt_date
|
||||||
|
result.confidence_date = tesseract.confidence_date
|
||||||
|
print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True)
|
||||||
|
elif paddle.receipt_date:
|
||||||
|
result.receipt_date = paddle.receipt_date
|
||||||
|
result.confidence_date = paddle.confidence_date
|
||||||
|
elif tesseract.receipt_date:
|
||||||
|
result.receipt_date = tesseract.receipt_date
|
||||||
|
result.confidence_date = tesseract.confidence_date
|
||||||
|
|
||||||
|
# === VENDOR NAME ===
|
||||||
|
# Prefer one with company indicators (S.R.L., S.A., etc.)
|
||||||
|
paddle_has_indicator = self._has_company_indicator(paddle.partner_name)
|
||||||
|
tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name)
|
||||||
|
|
||||||
|
if paddle.partner_name and tesseract.partner_name:
|
||||||
|
if paddle_has_indicator and not tesseract_has_indicator:
|
||||||
|
result.partner_name = paddle.partner_name
|
||||||
|
result.confidence_vendor = paddle.confidence_vendor
|
||||||
|
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True)
|
||||||
|
elif tesseract_has_indicator and not paddle_has_indicator:
|
||||||
|
result.partner_name = tesseract.partner_name
|
||||||
|
result.confidence_vendor = tesseract.confidence_vendor
|
||||||
|
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True)
|
||||||
|
elif paddle.confidence_vendor >= tesseract.confidence_vendor:
|
||||||
|
result.partner_name = paddle.partner_name
|
||||||
|
result.confidence_vendor = paddle.confidence_vendor
|
||||||
|
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True)
|
||||||
|
else:
|
||||||
|
result.partner_name = tesseract.partner_name
|
||||||
|
result.confidence_vendor = tesseract.confidence_vendor
|
||||||
|
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True)
|
||||||
|
elif paddle.partner_name:
|
||||||
|
result.partner_name = paddle.partner_name
|
||||||
|
result.confidence_vendor = paddle.confidence_vendor
|
||||||
|
elif tesseract.partner_name:
|
||||||
|
result.partner_name = tesseract.partner_name
|
||||||
|
result.confidence_vendor = tesseract.confidence_vendor
|
||||||
|
|
||||||
|
# === CUI (Fiscal Code) ===
|
||||||
|
# Validate format: 6-10 digits, prefer valid one
|
||||||
|
paddle_cui_valid = self._is_valid_cui(paddle.cui)
|
||||||
|
tesseract_cui_valid = self._is_valid_cui(tesseract.cui)
|
||||||
|
|
||||||
|
if paddle.cui and tesseract.cui:
|
||||||
|
if paddle_cui_valid and not tesseract_cui_valid:
|
||||||
|
result.cui = paddle.cui
|
||||||
|
print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True)
|
||||||
|
elif tesseract_cui_valid and not paddle_cui_valid:
|
||||||
|
result.cui = tesseract.cui
|
||||||
|
print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True)
|
||||||
|
else:
|
||||||
|
# Both valid or both invalid - prefer PaddleOCR
|
||||||
|
result.cui = paddle.cui
|
||||||
|
print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True)
|
||||||
|
elif paddle.cui and paddle_cui_valid:
|
||||||
|
result.cui = paddle.cui
|
||||||
|
elif tesseract.cui and tesseract_cui_valid:
|
||||||
|
result.cui = tesseract.cui
|
||||||
|
elif paddle.cui:
|
||||||
|
result.cui = paddle.cui
|
||||||
|
elif tesseract.cui:
|
||||||
|
result.cui = tesseract.cui
|
||||||
|
|
||||||
|
# === TVA ENTRIES ===
|
||||||
|
# Prefer non-empty, use the one with more entries or higher amounts
|
||||||
|
if paddle.tva_entries and tesseract.tva_entries:
|
||||||
|
# Compare: prefer the one with actual amounts (not just 0)
|
||||||
|
paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries)
|
||||||
|
tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries)
|
||||||
|
|
||||||
|
if paddle_total >= tesseract_total:
|
||||||
|
result.tva_entries = paddle.tva_entries
|
||||||
|
result.tva_total = paddle.tva_total
|
||||||
|
print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True)
|
||||||
|
else:
|
||||||
|
result.tva_entries = tesseract.tva_entries
|
||||||
|
result.tva_total = tesseract.tva_total
|
||||||
|
print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True)
|
||||||
|
elif paddle.tva_entries:
|
||||||
|
result.tva_entries = paddle.tva_entries
|
||||||
|
result.tva_total = paddle.tva_total
|
||||||
|
elif tesseract.tva_entries:
|
||||||
|
result.tva_entries = tesseract.tva_entries
|
||||||
|
result.tva_total = tesseract.tva_total
|
||||||
|
|
||||||
|
# === OTHER FIELDS ===
|
||||||
|
# Simple preference: paddle > tesseract
|
||||||
|
result.receipt_number = paddle.receipt_number or tesseract.receipt_number
|
||||||
|
result.receipt_series = paddle.receipt_series or tesseract.receipt_series
|
||||||
|
result.receipt_type = paddle.receipt_type or tesseract.receipt_type
|
||||||
|
result.items_count = paddle.items_count or tesseract.items_count
|
||||||
|
result.address = paddle.address or tesseract.address
|
||||||
|
result.description = paddle.description or tesseract.description
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _has_company_indicator(self, name: Optional[str]) -> bool:
|
||||||
|
"""Check if vendor name has company type indicator (S.R.L., S.A., etc.)"""
|
||||||
|
if not name:
|
||||||
|
return False
|
||||||
|
name_upper = name.upper()
|
||||||
|
indicators = [
|
||||||
|
r'\bS\.?\s*R\.?\s*L\.?\b',
|
||||||
|
r'\bS\.?\s*A\.?\b',
|
||||||
|
r'\bS\.?\s*N\.?\s*C\.?\b',
|
||||||
|
r'\bP\.?\s*F\.?\s*A\.?\b',
|
||||||
|
r'\bI\.?\s*I\.?\b',
|
||||||
|
r'\bHOLDING\b',
|
||||||
|
r'\bGROUP\b',
|
||||||
|
r'\bCOMPANY\b',
|
||||||
|
]
|
||||||
|
for indicator in indicators:
|
||||||
|
if re.search(indicator, name_upper):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _is_valid_cui(self, cui: Optional[str]) -> bool:
|
||||||
|
"""Validate CUI format: 6-10 digits."""
|
||||||
|
if not cui:
|
||||||
|
return False
|
||||||
|
# Remove any RO prefix
|
||||||
|
cui_clean = re.sub(r'^RO', '', cui.upper())
|
||||||
|
# Must be 6-10 digits
|
||||||
|
return bool(re.match(r'^\d{6,10}$', cui_clean))
|
||||||
|
|
||||||
|
def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool:
|
||||||
|
"""
|
||||||
|
Check if extraction has ALL required fields to skip further processing.
|
||||||
|
|
||||||
|
Required for early exit (ALL must be true):
|
||||||
|
- Overall confidence >= 85%
|
||||||
|
- ALL 5 critical fields present: number, date, amount, TVA, CUI
|
||||||
|
"""
|
||||||
|
# Must have high confidence
|
||||||
|
if ext.overall_confidence < min_confidence:
|
||||||
|
print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check all required fields
|
||||||
|
has_number = bool(ext.receipt_number)
|
||||||
|
has_date = bool(ext.receipt_date)
|
||||||
|
has_amount = bool(ext.amount)
|
||||||
|
has_tva = bool(ext.tva_total) or bool(ext.tva_entries)
|
||||||
|
has_cui = bool(ext.cui)
|
||||||
|
|
||||||
|
missing = []
|
||||||
|
if not has_number: missing.append("number")
|
||||||
|
if not has_date: missing.append("date")
|
||||||
|
if not has_amount: missing.append("amount")
|
||||||
|
if not has_tva: missing.append("TVA")
|
||||||
|
if not has_cui: missing.append("CUI")
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
# Singleton instance
|
# Singleton instance
|
||||||
ocr_service = OCRService()
|
ocr_service = OCRService()
|
||||||
|
|||||||
@@ -106,14 +106,27 @@
|
|||||||
|
|
||||||
<!-- Raw Text Toggle -->
|
<!-- Raw Text Toggle -->
|
||||||
<div class="raw-text-section" v-if="data.raw_text">
|
<div class="raw-text-section" v-if="data.raw_text">
|
||||||
<Button
|
<div class="raw-text-header">
|
||||||
:label="showRawText ? 'Ascunde text OCR' : 'Arata text OCR'"
|
<Button
|
||||||
:icon="showRawText ? 'pi pi-eye-slash' : 'pi pi-eye'"
|
:label="showRawText ? 'Ascunde text OCR' : 'Arata text OCR'"
|
||||||
severity="secondary"
|
:icon="showRawText ? 'pi pi-eye-slash' : 'pi pi-eye'"
|
||||||
size="small"
|
severity="secondary"
|
||||||
text
|
size="small"
|
||||||
@click="showRawText = !showRawText"
|
text
|
||||||
/>
|
@click="showRawText = !showRawText"
|
||||||
|
/>
|
||||||
|
<span v-if="data.ocr_engine" class="ocr-engine-badge" :class="getEngineClass(data.ocr_engine)">
|
||||||
|
<i :class="getEngineIcon(data.ocr_engine)"></i>
|
||||||
|
{{ getEngineLabel(data.ocr_engine) }}
|
||||||
|
</span>
|
||||||
|
<span v-if="data._ocr_message" class="ocr-message-badge" :class="getMessageClass(data._ocr_message)">
|
||||||
|
{{ data._ocr_message }}
|
||||||
|
</span>
|
||||||
|
<span v-if="data.processing_time_ms" class="ocr-time-badge">
|
||||||
|
<i class="pi pi-clock"></i>
|
||||||
|
{{ formatProcessingTime(data.processing_time_ms) }}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
<div v-if="showRawText" class="raw-text">
|
<div v-if="showRawText" class="raw-text">
|
||||||
<pre>{{ data.raw_text }}</pre>
|
<pre>{{ data.raw_text }}</pre>
|
||||||
</div>
|
</div>
|
||||||
@@ -168,6 +181,45 @@ const formatDate = (dateStr) => {
|
|||||||
year: 'numeric'
|
year: 'numeric'
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const getEngineClass = (engine) => {
|
||||||
|
if (!engine) return ''
|
||||||
|
if (engine === 'paddle-light') return 'fast'
|
||||||
|
if (engine === 'paddle-adaptive') return 'adaptive'
|
||||||
|
if (engine === 'adaptive-full') return 'full'
|
||||||
|
if (engine.includes('paddle')) return 'paddleocr'
|
||||||
|
if (engine.includes('tesseract')) return 'tesseract'
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
|
const getEngineIcon = (engine) => {
|
||||||
|
if (!engine) return 'pi pi-cog'
|
||||||
|
if (engine === 'paddle-light') return 'pi pi-bolt' // Fast/lightning
|
||||||
|
if (engine === 'adaptive-full') return 'pi pi-cog' // Full pipeline
|
||||||
|
return 'pi pi-cog'
|
||||||
|
}
|
||||||
|
|
||||||
|
const getEngineLabel = (engine) => {
|
||||||
|
if (!engine) return ''
|
||||||
|
if (engine === 'paddle-light') return 'Fast Mode (PaddleOCR)'
|
||||||
|
if (engine === 'paddle-adaptive') return 'Adaptive (Paddle dual)'
|
||||||
|
if (engine === 'adaptive-full') return 'Full Pipeline'
|
||||||
|
if (engine.includes('paddle')) return 'PaddleOCR'
|
||||||
|
if (engine.includes('tesseract')) return 'Tesseract'
|
||||||
|
return engine
|
||||||
|
}
|
||||||
|
|
||||||
|
const getMessageClass = (message) => {
|
||||||
|
if (!message) return ''
|
||||||
|
if (message.includes('fast mode')) return 'fast-mode'
|
||||||
|
if (message.includes('full pipeline')) return 'full-pipeline'
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
|
const formatProcessingTime = (ms) => {
|
||||||
|
if (ms < 1000) return `${ms}ms`
|
||||||
|
return `${(ms / 1000).toFixed(1)}s`
|
||||||
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style scoped>
|
<style scoped>
|
||||||
@@ -305,6 +357,82 @@ const formatDate = (dateStr) => {
|
|||||||
border-top: 1px dashed #86efac;
|
border-top: 1px dashed #86efac;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.raw-text-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 0.75rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-engine-badge {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.25rem;
|
||||||
|
padding: 0.25rem 0.5rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.75rem;
|
||||||
|
font-weight: 500;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-engine-badge.paddleocr {
|
||||||
|
background: #dbeafe;
|
||||||
|
color: #1e40af;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-engine-badge.tesseract {
|
||||||
|
background: #fef3c7;
|
||||||
|
color: #92400e;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-engine-badge.fast {
|
||||||
|
background: #dcfce7;
|
||||||
|
color: #166534;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-engine-badge.adaptive {
|
||||||
|
background: #dbeafe;
|
||||||
|
color: #1e40af;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-engine-badge.full {
|
||||||
|
background: #fef3c7;
|
||||||
|
color: #92400e;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-message-badge {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.25rem;
|
||||||
|
padding: 0.25rem 0.5rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.75rem;
|
||||||
|
font-weight: 500;
|
||||||
|
background: #f1f5f9;
|
||||||
|
color: #475569;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-message-badge.fast-mode {
|
||||||
|
background: #dcfce7;
|
||||||
|
color: #166534;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-message-badge.full-pipeline {
|
||||||
|
background: #fef3c7;
|
||||||
|
color: #92400e;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ocr-time-badge {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.25rem;
|
||||||
|
padding: 0.25rem 0.5rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.75rem;
|
||||||
|
font-weight: 600;
|
||||||
|
background: #e0e7ff;
|
||||||
|
color: #3730a3;
|
||||||
|
}
|
||||||
|
|
||||||
.raw-text {
|
.raw-text {
|
||||||
margin-top: 0.5rem;
|
margin-top: 0.5rem;
|
||||||
padding: 0.75rem;
|
padding: 0.75rem;
|
||||||
|
|||||||
@@ -143,7 +143,12 @@ const processOCR = async () => {
|
|||||||
})
|
})
|
||||||
|
|
||||||
if (response.data.success) {
|
if (response.data.success) {
|
||||||
emit('ocr-result', response.data.data)
|
// Include the OCR message in the data for debugging
|
||||||
|
const resultData = {
|
||||||
|
...response.data.data,
|
||||||
|
_ocr_message: response.data.message
|
||||||
|
}
|
||||||
|
emit('ocr-result', resultData)
|
||||||
} else {
|
} else {
|
||||||
error.value = response.data.message || 'OCR processing failed'
|
error.value = response.data.message || 'OCR processing failed'
|
||||||
emit('error', error.value)
|
emit('error', error.value)
|
||||||
|
|||||||
Reference in New Issue
Block a user