feat: Improve OCR adaptive pipeline with early exit and better pattern matching

- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found
- Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR)
- Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts)
- Add pattern for "OTAL TAXE" with T cut off and reversed amount position
- Make TVA rate pattern more flexible (code letter optional, handle "-21%")
- Replace logger.info with print(flush=True) for better debugging visibility
- Improve OCRPreview.vue to show extraction progress and raw OCR text

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-13 01:54:52 +02:00
parent 6c3dd89f6d
commit 9f06482681
9 changed files with 952 additions and 116 deletions

View File

@@ -1,10 +1,19 @@
"""FastAPI application entry point for Data Entry App.""" """FastAPI application entry point for Data Entry App."""
import sys import sys
import logging
import threading
from pathlib import Path from pathlib import Path
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from fastapi import FastAPI from fastapi import FastAPI
# Configure logging to show INFO level messages
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
@@ -30,6 +39,18 @@ async def lifespan(app: FastAPI):
settings.upload_path_resolved settings.upload_path_resolved
print(f"Upload path: {settings.upload_path_resolved}") print(f"Upload path: {settings.upload_path_resolved}")
# Pre-initialize OCR engine in background (PaddleOCR takes 15-20s)
def init_ocr_background():
try:
from app.services.ocr_service import ocr_service
ocr_service.ocr_engine._init_paddle_lazy()
print("OCR engine ready")
except Exception as e:
print(f"Warning: OCR engine pre-load failed: {e}")
print("Starting OCR engine pre-load (background)...")
threading.Thread(target=init_ocr_background, daemon=True).start()
yield yield
# Shutdown # Shutdown

View File

@@ -102,6 +102,8 @@ async def extract_from_image(file: UploadFile = File(...)):
confidence_vendor=result.confidence_vendor, confidence_vendor=result.confidence_vendor,
overall_confidence=result.overall_confidence, overall_confidence=result.overall_confidence,
raw_text=result.raw_text, raw_text=result.raw_text,
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
) )
return OCRResponse(success=True, message=message, data=data) return OCRResponse(success=True, message=message, data=data)
@@ -171,6 +173,8 @@ async def extract_from_attachment(
confidence_vendor=result.confidence_vendor, confidence_vendor=result.confidence_vendor,
overall_confidence=result.overall_confidence, overall_confidence=result.overall_confidence,
raw_text=result.raw_text, raw_text=result.raw_text,
ocr_engine=result.ocr_engine,
processing_time_ms=result.processing_time_ms,
) )
return OCRResponse(success=True, message=message, data=data) return OCRResponse(success=True, message=message, data=data)

View File

@@ -37,6 +37,8 @@ class ExtractionData(BaseModel):
confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence") confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score") overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
raw_text: str = Field(default="", description="Raw OCR text") raw_text: str = Field(default="", description="Raw OCR text")
ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")
class Config: class Config:
"""Pydantic config.""" """Pydantic config."""

View File

@@ -23,37 +23,57 @@ class ImagePreprocessor:
raise ValueError(f"Could not load image: {path}") raise ValueError(f"Could not load image: {path}")
return image return image
def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]: def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
""" """
Convert PDF to images with high DPI for better OCR. Convert PDF to images.
Args: Args:
path: Path to PDF file path: Path to PDF file
dpi: Resolution (400 recommended for receipts, higher = better quality but slower) dpi: Resolution (300 = fast & good quality, 400 = better but slower)
""" """
if not PDF_AVAILABLE: if not PDF_AVAILABLE:
raise RuntimeError("pdf2image not available. Install with: pip install pdf2image") raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
# Use 400 DPI for better text recognition on thermal receipts
images = pdf2image.convert_from_path(str(path), dpi=dpi) images = pdf2image.convert_from_path(str(path), dpi=dpi)
return [np.array(img) for img in images] return [np.array(img) for img in images]
def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray: def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
""" """
Apply preprocessing pipeline for thermal receipt images. Apply LIGHT preprocessing - better for clear PDFs.
Heavy binarization can destroy text on clear images.
"""
return self.preprocess_light(image)
Pipeline: def preprocess_light(self, image: np.ndarray) -> np.ndarray:
1. Convert to grayscale """
2. Resize if too small (min 1500px width for high quality) Light preprocessing for CLEAR images (PDFs, good scans).
3. Deskew (straighten rotated text) Preserves original quality, only enhances contrast.
4. Contrast enhancement (CLAHE) """
5. Denoise (Non-local means) # 1. Grayscale
6. Sharpening (for clearer text edges) if len(image.shape) == 3:
7. Adaptive thresholding (binarization) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
8. Morphological operations (connect broken chars) else:
gray = image.copy()
Args: # 2. Resize if too small
image: Input image (BGR or grayscale) height, width = gray.shape
high_quality: If True, apply more aggressive preprocessing if width < 1500:
scale = 1500 / width
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# 3. Deskew
gray = self._deskew(gray)
# 4. Light contrast enhancement only
clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# NO binarization, NO morphological ops - preserve original quality
return enhanced
def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
"""
Heavy preprocessing for FADED thermal receipts.
Aggressive binarization to recover faded text.
""" """
# 1. Grayscale # 1. Grayscale
if len(image.shape) == 3: if len(image.shape) == 3:
@@ -63,57 +83,48 @@ class ImagePreprocessor:
# 2. Resize if too small (larger = better OCR) # 2. Resize if too small (larger = better OCR)
height, width = gray.shape height, width = gray.shape
min_width = 1500 if high_quality else 1000 if width < 1500:
if width < min_width: scale = 1500 / width
scale = min_width / width gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
gray = cv2.resize(
gray, None, fx=scale, fy=scale,
interpolation=cv2.INTER_CUBIC
)
# 3. Deskew # 3. Deskew
gray = self._deskew(gray) gray = self._deskew(gray)
# 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization) # 4. Contrast enhancement with CLAHE
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray) enhanced = clahe.apply(gray)
# 5. Denoise (slightly less aggressive to preserve text details) # 5. Denoise
denoised = cv2.fastNlMeansDenoising( denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
enhanced, h=8, # Lower h = preserve more details
templateWindowSize=7,
searchWindowSize=21
)
# 6. Sharpening to enhance text edges # 6. Sharpening
if high_quality: gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
# Unsharp mask for better text clarity sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
else:
sharpened = denoised
# 7. Adaptive thresholding with optimized parameters # 7. Adaptive thresholding (binarization)
binary = cv2.adaptiveThreshold( binary = cv2.adaptiveThreshold(
sharpened, 255, sharpened, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, cv2.THRESH_BINARY,
blockSize=11, # Smaller block = better for small text blockSize=11, C=5
C=5 # Lower C = darker result, better for faded receipts
) )
# 8. Morphological operations # 8. Morphological operations
# Close small gaps in characters
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close) result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)
# Optional: Remove small noise spots
if high_quality:
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open)
return result return result
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
"""
Generate 2 preprocessing variants for OCR (fast mode).
Returns: [light_processed, heavy_processed]
"""
return [
self.preprocess_light(image),
self.preprocess_heavy(image),
]
def _deskew(self, image: np.ndarray) -> np.ndarray: def _deskew(self, image: np.ndarray) -> np.ndarray:
"""Correct image rotation/skew using Hough lines.""" """Correct image rotation/skew using Hough lines."""
edges = cv2.Canny(image, 50, 150, apertureSize=3) edges = cv2.Canny(image, 50, 150, apertureSize=3)

View File

@@ -1,11 +1,16 @@
"""OCR engine wrapper for PaddleOCR and Tesseract.""" """OCR engine wrapper for PaddleOCR and Tesseract."""
import os import os
import logging
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional from typing import List, Optional, Tuple
import numpy as np import numpy as np
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO) # Ensure logs are visible
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) # Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
@@ -40,6 +45,7 @@ class OCRResult:
text: str text: str
confidence: float confidence: float
boxes: List[dict] boxes: List[dict]
engine: str = "" # OCR engine used: paddleocr or tesseract
class OCREngine: class OCREngine:
@@ -65,8 +71,9 @@ class OCREngine:
print("Initializing PaddleOCR engine...") print("Initializing PaddleOCR engine...")
# PaddleOCR 3.x API - optimized for Romanian receipts # PaddleOCR 3.x API - optimized for Romanian receipts
# Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
self._paddle = PaddleOCR( self._paddle = PaddleOCR(
lang='en', # 'en' works better than 'ro' for mixed alphanumeric lang='en', # 'en' handles Latin alphabet well for receipts
# High quality settings for better accuracy # High quality settings for better accuracy
det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3) det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3)
det_db_box_thresh=0.5, # Box confidence threshold (default 0.5) det_db_box_thresh=0.5, # Box confidence threshold (default 0.5)
@@ -81,14 +88,19 @@ class OCREngine:
def recognize(self, image: np.ndarray) -> OCRResult: def recognize(self, image: np.ndarray) -> OCRResult:
"""Perform OCR on preprocessed image.""" """Perform OCR on preprocessed image."""
logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
# Lazy init PaddleOCR on first call # Lazy init PaddleOCR on first call
self._init_paddle_lazy() self._init_paddle_lazy()
if PADDLE_AVAILABLE and self._paddle: if PADDLE_AVAILABLE and self._paddle:
logger.info("[OCR] Using PaddleOCR engine")
return self._paddle_recognize(image) return self._paddle_recognize(image)
elif TESSERACT_AVAILABLE: elif TESSERACT_AVAILABLE:
logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
return self._tesseract_recognize(image) return self._tesseract_recognize(image)
else: else:
logger.error("[OCR] No OCR engine available!")
raise RuntimeError( raise RuntimeError(
"No OCR engine available. Install PaddleOCR or Tesseract." "No OCR engine available. Install PaddleOCR or Tesseract."
) )
@@ -96,17 +108,23 @@ class OCREngine:
def _paddle_recognize(self, image: np.ndarray) -> OCRResult: def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
"""Recognize text using PaddleOCR 3.x API.""" """Recognize text using PaddleOCR 3.x API."""
try: try:
logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
# PaddleOCR 3.x requires 3-channel images # PaddleOCR 3.x requires 3-channel images
if len(image.shape) == 2: if len(image.shape) == 2:
# Convert grayscale to 3-channel BGR # Convert grayscale to 3-channel BGR
import cv2 import cv2
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
# PaddleOCR 3.x uses predict() with new parameter names # PaddleOCR 3.x uses predict() with new parameter names
logger.info("[PaddleOCR] Calling predict()...")
result = self._paddle.predict(image, use_textline_orientation=True) result = self._paddle.predict(image, use_textline_orientation=True)
logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
if not result or len(result) == 0: if not result or len(result) == 0:
return OCRResult(text="", confidence=0.0, boxes=[]) logger.warning("[PaddleOCR] No results returned")
return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
# PaddleOCR 3.x returns OCRResult objects with different structure # PaddleOCR 3.x returns OCRResult objects with different structure
ocr_result = result[0] ocr_result = result[0]
@@ -117,7 +135,7 @@ class OCREngine:
dt_polys = ocr_result.get('dt_polys', []) dt_polys = ocr_result.get('dt_polys', [])
if not rec_texts: if not rec_texts:
return OCRResult(text="", confidence=0.0, boxes=[]) return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
boxes = [] boxes = []
for i, text in enumerate(rec_texts): for i, text in enumerate(rec_texts):
@@ -130,13 +148,17 @@ class OCREngine:
}) })
avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0 avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
text_result = '\n'.join(rec_texts)
logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
return OCRResult( return OCRResult(
text='\n'.join(rec_texts), text=text_result,
confidence=float(avg_conf), confidence=float(avg_conf),
boxes=boxes boxes=boxes,
engine="paddleocr"
) )
except Exception as e: except Exception as e:
print(f"PaddleOCR error: {e}, falling back to Tesseract") logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
if TESSERACT_AVAILABLE: if TESSERACT_AVAILABLE:
return self._tesseract_recognize(image) return self._tesseract_recognize(image)
raise raise
@@ -145,23 +167,70 @@ class OCREngine:
"""Recognize text using Tesseract.""" """Recognize text using Tesseract."""
global pytesseract global pytesseract
logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
# Lazy import pytesseract # Lazy import pytesseract
if pytesseract is None: if pytesseract is None:
print("Importing pytesseract...") logger.info("[Tesseract] Importing pytesseract...")
import pytesseract as _pytesseract import pytesseract as _pytesseract
pytesseract = _pytesseract pytesseract = _pytesseract
config = '--psm 6 -l ron+eng' # PSM 4: Single column (best for receipts)
config = '--psm 4 -l ron+eng'
text = pytesseract.image_to_string(image, config=config) text = pytesseract.image_to_string(image, config=config)
data = pytesseract.image_to_data(
image, config=config,
output_type=pytesseract.Output.DICT
)
# Quick confidence estimate
data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
confidences = [int(c) for c in data['conf'] if int(c) > 0] confidences = [int(c) for c in data['conf'] if int(c) > 0]
avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0 avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
return OCRResult(text=text, confidence=avg_conf, boxes=[]) logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
"""
Run both OCR engines and return both results.
Returns:
Tuple of (paddle_result, tesseract_result)
tesseract_result may be None if Tesseract is not available
"""
logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
# Lazy init PaddleOCR
self._init_paddle_lazy()
paddle_result = None
tesseract_result = None
# Run PaddleOCR
if PADDLE_AVAILABLE and self._paddle:
try:
logger.info("[OCR Dual] Running PaddleOCR...")
paddle_result = self._paddle_recognize(image)
logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
except Exception as e:
logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
# Run Tesseract
if TESSERACT_AVAILABLE:
try:
logger.info("[OCR Dual] Running Tesseract...")
tesseract_result = self._tesseract_recognize(image)
logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
except Exception as e:
logger.error(f"[OCR Dual] Tesseract failed: {e}")
tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
# Fallback if PaddleOCR not available
if paddle_result is None:
if tesseract_result:
paddle_result = tesseract_result
else:
raise RuntimeError("No OCR engine available")
return paddle_result, tesseract_result
@staticmethod @staticmethod
def get_available_engines() -> List[str]: def get_available_engines() -> List[str]:

View File

@@ -28,6 +28,8 @@ class ExtractionResult:
confidence_date: float = 0.0 confidence_date: float = 0.0
confidence_vendor: float = 0.0 confidence_vendor: float = 0.0
raw_text: str = "" raw_text: str = ""
ocr_engine: str = "" # OCR engine used: paddleocr or tesseract
processing_time_ms: int = 0 # Processing time in milliseconds
@property @property
def overall_confidence(self) -> float: def overall_confidence(self) -> float:
@@ -70,6 +72,7 @@ class ReceiptExtractor:
# Date patterns - support dash, dot, and slash separators # Date patterns - support dash, dot, and slash separators
# OCR may produce DRTA instead of DATA, DAIA, etc. # OCR may produce DRTA instead of DATA, DAIA, etc.
# OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
DATE_PATTERNS = [ DATE_PATTERNS = [
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant) # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
@@ -84,6 +87,19 @@ class ReceiptExtractor:
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75), (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
] ]
# OCR-corrupted date patterns with spaces/commas
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
DATE_PATTERNS_OCR_SPACES = [
# YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
# YYYY. MM. DD format with spaces (standalone)
(r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
# DD. MM, YYYY or DD, MM. YYYY (with time following)
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
# DD. MM, YYYY or DD, MM. YYYY (standalone)
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Receipt number patterns - Romanian fiscal receipt formats # Receipt number patterns - Romanian fiscal receipt formats
# OCR may produce N instead of : or other errors # OCR may produce N instead of : or other errors
NUMBER_PATTERNS = [ NUMBER_PATTERNS = [
@@ -127,12 +143,23 @@ class ReceiptExtractor:
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90), (r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
# COD FISCAL (vendor) # COD FISCAL (vendor)
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90), (r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# C.I.F. format (with dots) # C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
(r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
# C.I.F. format (with dots, no spaces)
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88), (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
# CUI format (less specific, use with caution) # CUI format (less specific, use with caution)
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85), (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
] ]
# Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
# Common in some receipts: "R011201891\nC. I. F." - number on line before label
CUI_REVERSED_PATTERNS = [
# RO + 8-10 digits on line immediately before C.I.F./CIF label
(r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
# Just digits before C.I.F. label
(r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
]
# Series patterns - be strict to avoid false matches # Series patterns - be strict to avoid false matches
SERIES_PATTERNS = [ SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90), (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
@@ -158,6 +185,7 @@ class ReceiptExtractor:
# Items count patterns - OCR may produce OZ instead of POZ, etc. # Items count patterns - OCR may produce OZ instead of POZ, etc.
# Number may be on separate line before or after the label # Number may be on separate line before or after the label
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
ITEMS_COUNT_PATTERNS = [ ITEMS_COUNT_PATTERNS = [
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces) # NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
# OCR tolerant: OZ instead of POZ, ARI instead of ART # OCR tolerant: OZ instead of POZ, ARI instead of ART
@@ -167,11 +195,10 @@ class ReceiptExtractor:
# Number may be on next line after label # Number may be on next line after label
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93), (r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90), (r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
# Simpler patterns # Simpler patterns - but more specific
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88), (r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
(r'P?[O0]Z\s*:?\s*(\d+)', 0.85), # POZ at start of line or after colon (not in product descriptions)
# X articole/pozitii (r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
(r'(\d+)\s*(?:ARTIC[O0]LE|P[O0]ZITII|BUC)', 0.80),
] ]
# Address patterns (Romanian format) # Address patterns (Romanian format)
@@ -183,20 +210,21 @@ class ReceiptExtractor:
] ]
# Vendor name indicators (lines containing these are likely vendor names) # Vendor name indicators (lines containing these are likely vendor names)
# These should be company type suffixes, not generic words
# Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
VENDOR_INDICATORS = [ VENDOR_INDICATORS = [
r'\bS\.?R\.?L\.?\b', # S.R.L. r'\bS\.?\s*R\.?\s*L\.?\b', # S.R.L. or S. R. L.
r'\bS\.?A\.?\b', # S.A. r'\bS\.?\s*A\.?\b', # S.A. or S. A.
r'\bS\.?N\.?C\.?\b', # S.N.C. r'\bS\.?\s*N\.?\s*C\.?\b', # S.N.C. or S. N. C.
r'\bS\.?C\.?S\.?\b', # S.C.S. r'\bS\.?\s*C\.?\s*S\.?\b', # S.C.S. or S. C. S.
r'\bI\.?I\.?\b', # I.I. (Individual) r'\bI\.?\s*I\.?\b', # I.I. or I. I.
r'\bP\.?F\.?A\.?\b', # P.F.A. r'\bP\.?\s*F\.?\s*A\.?\b', # P.F.A. or P. F. A.
r'\bS\.?C\.?\b', # S.C. # S.C. alone is too short and generic - only match if followed by company name
r'\bS\.?\s*C\.?\s+[A-Z]', # S.C. followed by company name
r'HOLDING', r'HOLDING',
r'COMPANY', r'COMPANY',
r'GROUP', r'GROUP',
r'MAGAZIN', # Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
r'MARKET',
r'SHOP',
] ]
def extract(self, text: str) -> ExtractionResult: def extract(self, text: str) -> ExtractionResult:
@@ -215,6 +243,14 @@ class ReceiptExtractor:
# Extract additional fields - Multiple TVA entries # Extract additional fields - Multiple TVA entries
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper) result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
if not result.tva_entries:
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
# Debug: show what patterns see
import re
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
result.items_count = self._extract_items_count(text_upper) result.items_count = self._extract_items_count(text_upper)
result.address = self._extract_address(text_upper) result.address = self._extract_address(text_upper)
@@ -334,6 +370,7 @@ class ReceiptExtractor:
def _extract_date(self, text: str) -> Tuple[Optional[date], float]: def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
"""Extract receipt date from text.""" """Extract receipt date from text."""
# First try standard patterns (clean dates)
for pattern, confidence in self.DATE_PATTERNS: for pattern, confidence in self.DATE_PATTERNS:
match = re.search(pattern, text) match = re.search(pattern, text)
if match: if match:
@@ -354,6 +391,34 @@ class ReceiptExtractor:
return parsed, confidence return parsed, confidence
except ValueError: except ValueError:
continue continue
# Then try OCR-corrupted patterns (dates with spaces/commas)
# Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
match = re.search(pattern, text)
if match:
try:
if fmt == 'ymd':
# YYYY. MM. DD format (OMV/Petrom)
year = match.group(1)
month = match.group(2)
day = match.group(3)
else:
# DD. MM. YYYY format (default)
day = match.group(1)
month = match.group(2)
year = match.group(3)
date_str = f"{day}.{month}.{year}"
parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
# Validate date range
today = date.today()
if parsed <= today and parsed.year >= 2020:
return parsed, confidence
except ValueError:
continue
return None, 0.0 return None, 0.0
def _extract_number(self, text: str) -> Tuple[Optional[str], float]: def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
@@ -377,8 +442,9 @@ class ReceiptExtractor:
Extract vendor/partner name from text. Extract vendor/partner name from text.
Uses multiple strategies: Uses multiple strategies:
1. Look for lines with company type indicators (S.R.L., S.A., etc.) 1. Look for lines with company type indicators (S.R.L., S.A., etc.)
2. Look for lines near CIF 2. Look for company name + SRL on separate lines
3. Use first valid line as fallback 3. Look for lines near CIF
4. Use first valid line as fallback
""" """
lines = text.split('\n') lines = text.split('\n')
skip_keywords = [ skip_keywords = [
@@ -388,9 +454,37 @@ class ReceiptExtractor:
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT', 'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT', 'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY', 'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
'BUC', 'ROLA', 'CUMPARATOR' 'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
'NIVS', 'BENZINA', 'PETROM', 'OMV'
] ]
# Strategy 0: Look for company name followed by SRL/SA on next line
# Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
for i, line in enumerate(lines[:15]):
line = line.strip()
if not line or len(line) < 3:
continue
line_upper = line.upper()
# Skip lines with skip keywords
if any(kw in line_upper for kw in skip_keywords):
continue
# Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
if i + 1 < len(lines):
next_line = lines[i + 1].strip().upper()
# Match standalone company type suffix
if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
re.match(r'^S\.?\s*A\.?$', next_line) or \
re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
re.match(r'^I\.?\s*I\.?$', next_line):
# Combine: "COMPANY NAME" + " " + "SRL"
vendor = self._clean_vendor_name(f"{line} {next_line}")
if vendor and len(vendor) >= 5:
return vendor, 0.95
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.) # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
for i, line in enumerate(lines[:15]): # Check first 15 lines for i, line in enumerate(lines[:15]): # Check first 15 lines
line = line.strip() line = line.strip()
@@ -476,7 +570,22 @@ class ReceiptExtractor:
Extract vendor CUI (fiscal identification code) from text. Extract vendor CUI (fiscal identification code) from text.
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...' Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
""" """
# First, try to find CIF on a line that doesn't contain CLIENT # Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
# This is common in some receipts: "R011201891\nC. I. F."
for pattern, confidence in self.CUI_REVERSED_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
# Verify this is not the CLIENT CUI by checking context
start = match.start()
# Check 50 chars before the match for CLIENT keyword
context_start = max(0, start - 50)
context = text_upper[context_start:start]
if 'CLIENT' not in context and 'LIENT' not in context:
return cui, confidence
# Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
lines = text_upper.split('\n') lines = text_upper.split('\n')
for line in lines: for line in lines:
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's) # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
@@ -491,7 +600,7 @@ class ReceiptExtractor:
if 6 <= len(cui) <= 10: if 6 <= len(cui) <= 10:
return cui, confidence return cui, confidence
# Fallback: search entire text but exclude CLIENT patterns # Strategy 2: Fallback - search entire text but exclude CLIENT patterns
for pattern, confidence in self.CUI_PATTERNS: for pattern, confidence in self.CUI_PATTERNS:
# Find all matches # Find all matches
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE): for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
@@ -523,8 +632,94 @@ class ReceiptExtractor:
tva_entries = [] tva_entries = []
seen_entries = set() # To avoid duplicates seen_entries = set() # To avoid duplicates
# Normalize spaces in numbers first (OCR may produce "32. 31") # Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
# OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
# Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
non_vat_patterns = [
# Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR
r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR
r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR...
r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error)
r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA
r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA
r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated)
]
for pattern in non_vat_patterns:
if re.search(pattern, text, re.IGNORECASE):
# Non-VAT payer - return TVA = 0
return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')
# Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
# Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
# Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
# Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
# OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
# OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
# Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
if not taxe_match:
reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
if taxe_match:
# Also need to find the TVA rate from the table
# Pattern handles: "A-21%", "-21,00%", "21%" etc.
rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
if rate_match:
try:
code = rate_match.group(1).upper() if rate_match.group(1) else 'A' # Default to A if missing
percent = int(rate_match.group(2))
amount_str = taxe_match.group(1).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
pass
# Pattern 0b: Table format "A-21,00% 285,66 49,58" (code-percent base tva_amount)
# This format appears after a TVA header line like "TVA TOTAL VALDARE"
# The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
if not tva_entries:
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount1_str = match.group(3).replace(' ', '')
amount2_str = match.group(4).replace(' ', '')
amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
# Determine which is TVA: the smaller amount is usually TVA
# (TVA is a fraction of the total, so it's always smaller)
tva_amount = min(amount1, amount2)
if tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code) # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
# OCR tolerant: TUA, TVR, etc. # OCR tolerant: TUA, TVR, etc.
@@ -571,7 +766,75 @@ class ReceiptExtractor:
except (ValueError, InvalidOperation): except (ValueError, InvalidOperation):
continue continue
# Pattern 3: "TVAA - 21%" on one line, amount on next line # Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
if not tva_entries:
# First try: "TOTAL TVA A - 21% 32.31" (amount on same line)
tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount_str = self._normalize_number(match.group(3))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
if not tva_entries:
tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# Look for "TOTAL TVA BON" followed by amount
tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
if tva_bon_match:
amount_str = self._normalize_number(tva_bon_match.group(1))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
continue
# Fallback: Amount after TOTAL TVA BON on next line
tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
if tva_bon_pos:
after_bon = normalized_text[tva_bon_pos.end():]
# Find first standalone number (likely TVA amount)
amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
if amount_match:
amount_str = self._normalize_number(amount_match.group(1))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
if not tva_entries: if not tva_entries:
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%' tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE): for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):

View File

@@ -1,11 +1,16 @@
"""Main OCR service coordinating preprocessing, recognition, and extraction.""" """Main OCR service coordinating preprocessing, recognition, and extraction."""
import os import os
import re
import logging
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
import time
import asyncio import asyncio
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from decimal import Decimal
from pathlib import Path from pathlib import Path
from typing import Optional, Tuple from typing import Optional, Tuple
@@ -13,6 +18,9 @@ from app.services.ocr_engine import OCREngine
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
from app.services.image_preprocessor import ImagePreprocessor from app.services.image_preprocessor import ImagePreprocessor
# Setup logging
logger = logging.getLogger(__name__)
class OCRService: class OCRService:
"""Service for OCR processing of receipt images.""" """Service for OCR processing of receipt images."""
@@ -56,15 +64,18 @@ class OCRService:
image_path: Path, image_path: Path,
mime_type: str mime_type: str
) -> Tuple[bool, str, Optional[ExtractionResult]]: ) -> Tuple[bool, str, Optional[ExtractionResult]]:
"""Synchronous processing (runs in thread pool).""" """Synchronous processing with ADAPTIVE OCR pipeline."""
# Handle PDF start_time = time.time()
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
# Load image
if mime_type == 'application/pdf': if mime_type == 'application/pdf':
try: try:
images = self.preprocessor.pdf_to_images(image_path) images = self.preprocessor.pdf_to_images(image_path)
if not images: if not images:
return False, "Failed to extract images from PDF", None return False, "Failed to extract images from PDF", None
image = images[0] # Process first page only image = images[0]
except RuntimeError as e: except RuntimeError as e:
return False, str(e), None return False, str(e), None
else: else:
@@ -73,38 +84,360 @@ class OCRService:
except ValueError as e: except ValueError as e:
return False, str(e), None return False, str(e), None
# Preprocess image raw_texts = []
processed = self.preprocessor.preprocess(image) extraction = None
# ══════════════════════════════════════════════════════════════
# STEP 1: PaddleOCR + Light (fastest, best for clear PDFs)
# ══════════════════════════════════════════════════════════════
print("=" * 60, flush=True)
print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True)
print("=" * 60, flush=True)
light_img = self.preprocessor.preprocess_light(image)
# Perform OCR
try: try:
ocr_result = self.ocr_engine.recognize(processed) paddle_light = self.ocr_engine._paddle_recognize(light_img)
except RuntimeError as e: if paddle_light and paddle_light.text:
return False, str(e), None extraction = self.extractor.extract(paddle_light.text)
extraction.ocr_engine = "paddle-light"
raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}")
if not ocr_result.text: # Log extraction results
return False, "No text detected in image", None print(f"[OCR] Step 1 Results:", flush=True)
print(f" - OCR Confidence: {paddle_light.confidence:.0%}", flush=True)
print(f" - Amount: {extraction.amount}", flush=True)
print(f" - Date: {extraction.receipt_date}", flush=True)
print(f" - Number: {extraction.receipt_number}", flush=True)
print(f" - CUI: {extraction.cui}", flush=True)
print(f" - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True)
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
# Extract structured fields # Early exit if complete
extraction = self.extractor.extract(ocr_result.text) if self._is_extraction_complete(extraction):
extraction.raw_text = "\n\n".join(raw_texts)
elapsed_ms = int((time.time() - start_time) * 1000)
extraction.processing_time_ms = elapsed_ms
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
return True, "OCR complete (fast mode)", extraction
else:
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
except Exception as e:
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
extraction = ExtractionResult()
# ══════════════════════════════════════════════════════════════
# STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
# ══════════════════════════════════════════════════════════════
print("=" * 60, flush=True)
print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
print("=" * 60, flush=True)
heavy_img = self.preprocessor.preprocess_heavy(image)
try:
paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
if paddle_heavy and paddle_heavy.text:
extraction_heavy = self.extractor.extract(paddle_heavy.text)
extraction_heavy.ocr_engine = "paddle-heavy"
raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
print(f" - Amount: {extraction_heavy.amount}", flush=True)
print(f" - Date: {extraction_heavy.receipt_date}", flush=True)
print(f" - CUI: {extraction_heavy.cui}", flush=True)
# Merge with previous
extraction = self._merge_extractions(extraction, extraction_heavy)
print(f"[OCR] After merge:", flush=True)
print(f" - Amount: {extraction.amount}", flush=True)
print(f" - Date: {extraction.receipt_date}", flush=True)
print(f" - Number: {extraction.receipt_number}", flush=True)
print(f" - CUI: {extraction.cui}", flush=True)
print(f" - TVA: {extraction.tva_total}", flush=True)
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
if self._is_extraction_complete(extraction):
extraction.raw_text = "\n\n".join(raw_texts)
extraction.ocr_engine = "paddle-adaptive"
elapsed_ms = int((time.time() - start_time) * 1000)
extraction.processing_time_ms = elapsed_ms
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
return True, "OCR complete (paddle dual)", extraction
else:
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
except Exception as e:
print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
# ══════════════════════════════════════════════════════════════
# STEP 3: Tesseract fallback
# ══════════════════════════════════════════════════════════════
print("=" * 60, flush=True)
print("[OCR] STEP 3: Tesseract fallback", flush=True)
print("=" * 60, flush=True)
try:
tesseract_result = self.ocr_engine._tesseract_recognize(light_img)
if tesseract_result and tesseract_result.text:
extraction_tess = self.extractor.extract(tesseract_result.text)
extraction_tess.ocr_engine = "tesseract"
raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}")
print(f"[OCR] Step 3 (Tesseract) Results:", flush=True)
print(f" - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True)
print(f" - Amount: {extraction_tess.amount}", flush=True)
print(f" - Date: {extraction_tess.receipt_date}", flush=True)
print(f" - CUI: {extraction_tess.cui}", flush=True)
extraction = self._merge_extractions(extraction, extraction_tess)
except Exception as e:
print(f"[OCR] Tesseract failed: {e}", flush=True)
# Final result
if extraction is None:
return False, "No text detected", None
extraction.raw_text = "\n\n".join(raw_texts)
extraction.ocr_engine = "adaptive-full"
# Build result message # Build result message
fields_found = [] fields_found = []
if extraction.amount: if extraction.amount: fields_found.append("amount")
fields_found.append("amount") if extraction.receipt_date: fields_found.append("date")
if extraction.receipt_date: if extraction.receipt_number: fields_found.append("number")
fields_found.append("date") if extraction.cui: fields_found.append("CUI")
if extraction.partner_name: if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA")
fields_found.append("vendor")
if extraction.cui:
fields_found.append("CUI")
if extraction.receipt_number:
fields_found.append("number")
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}" message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}"
elapsed_ms = int((time.time() - start_time) * 1000)
extraction.processing_time_ms = elapsed_ms
print("=" * 60, flush=True)
print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True)
print("=" * 60, flush=True)
print(f" - Amount: {extraction.amount}", flush=True)
print(f" - Date: {extraction.receipt_date}", flush=True)
print(f" - Number: {extraction.receipt_number}", flush=True)
print(f" - CUI: {extraction.cui}", flush=True)
print(f" - TVA: {extraction.tva_total}", flush=True)
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
print(f" - Processing Time: {elapsed_ms}ms", flush=True)
print(f" - Message: {message}", flush=True)
return True, message, extraction return True, message, extraction
def _merge_extractions(
self,
paddle: Optional[ExtractionResult],
tesseract: Optional[ExtractionResult]
) -> ExtractionResult:
"""
Merge two extractions, picking best fields from each engine.
Strategy:
- For each field, prefer the one with higher confidence
- Use validation rules (CUI format, date validity, company indicators)
- Combine TVA entries if different
"""
result = ExtractionResult()
# Handle case where one is None
if paddle is None and tesseract is None:
return result
if paddle is None:
return tesseract
if tesseract is None:
return paddle
print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True)
# === AMOUNT ===
# Pick higher confidence, both must be positive
if paddle.amount and tesseract.amount:
if paddle.confidence_amount >= tesseract.confidence_amount:
result.amount = paddle.amount
result.confidence_amount = paddle.confidence_amount
print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True)
else:
result.amount = tesseract.amount
result.confidence_amount = tesseract.confidence_amount
print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True)
elif paddle.amount:
result.amount = paddle.amount
result.confidence_amount = paddle.confidence_amount
elif tesseract.amount:
result.amount = tesseract.amount
result.confidence_amount = tesseract.confidence_amount
# === DATE ===
# Pick higher confidence, validate date reasonableness
if paddle.receipt_date and tesseract.receipt_date:
if paddle.confidence_date >= tesseract.confidence_date:
result.receipt_date = paddle.receipt_date
result.confidence_date = paddle.confidence_date
print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True)
else:
result.receipt_date = tesseract.receipt_date
result.confidence_date = tesseract.confidence_date
print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True)
elif paddle.receipt_date:
result.receipt_date = paddle.receipt_date
result.confidence_date = paddle.confidence_date
elif tesseract.receipt_date:
result.receipt_date = tesseract.receipt_date
result.confidence_date = tesseract.confidence_date
# === VENDOR NAME ===
# Prefer one with company indicators (S.R.L., S.A., etc.)
paddle_has_indicator = self._has_company_indicator(paddle.partner_name)
tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name)
if paddle.partner_name and tesseract.partner_name:
if paddle_has_indicator and not tesseract_has_indicator:
result.partner_name = paddle.partner_name
result.confidence_vendor = paddle.confidence_vendor
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True)
elif tesseract_has_indicator and not paddle_has_indicator:
result.partner_name = tesseract.partner_name
result.confidence_vendor = tesseract.confidence_vendor
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True)
elif paddle.confidence_vendor >= tesseract.confidence_vendor:
result.partner_name = paddle.partner_name
result.confidence_vendor = paddle.confidence_vendor
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True)
else:
result.partner_name = tesseract.partner_name
result.confidence_vendor = tesseract.confidence_vendor
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True)
elif paddle.partner_name:
result.partner_name = paddle.partner_name
result.confidence_vendor = paddle.confidence_vendor
elif tesseract.partner_name:
result.partner_name = tesseract.partner_name
result.confidence_vendor = tesseract.confidence_vendor
# === CUI (Fiscal Code) ===
# Validate format: 6-10 digits, prefer valid one
paddle_cui_valid = self._is_valid_cui(paddle.cui)
tesseract_cui_valid = self._is_valid_cui(tesseract.cui)
if paddle.cui and tesseract.cui:
if paddle_cui_valid and not tesseract_cui_valid:
result.cui = paddle.cui
print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True)
elif tesseract_cui_valid and not paddle_cui_valid:
result.cui = tesseract.cui
print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True)
else:
# Both valid or both invalid - prefer PaddleOCR
result.cui = paddle.cui
print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True)
elif paddle.cui and paddle_cui_valid:
result.cui = paddle.cui
elif tesseract.cui and tesseract_cui_valid:
result.cui = tesseract.cui
elif paddle.cui:
result.cui = paddle.cui
elif tesseract.cui:
result.cui = tesseract.cui
# === TVA ENTRIES ===
# Prefer non-empty, use the one with more entries or higher amounts
if paddle.tva_entries and tesseract.tva_entries:
# Compare: prefer the one with actual amounts (not just 0)
paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries)
tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries)
if paddle_total >= tesseract_total:
result.tva_entries = paddle.tva_entries
result.tva_total = paddle.tva_total
print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True)
else:
result.tva_entries = tesseract.tva_entries
result.tva_total = tesseract.tva_total
print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True)
elif paddle.tva_entries:
result.tva_entries = paddle.tva_entries
result.tva_total = paddle.tva_total
elif tesseract.tva_entries:
result.tva_entries = tesseract.tva_entries
result.tva_total = tesseract.tva_total
# === OTHER FIELDS ===
# Simple preference: paddle > tesseract
result.receipt_number = paddle.receipt_number or tesseract.receipt_number
result.receipt_series = paddle.receipt_series or tesseract.receipt_series
result.receipt_type = paddle.receipt_type or tesseract.receipt_type
result.items_count = paddle.items_count or tesseract.items_count
result.address = paddle.address or tesseract.address
result.description = paddle.description or tesseract.description
return result
def _has_company_indicator(self, name: Optional[str]) -> bool:
"""Check if vendor name has company type indicator (S.R.L., S.A., etc.)"""
if not name:
return False
name_upper = name.upper()
indicators = [
r'\bS\.?\s*R\.?\s*L\.?\b',
r'\bS\.?\s*A\.?\b',
r'\bS\.?\s*N\.?\s*C\.?\b',
r'\bP\.?\s*F\.?\s*A\.?\b',
r'\bI\.?\s*I\.?\b',
r'\bHOLDING\b',
r'\bGROUP\b',
r'\bCOMPANY\b',
]
for indicator in indicators:
if re.search(indicator, name_upper):
return True
return False
def _is_valid_cui(self, cui: Optional[str]) -> bool:
"""Validate CUI format: 6-10 digits."""
if not cui:
return False
# Remove any RO prefix
cui_clean = re.sub(r'^RO', '', cui.upper())
# Must be 6-10 digits
return bool(re.match(r'^\d{6,10}$', cui_clean))
def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool:
"""
Check if extraction has ALL required fields to skip further processing.
Required for early exit (ALL must be true):
- Overall confidence >= 85%
- ALL 5 critical fields present: number, date, amount, TVA, CUI
"""
# Must have high confidence
if ext.overall_confidence < min_confidence:
print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True)
return False
# Check all required fields
has_number = bool(ext.receipt_number)
has_date = bool(ext.receipt_date)
has_amount = bool(ext.amount)
has_tva = bool(ext.tva_total) or bool(ext.tva_entries)
has_cui = bool(ext.cui)
missing = []
if not has_number: missing.append("number")
if not has_date: missing.append("date")
if not has_amount: missing.append("amount")
if not has_tva: missing.append("TVA")
if not has_cui: missing.append("CUI")
if missing:
print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True)
return False
print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
return True
# Singleton instance # Singleton instance
ocr_service = OCRService() ocr_service = OCRService()

View File

@@ -106,14 +106,27 @@
<!-- Raw Text Toggle --> <!-- Raw Text Toggle -->
<div class="raw-text-section" v-if="data.raw_text"> <div class="raw-text-section" v-if="data.raw_text">
<Button <div class="raw-text-header">
:label="showRawText ? 'Ascunde text OCR' : 'Arata text OCR'" <Button
:icon="showRawText ? 'pi pi-eye-slash' : 'pi pi-eye'" :label="showRawText ? 'Ascunde text OCR' : 'Arata text OCR'"
severity="secondary" :icon="showRawText ? 'pi pi-eye-slash' : 'pi pi-eye'"
size="small" severity="secondary"
text size="small"
@click="showRawText = !showRawText" text
/> @click="showRawText = !showRawText"
/>
<span v-if="data.ocr_engine" class="ocr-engine-badge" :class="getEngineClass(data.ocr_engine)">
<i :class="getEngineIcon(data.ocr_engine)"></i>
{{ getEngineLabel(data.ocr_engine) }}
</span>
<span v-if="data._ocr_message" class="ocr-message-badge" :class="getMessageClass(data._ocr_message)">
{{ data._ocr_message }}
</span>
<span v-if="data.processing_time_ms" class="ocr-time-badge">
<i class="pi pi-clock"></i>
{{ formatProcessingTime(data.processing_time_ms) }}
</span>
</div>
<div v-if="showRawText" class="raw-text"> <div v-if="showRawText" class="raw-text">
<pre>{{ data.raw_text }}</pre> <pre>{{ data.raw_text }}</pre>
</div> </div>
@@ -168,6 +181,45 @@ const formatDate = (dateStr) => {
year: 'numeric' year: 'numeric'
}) })
} }
const getEngineClass = (engine) => {
if (!engine) return ''
if (engine === 'paddle-light') return 'fast'
if (engine === 'paddle-adaptive') return 'adaptive'
if (engine === 'adaptive-full') return 'full'
if (engine.includes('paddle')) return 'paddleocr'
if (engine.includes('tesseract')) return 'tesseract'
return ''
}
const getEngineIcon = (engine) => {
if (!engine) return 'pi pi-cog'
if (engine === 'paddle-light') return 'pi pi-bolt' // Fast/lightning
if (engine === 'adaptive-full') return 'pi pi-cog' // Full pipeline
return 'pi pi-cog'
}
const getEngineLabel = (engine) => {
if (!engine) return ''
if (engine === 'paddle-light') return 'Fast Mode (PaddleOCR)'
if (engine === 'paddle-adaptive') return 'Adaptive (Paddle dual)'
if (engine === 'adaptive-full') return 'Full Pipeline'
if (engine.includes('paddle')) return 'PaddleOCR'
if (engine.includes('tesseract')) return 'Tesseract'
return engine
}
const getMessageClass = (message) => {
if (!message) return ''
if (message.includes('fast mode')) return 'fast-mode'
if (message.includes('full pipeline')) return 'full-pipeline'
return ''
}
const formatProcessingTime = (ms) => {
if (ms < 1000) return `${ms}ms`
return `${(ms / 1000).toFixed(1)}s`
}
</script> </script>
<style scoped> <style scoped>
@@ -305,6 +357,82 @@ const formatDate = (dateStr) => {
border-top: 1px dashed #86efac; border-top: 1px dashed #86efac;
} }
.raw-text-header {
display: flex;
align-items: center;
flex-wrap: wrap;
gap: 0.75rem;
}
.ocr-engine-badge {
display: inline-flex;
align-items: center;
gap: 0.25rem;
padding: 0.25rem 0.5rem;
border-radius: 4px;
font-size: 0.75rem;
font-weight: 500;
}
.ocr-engine-badge.paddleocr {
background: #dbeafe;
color: #1e40af;
}
.ocr-engine-badge.tesseract {
background: #fef3c7;
color: #92400e;
}
.ocr-engine-badge.fast {
background: #dcfce7;
color: #166534;
}
.ocr-engine-badge.adaptive {
background: #dbeafe;
color: #1e40af;
}
.ocr-engine-badge.full {
background: #fef3c7;
color: #92400e;
}
.ocr-message-badge {
display: inline-flex;
align-items: center;
gap: 0.25rem;
padding: 0.25rem 0.5rem;
border-radius: 4px;
font-size: 0.75rem;
font-weight: 500;
background: #f1f5f9;
color: #475569;
}
.ocr-message-badge.fast-mode {
background: #dcfce7;
color: #166534;
}
.ocr-message-badge.full-pipeline {
background: #fef3c7;
color: #92400e;
}
.ocr-time-badge {
display: inline-flex;
align-items: center;
gap: 0.25rem;
padding: 0.25rem 0.5rem;
border-radius: 4px;
font-size: 0.75rem;
font-weight: 600;
background: #e0e7ff;
color: #3730a3;
}
.raw-text { .raw-text {
margin-top: 0.5rem; margin-top: 0.5rem;
padding: 0.75rem; padding: 0.75rem;

View File

@@ -143,7 +143,12 @@ const processOCR = async () => {
}) })
if (response.data.success) { if (response.data.success) {
emit('ocr-result', response.data.data) // Include the OCR message in the data for debugging
const resultData = {
...response.data.data,
_ocr_message: response.data.message
}
emit('ocr-result', resultData)
} else { } else {
error.value = response.data.message || 'OCR processing failed' error.value = response.data.message || 'OCR processing failed'
emit('error', error.value) emit('error', error.value)