feat: Improve OCR adaptive pipeline with early exit and better pattern matching
- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
|
||||
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
||||
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
||||
|
||||
import time
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -13,6 +18,9 @@ from app.services.ocr_engine import OCREngine
|
||||
from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
|
||||
from app.services.image_preprocessor import ImagePreprocessor
|
||||
|
||||
# Setup logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRService:
|
||||
"""Service for OCR processing of receipt images."""
|
||||
@@ -56,15 +64,18 @@ class OCRService:
|
||||
image_path: Path,
|
||||
mime_type: str
|
||||
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
||||
"""Synchronous processing (runs in thread pool)."""
|
||||
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
||||
|
||||
# Handle PDF
|
||||
start_time = time.time()
|
||||
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
||||
|
||||
# Load image
|
||||
if mime_type == 'application/pdf':
|
||||
try:
|
||||
images = self.preprocessor.pdf_to_images(image_path)
|
||||
if not images:
|
||||
return False, "Failed to extract images from PDF", None
|
||||
image = images[0] # Process first page only
|
||||
image = images[0]
|
||||
except RuntimeError as e:
|
||||
return False, str(e), None
|
||||
else:
|
||||
@@ -73,38 +84,360 @@ class OCRService:
|
||||
except ValueError as e:
|
||||
return False, str(e), None
|
||||
|
||||
# Preprocess image
|
||||
processed = self.preprocessor.preprocess(image)
|
||||
raw_texts = []
|
||||
extraction = None
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 1: PaddleOCR + Light (fastest, best for clear PDFs)
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
print("=" * 60, flush=True)
|
||||
print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True)
|
||||
print("=" * 60, flush=True)
|
||||
light_img = self.preprocessor.preprocess_light(image)
|
||||
|
||||
# Perform OCR
|
||||
try:
|
||||
ocr_result = self.ocr_engine.recognize(processed)
|
||||
except RuntimeError as e:
|
||||
return False, str(e), None
|
||||
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
||||
if paddle_light and paddle_light.text:
|
||||
extraction = self.extractor.extract(paddle_light.text)
|
||||
extraction.ocr_engine = "paddle-light"
|
||||
raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}")
|
||||
|
||||
if not ocr_result.text:
|
||||
return False, "No text detected in image", None
|
||||
# Log extraction results
|
||||
print(f"[OCR] Step 1 Results:", flush=True)
|
||||
print(f" - OCR Confidence: {paddle_light.confidence:.0%}", flush=True)
|
||||
print(f" - Amount: {extraction.amount}", flush=True)
|
||||
print(f" - Date: {extraction.receipt_date}", flush=True)
|
||||
print(f" - Number: {extraction.receipt_number}", flush=True)
|
||||
print(f" - CUI: {extraction.cui}", flush=True)
|
||||
print(f" - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True)
|
||||
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
||||
|
||||
# Extract structured fields
|
||||
extraction = self.extractor.extract(ocr_result.text)
|
||||
# Early exit if complete
|
||||
if self._is_extraction_complete(extraction):
|
||||
extraction.raw_text = "\n\n".join(raw_texts)
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||
return True, "OCR complete (fast mode)", extraction
|
||||
else:
|
||||
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
||||
except Exception as e:
|
||||
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
||||
extraction = ExtractionResult()
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
print("=" * 60, flush=True)
|
||||
print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
|
||||
print("=" * 60, flush=True)
|
||||
heavy_img = self.preprocessor.preprocess_heavy(image)
|
||||
|
||||
try:
|
||||
paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
|
||||
if paddle_heavy and paddle_heavy.text:
|
||||
extraction_heavy = self.extractor.extract(paddle_heavy.text)
|
||||
extraction_heavy.ocr_engine = "paddle-heavy"
|
||||
raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
|
||||
|
||||
print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
|
||||
print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
|
||||
print(f" - Amount: {extraction_heavy.amount}", flush=True)
|
||||
print(f" - Date: {extraction_heavy.receipt_date}", flush=True)
|
||||
print(f" - CUI: {extraction_heavy.cui}", flush=True)
|
||||
|
||||
# Merge with previous
|
||||
extraction = self._merge_extractions(extraction, extraction_heavy)
|
||||
|
||||
print(f"[OCR] After merge:", flush=True)
|
||||
print(f" - Amount: {extraction.amount}", flush=True)
|
||||
print(f" - Date: {extraction.receipt_date}", flush=True)
|
||||
print(f" - Number: {extraction.receipt_number}", flush=True)
|
||||
print(f" - CUI: {extraction.cui}", flush=True)
|
||||
print(f" - TVA: {extraction.tva_total}", flush=True)
|
||||
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
||||
|
||||
if self._is_extraction_complete(extraction):
|
||||
extraction.raw_text = "\n\n".join(raw_texts)
|
||||
extraction.ocr_engine = "paddle-adaptive"
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
||||
return True, "OCR complete (paddle dual)", extraction
|
||||
else:
|
||||
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
||||
except Exception as e:
|
||||
print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
# STEP 3: Tesseract fallback
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
print("=" * 60, flush=True)
|
||||
print("[OCR] STEP 3: Tesseract fallback", flush=True)
|
||||
print("=" * 60, flush=True)
|
||||
|
||||
try:
|
||||
tesseract_result = self.ocr_engine._tesseract_recognize(light_img)
|
||||
if tesseract_result and tesseract_result.text:
|
||||
extraction_tess = self.extractor.extract(tesseract_result.text)
|
||||
extraction_tess.ocr_engine = "tesseract"
|
||||
raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}")
|
||||
|
||||
print(f"[OCR] Step 3 (Tesseract) Results:", flush=True)
|
||||
print(f" - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True)
|
||||
print(f" - Amount: {extraction_tess.amount}", flush=True)
|
||||
print(f" - Date: {extraction_tess.receipt_date}", flush=True)
|
||||
print(f" - CUI: {extraction_tess.cui}", flush=True)
|
||||
|
||||
extraction = self._merge_extractions(extraction, extraction_tess)
|
||||
except Exception as e:
|
||||
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
||||
|
||||
# Final result
|
||||
if extraction is None:
|
||||
return False, "No text detected", None
|
||||
|
||||
extraction.raw_text = "\n\n".join(raw_texts)
|
||||
extraction.ocr_engine = "adaptive-full"
|
||||
|
||||
# Build result message
|
||||
fields_found = []
|
||||
if extraction.amount:
|
||||
fields_found.append("amount")
|
||||
if extraction.receipt_date:
|
||||
fields_found.append("date")
|
||||
if extraction.partner_name:
|
||||
fields_found.append("vendor")
|
||||
if extraction.cui:
|
||||
fields_found.append("CUI")
|
||||
if extraction.receipt_number:
|
||||
fields_found.append("number")
|
||||
if extraction.amount: fields_found.append("amount")
|
||||
if extraction.receipt_date: fields_found.append("date")
|
||||
if extraction.receipt_number: fields_found.append("number")
|
||||
if extraction.cui: fields_found.append("CUI")
|
||||
if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA")
|
||||
|
||||
message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
|
||||
message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}"
|
||||
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
extraction.processing_time_ms = elapsed_ms
|
||||
|
||||
print("=" * 60, flush=True)
|
||||
print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True)
|
||||
print("=" * 60, flush=True)
|
||||
print(f" - Amount: {extraction.amount}", flush=True)
|
||||
print(f" - Date: {extraction.receipt_date}", flush=True)
|
||||
print(f" - Number: {extraction.receipt_number}", flush=True)
|
||||
print(f" - CUI: {extraction.cui}", flush=True)
|
||||
print(f" - TVA: {extraction.tva_total}", flush=True)
|
||||
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
||||
print(f" - Processing Time: {elapsed_ms}ms", flush=True)
|
||||
print(f" - Message: {message}", flush=True)
|
||||
|
||||
return True, message, extraction
|
||||
|
||||
def _merge_extractions(
|
||||
self,
|
||||
paddle: Optional[ExtractionResult],
|
||||
tesseract: Optional[ExtractionResult]
|
||||
) -> ExtractionResult:
|
||||
"""
|
||||
Merge two extractions, picking best fields from each engine.
|
||||
|
||||
Strategy:
|
||||
- For each field, prefer the one with higher confidence
|
||||
- Use validation rules (CUI format, date validity, company indicators)
|
||||
- Combine TVA entries if different
|
||||
"""
|
||||
result = ExtractionResult()
|
||||
|
||||
# Handle case where one is None
|
||||
if paddle is None and tesseract is None:
|
||||
return result
|
||||
if paddle is None:
|
||||
return tesseract
|
||||
if tesseract is None:
|
||||
return paddle
|
||||
|
||||
print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True)
|
||||
|
||||
# === AMOUNT ===
|
||||
# Pick higher confidence, both must be positive
|
||||
if paddle.amount and tesseract.amount:
|
||||
if paddle.confidence_amount >= tesseract.confidence_amount:
|
||||
result.amount = paddle.amount
|
||||
result.confidence_amount = paddle.confidence_amount
|
||||
print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True)
|
||||
else:
|
||||
result.amount = tesseract.amount
|
||||
result.confidence_amount = tesseract.confidence_amount
|
||||
print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True)
|
||||
elif paddle.amount:
|
||||
result.amount = paddle.amount
|
||||
result.confidence_amount = paddle.confidence_amount
|
||||
elif tesseract.amount:
|
||||
result.amount = tesseract.amount
|
||||
result.confidence_amount = tesseract.confidence_amount
|
||||
|
||||
# === DATE ===
|
||||
# Pick higher confidence, validate date reasonableness
|
||||
if paddle.receipt_date and tesseract.receipt_date:
|
||||
if paddle.confidence_date >= tesseract.confidence_date:
|
||||
result.receipt_date = paddle.receipt_date
|
||||
result.confidence_date = paddle.confidence_date
|
||||
print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True)
|
||||
else:
|
||||
result.receipt_date = tesseract.receipt_date
|
||||
result.confidence_date = tesseract.confidence_date
|
||||
print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True)
|
||||
elif paddle.receipt_date:
|
||||
result.receipt_date = paddle.receipt_date
|
||||
result.confidence_date = paddle.confidence_date
|
||||
elif tesseract.receipt_date:
|
||||
result.receipt_date = tesseract.receipt_date
|
||||
result.confidence_date = tesseract.confidence_date
|
||||
|
||||
# === VENDOR NAME ===
|
||||
# Prefer one with company indicators (S.R.L., S.A., etc.)
|
||||
paddle_has_indicator = self._has_company_indicator(paddle.partner_name)
|
||||
tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name)
|
||||
|
||||
if paddle.partner_name and tesseract.partner_name:
|
||||
if paddle_has_indicator and not tesseract_has_indicator:
|
||||
result.partner_name = paddle.partner_name
|
||||
result.confidence_vendor = paddle.confidence_vendor
|
||||
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True)
|
||||
elif tesseract_has_indicator and not paddle_has_indicator:
|
||||
result.partner_name = tesseract.partner_name
|
||||
result.confidence_vendor = tesseract.confidence_vendor
|
||||
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True)
|
||||
elif paddle.confidence_vendor >= tesseract.confidence_vendor:
|
||||
result.partner_name = paddle.partner_name
|
||||
result.confidence_vendor = paddle.confidence_vendor
|
||||
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True)
|
||||
else:
|
||||
result.partner_name = tesseract.partner_name
|
||||
result.confidence_vendor = tesseract.confidence_vendor
|
||||
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True)
|
||||
elif paddle.partner_name:
|
||||
result.partner_name = paddle.partner_name
|
||||
result.confidence_vendor = paddle.confidence_vendor
|
||||
elif tesseract.partner_name:
|
||||
result.partner_name = tesseract.partner_name
|
||||
result.confidence_vendor = tesseract.confidence_vendor
|
||||
|
||||
# === CUI (Fiscal Code) ===
|
||||
# Validate format: 6-10 digits, prefer valid one
|
||||
paddle_cui_valid = self._is_valid_cui(paddle.cui)
|
||||
tesseract_cui_valid = self._is_valid_cui(tesseract.cui)
|
||||
|
||||
if paddle.cui and tesseract.cui:
|
||||
if paddle_cui_valid and not tesseract_cui_valid:
|
||||
result.cui = paddle.cui
|
||||
print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True)
|
||||
elif tesseract_cui_valid and not paddle_cui_valid:
|
||||
result.cui = tesseract.cui
|
||||
print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True)
|
||||
else:
|
||||
# Both valid or both invalid - prefer PaddleOCR
|
||||
result.cui = paddle.cui
|
||||
print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True)
|
||||
elif paddle.cui and paddle_cui_valid:
|
||||
result.cui = paddle.cui
|
||||
elif tesseract.cui and tesseract_cui_valid:
|
||||
result.cui = tesseract.cui
|
||||
elif paddle.cui:
|
||||
result.cui = paddle.cui
|
||||
elif tesseract.cui:
|
||||
result.cui = tesseract.cui
|
||||
|
||||
# === TVA ENTRIES ===
|
||||
# Prefer non-empty, use the one with more entries or higher amounts
|
||||
if paddle.tva_entries and tesseract.tva_entries:
|
||||
# Compare: prefer the one with actual amounts (not just 0)
|
||||
paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries)
|
||||
tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries)
|
||||
|
||||
if paddle_total >= tesseract_total:
|
||||
result.tva_entries = paddle.tva_entries
|
||||
result.tva_total = paddle.tva_total
|
||||
print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True)
|
||||
else:
|
||||
result.tva_entries = tesseract.tva_entries
|
||||
result.tva_total = tesseract.tva_total
|
||||
print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True)
|
||||
elif paddle.tva_entries:
|
||||
result.tva_entries = paddle.tva_entries
|
||||
result.tva_total = paddle.tva_total
|
||||
elif tesseract.tva_entries:
|
||||
result.tva_entries = tesseract.tva_entries
|
||||
result.tva_total = tesseract.tva_total
|
||||
|
||||
# === OTHER FIELDS ===
|
||||
# Simple preference: paddle > tesseract
|
||||
result.receipt_number = paddle.receipt_number or tesseract.receipt_number
|
||||
result.receipt_series = paddle.receipt_series or tesseract.receipt_series
|
||||
result.receipt_type = paddle.receipt_type or tesseract.receipt_type
|
||||
result.items_count = paddle.items_count or tesseract.items_count
|
||||
result.address = paddle.address or tesseract.address
|
||||
result.description = paddle.description or tesseract.description
|
||||
|
||||
return result
|
||||
|
||||
def _has_company_indicator(self, name: Optional[str]) -> bool:
|
||||
"""Check if vendor name has company type indicator (S.R.L., S.A., etc.)"""
|
||||
if not name:
|
||||
return False
|
||||
name_upper = name.upper()
|
||||
indicators = [
|
||||
r'\bS\.?\s*R\.?\s*L\.?\b',
|
||||
r'\bS\.?\s*A\.?\b',
|
||||
r'\bS\.?\s*N\.?\s*C\.?\b',
|
||||
r'\bP\.?\s*F\.?\s*A\.?\b',
|
||||
r'\bI\.?\s*I\.?\b',
|
||||
r'\bHOLDING\b',
|
||||
r'\bGROUP\b',
|
||||
r'\bCOMPANY\b',
|
||||
]
|
||||
for indicator in indicators:
|
||||
if re.search(indicator, name_upper):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_valid_cui(self, cui: Optional[str]) -> bool:
|
||||
"""Validate CUI format: 6-10 digits."""
|
||||
if not cui:
|
||||
return False
|
||||
# Remove any RO prefix
|
||||
cui_clean = re.sub(r'^RO', '', cui.upper())
|
||||
# Must be 6-10 digits
|
||||
return bool(re.match(r'^\d{6,10}$', cui_clean))
|
||||
|
||||
def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool:
|
||||
"""
|
||||
Check if extraction has ALL required fields to skip further processing.
|
||||
|
||||
Required for early exit (ALL must be true):
|
||||
- Overall confidence >= 85%
|
||||
- ALL 5 critical fields present: number, date, amount, TVA, CUI
|
||||
"""
|
||||
# Must have high confidence
|
||||
if ext.overall_confidence < min_confidence:
|
||||
print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True)
|
||||
return False
|
||||
|
||||
# Check all required fields
|
||||
has_number = bool(ext.receipt_number)
|
||||
has_date = bool(ext.receipt_date)
|
||||
has_amount = bool(ext.amount)
|
||||
has_tva = bool(ext.tva_total) or bool(ext.tva_entries)
|
||||
has_cui = bool(ext.cui)
|
||||
|
||||
missing = []
|
||||
if not has_number: missing.append("number")
|
||||
if not has_date: missing.append("date")
|
||||
if not has_amount: missing.append("amount")
|
||||
if not has_tva: missing.append("TVA")
|
||||
if not has_cui: missing.append("CUI")
|
||||
|
||||
if missing:
|
||||
print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True)
|
||||
return False
|
||||
|
||||
print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
|
||||
return True
|
||||
|
||||
|
||||
# Singleton instance
|
||||
ocr_service = OCRService()
|
||||
|
||||
Reference in New Issue
Block a user