OCR Service improvements: - Single worker ThreadPoolExecutor to prevent parallel OCR memory accumulation - Semaphore to ensure only one OCR operation at a time - Explicit numpy array cleanup after each OCR step - Forced garbage collection after every OCR request - Memory threshold check (2500MB) with pre-processing GC - Memory usage logging before/after processing Backend auto-restart: - Add run-with-restart.sh wrapper script for uvicorn - Auto-restart on crash (OOM, etc.) with max 10 restarts - Reset restart counter if process runs >60s (stable) - Graceful exit on SIGINT/SIGTERM - Update start-prod.sh and start-test.sh to use wrapper 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
736 lines
34 KiB
Python
736 lines
34 KiB
Python
"""Main OCR service coordinating preprocessing, recognition, and extraction."""
|
|
|
|
import os
|
|
import re
|
|
import gc
|
|
import logging
|
|
import threading
|
|
|
|
# Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
|
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
|
|
import time
|
|
import asyncio
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from decimal import Decimal
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple
|
|
|
|
from backend.modules.data_entry.services.ocr_engine import OCREngine
|
|
from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult
|
|
from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor
|
|
from backend.modules.data_entry.services.ocr.validation import OCRValidationEngine
|
|
|
|
# Setup logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_memory_usage_mb() -> float:
|
|
"""Get current process memory usage in MB."""
|
|
try:
|
|
import resource
|
|
# Get memory in KB, convert to MB
|
|
rusage = resource.getrusage(resource.RUSAGE_SELF)
|
|
return rusage.ru_maxrss / 1024 # Linux returns KB
|
|
except Exception:
|
|
return 0.0
|
|
|
|
|
|
class OCRService:
|
|
"""Service for OCR processing of receipt images."""
|
|
|
|
# Single worker to prevent memory accumulation from parallel OCR
|
|
_executor = ThreadPoolExecutor(max_workers=1)
|
|
# Semaphore to ensure only one OCR operation at a time (memory protection)
|
|
_ocr_semaphore = threading.Semaphore(1)
|
|
# Memory threshold in MB - if exceeded, force GC before processing
|
|
_memory_threshold_mb = 2500
|
|
|
|
def __init__(self):
|
|
self.preprocessor = ImagePreprocessor()
|
|
self.ocr_engine = OCREngine()
|
|
self.extractor = ReceiptExtractor()
|
|
|
|
async def process_image(
|
|
self,
|
|
image_path: Path,
|
|
mime_type: str
|
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
|
"""
|
|
Process receipt image and extract structured data.
|
|
|
|
Args:
|
|
image_path: Path to the image file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
Tuple of (success, message, extraction_result)
|
|
"""
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
self._executor,
|
|
self._process_sync,
|
|
image_path,
|
|
mime_type
|
|
)
|
|
return result
|
|
except Exception as e:
|
|
return False, f"OCR processing failed: {str(e)}", None
|
|
|
|
def _cleanup_memory(self, *arrays):
|
|
"""Explicitly delete numpy arrays and force garbage collection."""
|
|
for arr in arrays:
|
|
if arr is not None:
|
|
try:
|
|
del arr
|
|
except:
|
|
pass
|
|
gc.collect()
|
|
|
|
def _process_sync(
|
|
self,
|
|
image_path: Path,
|
|
mime_type: str
|
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
|
"""Synchronous processing with ADAPTIVE OCR pipeline."""
|
|
|
|
# Acquire semaphore to ensure only one OCR at a time
|
|
acquired = self._ocr_semaphore.acquire(timeout=120) # 2 min timeout
|
|
if not acquired:
|
|
return False, "OCR service busy - please try again", None
|
|
|
|
try:
|
|
return self._process_sync_internal(image_path, mime_type)
|
|
finally:
|
|
# Always release semaphore and cleanup
|
|
self._ocr_semaphore.release()
|
|
# Force garbage collection after EVERY OCR request
|
|
gc.collect()
|
|
mem_after = get_memory_usage_mb()
|
|
print(f"[OCR Service] Memory after cleanup: {mem_after:.0f}MB", flush=True)
|
|
|
|
def _process_sync_internal(
|
|
self,
|
|
image_path: Path,
|
|
mime_type: str
|
|
) -> Tuple[bool, str, Optional[ExtractionResult]]:
|
|
"""Internal processing - called with semaphore held."""
|
|
|
|
start_time = time.time()
|
|
mem_before = get_memory_usage_mb()
|
|
print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
|
|
print(f"[OCR Service] Memory before: {mem_before:.0f}MB", flush=True)
|
|
|
|
# Check if memory is high - force GC before processing
|
|
if mem_before > self._memory_threshold_mb:
|
|
print(f"[OCR Service] ⚠️ Memory high ({mem_before:.0f}MB > {self._memory_threshold_mb}MB), forcing GC...", flush=True)
|
|
gc.collect()
|
|
mem_after_gc = get_memory_usage_mb()
|
|
print(f"[OCR Service] Memory after pre-GC: {mem_after_gc:.0f}MB", flush=True)
|
|
|
|
# Load image
|
|
images = None # For cleanup
|
|
image = None
|
|
if mime_type == 'application/pdf':
|
|
try:
|
|
images = self.preprocessor.pdf_to_images(image_path)
|
|
if not images:
|
|
return False, "Failed to extract images from PDF", None
|
|
image = images[0]
|
|
# Delete other pages immediately to save memory
|
|
if len(images) > 1:
|
|
for i in range(1, len(images)):
|
|
del images[i]
|
|
images = [image]
|
|
except RuntimeError as e:
|
|
return False, str(e), None
|
|
else:
|
|
try:
|
|
image = self.preprocessor.load_image(image_path)
|
|
except ValueError as e:
|
|
return False, str(e), None
|
|
|
|
raw_texts = []
|
|
extraction = None
|
|
|
|
# ══════════════════════════════════════════════════════════════
|
|
# STEP 1: PaddleOCR + Light (fastest, best for clear PDFs)
|
|
# ══════════════════════════════════════════════════════════════
|
|
print("=" * 60, flush=True)
|
|
print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True)
|
|
print("=" * 60, flush=True)
|
|
light_img = self.preprocessor.preprocess_light(image)
|
|
|
|
try:
|
|
paddle_light = self.ocr_engine._paddle_recognize(light_img)
|
|
# Cleanup light_img immediately after OCR
|
|
del light_img
|
|
light_img = None
|
|
|
|
if paddle_light and paddle_light.text:
|
|
extraction = self.extractor.extract(paddle_light.text)
|
|
extraction.ocr_engine = "paddle-light"
|
|
raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}")
|
|
|
|
# Log extraction results
|
|
print(f"[OCR] Step 1 Results:", flush=True)
|
|
print(f" - OCR Confidence: {paddle_light.confidence:.0%}", flush=True)
|
|
print(f" - Amount: {extraction.amount}", flush=True)
|
|
print(f" - Date: {extraction.receipt_date}", flush=True)
|
|
print(f" - Number: {extraction.receipt_number}", flush=True)
|
|
print(f" - CUI: {extraction.cui}", flush=True)
|
|
print(f" - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True)
|
|
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
|
|
|
# Early exit if complete
|
|
if self._is_extraction_complete(extraction):
|
|
extraction.raw_text = "\n\n".join(raw_texts)
|
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
|
extraction.processing_time_ms = elapsed_ms
|
|
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
|
# Cleanup before return
|
|
del image
|
|
if images:
|
|
del images
|
|
return True, "OCR complete (fast mode)", extraction
|
|
else:
|
|
print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
|
|
except Exception as e:
|
|
print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
|
|
extraction = ExtractionResult()
|
|
# Cleanup on error
|
|
if light_img is not None:
|
|
del light_img
|
|
|
|
# ══════════════════════════════════════════════════════════════
|
|
# STEP 2: PaddleOCR + Medium (balanced preprocessing)
|
|
# ══════════════════════════════════════════════════════════════
|
|
print("=" * 60, flush=True)
|
|
print("[OCR] STEP 2: PaddleOCR + Medium preprocessing", flush=True)
|
|
print("=" * 60, flush=True)
|
|
medium_img = self.preprocessor.preprocess_medium(image)
|
|
|
|
try:
|
|
paddle_medium = self.ocr_engine._paddle_recognize(medium_img)
|
|
# Cleanup medium_img immediately after OCR
|
|
del medium_img
|
|
medium_img = None
|
|
|
|
if paddle_medium and paddle_medium.text:
|
|
extraction_medium = self.extractor.extract(paddle_medium.text)
|
|
extraction_medium.ocr_engine = "paddle-medium"
|
|
raw_texts.append(f"═══ PaddleOCR (medium, conf: {paddle_medium.confidence:.0%}) ═══\n{paddle_medium.text}")
|
|
|
|
print(f"[OCR] Step 2 (Medium) Results:", flush=True)
|
|
print(f" - OCR Confidence: {paddle_medium.confidence:.0%}", flush=True)
|
|
print(f" - Amount: {extraction_medium.amount}", flush=True)
|
|
print(f" - Date: {extraction_medium.receipt_date}", flush=True)
|
|
print(f" - CUI: {extraction_medium.cui}", flush=True)
|
|
|
|
# Merge with previous
|
|
extraction = self._merge_extractions(extraction, extraction_medium)
|
|
|
|
print(f"[OCR] After merge:", flush=True)
|
|
print(f" - Amount: {extraction.amount}", flush=True)
|
|
print(f" - Date: {extraction.receipt_date}", flush=True)
|
|
print(f" - Number: {extraction.receipt_number}", flush=True)
|
|
print(f" - CUI: {extraction.cui}", flush=True)
|
|
print(f" - TVA: {extraction.tva_total}", flush=True)
|
|
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
|
|
|
if self._is_extraction_complete(extraction):
|
|
extraction.raw_text = "\n\n".join(raw_texts)
|
|
extraction.ocr_engine = "paddle-adaptive"
|
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
|
extraction.processing_time_ms = elapsed_ms
|
|
print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
|
|
# Cleanup before return
|
|
del image
|
|
if images:
|
|
del images
|
|
return True, "OCR complete (paddle dual)", extraction
|
|
else:
|
|
print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
|
|
except Exception as e:
|
|
print(f"[OCR] PaddleOCR medium failed: {e}", flush=True)
|
|
# Cleanup on error
|
|
if medium_img is not None:
|
|
del medium_img
|
|
|
|
# ══════════════════════════════════════════════════════════════
|
|
# STEP 3: Tesseract - ONLY to complete missing fields
|
|
# Uses Tesseract-optimized preprocessing (binarized, high contrast)
|
|
# ══════════════════════════════════════════════════════════════
|
|
print("=" * 60, flush=True)
|
|
print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True)
|
|
print("=" * 60, flush=True)
|
|
|
|
tesseract_img = None
|
|
try:
|
|
# Use Tesseract-specific preprocessing (Otsu binarization)
|
|
tesseract_img = self.preprocessor.preprocess_for_tesseract(image)
|
|
tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img)
|
|
# Cleanup tesseract_img immediately after OCR
|
|
del tesseract_img
|
|
tesseract_img = None
|
|
|
|
if tesseract_result and tesseract_result.text:
|
|
extraction_tess = self.extractor.extract(tesseract_result.text)
|
|
extraction_tess.ocr_engine = "tesseract"
|
|
raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}")
|
|
|
|
print(f"[OCR] Step 3 (Tesseract) Results:", flush=True)
|
|
print(f" - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True)
|
|
print(f" - Amount: {extraction_tess.amount}", flush=True)
|
|
print(f" - Date: {extraction_tess.receipt_date}", flush=True)
|
|
print(f" - CUI: {extraction_tess.cui}", flush=True)
|
|
|
|
# IMPORTANT: Tesseract only COMPLETES missing fields, never overrides!
|
|
extraction = self._complement_extraction(extraction, extraction_tess)
|
|
except Exception as e:
|
|
print(f"[OCR] Tesseract failed: {e}", flush=True)
|
|
# Cleanup on error
|
|
if tesseract_img is not None:
|
|
del tesseract_img
|
|
|
|
# Cleanup original image - no longer needed
|
|
del image
|
|
if images:
|
|
del images
|
|
|
|
# ══════════════════════════════════════════════════════════════
|
|
# FINAL VALIDATION: Fix impossible values
|
|
# ══════════════════════════════════════════════════════════════
|
|
if extraction:
|
|
extraction = self._final_validation(extraction)
|
|
|
|
# Final result
|
|
if extraction is None:
|
|
return False, "No text detected", None
|
|
|
|
extraction.raw_text = "\n\n".join(raw_texts)
|
|
extraction.ocr_engine = "adaptive-full"
|
|
|
|
# Build result message
|
|
fields_found = []
|
|
if extraction.amount: fields_found.append("amount")
|
|
if extraction.receipt_date: fields_found.append("date")
|
|
if extraction.receipt_number: fields_found.append("number")
|
|
if extraction.cui: fields_found.append("CUI")
|
|
if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA")
|
|
|
|
message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}"
|
|
|
|
elapsed_ms = int((time.time() - start_time) * 1000)
|
|
extraction.processing_time_ms = elapsed_ms
|
|
|
|
print("=" * 60, flush=True)
|
|
print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True)
|
|
print("=" * 60, flush=True)
|
|
print(f" - Amount: {extraction.amount}", flush=True)
|
|
print(f" - Date: {extraction.receipt_date}", flush=True)
|
|
print(f" - Number: {extraction.receipt_number}", flush=True)
|
|
print(f" - CUI: {extraction.cui}", flush=True)
|
|
print(f" - TVA: {extraction.tva_total}", flush=True)
|
|
print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
|
|
print(f" - Processing Time: {elapsed_ms}ms", flush=True)
|
|
print(f" - Message: {message}", flush=True)
|
|
|
|
# ══════════════════════════════════════════════════════════════
|
|
# VALIDATION: Apply validation rules to final extraction
|
|
# ══════════════════════════════════════════════════════════════
|
|
print("\n" + "=" * 60, flush=True)
|
|
print("[Validation] Applying validation rules...", flush=True)
|
|
print("=" * 60, flush=True)
|
|
|
|
validator = OCRValidationEngine()
|
|
|
|
# Prepare data for validation with safe type conversions
|
|
def safe_float(value) -> Optional[float]:
|
|
"""Safely convert Decimal or number to float."""
|
|
if value is None:
|
|
return None
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
def safe_payment_sum(methods: list, method_type: str) -> Optional[float]:
|
|
"""Safely sum payment amounts for a given method type."""
|
|
if not methods:
|
|
return None
|
|
try:
|
|
total = sum(
|
|
float(pm.get('amount', 0) or 0)
|
|
for pm in methods
|
|
if pm.get('method') == method_type
|
|
)
|
|
return total if total > 0 else None
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
validation_data = {
|
|
'amount': safe_float(extraction.amount),
|
|
'tva': safe_float(extraction.tva_total),
|
|
'cui': extraction.cui,
|
|
'card_amount': safe_payment_sum(extraction.payment_methods, 'CARD'),
|
|
'cash_amount': safe_payment_sum(extraction.payment_methods, 'NUMERAR'),
|
|
'tva_entries': {
|
|
entry.get('code', ''): safe_float(entry.get('amount'))
|
|
for entry in (extraction.tva_entries or [])
|
|
if entry.get('code') and safe_float(entry.get('amount')) is not None
|
|
}
|
|
}
|
|
|
|
# Run validation (no light/medium comparison for final result)
|
|
validated_result = validator.validate_extraction(validation_data)
|
|
|
|
# Apply validation results to extraction
|
|
extraction.needs_manual_review = validated_result.needs_manual_review
|
|
extraction.validation_warnings = validated_result.validation_warnings
|
|
extraction.validation_errors = validated_result.validation_errors
|
|
extraction.confidence_adjustments = validated_result.confidence_adjustments
|
|
extraction.inter_ocr_ratios = validated_result.inter_ocr_ratios
|
|
|
|
print(f"[Validation] Complete:", flush=True)
|
|
print(f" - Warnings: {len(extraction.validation_warnings)}", flush=True)
|
|
print(f" - Errors: {len(extraction.validation_errors)}", flush=True)
|
|
print(f" - Needs Manual Review: {extraction.needs_manual_review}", flush=True)
|
|
if extraction.validation_warnings:
|
|
for warning in extraction.validation_warnings:
|
|
print(f" ⚠️ {warning}", flush=True)
|
|
|
|
return True, message, extraction
|
|
|
|
def _merge_extractions(
|
|
self,
|
|
paddle: Optional[ExtractionResult],
|
|
tesseract: Optional[ExtractionResult]
|
|
) -> ExtractionResult:
|
|
"""
|
|
Merge two extractions, picking best fields from each engine.
|
|
|
|
Strategy:
|
|
- For each field, prefer the one with higher confidence
|
|
- Use validation rules (CUI format, date validity, company indicators)
|
|
- Combine TVA entries if different
|
|
"""
|
|
result = ExtractionResult()
|
|
|
|
# Handle case where one is None
|
|
if paddle is None and tesseract is None:
|
|
return result
|
|
if paddle is None:
|
|
return tesseract
|
|
if tesseract is None:
|
|
return paddle
|
|
|
|
print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True)
|
|
|
|
# === AMOUNT ===
|
|
# Pick higher confidence, both must be positive
|
|
if paddle.amount and tesseract.amount:
|
|
if paddle.confidence_amount >= tesseract.confidence_amount:
|
|
result.amount = paddle.amount
|
|
result.confidence_amount = paddle.confidence_amount
|
|
print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True)
|
|
else:
|
|
result.amount = tesseract.amount
|
|
result.confidence_amount = tesseract.confidence_amount
|
|
print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True)
|
|
elif paddle.amount:
|
|
result.amount = paddle.amount
|
|
result.confidence_amount = paddle.confidence_amount
|
|
elif tesseract.amount:
|
|
result.amount = tesseract.amount
|
|
result.confidence_amount = tesseract.confidence_amount
|
|
|
|
# === DATE ===
|
|
# Pick higher confidence, validate date reasonableness
|
|
if paddle.receipt_date and tesseract.receipt_date:
|
|
if paddle.confidence_date >= tesseract.confidence_date:
|
|
result.receipt_date = paddle.receipt_date
|
|
result.confidence_date = paddle.confidence_date
|
|
print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True)
|
|
else:
|
|
result.receipt_date = tesseract.receipt_date
|
|
result.confidence_date = tesseract.confidence_date
|
|
print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True)
|
|
elif paddle.receipt_date:
|
|
result.receipt_date = paddle.receipt_date
|
|
result.confidence_date = paddle.confidence_date
|
|
elif tesseract.receipt_date:
|
|
result.receipt_date = tesseract.receipt_date
|
|
result.confidence_date = tesseract.confidence_date
|
|
|
|
# === VENDOR NAME ===
|
|
# Prefer one with company indicators (S.R.L., S.A., etc.)
|
|
paddle_has_indicator = self._has_company_indicator(paddle.partner_name)
|
|
tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name)
|
|
|
|
if paddle.partner_name and tesseract.partner_name:
|
|
if paddle_has_indicator and not tesseract_has_indicator:
|
|
result.partner_name = paddle.partner_name
|
|
result.confidence_vendor = paddle.confidence_vendor
|
|
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True)
|
|
elif tesseract_has_indicator and not paddle_has_indicator:
|
|
result.partner_name = tesseract.partner_name
|
|
result.confidence_vendor = tesseract.confidence_vendor
|
|
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True)
|
|
elif paddle.confidence_vendor >= tesseract.confidence_vendor:
|
|
result.partner_name = paddle.partner_name
|
|
result.confidence_vendor = paddle.confidence_vendor
|
|
print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True)
|
|
else:
|
|
result.partner_name = tesseract.partner_name
|
|
result.confidence_vendor = tesseract.confidence_vendor
|
|
print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True)
|
|
elif paddle.partner_name:
|
|
result.partner_name = paddle.partner_name
|
|
result.confidence_vendor = paddle.confidence_vendor
|
|
elif tesseract.partner_name:
|
|
result.partner_name = tesseract.partner_name
|
|
result.confidence_vendor = tesseract.confidence_vendor
|
|
|
|
# === CUI (Fiscal Code) ===
|
|
# Validate format: 6-10 digits, prefer valid one
|
|
paddle_cui_valid = self._is_valid_cui(paddle.cui)
|
|
tesseract_cui_valid = self._is_valid_cui(tesseract.cui)
|
|
|
|
if paddle.cui and tesseract.cui:
|
|
if paddle_cui_valid and not tesseract_cui_valid:
|
|
result.cui = paddle.cui
|
|
print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True)
|
|
elif tesseract_cui_valid and not paddle_cui_valid:
|
|
result.cui = tesseract.cui
|
|
print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True)
|
|
else:
|
|
# Both valid or both invalid - prefer PaddleOCR
|
|
result.cui = paddle.cui
|
|
print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True)
|
|
elif paddle.cui and paddle_cui_valid:
|
|
result.cui = paddle.cui
|
|
elif tesseract.cui and tesseract_cui_valid:
|
|
result.cui = tesseract.cui
|
|
elif paddle.cui:
|
|
result.cui = paddle.cui
|
|
elif tesseract.cui:
|
|
result.cui = tesseract.cui
|
|
|
|
# === TVA ENTRIES ===
|
|
# Prefer non-empty, use the one with more entries or higher amounts
|
|
if paddle.tva_entries and tesseract.tva_entries:
|
|
# Compare: prefer the one with actual amounts (not just 0)
|
|
paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries)
|
|
tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries)
|
|
|
|
if paddle_total >= tesseract_total:
|
|
result.tva_entries = paddle.tva_entries
|
|
result.tva_total = paddle.tva_total
|
|
print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True)
|
|
else:
|
|
result.tva_entries = tesseract.tva_entries
|
|
result.tva_total = tesseract.tva_total
|
|
print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True)
|
|
elif paddle.tva_entries:
|
|
result.tva_entries = paddle.tva_entries
|
|
result.tva_total = paddle.tva_total
|
|
elif tesseract.tva_entries:
|
|
result.tva_entries = tesseract.tva_entries
|
|
result.tva_total = tesseract.tva_total
|
|
|
|
# === OTHER FIELDS ===
|
|
# Simple preference: paddle > tesseract
|
|
result.receipt_number = paddle.receipt_number or tesseract.receipt_number
|
|
result.receipt_series = paddle.receipt_series or tesseract.receipt_series
|
|
result.receipt_type = paddle.receipt_type or tesseract.receipt_type
|
|
result.items_count = paddle.items_count or tesseract.items_count
|
|
result.address = paddle.address or tesseract.address
|
|
result.description = paddle.description or tesseract.description
|
|
|
|
return result
|
|
|
|
def _has_company_indicator(self, name: Optional[str]) -> bool:
|
|
"""Check if vendor name has company type indicator (S.R.L., S.A., etc.)"""
|
|
if not name:
|
|
return False
|
|
name_upper = name.upper()
|
|
indicators = [
|
|
r'\bS\.?\s*R\.?\s*L\.?\b',
|
|
r'\bS\.?\s*A\.?\b',
|
|
r'\bS\.?\s*N\.?\s*C\.?\b',
|
|
r'\bP\.?\s*F\.?\s*A\.?\b',
|
|
r'\bI\.?\s*I\.?\b',
|
|
r'\bHOLDING\b',
|
|
r'\bGROUP\b',
|
|
r'\bCOMPANY\b',
|
|
]
|
|
for indicator in indicators:
|
|
if re.search(indicator, name_upper):
|
|
return True
|
|
return False
|
|
|
|
def _is_valid_cui(self, cui: Optional[str]) -> bool:
|
|
"""Validate CUI format: 6-10 digits."""
|
|
if not cui:
|
|
return False
|
|
# Remove any RO prefix
|
|
cui_clean = re.sub(r'^RO', '', cui.upper())
|
|
# Must be 6-10 digits
|
|
return bool(re.match(r'^\d{6,10}$', cui_clean))
|
|
|
|
def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool:
|
|
"""
|
|
Check if extraction has ALL required fields to skip further processing.
|
|
|
|
Required for early exit (ALL must be true):
|
|
- Overall confidence >= 85%
|
|
- ALL 5 critical fields present: number, date, amount, TVA, CUI
|
|
"""
|
|
# Must have high confidence
|
|
if ext.overall_confidence < min_confidence:
|
|
print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True)
|
|
return False
|
|
|
|
# Check all required fields
|
|
has_number = bool(ext.receipt_number)
|
|
has_date = bool(ext.receipt_date)
|
|
has_amount = bool(ext.amount)
|
|
has_tva = bool(ext.tva_total) or bool(ext.tva_entries)
|
|
has_cui = bool(ext.cui)
|
|
|
|
missing = []
|
|
if not has_number: missing.append("number")
|
|
if not has_date: missing.append("date")
|
|
if not has_amount: missing.append("amount")
|
|
if not has_tva: missing.append("TVA")
|
|
if not has_cui: missing.append("CUI")
|
|
|
|
if missing:
|
|
print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True)
|
|
return False
|
|
|
|
print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
|
|
return True
|
|
|
|
def _complement_extraction(
|
|
self,
|
|
primary: Optional[ExtractionResult],
|
|
secondary: Optional[ExtractionResult]
|
|
) -> ExtractionResult:
|
|
"""
|
|
Complement primary extraction with missing fields from secondary.
|
|
NEVER overrides existing values - only fills in gaps.
|
|
|
|
This is different from _merge_extractions which can override values.
|
|
"""
|
|
if primary is None and secondary is None:
|
|
return ExtractionResult()
|
|
if primary is None:
|
|
return secondary
|
|
if secondary is None:
|
|
return primary
|
|
|
|
print("[Complement] Adding missing fields from Tesseract...", flush=True)
|
|
|
|
# Only fill missing amount
|
|
if not primary.amount and secondary.amount:
|
|
primary.amount = secondary.amount
|
|
primary.confidence_amount = secondary.confidence_amount
|
|
print(f"[Complement] Added amount: {secondary.amount}", flush=True)
|
|
|
|
# Only fill missing date
|
|
if not primary.receipt_date and secondary.receipt_date:
|
|
primary.receipt_date = secondary.receipt_date
|
|
primary.confidence_date = secondary.confidence_date
|
|
print(f"[Complement] Added date: {secondary.receipt_date}", flush=True)
|
|
|
|
# Only fill missing vendor
|
|
if not primary.partner_name and secondary.partner_name:
|
|
primary.partner_name = secondary.partner_name
|
|
primary.confidence_vendor = secondary.confidence_vendor
|
|
print(f"[Complement] Added vendor: {secondary.partner_name}", flush=True)
|
|
|
|
# Only fill missing CUI
|
|
if not primary.cui and secondary.cui and self._is_valid_cui(secondary.cui):
|
|
primary.cui = secondary.cui
|
|
print(f"[Complement] Added CUI: {secondary.cui}", flush=True)
|
|
|
|
# Only fill missing TVA
|
|
if not primary.tva_entries and secondary.tva_entries:
|
|
primary.tva_entries = secondary.tva_entries
|
|
primary.tva_total = secondary.tva_total
|
|
print(f"[Complement] Added TVA: {secondary.tva_total}", flush=True)
|
|
|
|
# Only fill missing receipt number
|
|
if not primary.receipt_number and secondary.receipt_number:
|
|
primary.receipt_number = secondary.receipt_number
|
|
print(f"[Complement] Added number: {secondary.receipt_number}", flush=True)
|
|
|
|
# Only fill missing address
|
|
if not primary.address and secondary.address:
|
|
primary.address = secondary.address
|
|
print(f"[Complement] Added address: {secondary.address}", flush=True)
|
|
|
|
return primary
|
|
|
|
def _final_validation(self, extraction: ExtractionResult) -> ExtractionResult:
|
|
"""
|
|
Final validation and correction of impossible values.
|
|
|
|
Key rules:
|
|
1. TVA cannot be greater than TOTAL (it's always a fraction)
|
|
2. If TVA > TOTAL, recalculate TOTAL from TVA using known rates
|
|
3. Validate TVA entries sum equals TVA total
|
|
"""
|
|
print("[Final Validation] Checking extracted values...", flush=True)
|
|
|
|
# Rule 1: TVA cannot be greater than TOTAL
|
|
if extraction.tva_total and extraction.amount:
|
|
if extraction.tva_total > extraction.amount:
|
|
print(f"[Final Validation] TVA ({extraction.tva_total}) > TOTAL ({extraction.amount}) - IMPOSSIBLE!", flush=True)
|
|
|
|
# Calculate TOTAL from TVA using reverse formula:
|
|
# total = base + tva = tva * (100/rate + 1) = tva * (100 + rate) / rate
|
|
# For 9% TVA: total = tva * 109 / 9 = tva * 12.11
|
|
# For 19% TVA: total = tva * 119 / 19 = tva * 6.26
|
|
# For 21% TVA: total = tva * 121 / 21 = tva * 5.76
|
|
|
|
rate = 19 # Default rate assumption
|
|
if extraction.tva_entries:
|
|
# Use the rate from the first entry
|
|
rate = extraction.tva_entries[0].get('percent', 19)
|
|
|
|
if rate > 0:
|
|
# Formula: total = tva * (100 + rate) / rate
|
|
calculated_total = extraction.tva_total * (Decimal('100') + Decimal(str(rate))) / Decimal(str(rate))
|
|
calculated_total = calculated_total.quantize(Decimal('0.01'))
|
|
|
|
print(f"[Final Validation] Calculated TOTAL from TVA: {calculated_total} (using {rate}% rate)", flush=True)
|
|
|
|
extraction.amount = calculated_total
|
|
extraction.confidence_amount = 0.70 # Lower confidence for calculated value
|
|
|
|
# Rule 2: TVA cannot be more than ~25% of total (max Romanian rate is 21%)
|
|
if extraction.tva_total and extraction.amount:
|
|
tva_percent = extraction.tva_total / extraction.amount * Decimal('100')
|
|
if tva_percent > Decimal('25'):
|
|
print(f"[Final Validation] Warning: TVA is {tva_percent:.1f}% of total - suspicious", flush=True)
|
|
|
|
# Rule 3: Validate TVA entries sum
|
|
if extraction.tva_entries and extraction.tva_total:
|
|
entries_sum = sum(e.get('amount', Decimal('0')) for e in extraction.tva_entries)
|
|
tolerance = Decimal('0.05')
|
|
if abs(entries_sum - extraction.tva_total) > tolerance:
|
|
print(f"[Final Validation] TVA entries sum ({entries_sum}) != tva_total ({extraction.tva_total})", flush=True)
|
|
# Use the sum as it's more reliable
|
|
extraction.tva_total = entries_sum
|
|
|
|
print(f"[Final Validation] Done. Amount={extraction.amount}, TVA={extraction.tva_total}", flush=True)
|
|
return extraction
|
|
|
|
|
|
# Singleton instance
|
|
ocr_service = OCRService()
|