"""Main OCR service coordinating preprocessing, recognition, and extraction.""" import os import re import logging # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' import time import asyncio from concurrent.futures import ThreadPoolExecutor from decimal import Decimal from pathlib import Path from typing import Optional, Tuple from backend.modules.data_entry.services.ocr_engine import OCREngine from backend.modules.data_entry.services.ocr_extractor import ReceiptExtractor, ExtractionResult from backend.modules.data_entry.services.image_preprocessor import ImagePreprocessor # Setup logging logger = logging.getLogger(__name__) class OCRService: """Service for OCR processing of receipt images.""" _executor = ThreadPoolExecutor(max_workers=2) def __init__(self): self.preprocessor = ImagePreprocessor() self.ocr_engine = OCREngine() self.extractor = ReceiptExtractor() async def process_image( self, image_path: Path, mime_type: str ) -> Tuple[bool, str, Optional[ExtractionResult]]: """ Process receipt image and extract structured data. Args: image_path: Path to the image file mime_type: MIME type of the file Returns: Tuple of (success, message, extraction_result) """ try: loop = asyncio.get_event_loop() result = await loop.run_in_executor( self._executor, self._process_sync, image_path, mime_type ) return result except Exception as e: return False, f"OCR processing failed: {str(e)}", None def _process_sync( self, image_path: Path, mime_type: str ) -> Tuple[bool, str, Optional[ExtractionResult]]: """Synchronous processing with ADAPTIVE OCR pipeline.""" start_time = time.time() print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True) # Load image if mime_type == 'application/pdf': try: images = self.preprocessor.pdf_to_images(image_path) if not images: return False, "Failed to extract images from PDF", None image = images[0] except RuntimeError as e: return False, str(e), None else: try: image = self.preprocessor.load_image(image_path) except ValueError as e: return False, str(e), None raw_texts = [] extraction = None # ══════════════════════════════════════════════════════════════ # STEP 1: PaddleOCR + Light (fastest, best for clear PDFs) # ══════════════════════════════════════════════════════════════ print("=" * 60, flush=True) print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True) print("=" * 60, flush=True) light_img = self.preprocessor.preprocess_light(image) try: paddle_light = self.ocr_engine._paddle_recognize(light_img) if paddle_light and paddle_light.text: extraction = self.extractor.extract(paddle_light.text) extraction.ocr_engine = "paddle-light" raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}") # Log extraction results print(f"[OCR] Step 1 Results:", flush=True) print(f" - OCR Confidence: {paddle_light.confidence:.0%}", flush=True) print(f" - Amount: {extraction.amount}", flush=True) print(f" - Date: {extraction.receipt_date}", flush=True) print(f" - Number: {extraction.receipt_number}", flush=True) print(f" - CUI: {extraction.cui}", flush=True) print(f" - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True) print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True) # Early exit if complete if self._is_extraction_complete(extraction): extraction.raw_text = "\n\n".join(raw_texts) elapsed_ms = int((time.time() - start_time) * 1000) extraction.processing_time_ms = elapsed_ms print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True) return True, "OCR complete (fast mode)", extraction else: print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True) except Exception as e: print(f"[OCR] PaddleOCR light failed: {e}", flush=True) extraction = ExtractionResult() # ══════════════════════════════════════════════════════════════ # STEP 2: PaddleOCR + Heavy (for faded thermal receipts) # ══════════════════════════════════════════════════════════════ print("=" * 60, flush=True) print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True) print("=" * 60, flush=True) heavy_img = self.preprocessor.preprocess_heavy(image) try: paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img) if paddle_heavy and paddle_heavy.text: extraction_heavy = self.extractor.extract(paddle_heavy.text) extraction_heavy.ocr_engine = "paddle-heavy" raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}") print(f"[OCR] Step 2 (Heavy) Results:", flush=True) print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True) print(f" - Amount: {extraction_heavy.amount}", flush=True) print(f" - Date: {extraction_heavy.receipt_date}", flush=True) print(f" - CUI: {extraction_heavy.cui}", flush=True) # Merge with previous extraction = self._merge_extractions(extraction, extraction_heavy) print(f"[OCR] After merge:", flush=True) print(f" - Amount: {extraction.amount}", flush=True) print(f" - Date: {extraction.receipt_date}", flush=True) print(f" - Number: {extraction.receipt_number}", flush=True) print(f" - CUI: {extraction.cui}", flush=True) print(f" - TVA: {extraction.tva_total}", flush=True) print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True) if self._is_extraction_complete(extraction): extraction.raw_text = "\n\n".join(raw_texts) extraction.ocr_engine = "paddle-adaptive" elapsed_ms = int((time.time() - start_time) * 1000) extraction.processing_time_ms = elapsed_ms print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True) return True, "OCR complete (paddle dual)", extraction else: print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True) except Exception as e: print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True) # ══════════════════════════════════════════════════════════════ # STEP 3: Tesseract - ONLY to complete missing fields # Uses Tesseract-optimized preprocessing (binarized, high contrast) # ══════════════════════════════════════════════════════════════ print("=" * 60, flush=True) print("[OCR] STEP 3: Tesseract (complement only, not override)", flush=True) print("=" * 60, flush=True) try: # Use Tesseract-specific preprocessing (Otsu binarization) tesseract_img = self.preprocessor.preprocess_for_tesseract(image) tesseract_result = self.ocr_engine._tesseract_recognize(tesseract_img) if tesseract_result and tesseract_result.text: extraction_tess = self.extractor.extract(tesseract_result.text) extraction_tess.ocr_engine = "tesseract" raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}") print(f"[OCR] Step 3 (Tesseract) Results:", flush=True) print(f" - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True) print(f" - Amount: {extraction_tess.amount}", flush=True) print(f" - Date: {extraction_tess.receipt_date}", flush=True) print(f" - CUI: {extraction_tess.cui}", flush=True) # IMPORTANT: Tesseract only COMPLETES missing fields, never overrides! extraction = self._complement_extraction(extraction, extraction_tess) except Exception as e: print(f"[OCR] Tesseract failed: {e}", flush=True) # ══════════════════════════════════════════════════════════════ # FINAL VALIDATION: Fix impossible values # ══════════════════════════════════════════════════════════════ if extraction: extraction = self._final_validation(extraction) # Final result if extraction is None: return False, "No text detected", None extraction.raw_text = "\n\n".join(raw_texts) extraction.ocr_engine = "adaptive-full" # Build result message fields_found = [] if extraction.amount: fields_found.append("amount") if extraction.receipt_date: fields_found.append("date") if extraction.receipt_number: fields_found.append("number") if extraction.cui: fields_found.append("CUI") if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA") message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}" elapsed_ms = int((time.time() - start_time) * 1000) extraction.processing_time_ms = elapsed_ms print("=" * 60, flush=True) print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True) print("=" * 60, flush=True) print(f" - Amount: {extraction.amount}", flush=True) print(f" - Date: {extraction.receipt_date}", flush=True) print(f" - Number: {extraction.receipt_number}", flush=True) print(f" - CUI: {extraction.cui}", flush=True) print(f" - TVA: {extraction.tva_total}", flush=True) print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True) print(f" - Processing Time: {elapsed_ms}ms", flush=True) print(f" - Message: {message}", flush=True) return True, message, extraction def _merge_extractions( self, paddle: Optional[ExtractionResult], tesseract: Optional[ExtractionResult] ) -> ExtractionResult: """ Merge two extractions, picking best fields from each engine. Strategy: - For each field, prefer the one with higher confidence - Use validation rules (CUI format, date validity, company indicators) - Combine TVA entries if different """ result = ExtractionResult() # Handle case where one is None if paddle is None and tesseract is None: return result if paddle is None: return tesseract if tesseract is None: return paddle print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True) # === AMOUNT === # Pick higher confidence, both must be positive if paddle.amount and tesseract.amount: if paddle.confidence_amount >= tesseract.confidence_amount: result.amount = paddle.amount result.confidence_amount = paddle.confidence_amount print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True) else: result.amount = tesseract.amount result.confidence_amount = tesseract.confidence_amount print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True) elif paddle.amount: result.amount = paddle.amount result.confidence_amount = paddle.confidence_amount elif tesseract.amount: result.amount = tesseract.amount result.confidence_amount = tesseract.confidence_amount # === DATE === # Pick higher confidence, validate date reasonableness if paddle.receipt_date and tesseract.receipt_date: if paddle.confidence_date >= tesseract.confidence_date: result.receipt_date = paddle.receipt_date result.confidence_date = paddle.confidence_date print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True) else: result.receipt_date = tesseract.receipt_date result.confidence_date = tesseract.confidence_date print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True) elif paddle.receipt_date: result.receipt_date = paddle.receipt_date result.confidence_date = paddle.confidence_date elif tesseract.receipt_date: result.receipt_date = tesseract.receipt_date result.confidence_date = tesseract.confidence_date # === VENDOR NAME === # Prefer one with company indicators (S.R.L., S.A., etc.) paddle_has_indicator = self._has_company_indicator(paddle.partner_name) tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name) if paddle.partner_name and tesseract.partner_name: if paddle_has_indicator and not tesseract_has_indicator: result.partner_name = paddle.partner_name result.confidence_vendor = paddle.confidence_vendor print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True) elif tesseract_has_indicator and not paddle_has_indicator: result.partner_name = tesseract.partner_name result.confidence_vendor = tesseract.confidence_vendor print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True) elif paddle.confidence_vendor >= tesseract.confidence_vendor: result.partner_name = paddle.partner_name result.confidence_vendor = paddle.confidence_vendor print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True) else: result.partner_name = tesseract.partner_name result.confidence_vendor = tesseract.confidence_vendor print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True) elif paddle.partner_name: result.partner_name = paddle.partner_name result.confidence_vendor = paddle.confidence_vendor elif tesseract.partner_name: result.partner_name = tesseract.partner_name result.confidence_vendor = tesseract.confidence_vendor # === CUI (Fiscal Code) === # Validate format: 6-10 digits, prefer valid one paddle_cui_valid = self._is_valid_cui(paddle.cui) tesseract_cui_valid = self._is_valid_cui(tesseract.cui) if paddle.cui and tesseract.cui: if paddle_cui_valid and not tesseract_cui_valid: result.cui = paddle.cui print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True) elif tesseract_cui_valid and not paddle_cui_valid: result.cui = tesseract.cui print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True) else: # Both valid or both invalid - prefer PaddleOCR result.cui = paddle.cui print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True) elif paddle.cui and paddle_cui_valid: result.cui = paddle.cui elif tesseract.cui and tesseract_cui_valid: result.cui = tesseract.cui elif paddle.cui: result.cui = paddle.cui elif tesseract.cui: result.cui = tesseract.cui # === TVA ENTRIES === # Prefer non-empty, use the one with more entries or higher amounts if paddle.tva_entries and tesseract.tva_entries: # Compare: prefer the one with actual amounts (not just 0) paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries) tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries) if paddle_total >= tesseract_total: result.tva_entries = paddle.tva_entries result.tva_total = paddle.tva_total print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True) else: result.tva_entries = tesseract.tva_entries result.tva_total = tesseract.tva_total print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True) elif paddle.tva_entries: result.tva_entries = paddle.tva_entries result.tva_total = paddle.tva_total elif tesseract.tva_entries: result.tva_entries = tesseract.tva_entries result.tva_total = tesseract.tva_total # === OTHER FIELDS === # Simple preference: paddle > tesseract result.receipt_number = paddle.receipt_number or tesseract.receipt_number result.receipt_series = paddle.receipt_series or tesseract.receipt_series result.receipt_type = paddle.receipt_type or tesseract.receipt_type result.items_count = paddle.items_count or tesseract.items_count result.address = paddle.address or tesseract.address result.description = paddle.description or tesseract.description return result def _has_company_indicator(self, name: Optional[str]) -> bool: """Check if vendor name has company type indicator (S.R.L., S.A., etc.)""" if not name: return False name_upper = name.upper() indicators = [ r'\bS\.?\s*R\.?\s*L\.?\b', r'\bS\.?\s*A\.?\b', r'\bS\.?\s*N\.?\s*C\.?\b', r'\bP\.?\s*F\.?\s*A\.?\b', r'\bI\.?\s*I\.?\b', r'\bHOLDING\b', r'\bGROUP\b', r'\bCOMPANY\b', ] for indicator in indicators: if re.search(indicator, name_upper): return True return False def _is_valid_cui(self, cui: Optional[str]) -> bool: """Validate CUI format: 6-10 digits.""" if not cui: return False # Remove any RO prefix cui_clean = re.sub(r'^RO', '', cui.upper()) # Must be 6-10 digits return bool(re.match(r'^\d{6,10}$', cui_clean)) def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool: """ Check if extraction has ALL required fields to skip further processing. Required for early exit (ALL must be true): - Overall confidence >= 85% - ALL 5 critical fields present: number, date, amount, TVA, CUI """ # Must have high confidence if ext.overall_confidence < min_confidence: print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True) return False # Check all required fields has_number = bool(ext.receipt_number) has_date = bool(ext.receipt_date) has_amount = bool(ext.amount) has_tva = bool(ext.tva_total) or bool(ext.tva_entries) has_cui = bool(ext.cui) missing = [] if not has_number: missing.append("number") if not has_date: missing.append("date") if not has_amount: missing.append("amount") if not has_tva: missing.append("TVA") if not has_cui: missing.append("CUI") if missing: print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True) return False print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True) return True def _complement_extraction( self, primary: Optional[ExtractionResult], secondary: Optional[ExtractionResult] ) -> ExtractionResult: """ Complement primary extraction with missing fields from secondary. NEVER overrides existing values - only fills in gaps. This is different from _merge_extractions which can override values. """ if primary is None and secondary is None: return ExtractionResult() if primary is None: return secondary if secondary is None: return primary print("[Complement] Adding missing fields from Tesseract...", flush=True) # Only fill missing amount if not primary.amount and secondary.amount: primary.amount = secondary.amount primary.confidence_amount = secondary.confidence_amount print(f"[Complement] Added amount: {secondary.amount}", flush=True) # Only fill missing date if not primary.receipt_date and secondary.receipt_date: primary.receipt_date = secondary.receipt_date primary.confidence_date = secondary.confidence_date print(f"[Complement] Added date: {secondary.receipt_date}", flush=True) # Only fill missing vendor if not primary.partner_name and secondary.partner_name: primary.partner_name = secondary.partner_name primary.confidence_vendor = secondary.confidence_vendor print(f"[Complement] Added vendor: {secondary.partner_name}", flush=True) # Only fill missing CUI if not primary.cui and secondary.cui and self._is_valid_cui(secondary.cui): primary.cui = secondary.cui print(f"[Complement] Added CUI: {secondary.cui}", flush=True) # Only fill missing TVA if not primary.tva_entries and secondary.tva_entries: primary.tva_entries = secondary.tva_entries primary.tva_total = secondary.tva_total print(f"[Complement] Added TVA: {secondary.tva_total}", flush=True) # Only fill missing receipt number if not primary.receipt_number and secondary.receipt_number: primary.receipt_number = secondary.receipt_number print(f"[Complement] Added number: {secondary.receipt_number}", flush=True) # Only fill missing address if not primary.address and secondary.address: primary.address = secondary.address print(f"[Complement] Added address: {secondary.address}", flush=True) return primary def _final_validation(self, extraction: ExtractionResult) -> ExtractionResult: """ Final validation and correction of impossible values. Key rules: 1. TVA cannot be greater than TOTAL (it's always a fraction) 2. If TVA > TOTAL, recalculate TOTAL from TVA using known rates 3. Validate TVA entries sum equals TVA total """ print("[Final Validation] Checking extracted values...", flush=True) # Rule 1: TVA cannot be greater than TOTAL if extraction.tva_total and extraction.amount: if extraction.tva_total > extraction.amount: print(f"[Final Validation] TVA ({extraction.tva_total}) > TOTAL ({extraction.amount}) - IMPOSSIBLE!", flush=True) # Calculate TOTAL from TVA using reverse formula: # total = base + tva = tva * (100/rate + 1) = tva * (100 + rate) / rate # For 9% TVA: total = tva * 109 / 9 = tva * 12.11 # For 19% TVA: total = tva * 119 / 19 = tva * 6.26 # For 21% TVA: total = tva * 121 / 21 = tva * 5.76 rate = 19 # Default rate assumption if extraction.tva_entries: # Use the rate from the first entry rate = extraction.tva_entries[0].get('percent', 19) if rate > 0: # Formula: total = tva * (100 + rate) / rate calculated_total = extraction.tva_total * (Decimal('100') + Decimal(str(rate))) / Decimal(str(rate)) calculated_total = calculated_total.quantize(Decimal('0.01')) print(f"[Final Validation] Calculated TOTAL from TVA: {calculated_total} (using {rate}% rate)", flush=True) extraction.amount = calculated_total extraction.confidence_amount = 0.70 # Lower confidence for calculated value # Rule 2: TVA cannot be more than ~25% of total (max Romanian rate is 21%) if extraction.tva_total and extraction.amount: tva_percent = extraction.tva_total / extraction.amount * Decimal('100') if tva_percent > Decimal('25'): print(f"[Final Validation] Warning: TVA is {tva_percent:.1f}% of total - suspicious", flush=True) # Rule 3: Validate TVA entries sum if extraction.tva_entries and extraction.tva_total: entries_sum = sum(e.get('amount', Decimal('0')) for e in extraction.tva_entries) tolerance = Decimal('0.05') if abs(entries_sum - extraction.tva_total) > tolerance: print(f"[Final Validation] TVA entries sum ({entries_sum}) != tva_total ({extraction.tva_total})", flush=True) # Use the sum as it's more reliable extraction.tva_total = entries_sum print(f"[Final Validation] Done. Amount={extraction.amount}, TVA={extraction.tva_total}", flush=True) return extraction # Singleton instance ocr_service = OCRService()