diff --git a/data-entry-app/backend/app/main.py b/data-entry-app/backend/app/main.py index cbd68a3..36c8776 100644 --- a/data-entry-app/backend/app/main.py +++ b/data-entry-app/backend/app/main.py @@ -1,10 +1,19 @@ """FastAPI application entry point for Data Entry App.""" import sys +import logging +import threading from pathlib import Path from contextlib import asynccontextmanager from fastapi import FastAPI + +# Configure logging to show INFO level messages +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%H:%M:%S' +) from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles @@ -30,6 +39,18 @@ async def lifespan(app: FastAPI): settings.upload_path_resolved print(f"Upload path: {settings.upload_path_resolved}") + # Pre-initialize OCR engine in background (PaddleOCR takes 15-20s) + def init_ocr_background(): + try: + from app.services.ocr_service import ocr_service + ocr_service.ocr_engine._init_paddle_lazy() + print("OCR engine ready") + except Exception as e: + print(f"Warning: OCR engine pre-load failed: {e}") + + print("Starting OCR engine pre-load (background)...") + threading.Thread(target=init_ocr_background, daemon=True).start() + yield # Shutdown diff --git a/data-entry-app/backend/app/routers/ocr.py b/data-entry-app/backend/app/routers/ocr.py index 65c6ad7..791014e 100644 --- a/data-entry-app/backend/app/routers/ocr.py +++ b/data-entry-app/backend/app/routers/ocr.py @@ -102,6 +102,8 @@ async def extract_from_image(file: UploadFile = File(...)): confidence_vendor=result.confidence_vendor, overall_confidence=result.overall_confidence, raw_text=result.raw_text, + ocr_engine=result.ocr_engine, + processing_time_ms=result.processing_time_ms, ) return OCRResponse(success=True, message=message, data=data) @@ -171,6 +173,8 @@ async def extract_from_attachment( confidence_vendor=result.confidence_vendor, overall_confidence=result.overall_confidence, raw_text=result.raw_text, + ocr_engine=result.ocr_engine, + processing_time_ms=result.processing_time_ms, ) return OCRResponse(success=True, message=message, data=data) diff --git a/data-entry-app/backend/app/schemas/ocr.py b/data-entry-app/backend/app/schemas/ocr.py index 0b79a4f..ee2bf3b 100644 --- a/data-entry-app/backend/app/schemas/ocr.py +++ b/data-entry-app/backend/app/schemas/ocr.py @@ -37,6 +37,8 @@ class ExtractionData(BaseModel): confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence") overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score") raw_text: str = Field(default="", description="Raw OCR text") + ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract") + processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds") class Config: """Pydantic config.""" diff --git a/data-entry-app/backend/app/services/image_preprocessor.py b/data-entry-app/backend/app/services/image_preprocessor.py index 3ee28f9..edd97e0 100644 --- a/data-entry-app/backend/app/services/image_preprocessor.py +++ b/data-entry-app/backend/app/services/image_preprocessor.py @@ -23,37 +23,57 @@ class ImagePreprocessor: raise ValueError(f"Could not load image: {path}") return image - def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]: + def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]: """ - Convert PDF to images with high DPI for better OCR. + Convert PDF to images. Args: path: Path to PDF file - dpi: Resolution (400 recommended for receipts, higher = better quality but slower) + dpi: Resolution (300 = fast & good quality, 400 = better but slower) """ if not PDF_AVAILABLE: raise RuntimeError("pdf2image not available. Install with: pip install pdf2image") - # Use 400 DPI for better text recognition on thermal receipts images = pdf2image.convert_from_path(str(path), dpi=dpi) return [np.array(img) for img in images] def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray: """ - Apply preprocessing pipeline for thermal receipt images. + Apply LIGHT preprocessing - better for clear PDFs. + Heavy binarization can destroy text on clear images. + """ + return self.preprocess_light(image) - Pipeline: - 1. Convert to grayscale - 2. Resize if too small (min 1500px width for high quality) - 3. Deskew (straighten rotated text) - 4. Contrast enhancement (CLAHE) - 5. Denoise (Non-local means) - 6. Sharpening (for clearer text edges) - 7. Adaptive thresholding (binarization) - 8. Morphological operations (connect broken chars) + def preprocess_light(self, image: np.ndarray) -> np.ndarray: + """ + Light preprocessing for CLEAR images (PDFs, good scans). + Preserves original quality, only enhances contrast. + """ + # 1. Grayscale + if len(image.shape) == 3: + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = image.copy() - Args: - image: Input image (BGR or grayscale) - high_quality: If True, apply more aggressive preprocessing + # 2. Resize if too small + height, width = gray.shape + if width < 1500: + scale = 1500 / width + gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) + + # 3. Deskew + gray = self._deskew(gray) + + # 4. Light contrast enhancement only + clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8)) + enhanced = clahe.apply(gray) + + # NO binarization, NO morphological ops - preserve original quality + return enhanced + + def preprocess_heavy(self, image: np.ndarray) -> np.ndarray: + """ + Heavy preprocessing for FADED thermal receipts. + Aggressive binarization to recover faded text. """ # 1. Grayscale if len(image.shape) == 3: @@ -63,57 +83,48 @@ class ImagePreprocessor: # 2. Resize if too small (larger = better OCR) height, width = gray.shape - min_width = 1500 if high_quality else 1000 - if width < min_width: - scale = min_width / width - gray = cv2.resize( - gray, None, fx=scale, fy=scale, - interpolation=cv2.INTER_CUBIC - ) + if width < 1500: + scale = 1500 / width + gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) # 3. Deskew gray = self._deskew(gray) - # 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization) + # 4. Contrast enhancement with CLAHE clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) - # 5. Denoise (slightly less aggressive to preserve text details) - denoised = cv2.fastNlMeansDenoising( - enhanced, h=8, # Lower h = preserve more details - templateWindowSize=7, - searchWindowSize=21 - ) + # 5. Denoise + denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21) - # 6. Sharpening to enhance text edges - if high_quality: - # Unsharp mask for better text clarity - gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0) - sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0) - else: - sharpened = denoised + # 6. Sharpening + gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0) + sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0) - # 7. Adaptive thresholding with optimized parameters + # 7. Adaptive thresholding (binarization) binary = cv2.adaptiveThreshold( sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, - blockSize=11, # Smaller block = better for small text - C=5 # Lower C = darker result, better for faded receipts + blockSize=11, C=5 ) # 8. Morphological operations - # Close small gaps in characters kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close) - # Optional: Remove small noise spots - if high_quality: - kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) - result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open) - return result + def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]: + """ + Generate 2 preprocessing variants for OCR (fast mode). + Returns: [light_processed, heavy_processed] + """ + return [ + self.preprocess_light(image), + self.preprocess_heavy(image), + ] + def _deskew(self, image: np.ndarray) -> np.ndarray: """Correct image rotation/skew using Hough lines.""" edges = cv2.Canny(image, 50, 150, apertureSize=3) diff --git a/data-entry-app/backend/app/services/ocr_engine.py b/data-entry-app/backend/app/services/ocr_engine.py index f086729..2af189c 100644 --- a/data-entry-app/backend/app/services/ocr_engine.py +++ b/data-entry-app/backend/app/services/ocr_engine.py @@ -1,11 +1,16 @@ """OCR engine wrapper for PaddleOCR and Tesseract.""" import os +import logging from dataclasses import dataclass -from typing import List, Optional +from typing import List, Optional, Tuple import numpy as np +# Setup logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) # Ensure logs are visible + # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' @@ -40,6 +45,7 @@ class OCRResult: text: str confidence: float boxes: List[dict] + engine: str = "" # OCR engine used: paddleocr or tesseract class OCREngine: @@ -65,8 +71,9 @@ class OCREngine: print("Initializing PaddleOCR engine...") # PaddleOCR 3.x API - optimized for Romanian receipts + # Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts self._paddle = PaddleOCR( - lang='en', # 'en' works better than 'ro' for mixed alphanumeric + lang='en', # 'en' handles Latin alphabet well for receipts # High quality settings for better accuracy det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3) det_db_box_thresh=0.5, # Box confidence threshold (default 0.5) @@ -81,14 +88,19 @@ class OCREngine: def recognize(self, image: np.ndarray) -> OCRResult: """Perform OCR on preprocessed image.""" + logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}") + # Lazy init PaddleOCR on first call self._init_paddle_lazy() if PADDLE_AVAILABLE and self._paddle: + logger.info("[OCR] Using PaddleOCR engine") return self._paddle_recognize(image) elif TESSERACT_AVAILABLE: + logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)") return self._tesseract_recognize(image) else: + logger.error("[OCR] No OCR engine available!") raise RuntimeError( "No OCR engine available. Install PaddleOCR or Tesseract." ) @@ -96,17 +108,23 @@ class OCREngine: def _paddle_recognize(self, image: np.ndarray) -> OCRResult: """Recognize text using PaddleOCR 3.x API.""" try: + logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}") + # PaddleOCR 3.x requires 3-channel images if len(image.shape) == 2: # Convert grayscale to 3-channel BGR import cv2 image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}") # PaddleOCR 3.x uses predict() with new parameter names + logger.info("[PaddleOCR] Calling predict()...") result = self._paddle.predict(image, use_textline_orientation=True) + logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}") if not result or len(result) == 0: - return OCRResult(text="", confidence=0.0, boxes=[]) + logger.warning("[PaddleOCR] No results returned") + return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr") # PaddleOCR 3.x returns OCRResult objects with different structure ocr_result = result[0] @@ -117,7 +135,7 @@ class OCREngine: dt_polys = ocr_result.get('dt_polys', []) if not rec_texts: - return OCRResult(text="", confidence=0.0, boxes=[]) + return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr") boxes = [] for i, text in enumerate(rec_texts): @@ -130,13 +148,17 @@ class OCREngine: }) avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0 + text_result = '\n'.join(rec_texts) + logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}") + logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...") return OCRResult( - text='\n'.join(rec_texts), + text=text_result, confidence=float(avg_conf), - boxes=boxes + boxes=boxes, + engine="paddleocr" ) except Exception as e: - print(f"PaddleOCR error: {e}, falling back to Tesseract") + logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract") if TESSERACT_AVAILABLE: return self._tesseract_recognize(image) raise @@ -145,23 +167,70 @@ class OCREngine: """Recognize text using Tesseract.""" global pytesseract + logger.info(f"[Tesseract] Processing image, shape: {image.shape}") + # Lazy import pytesseract if pytesseract is None: - print("Importing pytesseract...") + logger.info("[Tesseract] Importing pytesseract...") import pytesseract as _pytesseract pytesseract = _pytesseract - config = '--psm 6 -l ron+eng' + # PSM 4: Single column (best for receipts) + config = '--psm 4 -l ron+eng' text = pytesseract.image_to_string(image, config=config) - data = pytesseract.image_to_data( - image, config=config, - output_type=pytesseract.Output.DICT - ) + # Quick confidence estimate + data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT) confidences = [int(c) for c in data['conf'] if int(c) > 0] avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0 - return OCRResult(text=text, confidence=avg_conf, boxes=[]) + logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}") + return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract") + + def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]: + """ + Run both OCR engines and return both results. + + Returns: + Tuple of (paddle_result, tesseract_result) + tesseract_result may be None if Tesseract is not available + """ + logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}") + + # Lazy init PaddleOCR + self._init_paddle_lazy() + + paddle_result = None + tesseract_result = None + + # Run PaddleOCR + if PADDLE_AVAILABLE and self._paddle: + try: + logger.info("[OCR Dual] Running PaddleOCR...") + paddle_result = self._paddle_recognize(image) + logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}") + except Exception as e: + logger.error(f"[OCR Dual] PaddleOCR failed: {e}") + paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr") + + # Run Tesseract + if TESSERACT_AVAILABLE: + try: + logger.info("[OCR Dual] Running Tesseract...") + tesseract_result = self._tesseract_recognize(image) + logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}") + except Exception as e: + logger.error(f"[OCR Dual] Tesseract failed: {e}") + tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract") + + # Fallback if PaddleOCR not available + if paddle_result is None: + if tesseract_result: + paddle_result = tesseract_result + else: + raise RuntimeError("No OCR engine available") + + return paddle_result, tesseract_result @staticmethod def get_available_engines() -> List[str]: diff --git a/data-entry-app/backend/app/services/ocr_extractor.py b/data-entry-app/backend/app/services/ocr_extractor.py index 8b60a7d..a37c73f 100644 --- a/data-entry-app/backend/app/services/ocr_extractor.py +++ b/data-entry-app/backend/app/services/ocr_extractor.py @@ -28,6 +28,8 @@ class ExtractionResult: confidence_date: float = 0.0 confidence_vendor: float = 0.0 raw_text: str = "" + ocr_engine: str = "" # OCR engine used: paddleocr or tesseract + processing_time_ms: int = 0 # Processing time in milliseconds @property def overall_confidence(self) -> float: @@ -70,6 +72,7 @@ class ReceiptExtractor: # Date patterns - support dash, dot, and slash separators # OCR may produce DRTA instead of DATA, DAIA, etc. + # OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025" DATE_PATTERNS = [ # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant) (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), @@ -84,6 +87,19 @@ class ReceiptExtractor: (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75), ] + # OCR-corrupted date patterns with spaces/commas + # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc. + DATE_PATTERNS_OCR_SPACES = [ + # YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time + (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'), + # YYYY. MM. DD format with spaces (standalone) + (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'), + # DD. MM, YYYY or DD, MM. YYYY (with time following) + (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'), + # DD. MM, YYYY or DD, MM. YYYY (standalone) + (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'), + ] + # Receipt number patterns - Romanian fiscal receipt formats # OCR may produce N instead of : or other errors NUMBER_PATTERNS = [ @@ -127,12 +143,23 @@ class ReceiptExtractor: (r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90), # COD FISCAL (vendor) (r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90), - # C.I.F. format (with dots) + # C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891" + (r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92), + # C.I.F. format (with dots, no spaces) (r'(? ExtractionResult: @@ -215,6 +243,14 @@ class ReceiptExtractor: # Extract additional fields - Multiple TVA entries result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper) + if not result.tva_entries: + print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True) + # Debug: show what patterns see + import re + normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper) + taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE) + rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE) + print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True) result.items_count = self._extract_items_count(text_upper) result.address = self._extract_address(text_upper) @@ -334,6 +370,7 @@ class ReceiptExtractor: def _extract_date(self, text: str) -> Tuple[Optional[date], float]: """Extract receipt date from text.""" + # First try standard patterns (clean dates) for pattern, confidence in self.DATE_PATTERNS: match = re.search(pattern, text) if match: @@ -354,6 +391,34 @@ class ReceiptExtractor: return parsed, confidence except ValueError: continue + + # Then try OCR-corrupted patterns (dates with spaces/commas) + # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc. + for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES: + match = re.search(pattern, text) + if match: + try: + if fmt == 'ymd': + # YYYY. MM. DD format (OMV/Petrom) + year = match.group(1) + month = match.group(2) + day = match.group(3) + else: + # DD. MM. YYYY format (default) + day = match.group(1) + month = match.group(2) + year = match.group(3) + + date_str = f"{day}.{month}.{year}" + parsed = datetime.strptime(date_str, '%d.%m.%Y').date() + + # Validate date range + today = date.today() + if parsed <= today and parsed.year >= 2020: + return parsed, confidence + except ValueError: + continue + return None, 0.0 def _extract_number(self, text: str) -> Tuple[Optional[str], float]: @@ -377,8 +442,9 @@ class ReceiptExtractor: Extract vendor/partner name from text. Uses multiple strategies: 1. Look for lines with company type indicators (S.R.L., S.A., etc.) - 2. Look for lines near CIF - 3. Use first valid line as fallback + 2. Look for company name + SRL on separate lines + 3. Look for lines near CIF + 4. Use first valid line as fallback """ lines = text.split('\n') skip_keywords = [ @@ -388,9 +454,37 @@ class ReceiptExtractor: 'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT', 'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT', 'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY', - 'BUC', 'ROLA', 'CUMPARATOR' + 'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK', + 'NIVS', 'BENZINA', 'PETROM', 'OMV' ] + # Strategy 0: Look for company name followed by SRL/SA on next line + # Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L." + for i, line in enumerate(lines[:15]): + line = line.strip() + if not line or len(line) < 3: + continue + + line_upper = line.upper() + + # Skip lines with skip keywords + if any(kw in line_upper for kw in skip_keywords): + continue + + # Check if next line is standalone SRL, S.R.L., SA, S.A., etc. + if i + 1 < len(lines): + next_line = lines[i + 1].strip().upper() + # Match standalone company type suffix + if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \ + re.match(r'^S\.?\s*A\.?$', next_line) or \ + re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \ + re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \ + re.match(r'^I\.?\s*I\.?$', next_line): + # Combine: "COMPANY NAME" + " " + "SRL" + vendor = self._clean_vendor_name(f"{line} {next_line}") + if vendor and len(vendor) >= 5: + return vendor, 0.95 + # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.) for i, line in enumerate(lines[:15]): # Check first 15 lines line = line.strip() @@ -476,7 +570,22 @@ class ReceiptExtractor: Extract vendor CUI (fiscal identification code) from text. Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...' """ - # First, try to find CIF on a line that doesn't contain CLIENT + # Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label) + # This is common in some receipts: "R011201891\nC. I. F." + for pattern, confidence in self.CUI_REVERSED_PATTERNS: + match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE) + if match: + cui = match.group(1) + if 6 <= len(cui) <= 10: + # Verify this is not the CLIENT CUI by checking context + start = match.start() + # Check 50 chars before the match for CLIENT keyword + context_start = max(0, start - 50) + context = text_upper[context_start:start] + if 'CLIENT' not in context and 'LIENT' not in context: + return cui, confidence + + # Strategy 1: Try to find CIF on a line that doesn't contain CLIENT lines = text_upper.split('\n') for line in lines: # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's) @@ -491,7 +600,7 @@ class ReceiptExtractor: if 6 <= len(cui) <= 10: return cui, confidence - # Fallback: search entire text but exclude CLIENT patterns + # Strategy 2: Fallback - search entire text but exclude CLIENT patterns for pattern, confidence in self.CUI_PATTERNS: # Find all matches for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE): @@ -523,8 +632,94 @@ class ReceiptExtractor: tva_entries = [] seen_entries = set() # To avoid duplicates - # Normalize spaces in numbers first (OCR may produce "32. 31") + # Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0 + # OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc. + # Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)" + non_vat_patterns = [ + # Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R + r'NEPLAT\w*OR', # NEPLATITOR, NEPLATTOR, NEPLATOR + r'[ANM]EPLAT\w*O?R', # OCR errors: ANEPLATHTOR, MEPLATITOR + r'TOTAL\s+NEPLAT', # TOTAL NEPLATITOR... + r'TOTAL\s+[ANM]EPLAT', # TOTAL ANEPLAT... (OCR error) + r'SCUTIT\s*(?:DE\s+)?T[VU]A', # SCUTIT DE TVA + r'NEPLAT\w*\s+T[VU]A', # NEPLATITOR TVA + r'NEPLAT\w*\s+DE\s+T', # NEPLATITOR DE T... (truncated) + ] + for pattern in non_vat_patterns: + if re.search(pattern, text, re.IGNORECASE): + # Non-VAT payer - return TVA = 0 + return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00') + + # Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58") normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) + # Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%") + normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text) + + # Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable + # Format: "TOTAL TAXE: 55,22" - this is always the TVA amount + # OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:" + # OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:" + total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)' + taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE) + + # Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue) + if not taxe_match: + reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE' + taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE) + + if taxe_match: + # Also need to find the TVA rate from the table + # Pattern handles: "A-21%", "-21,00%", "21%" etc. + rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%' + rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE) + if rate_match: + try: + code = rate_match.group(1).upper() if rate_match.group(1) else 'A' # Default to A if missing + percent = int(rate_match.group(2)) + amount_str = taxe_match.group(1).replace(' ', '') + amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) + amount = Decimal(amount_str) + if amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + pass + + # Pattern 0b: Table format "A-21,00% 285,66 49,58" (code-percent base tva_amount) + # This format appears after a TVA header line like "TVA TOTAL VALDARE" + # The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle + if not tva_entries: + table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)' + for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE): + try: + code = match.group(1).upper() + percent = int(match.group(2)) + amount1_str = match.group(3).replace(' ', '') + amount2_str = match.group(4).replace(' ', '') + amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str))) + amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str))) + + # Determine which is TVA: the smaller amount is usually TVA + # (TVA is a fraction of the total, so it's always smaller) + tva_amount = min(amount1, amount2) + + if tva_amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': tva_amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + continue # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code) # OCR tolerant: TUA, TVR, etc. @@ -571,7 +766,75 @@ class ReceiptExtractor: except (ValueError, InvalidOperation): continue - # Pattern 3: "TVAA - 21%" on one line, amount on next line + # Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount + if not tva_entries: + # First try: "TOTAL TVA A - 21% 32.31" (amount on same line) + tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)' + for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE): + try: + code = match.group(1).upper() + percent = int(match.group(2)) + amount_str = self._normalize_number(match.group(3)) + amount = Decimal(amount_str) + if amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + continue + + # Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount + if not tva_entries: + tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%' + for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE): + try: + code = match.group(1).upper() + percent = int(match.group(2)) + + # Look for "TOTAL TVA BON" followed by amount + tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)' + tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE) + if tva_bon_match: + amount_str = self._normalize_number(tva_bon_match.group(1)) + amount = Decimal(amount_str) + if amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + continue + + # Fallback: Amount after TOTAL TVA BON on next line + tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE) + if tva_bon_pos: + after_bon = normalized_text[tva_bon_pos.end():] + # Find first standalone number (likely TVA amount) + amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon) + if amount_match: + amount_str = self._normalize_number(amount_match.group(1)) + amount = Decimal(amount_str) + if amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + continue + + # Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format) if not tva_entries: tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%' for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE): diff --git a/data-entry-app/backend/app/services/ocr_service.py b/data-entry-app/backend/app/services/ocr_service.py index 3d54ec6..354bbb6 100644 --- a/data-entry-app/backend/app/services/ocr_service.py +++ b/data-entry-app/backend/app/services/ocr_service.py @@ -1,11 +1,16 @@ """Main OCR service coordinating preprocessing, recognition, and extraction.""" import os +import re +import logging + # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' +import time import asyncio from concurrent.futures import ThreadPoolExecutor +from decimal import Decimal from pathlib import Path from typing import Optional, Tuple @@ -13,6 +18,9 @@ from app.services.ocr_engine import OCREngine from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult from app.services.image_preprocessor import ImagePreprocessor +# Setup logging +logger = logging.getLogger(__name__) + class OCRService: """Service for OCR processing of receipt images.""" @@ -56,15 +64,18 @@ class OCRService: image_path: Path, mime_type: str ) -> Tuple[bool, str, Optional[ExtractionResult]]: - """Synchronous processing (runs in thread pool).""" + """Synchronous processing with ADAPTIVE OCR pipeline.""" - # Handle PDF + start_time = time.time() + print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True) + + # Load image if mime_type == 'application/pdf': try: images = self.preprocessor.pdf_to_images(image_path) if not images: return False, "Failed to extract images from PDF", None - image = images[0] # Process first page only + image = images[0] except RuntimeError as e: return False, str(e), None else: @@ -73,38 +84,360 @@ class OCRService: except ValueError as e: return False, str(e), None - # Preprocess image - processed = self.preprocessor.preprocess(image) + raw_texts = [] + extraction = None + + # ══════════════════════════════════════════════════════════════ + # STEP 1: PaddleOCR + Light (fastest, best for clear PDFs) + # ══════════════════════════════════════════════════════════════ + print("=" * 60, flush=True) + print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True) + print("=" * 60, flush=True) + light_img = self.preprocessor.preprocess_light(image) - # Perform OCR try: - ocr_result = self.ocr_engine.recognize(processed) - except RuntimeError as e: - return False, str(e), None + paddle_light = self.ocr_engine._paddle_recognize(light_img) + if paddle_light and paddle_light.text: + extraction = self.extractor.extract(paddle_light.text) + extraction.ocr_engine = "paddle-light" + raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}") - if not ocr_result.text: - return False, "No text detected in image", None + # Log extraction results + print(f"[OCR] Step 1 Results:", flush=True) + print(f" - OCR Confidence: {paddle_light.confidence:.0%}", flush=True) + print(f" - Amount: {extraction.amount}", flush=True) + print(f" - Date: {extraction.receipt_date}", flush=True) + print(f" - Number: {extraction.receipt_number}", flush=True) + print(f" - CUI: {extraction.cui}", flush=True) + print(f" - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True) + print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True) - # Extract structured fields - extraction = self.extractor.extract(ocr_result.text) + # Early exit if complete + if self._is_extraction_complete(extraction): + extraction.raw_text = "\n\n".join(raw_texts) + elapsed_ms = int((time.time() - start_time) * 1000) + extraction.processing_time_ms = elapsed_ms + print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True) + return True, "OCR complete (fast mode)", extraction + else: + print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True) + except Exception as e: + print(f"[OCR] PaddleOCR light failed: {e}", flush=True) + extraction = ExtractionResult() + + # ══════════════════════════════════════════════════════════════ + # STEP 2: PaddleOCR + Heavy (for faded thermal receipts) + # ══════════════════════════════════════════════════════════════ + print("=" * 60, flush=True) + print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True) + print("=" * 60, flush=True) + heavy_img = self.preprocessor.preprocess_heavy(image) + + try: + paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img) + if paddle_heavy and paddle_heavy.text: + extraction_heavy = self.extractor.extract(paddle_heavy.text) + extraction_heavy.ocr_engine = "paddle-heavy" + raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}") + + print(f"[OCR] Step 2 (Heavy) Results:", flush=True) + print(f" - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True) + print(f" - Amount: {extraction_heavy.amount}", flush=True) + print(f" - Date: {extraction_heavy.receipt_date}", flush=True) + print(f" - CUI: {extraction_heavy.cui}", flush=True) + + # Merge with previous + extraction = self._merge_extractions(extraction, extraction_heavy) + + print(f"[OCR] After merge:", flush=True) + print(f" - Amount: {extraction.amount}", flush=True) + print(f" - Date: {extraction.receipt_date}", flush=True) + print(f" - Number: {extraction.receipt_number}", flush=True) + print(f" - CUI: {extraction.cui}", flush=True) + print(f" - TVA: {extraction.tva_total}", flush=True) + print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True) + + if self._is_extraction_complete(extraction): + extraction.raw_text = "\n\n".join(raw_texts) + extraction.ocr_engine = "paddle-adaptive" + elapsed_ms = int((time.time() - start_time) * 1000) + extraction.processing_time_ms = elapsed_ms + print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True) + return True, "OCR complete (paddle dual)", extraction + else: + print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True) + except Exception as e: + print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True) + + # ══════════════════════════════════════════════════════════════ + # STEP 3: Tesseract fallback + # ══════════════════════════════════════════════════════════════ + print("=" * 60, flush=True) + print("[OCR] STEP 3: Tesseract fallback", flush=True) + print("=" * 60, flush=True) + + try: + tesseract_result = self.ocr_engine._tesseract_recognize(light_img) + if tesseract_result and tesseract_result.text: + extraction_tess = self.extractor.extract(tesseract_result.text) + extraction_tess.ocr_engine = "tesseract" + raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}") + + print(f"[OCR] Step 3 (Tesseract) Results:", flush=True) + print(f" - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True) + print(f" - Amount: {extraction_tess.amount}", flush=True) + print(f" - Date: {extraction_tess.receipt_date}", flush=True) + print(f" - CUI: {extraction_tess.cui}", flush=True) + + extraction = self._merge_extractions(extraction, extraction_tess) + except Exception as e: + print(f"[OCR] Tesseract failed: {e}", flush=True) + + # Final result + if extraction is None: + return False, "No text detected", None + + extraction.raw_text = "\n\n".join(raw_texts) + extraction.ocr_engine = "adaptive-full" # Build result message fields_found = [] - if extraction.amount: - fields_found.append("amount") - if extraction.receipt_date: - fields_found.append("date") - if extraction.partner_name: - fields_found.append("vendor") - if extraction.cui: - fields_found.append("CUI") - if extraction.receipt_number: - fields_found.append("number") + if extraction.amount: fields_found.append("amount") + if extraction.receipt_date: fields_found.append("date") + if extraction.receipt_number: fields_found.append("number") + if extraction.cui: fields_found.append("CUI") + if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA") - message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}" + message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}" + + elapsed_ms = int((time.time() - start_time) * 1000) + extraction.processing_time_ms = elapsed_ms + + print("=" * 60, flush=True) + print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True) + print("=" * 60, flush=True) + print(f" - Amount: {extraction.amount}", flush=True) + print(f" - Date: {extraction.receipt_date}", flush=True) + print(f" - Number: {extraction.receipt_number}", flush=True) + print(f" - CUI: {extraction.cui}", flush=True) + print(f" - TVA: {extraction.tva_total}", flush=True) + print(f" - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True) + print(f" - Processing Time: {elapsed_ms}ms", flush=True) + print(f" - Message: {message}", flush=True) return True, message, extraction + def _merge_extractions( + self, + paddle: Optional[ExtractionResult], + tesseract: Optional[ExtractionResult] + ) -> ExtractionResult: + """ + Merge two extractions, picking best fields from each engine. + + Strategy: + - For each field, prefer the one with higher confidence + - Use validation rules (CUI format, date validity, company indicators) + - Combine TVA entries if different + """ + result = ExtractionResult() + + # Handle case where one is None + if paddle is None and tesseract is None: + return result + if paddle is None: + return tesseract + if tesseract is None: + return paddle + + print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True) + + # === AMOUNT === + # Pick higher confidence, both must be positive + if paddle.amount and tesseract.amount: + if paddle.confidence_amount >= tesseract.confidence_amount: + result.amount = paddle.amount + result.confidence_amount = paddle.confidence_amount + print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True) + else: + result.amount = tesseract.amount + result.confidence_amount = tesseract.confidence_amount + print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True) + elif paddle.amount: + result.amount = paddle.amount + result.confidence_amount = paddle.confidence_amount + elif tesseract.amount: + result.amount = tesseract.amount + result.confidence_amount = tesseract.confidence_amount + + # === DATE === + # Pick higher confidence, validate date reasonableness + if paddle.receipt_date and tesseract.receipt_date: + if paddle.confidence_date >= tesseract.confidence_date: + result.receipt_date = paddle.receipt_date + result.confidence_date = paddle.confidence_date + print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True) + else: + result.receipt_date = tesseract.receipt_date + result.confidence_date = tesseract.confidence_date + print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True) + elif paddle.receipt_date: + result.receipt_date = paddle.receipt_date + result.confidence_date = paddle.confidence_date + elif tesseract.receipt_date: + result.receipt_date = tesseract.receipt_date + result.confidence_date = tesseract.confidence_date + + # === VENDOR NAME === + # Prefer one with company indicators (S.R.L., S.A., etc.) + paddle_has_indicator = self._has_company_indicator(paddle.partner_name) + tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name) + + if paddle.partner_name and tesseract.partner_name: + if paddle_has_indicator and not tesseract_has_indicator: + result.partner_name = paddle.partner_name + result.confidence_vendor = paddle.confidence_vendor + print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True) + elif tesseract_has_indicator and not paddle_has_indicator: + result.partner_name = tesseract.partner_name + result.confidence_vendor = tesseract.confidence_vendor + print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True) + elif paddle.confidence_vendor >= tesseract.confidence_vendor: + result.partner_name = paddle.partner_name + result.confidence_vendor = paddle.confidence_vendor + print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True) + else: + result.partner_name = tesseract.partner_name + result.confidence_vendor = tesseract.confidence_vendor + print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True) + elif paddle.partner_name: + result.partner_name = paddle.partner_name + result.confidence_vendor = paddle.confidence_vendor + elif tesseract.partner_name: + result.partner_name = tesseract.partner_name + result.confidence_vendor = tesseract.confidence_vendor + + # === CUI (Fiscal Code) === + # Validate format: 6-10 digits, prefer valid one + paddle_cui_valid = self._is_valid_cui(paddle.cui) + tesseract_cui_valid = self._is_valid_cui(tesseract.cui) + + if paddle.cui and tesseract.cui: + if paddle_cui_valid and not tesseract_cui_valid: + result.cui = paddle.cui + print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True) + elif tesseract_cui_valid and not paddle_cui_valid: + result.cui = tesseract.cui + print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True) + else: + # Both valid or both invalid - prefer PaddleOCR + result.cui = paddle.cui + print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True) + elif paddle.cui and paddle_cui_valid: + result.cui = paddle.cui + elif tesseract.cui and tesseract_cui_valid: + result.cui = tesseract.cui + elif paddle.cui: + result.cui = paddle.cui + elif tesseract.cui: + result.cui = tesseract.cui + + # === TVA ENTRIES === + # Prefer non-empty, use the one with more entries or higher amounts + if paddle.tva_entries and tesseract.tva_entries: + # Compare: prefer the one with actual amounts (not just 0) + paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries) + tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries) + + if paddle_total >= tesseract_total: + result.tva_entries = paddle.tva_entries + result.tva_total = paddle.tva_total + print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True) + else: + result.tva_entries = tesseract.tva_entries + result.tva_total = tesseract.tva_total + print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True) + elif paddle.tva_entries: + result.tva_entries = paddle.tva_entries + result.tva_total = paddle.tva_total + elif tesseract.tva_entries: + result.tva_entries = tesseract.tva_entries + result.tva_total = tesseract.tva_total + + # === OTHER FIELDS === + # Simple preference: paddle > tesseract + result.receipt_number = paddle.receipt_number or tesseract.receipt_number + result.receipt_series = paddle.receipt_series or tesseract.receipt_series + result.receipt_type = paddle.receipt_type or tesseract.receipt_type + result.items_count = paddle.items_count or tesseract.items_count + result.address = paddle.address or tesseract.address + result.description = paddle.description or tesseract.description + + return result + + def _has_company_indicator(self, name: Optional[str]) -> bool: + """Check if vendor name has company type indicator (S.R.L., S.A., etc.)""" + if not name: + return False + name_upper = name.upper() + indicators = [ + r'\bS\.?\s*R\.?\s*L\.?\b', + r'\bS\.?\s*A\.?\b', + r'\bS\.?\s*N\.?\s*C\.?\b', + r'\bP\.?\s*F\.?\s*A\.?\b', + r'\bI\.?\s*I\.?\b', + r'\bHOLDING\b', + r'\bGROUP\b', + r'\bCOMPANY\b', + ] + for indicator in indicators: + if re.search(indicator, name_upper): + return True + return False + + def _is_valid_cui(self, cui: Optional[str]) -> bool: + """Validate CUI format: 6-10 digits.""" + if not cui: + return False + # Remove any RO prefix + cui_clean = re.sub(r'^RO', '', cui.upper()) + # Must be 6-10 digits + return bool(re.match(r'^\d{6,10}$', cui_clean)) + + def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool: + """ + Check if extraction has ALL required fields to skip further processing. + + Required for early exit (ALL must be true): + - Overall confidence >= 85% + - ALL 5 critical fields present: number, date, amount, TVA, CUI + """ + # Must have high confidence + if ext.overall_confidence < min_confidence: + print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True) + return False + + # Check all required fields + has_number = bool(ext.receipt_number) + has_date = bool(ext.receipt_date) + has_amount = bool(ext.amount) + has_tva = bool(ext.tva_total) or bool(ext.tva_entries) + has_cui = bool(ext.cui) + + missing = [] + if not has_number: missing.append("number") + if not has_date: missing.append("date") + if not has_amount: missing.append("amount") + if not has_tva: missing.append("TVA") + if not has_cui: missing.append("CUI") + + if missing: + print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True) + return False + + print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True) + return True + # Singleton instance ocr_service = OCRService() diff --git a/data-entry-app/frontend/src/components/ocr/OCRPreview.vue b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue index c4087d0..fb058e4 100644 --- a/data-entry-app/frontend/src/components/ocr/OCRPreview.vue +++ b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue @@ -106,14 +106,27 @@
-
{{ data.raw_text }}
@@ -168,6 +181,45 @@ const formatDate = (dateStr) => { year: 'numeric' }) } + +const getEngineClass = (engine) => { + if (!engine) return '' + if (engine === 'paddle-light') return 'fast' + if (engine === 'paddle-adaptive') return 'adaptive' + if (engine === 'adaptive-full') return 'full' + if (engine.includes('paddle')) return 'paddleocr' + if (engine.includes('tesseract')) return 'tesseract' + return '' +} + +const getEngineIcon = (engine) => { + if (!engine) return 'pi pi-cog' + if (engine === 'paddle-light') return 'pi pi-bolt' // Fast/lightning + if (engine === 'adaptive-full') return 'pi pi-cog' // Full pipeline + return 'pi pi-cog' +} + +const getEngineLabel = (engine) => { + if (!engine) return '' + if (engine === 'paddle-light') return 'Fast Mode (PaddleOCR)' + if (engine === 'paddle-adaptive') return 'Adaptive (Paddle dual)' + if (engine === 'adaptive-full') return 'Full Pipeline' + if (engine.includes('paddle')) return 'PaddleOCR' + if (engine.includes('tesseract')) return 'Tesseract' + return engine +} + +const getMessageClass = (message) => { + if (!message) return '' + if (message.includes('fast mode')) return 'fast-mode' + if (message.includes('full pipeline')) return 'full-pipeline' + return '' +} + +const formatProcessingTime = (ms) => { + if (ms < 1000) return `${ms}ms` + return `${(ms / 1000).toFixed(1)}s` +}