feat: Improve OCR adaptive pipeline with early exit and better pattern matching

- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 01:54:52 +02:00
parent 6c3dd89f6d
commit 9f06482681
9 changed files with 952 additions and 116 deletions
--- a/data-entry-app/backend/app/main.py
+++ b/data-entry-app/backend/app/main.py
@@ -1,10 +1,19 @@
 """FastAPI application entry point for Data Entry App."""
 import sys
 import logging
 import threading
 from pathlib import Path
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 # Configure logging to show INFO level messages
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
 )
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
@@ -30,6 +39,18 @@ async def lifespan(app: FastAPI):
    settings.upload_path_resolved
    print(f"Upload path: {settings.upload_path_resolved}")
    # Pre-initialize OCR engine in background (PaddleOCR takes 15-20s)
    def init_ocr_background():
        try:
            from app.services.ocr_service import ocr_service
            ocr_service.ocr_engine._init_paddle_lazy()
            print("OCR engine ready")
        except Exception as e:
            print(f"Warning: OCR engine pre-load failed: {e}")
    print("Starting OCR engine pre-load (background)...")
    threading.Thread(target=init_ocr_background, daemon=True).start()
    yield
    # Shutdown
--- a/data-entry-app/backend/app/routers/ocr.py
+++ b/data-entry-app/backend/app/routers/ocr.py
@@ -102,6 +102,8 @@ async def extract_from_image(file: UploadFile = File(...)):
            confidence_vendor=result.confidence_vendor,
            overall_confidence=result.overall_confidence,
            raw_text=result.raw_text,
            ocr_engine=result.ocr_engine,
            processing_time_ms=result.processing_time_ms,
        )
        return OCRResponse(success=True, message=message, data=data)
@@ -171,6 +173,8 @@ async def extract_from_attachment(
        confidence_vendor=result.confidence_vendor,
        overall_confidence=result.overall_confidence,
        raw_text=result.raw_text,
        ocr_engine=result.ocr_engine,
        processing_time_ms=result.processing_time_ms,
    )
    return OCRResponse(success=True, message=message, data=data)
--- a/data-entry-app/backend/app/schemas/ocr.py
+++ b/data-entry-app/backend/app/schemas/ocr.py
@@ -37,6 +37,8 @@ class ExtractionData(BaseModel):
    confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
    overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
    raw_text: str = Field(default="", description="Raw OCR text")
    ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
    processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")
    class Config:
        """Pydantic config."""
--- a/data-entry-app/backend/app/services/image_preprocessor.py
+++ b/data-entry-app/backend/app/services/image_preprocessor.py
@@ -23,37 +23,57 @@ class ImagePreprocessor:
            raise ValueError(f"Could not load image: {path}")
        return image
-    def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]:
+    def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
        """
-        Convert PDF to images with high DPI for better OCR.
+        Convert PDF to images.
        Args:
            path: Path to PDF file
-            dpi: Resolution (400 recommended for receipts, higher = better quality but slower)
+            dpi: Resolution (300 = fast & good quality, 400 = better but slower)
        """
        if not PDF_AVAILABLE:
            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
        # Use 400 DPI for better text recognition on thermal receipts
        images = pdf2image.convert_from_path(str(path), dpi=dpi)
        return [np.array(img) for img in images]
    def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
        """
-        Apply preprocessing pipeline for thermal receipt images.
+        Apply LIGHT preprocessing - better for clear PDFs.
        Heavy binarization can destroy text on clear images.
        """
        return self.preprocess_light(image)
-        Pipeline:
+    def preprocess_light(self, image: np.ndarray) -> np.ndarray:
-        1. Convert to grayscale
+        """
-        2. Resize if too small (min 1500px width for high quality)
+        Light preprocessing for CLEAR images (PDFs, good scans).
-        3. Deskew (straighten rotated text)
+        Preserves original quality, only enhances contrast.
-        4. Contrast enhancement (CLAHE)
+        """
-        5. Denoise (Non-local means)
+        # 1. Grayscale
-        6. Sharpening (for clearer text edges)
+        if len(image.shape) == 3:
-        7. Adaptive thresholding (binarization)
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        8. Morphological operations (connect broken chars)
+        else:
            gray = image.copy()
-        Args:
+        # 2. Resize if too small
-            image: Input image (BGR or grayscale)
+        height, width = gray.shape
-            high_quality: If True, apply more aggressive preprocessing
+        if width < 1500:
            scale = 1500 / width
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        # 3. Deskew
        gray = self._deskew(gray)
        # 4. Light contrast enhancement only
        clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        # NO binarization, NO morphological ops - preserve original quality
        return enhanced
    def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
        """
        Heavy preprocessing for FADED thermal receipts.
        Aggressive binarization to recover faded text.
        """
        # 1. Grayscale
        if len(image.shape) == 3:
@@ -63,57 +83,48 @@ class ImagePreprocessor:
        # 2. Resize if too small (larger = better OCR)
        height, width = gray.shape
-        min_width = 1500 if high_quality else 1000
+        if width < 1500:
-        if width < min_width:
+            scale = 1500 / width
-            scale = min_width / width
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
            gray = cv2.resize(
                gray, None, fx=scale, fy=scale,
                interpolation=cv2.INTER_CUBIC
            )
        # 3. Deskew
        gray = self._deskew(gray)
-        # 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization)
+        # 4. Contrast enhancement with CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
-        # 5. Denoise (slightly less aggressive to preserve text details)
+        # 5. Denoise
-        denoised = cv2.fastNlMeansDenoising(
+        denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
            enhanced, h=8,  # Lower h = preserve more details
            templateWindowSize=7,
            searchWindowSize=21
        )
-        # 6. Sharpening to enhance text edges
+        # 6. Sharpening
-        if high_quality:
+        gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
-            # Unsharp mask for better text clarity
+        sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
            gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
            sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
        else:
            sharpened = denoised
-        # 7. Adaptive thresholding with optimized parameters
+        # 7. Adaptive thresholding (binarization)
        binary = cv2.adaptiveThreshold(
            sharpened, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
-            blockSize=11,  # Smaller block = better for small text
+            blockSize=11, C=5
            C=5  # Lower C = darker result, better for faded receipts
        )
        # 8. Morphological operations
        # Close small gaps in characters
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)
        # Optional: Remove small noise spots
        if high_quality:
            kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
            result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open)
        return result
    def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
        """
        Generate 2 preprocessing variants for OCR (fast mode).
        Returns: [light_processed, heavy_processed]
        """
        return [
            self.preprocess_light(image),
            self.preprocess_heavy(image),
        ]
    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """Correct image rotation/skew using Hough lines."""
        edges = cv2.Canny(image, 50, 150, apertureSize=3)
--- a/data-entry-app/backend/app/services/ocr_engine.py
+++ b/data-entry-app/backend/app/services/ocr_engine.py
@@ -1,11 +1,16 @@
 """OCR engine wrapper for PaddleOCR and Tesseract."""
 import os
 import logging
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import List, Optional, Tuple
 import numpy as np
 # Setup logging
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)  # Ensure logs are visible
 # Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
 os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
@@ -40,6 +45,7 @@ class OCRResult:
    text: str
    confidence: float
    boxes: List[dict]
    engine: str = ""  # OCR engine used: paddleocr or tesseract
 class OCREngine:
@@ -65,8 +71,9 @@ class OCREngine:
                print("Initializing PaddleOCR engine...")
                # PaddleOCR 3.x API - optimized for Romanian receipts
                # Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
                self._paddle = PaddleOCR(
-                    lang='en',  # 'en' works better than 'ro' for mixed alphanumeric
+                    lang='en',  # 'en' handles Latin alphabet well for receipts
                    # High quality settings for better accuracy
                    det_db_thresh=0.3,      # Lower threshold = detect more text (default 0.3)
                    det_db_box_thresh=0.5,  # Box confidence threshold (default 0.5)
@@ -81,14 +88,19 @@ class OCREngine:
    def recognize(self, image: np.ndarray) -> OCRResult:
        """Perform OCR on preprocessed image."""
        logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
        # Lazy init PaddleOCR on first call
        self._init_paddle_lazy()
        if PADDLE_AVAILABLE and self._paddle:
            logger.info("[OCR] Using PaddleOCR engine")
            return self._paddle_recognize(image)
        elif TESSERACT_AVAILABLE:
            logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
            return self._tesseract_recognize(image)
        else:
            logger.error("[OCR] No OCR engine available!")
            raise RuntimeError(
                "No OCR engine available. Install PaddleOCR or Tesseract."
            )
@@ -96,17 +108,23 @@ class OCREngine:
    def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
        """Recognize text using PaddleOCR 3.x API."""
        try:
            logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
            # PaddleOCR 3.x requires 3-channel images
            if len(image.shape) == 2:
                # Convert grayscale to 3-channel BGR
                import cv2
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
                logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
            # PaddleOCR 3.x uses predict() with new parameter names
            logger.info("[PaddleOCR] Calling predict()...")
            result = self._paddle.predict(image, use_textline_orientation=True)
            logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
            if not result or len(result) == 0:
-                return OCRResult(text="", confidence=0.0, boxes=[])
+                logger.warning("[PaddleOCR] No results returned")
                return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
            # PaddleOCR 3.x returns OCRResult objects with different structure
            ocr_result = result[0]
@@ -117,7 +135,7 @@ class OCREngine:
            dt_polys = ocr_result.get('dt_polys', [])
            if not rec_texts:
-                return OCRResult(text="", confidence=0.0, boxes=[])
+                return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
            boxes = []
            for i, text in enumerate(rec_texts):
@@ -130,13 +148,17 @@ class OCREngine:
                })
            avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
            text_result = '\n'.join(rec_texts)
            logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
            logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
            return OCRResult(
-                text='\n'.join(rec_texts),
+                text=text_result,
                confidence=float(avg_conf),
-                boxes=boxes
+                boxes=boxes,
                engine="paddleocr"
            )
        except Exception as e:
-            print(f"PaddleOCR error: {e}, falling back to Tesseract")
+            logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
            if TESSERACT_AVAILABLE:
                return self._tesseract_recognize(image)
            raise
@@ -145,23 +167,70 @@ class OCREngine:
        """Recognize text using Tesseract."""
        global pytesseract
        logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
        # Lazy import pytesseract
        if pytesseract is None:
-            print("Importing pytesseract...")
+            logger.info("[Tesseract] Importing pytesseract...")
            import pytesseract as _pytesseract
            pytesseract = _pytesseract
-        config = '--psm 6 -l ron+eng'
+        # PSM 4: Single column (best for receipts)
        config = '--psm 4 -l ron+eng'
        text = pytesseract.image_to_string(image, config=config)
        data = pytesseract.image_to_data(
            image, config=config,
            output_type=pytesseract.Output.DICT
        )
        # Quick confidence estimate
        data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
        confidences = [int(c) for c in data['conf'] if int(c) > 0]
        avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
-        return OCRResult(text=text, confidence=avg_conf, boxes=[])
+        logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
        return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
    def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
        """
        Run both OCR engines and return both results.
        Returns:
            Tuple of (paddle_result, tesseract_result)
            tesseract_result may be None if Tesseract is not available
        """
        logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
        # Lazy init PaddleOCR
        self._init_paddle_lazy()
        paddle_result = None
        tesseract_result = None
        # Run PaddleOCR
        if PADDLE_AVAILABLE and self._paddle:
            try:
                logger.info("[OCR Dual] Running PaddleOCR...")
                paddle_result = self._paddle_recognize(image)
                logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
            except Exception as e:
                logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
                paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
        # Run Tesseract
        if TESSERACT_AVAILABLE:
            try:
                logger.info("[OCR Dual] Running Tesseract...")
                tesseract_result = self._tesseract_recognize(image)
                logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
            except Exception as e:
                logger.error(f"[OCR Dual] Tesseract failed: {e}")
                tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
        # Fallback if PaddleOCR not available
        if paddle_result is None:
            if tesseract_result:
                paddle_result = tesseract_result
            else:
                raise RuntimeError("No OCR engine available")
        return paddle_result, tesseract_result
    @staticmethod
    def get_available_engines() -> List[str]:
--- a/data-entry-app/backend/app/services/ocr_extractor.py
+++ b/data-entry-app/backend/app/services/ocr_extractor.py
@@ -28,6 +28,8 @@ class ExtractionResult:
    confidence_date: float = 0.0
    confidence_vendor: float = 0.0
    raw_text: str = ""
    ocr_engine: str = ""  # OCR engine used: paddleocr or tesseract
    processing_time_ms: int = 0  # Processing time in milliseconds
    @property
    def overall_confidence(self) -> float:
@@ -70,6 +72,7 @@ class ReceiptExtractor:
    # Date patterns - support dash, dot, and slash separators
    # OCR may produce DRTA instead of DATA, DAIA, etc.
    # OCR may also add spaces/commas in dates: "27. 10, 2025" instead of "27.10.2025"
    DATE_PATTERNS = [
        # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
        (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
@@ -84,6 +87,19 @@ class ReceiptExtractor:
        (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
    ]
    # OCR-corrupted date patterns with spaces/commas
    # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
    DATE_PATTERNS_OCR_SPACES = [
        # YYYY. MM. DD format with spaces (OMV/Petrom receipts) - with time
        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})\s+\d{2}:\d{2}', 0.92, 'ymd'),
        # YYYY. MM. DD format with spaces (standalone)
        (r'(\d{4})[.,]\s*(\d{2})[.,]\s*(\d{2})', 0.85, 'ymd'),
        # DD. MM, YYYY or DD, MM. YYYY (with time following)
        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})\s+\d{2}:\d{2}', 0.92, 'dmy'),
        # DD. MM, YYYY or DD, MM. YYYY (standalone)
        (r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
    ]
    # Receipt number patterns - Romanian fiscal receipt formats
    # OCR may produce N instead of : or other errors
    NUMBER_PATTERNS = [
@@ -127,12 +143,23 @@ class ReceiptExtractor:
        (r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
        # COD FISCAL (vendor)
        (r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
-        # C.I.F. format (with dots)
+        # C. I. F. format with SPACES (OCR artifact) - "C. I. F. : R011201891"
        (r'C\.\s*I\.\s*F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.92),
        # C.I.F. format (with dots, no spaces)
        (r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
        # CUI format (less specific, use with caution)
        (r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
    ]
    # Pattern for CIF NUMBER appearing BEFORE "C.I.F." label (reversed format)
    # Common in some receipts: "R011201891\nC. I. F." - number on line before label
    CUI_REVERSED_PATTERNS = [
        # RO + 8-10 digits on line immediately before C.I.F./CIF label
        (r'(?:R[O0])(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.98),
        # Just digits before C.I.F. label
        (r'(\d{6,10})\s*\n\s*C\.?\s*I\.?\s*F\.?', 0.95),
    ]
    # Series patterns - be strict to avoid false matches
    SERIES_PATTERNS = [
        (r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
@@ -158,6 +185,7 @@ class ReceiptExtractor:
    # Items count patterns - OCR may produce OZ instead of POZ, etc.
    # Number may be on separate line before or after the label
    # IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
    ITEMS_COUNT_PATTERNS = [
        # NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
        # OCR tolerant: OZ instead of POZ, ARI instead of ART
@@ -167,11 +195,10 @@ class ReceiptExtractor:
        # Number may be on next line after label
        (r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
        (r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
-        # Simpler patterns
+        # Simpler patterns - but more specific
        (r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
-        (r'P?[O0]Z\s*:?\s*(\d+)', 0.85),
+        # POZ at start of line or after colon (not in product descriptions)
-        # X articole/pozitii
+        (r'(?:^|\s|:)P?[O0]Z\.?\s*(?:ART)?\.?\s*(?:IN\s+BON)?\s*:?\s*(\d{1,3})(?:\s|$)', 0.85),
        (r'(\d+)\s*(?:ARTIC[O0]LE|P[O0]ZITII|BUC)', 0.80),
    ]
    # Address patterns (Romanian format)
@@ -183,20 +210,21 @@ class ReceiptExtractor:
    ]
    # Vendor name indicators (lines containing these are likely vendor names)
    # These should be company type suffixes, not generic words
    # Patterns must handle OCR spaces: "S. R. L." as well as "S.R.L."
    VENDOR_INDICATORS = [
-        r'\bS\.?R\.?L\.?\b',      # S.R.L.
+        r'\bS\.?\s*R\.?\s*L\.?\b',      # S.R.L. or S. R. L.
-        r'\bS\.?A\.?\b',          # S.A.
+        r'\bS\.?\s*A\.?\b',              # S.A. or S. A.
-        r'\bS\.?N\.?C\.?\b',      # S.N.C.
+        r'\bS\.?\s*N\.?\s*C\.?\b',      # S.N.C. or S. N. C.
-        r'\bS\.?C\.?S\.?\b',      # S.C.S.
+        r'\bS\.?\s*C\.?\s*S\.?\b',      # S.C.S. or S. C. S.
-        r'\bI\.?I\.?\b',          # I.I. (Individual)
+        r'\bI\.?\s*I\.?\b',              # I.I. or I. I.
-        r'\bP\.?F\.?A\.?\b',      # P.F.A.
+        r'\bP\.?\s*F\.?\s*A\.?\b',      # P.F.A. or P. F. A.
-        r'\bS\.?C\.?\b',          # S.C.
+        # S.C. alone is too short and generic - only match if followed by company name
        r'\bS\.?\s*C\.?\s+[A-Z]',       # S.C. followed by company name
        r'HOLDING',
        r'COMPANY',
        r'GROUP',
-        r'MAGAZIN',
+        # Removed: MAGAZIN, MARKET, SHOP - too generic, match store welcome messages
        r'MARKET',
        r'SHOP',
    ]
    def extract(self, text: str) -> ExtractionResult:
@@ -215,6 +243,14 @@ class ReceiptExtractor:
        # Extract additional fields - Multiple TVA entries
        result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
        if not result.tva_entries:
            print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
            # Debug: show what patterns see
            import re
            normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
            taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
            rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
            print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
        result.items_count = self._extract_items_count(text_upper)
        result.address = self._extract_address(text_upper)
@@ -334,6 +370,7 @@ class ReceiptExtractor:
    def _extract_date(self, text: str) -> Tuple[Optional[date], float]:
        """Extract receipt date from text."""
        # First try standard patterns (clean dates)
        for pattern, confidence in self.DATE_PATTERNS:
            match = re.search(pattern, text)
            if match:
@@ -354,6 +391,34 @@ class ReceiptExtractor:
                        return parsed, confidence
                except ValueError:
                    continue
        # Then try OCR-corrupted patterns (dates with spaces/commas)
        # Handles: "27. 10, 2025", "27, 10. 2025", "2025. 08. 14", etc.
        for pattern, confidence, fmt in self.DATE_PATTERNS_OCR_SPACES:
            match = re.search(pattern, text)
            if match:
                try:
                    if fmt == 'ymd':
                        # YYYY. MM. DD format (OMV/Petrom)
                        year = match.group(1)
                        month = match.group(2)
                        day = match.group(3)
                    else:
                        # DD. MM. YYYY format (default)
                        day = match.group(1)
                        month = match.group(2)
                        year = match.group(3)
                    date_str = f"{day}.{month}.{year}"
                    parsed = datetime.strptime(date_str, '%d.%m.%Y').date()
                    # Validate date range
                    today = date.today()
                    if parsed <= today and parsed.year >= 2020:
                        return parsed, confidence
                except ValueError:
                    continue
        return None, 0.0
    def _extract_number(self, text: str) -> Tuple[Optional[str], float]:
@@ -377,8 +442,9 @@ class ReceiptExtractor:
        Extract vendor/partner name from text.
        Uses multiple strategies:
        1. Look for lines with company type indicators (S.R.L., S.A., etc.)
-        2. Look for lines near CIF
+        2. Look for company name + SRL on separate lines
-        3. Use first valid line as fallback
+        3. Look for lines near CIF
        4. Use first valid line as fallback
        """
        lines = text.split('\n')
        skip_keywords = [
@@ -388,9 +454,37 @@ class ReceiptExtractor:
            'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
            'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
            'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
-            'BUC', 'ROLA', 'CUMPARATOR'
+            'BUC', 'ROLA', 'CUMPARATOR', 'MAGAZIN', 'BRICK',
            'NIVS', 'BENZINA', 'PETROM', 'OMV'
        ]
        # Strategy 0: Look for company name followed by SRL/SA on next line
        # Pattern: "COMPANY NAME\nSRL" or "COMPANY NAME\nS.R.L."
        for i, line in enumerate(lines[:15]):
            line = line.strip()
            if not line or len(line) < 3:
                continue
            line_upper = line.upper()
            # Skip lines with skip keywords
            if any(kw in line_upper for kw in skip_keywords):
                continue
            # Check if next line is standalone SRL, S.R.L., SA, S.A., etc.
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip().upper()
                # Match standalone company type suffix
                if re.match(r'^S\.?\s*R\.?\s*L\.?$', next_line) or \
                   re.match(r'^S\.?\s*A\.?$', next_line) or \
                   re.match(r'^S\.?\s*N\.?\s*C\.?$', next_line) or \
                   re.match(r'^P\.?\s*F\.?\s*A\.?$', next_line) or \
                   re.match(r'^I\.?\s*I\.?$', next_line):
                    # Combine: "COMPANY NAME" + " " + "SRL"
                    vendor = self._clean_vendor_name(f"{line} {next_line}")
                    if vendor and len(vendor) >= 5:
                        return vendor, 0.95
        # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
        for i, line in enumerate(lines[:15]):  # Check first 15 lines
            line = line.strip()
@@ -476,7 +570,22 @@ class ReceiptExtractor:
        Extract vendor CUI (fiscal identification code) from text.
        Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
        """
-        # First, try to find CIF on a line that doesn't contain CLIENT
+        # Strategy 0: Check for reversed format (CIF NUMBER on line BEFORE "C.I.F." label)
        # This is common in some receipts: "R011201891\nC. I. F."
        for pattern, confidence in self.CUI_REVERSED_PATTERNS:
            match = re.search(pattern, text_upper, re.IGNORECASE | re.MULTILINE)
            if match:
                cui = match.group(1)
                if 6 <= len(cui) <= 10:
                    # Verify this is not the CLIENT CUI by checking context
                    start = match.start()
                    # Check 50 chars before the match for CLIENT keyword
                    context_start = max(0, start - 50)
                    context = text_upper[context_start:start]
                    if 'CLIENT' not in context and 'LIENT' not in context:
                        return cui, confidence
        # Strategy 1: Try to find CIF on a line that doesn't contain CLIENT
        lines = text_upper.split('\n')
        for line in lines:
            # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
@@ -491,7 +600,7 @@ class ReceiptExtractor:
                    if 6 <= len(cui) <= 10:
                        return cui, confidence
-        # Fallback: search entire text but exclude CLIENT patterns
+        # Strategy 2: Fallback - search entire text but exclude CLIENT patterns
        for pattern, confidence in self.CUI_PATTERNS:
            # Find all matches
            for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
@@ -523,8 +632,94 @@ class ReceiptExtractor:
        tva_entries = []
        seen_entries = set()  # To avoid duplicates
-        # Normalize spaces in numbers first (OCR may produce "32. 31")
+        # Check for non-VAT payer (NEPLATITOR DE TVA) - TVA = 0
        # OCR variants: NEPLATTOR, NEPLATITOR, NEPLATOR, NEPLATTOR, ANEPLATHTOR, MEPLATITOR, etc.
        # Also handles: "TOTAL NEPLATITOR TVA", "(NEPLATITOR DE TVA)"
        non_vat_patterns = [
            # Main pattern - flexible for OCR errors: NEPLAT + any chars + OR/R
            r'NEPLAT\w*OR',           # NEPLATITOR, NEPLATTOR, NEPLATOR
            r'[ANM]EPLAT\w*O?R',      # OCR errors: ANEPLATHTOR, MEPLATITOR
            r'TOTAL\s+NEPLAT',        # TOTAL NEPLATITOR...
            r'TOTAL\s+[ANM]EPLAT',    # TOTAL ANEPLAT... (OCR error)
            r'SCUTIT\s*(?:DE\s+)?T[VU]A',  # SCUTIT DE TVA
            r'NEPLAT\w*\s+T[VU]A',    # NEPLATITOR TVA
            r'NEPLAT\w*\s+DE\s+T',    # NEPLATITOR DE T... (truncated)
        ]
        for pattern in non_vat_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                # Non-VAT payer - return TVA = 0
                return [{'code': 'D', 'percent': 0, 'amount': Decimal('0.00')}], Decimal('0.00')
        # Normalize spaces in numbers first (OCR may produce "32. 31" or "49, 58")
        normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
        # Also normalize comma followed by space to comma (for "21, 00%" -> "21,00%")
        normalized_text = re.sub(r'(\d+),\s+(\d{2})\s*%', r'\1.\2%', normalized_text)
        # Pattern 0a: First try to get TVA from "TOTAL TAXE:" which is most reliable
        # Format: "TOTAL TAXE: 55,22" - this is always the TVA amount
        # OCR may cut "T" producing "OTAL TAXE:" instead of "TOTAL TAXE:"
        # OCR may also put amount BEFORE "OTAL TAXE": "55,22OTAL TAXE:"
        total_taxe_pattern = r'T?OTAL\s+TAXE\s*:?\s*([\d\s.,]+)'
        taxe_match = re.search(total_taxe_pattern, normalized_text, re.IGNORECASE)
        # Also try pattern where amount comes BEFORE "OTAL TAXE" (OCR line break issue)
        if not taxe_match:
            reversed_taxe_pattern = r'([\d.,]+)\s*T?OTAL\s+TAXE'
            taxe_match = re.search(reversed_taxe_pattern, normalized_text, re.IGNORECASE)
        if taxe_match:
            # Also need to find the TVA rate from the table
            # Pattern handles: "A-21%", "-21,00%", "21%" etc.
            rate_pattern = r'([A-D])?\s*[-:]?\s*(\d{1,2})[.,]?\s*\d{0,2}\s*%'
            rate_match = re.search(rate_pattern, normalized_text, re.IGNORECASE)
            if rate_match:
                try:
                    code = rate_match.group(1).upper() if rate_match.group(1) else 'A'  # Default to A if missing
                    percent = int(rate_match.group(2))
                    amount_str = taxe_match.group(1).replace(' ', '')
                    amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    pass
        # Pattern 0b: Table format "A-21,00%  285,66  49,58" (code-percent  base  tva_amount)
        # This format appears after a TVA header line like "TVA  TOTAL  VALDARE"
        # The TVA amount position depends on header: VALDARE last = TVA last, VALOARE middle = TVA middle
        if not tva_entries:
            table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\s*\d{2}\s*%\s*([\d\s.,]+)\s+([\d\s.,]+)'
            for match in re.finditer(table_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
                    amount1_str = match.group(3).replace(' ', '')
                    amount2_str = match.group(4).replace(' ', '')
                    amount1 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount1_str)))
                    amount2 = Decimal(self._normalize_number(re.sub(r'[^\d.,]', '', amount2_str)))
                    # Determine which is TVA: the smaller amount is usually TVA
                    # (TVA is a fraction of the total, so it's always smaller)
                    tva_amount = min(amount1, amount2)
                    if tva_amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': tva_amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue
        # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
        # OCR tolerant: TUA, TVR, etc.
@@ -571,7 +766,75 @@ class ReceiptExtractor:
                except (ValueError, InvalidOperation):
                    continue
-        # Pattern 3: "TVAA - 21%" on one line, amount on next line
+        # Pattern 3: "TOTAL TVA A - 21%" with amount on same line or "TOTAL TVA BON" with amount
        if not tva_entries:
            # First try: "TOTAL TVA A - 21%  32.31" (amount on same line)
            tva_with_amount = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*([\d.,]+)'
            for match in re.finditer(tva_with_amount, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
                    amount_str = self._normalize_number(match.group(3))
                    amount = Decimal(amount_str)
                    if amount > 0:
                        entry_key = (code, percent)
                        if entry_key not in seen_entries:
                            tva_entries.append({
                                'code': code,
                                'percent': percent,
                                'amount': amount
                            })
                            seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue
        # Pattern 3b: "TOTAL TVA A - 21%" on one line, look for "TOTAL TVA BON" amount
        if not tva_entries:
            tva_total_pattern = r'TOTAL\s+T[VU][AR]\s+([A-D])\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_total_pattern, normalized_text, re.IGNORECASE):
                try:
                    code = match.group(1).upper()
                    percent = int(match.group(2))
                    # Look for "TOTAL TVA BON" followed by amount
                    tva_bon_pattern = r'TOTAL\s+T[VU][AR]\s+BON[:\s]*([\d.,]+)'
                    tva_bon_match = re.search(tva_bon_pattern, normalized_text, re.IGNORECASE)
                    if tva_bon_match:
                        amount_str = self._normalize_number(tva_bon_match.group(1))
                        amount = Decimal(amount_str)
                        if amount > 0:
                            entry_key = (code, percent)
                            if entry_key not in seen_entries:
                                tva_entries.append({
                                    'code': code,
                                    'percent': percent,
                                    'amount': amount
                                })
                                seen_entries.add(entry_key)
                            continue
                    # Fallback: Amount after TOTAL TVA BON on next line
                    tva_bon_pos = re.search(r'TOTAL\s+T[VU][AR]\s+BON', normalized_text, re.IGNORECASE)
                    if tva_bon_pos:
                        after_bon = normalized_text[tva_bon_pos.end():]
                        # Find first standalone number (likely TVA amount)
                        amount_match = re.search(r'[\s\n]*([\d]+[.,]\d{2})\s*\n', after_bon)
                        if amount_match:
                            amount_str = self._normalize_number(amount_match.group(1))
                            amount = Decimal(amount_str)
                            if amount > 0:
                                entry_key = (code, percent)
                                if entry_key not in seen_entries:
                                    tva_entries.append({
                                        'code': code,
                                        'percent': percent,
                                        'amount': amount
                                    })
                                    seen_entries.add(entry_key)
                except (ValueError, InvalidOperation):
                    continue
        # Pattern 3b: "TVAA - 21%" on one line, amount on next line (simpler format)
        if not tva_entries:
            tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
            for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
--- a/data-entry-app/backend/app/services/ocr_service.py
+++ b/data-entry-app/backend/app/services/ocr_service.py
@@ -1,11 +1,16 @@
 """Main OCR service coordinating preprocessing, recognition, and extraction."""
 import os
 import re
 import logging
 # Disable PaddleOCR model source check for faster startup (PaddleX 3.x) - must be set before import
 os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
 import time
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from decimal import Decimal
 from pathlib import Path
 from typing import Optional, Tuple
@@ -13,6 +18,9 @@ from app.services.ocr_engine import OCREngine
 from app.services.ocr_extractor import ReceiptExtractor, ExtractionResult
 from app.services.image_preprocessor import ImagePreprocessor
 # Setup logging
 logger = logging.getLogger(__name__)
 class OCRService:
    """Service for OCR processing of receipt images."""
@@ -56,15 +64,18 @@ class OCRService:
        image_path: Path,
        mime_type: str
    ) -> Tuple[bool, str, Optional[ExtractionResult]]:
-        """Synchronous processing (runs in thread pool)."""
+        """Synchronous processing with ADAPTIVE OCR pipeline."""
-        # Handle PDF
+        start_time = time.time()
        print(f"[OCR Service] Starting processing: {image_path}, mime: {mime_type}", flush=True)
        # Load image
        if mime_type == 'application/pdf':
            try:
                images = self.preprocessor.pdf_to_images(image_path)
                if not images:
                    return False, "Failed to extract images from PDF", None
-                image = images[0]  # Process first page only
+                image = images[0]
            except RuntimeError as e:
                return False, str(e), None
        else:
@@ -73,38 +84,360 @@ class OCRService:
            except ValueError as e:
                return False, str(e), None
-        # Preprocess image
+        raw_texts = []
-        processed = self.preprocessor.preprocess(image)
+        extraction = None
        # ══════════════════════════════════════════════════════════════
        # STEP 1: PaddleOCR + Light (fastest, best for clear PDFs)
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
        print("[OCR] STEP 1: PaddleOCR + Light preprocessing", flush=True)
        print("=" * 60, flush=True)
        light_img = self.preprocessor.preprocess_light(image)
        # Perform OCR
        try:
-            ocr_result = self.ocr_engine.recognize(processed)
+            paddle_light = self.ocr_engine._paddle_recognize(light_img)
-        except RuntimeError as e:
+            if paddle_light and paddle_light.text:
-            return False, str(e), None
+                extraction = self.extractor.extract(paddle_light.text)
                extraction.ocr_engine = "paddle-light"
                raw_texts.append(f"═══ PaddleOCR (light, conf: {paddle_light.confidence:.0%}) ═══\n{paddle_light.text}")
-        if not ocr_result.text:
+                # Log extraction results
-            return False, "No text detected in image", None
+                print(f"[OCR] Step 1 Results:", flush=True)
                print(f"  - OCR Confidence: {paddle_light.confidence:.0%}", flush=True)
                print(f"  - Amount: {extraction.amount}", flush=True)
                print(f"  - Date: {extraction.receipt_date}", flush=True)
                print(f"  - Number: {extraction.receipt_number}", flush=True)
                print(f"  - CUI: {extraction.cui}", flush=True)
                print(f"  - TVA: {extraction.tva_total} (entries: {len(extraction.tva_entries) if extraction.tva_entries else 0})", flush=True)
                print(f"  - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
-        # Extract structured fields
+                # Early exit if complete
-        extraction = self.extractor.extract(ocr_result.text)
+                if self._is_extraction_complete(extraction):
                    extraction.raw_text = "\n\n".join(raw_texts)
                    elapsed_ms = int((time.time() - start_time) * 1000)
                    extraction.processing_time_ms = elapsed_ms
                    print(f"[OCR] ✓✓✓ EARLY EXIT at Step 1 - All fields found! ({elapsed_ms}ms) ✓✓✓", flush=True)
                    return True, "OCR complete (fast mode)", extraction
                else:
                    print("[OCR] → Step 1 incomplete, continuing to Step 2...", flush=True)
        except Exception as e:
            print(f"[OCR] PaddleOCR light failed: {e}", flush=True)
            extraction = ExtractionResult()
        # ══════════════════════════════════════════════════════════════
        # STEP 2: PaddleOCR + Heavy (for faded thermal receipts)
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
        print("[OCR] STEP 2: PaddleOCR + Heavy preprocessing", flush=True)
        print("=" * 60, flush=True)
        heavy_img = self.preprocessor.preprocess_heavy(image)
        try:
            paddle_heavy = self.ocr_engine._paddle_recognize(heavy_img)
            if paddle_heavy and paddle_heavy.text:
                extraction_heavy = self.extractor.extract(paddle_heavy.text)
                extraction_heavy.ocr_engine = "paddle-heavy"
                raw_texts.append(f"═══ PaddleOCR (heavy, conf: {paddle_heavy.confidence:.0%}) ═══\n{paddle_heavy.text}")
                print(f"[OCR] Step 2 (Heavy) Results:", flush=True)
                print(f"  - OCR Confidence: {paddle_heavy.confidence:.0%}", flush=True)
                print(f"  - Amount: {extraction_heavy.amount}", flush=True)
                print(f"  - Date: {extraction_heavy.receipt_date}", flush=True)
                print(f"  - CUI: {extraction_heavy.cui}", flush=True)
                # Merge with previous
                extraction = self._merge_extractions(extraction, extraction_heavy)
                print(f"[OCR] After merge:", flush=True)
                print(f"  - Amount: {extraction.amount}", flush=True)
                print(f"  - Date: {extraction.receipt_date}", flush=True)
                print(f"  - Number: {extraction.receipt_number}", flush=True)
                print(f"  - CUI: {extraction.cui}", flush=True)
                print(f"  - TVA: {extraction.tva_total}", flush=True)
                print(f"  - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
                if self._is_extraction_complete(extraction):
                    extraction.raw_text = "\n\n".join(raw_texts)
                    extraction.ocr_engine = "paddle-adaptive"
                    elapsed_ms = int((time.time() - start_time) * 1000)
                    extraction.processing_time_ms = elapsed_ms
                    print(f"[OCR] ✓✓✓ EARLY EXIT at Step 2 - All fields found after merge! ({elapsed_ms}ms) ✓✓✓", flush=True)
                    return True, "OCR complete (paddle dual)", extraction
                else:
                    print("[OCR] → Step 2 incomplete, continuing to Step 3 (Tesseract)...", flush=True)
        except Exception as e:
            print(f"[OCR] PaddleOCR heavy failed: {e}", flush=True)
        # ══════════════════════════════════════════════════════════════
        # STEP 3: Tesseract fallback
        # ══════════════════════════════════════════════════════════════
        print("=" * 60, flush=True)
        print("[OCR] STEP 3: Tesseract fallback", flush=True)
        print("=" * 60, flush=True)
        try:
            tesseract_result = self.ocr_engine._tesseract_recognize(light_img)
            if tesseract_result and tesseract_result.text:
                extraction_tess = self.extractor.extract(tesseract_result.text)
                extraction_tess.ocr_engine = "tesseract"
                raw_texts.append(f"═══ Tesseract (conf: {tesseract_result.confidence:.0%}) ═══\n{tesseract_result.text}")
                print(f"[OCR] Step 3 (Tesseract) Results:", flush=True)
                print(f"  - OCR Confidence: {tesseract_result.confidence:.0%}", flush=True)
                print(f"  - Amount: {extraction_tess.amount}", flush=True)
                print(f"  - Date: {extraction_tess.receipt_date}", flush=True)
                print(f"  - CUI: {extraction_tess.cui}", flush=True)
                extraction = self._merge_extractions(extraction, extraction_tess)
        except Exception as e:
            print(f"[OCR] Tesseract failed: {e}", flush=True)
        # Final result
        if extraction is None:
            return False, "No text detected", None
        extraction.raw_text = "\n\n".join(raw_texts)
        extraction.ocr_engine = "adaptive-full"
        # Build result message
        fields_found = []
-        if extraction.amount:
+        if extraction.amount: fields_found.append("amount")
-            fields_found.append("amount")
+        if extraction.receipt_date: fields_found.append("date")
-        if extraction.receipt_date:
+        if extraction.receipt_number: fields_found.append("number")
-            fields_found.append("date")
+        if extraction.cui: fields_found.append("CUI")
-        if extraction.partner_name:
+        if extraction.tva_total or extraction.tva_entries: fields_found.append("TVA")
            fields_found.append("vendor")
        if extraction.cui:
            fields_found.append("CUI")
        if extraction.receipt_number:
            fields_found.append("number")
-        message = f"OCR processing successful. Found: {', '.join(fields_found) or 'no fields'}"
+        message = f"OCR complete (full pipeline). Found: {', '.join(fields_found) or 'no fields'}"
        elapsed_ms = int((time.time() - start_time) * 1000)
        extraction.processing_time_ms = elapsed_ms
        print("=" * 60, flush=True)
        print(f"[OCR] FINAL RESULT (full pipeline) - {elapsed_ms}ms", flush=True)
        print("=" * 60, flush=True)
        print(f"  - Amount: {extraction.amount}", flush=True)
        print(f"  - Date: {extraction.receipt_date}", flush=True)
        print(f"  - Number: {extraction.receipt_number}", flush=True)
        print(f"  - CUI: {extraction.cui}", flush=True)
        print(f"  - TVA: {extraction.tva_total}", flush=True)
        print(f"  - Overall Confidence: {extraction.overall_confidence:.0%}", flush=True)
        print(f"  - Processing Time: {elapsed_ms}ms", flush=True)
        print(f"  - Message: {message}", flush=True)
        return True, message, extraction
    def _merge_extractions(
        self,
        paddle: Optional[ExtractionResult],
        tesseract: Optional[ExtractionResult]
    ) -> ExtractionResult:
        """
        Merge two extractions, picking best fields from each engine.
        Strategy:
        - For each field, prefer the one with higher confidence
        - Use validation rules (CUI format, date validity, company indicators)
        - Combine TVA entries if different
        """
        result = ExtractionResult()
        # Handle case where one is None
        if paddle is None and tesseract is None:
            return result
        if paddle is None:
            return tesseract
        if tesseract is None:
            return paddle
        print("[Merge] Comparing PaddleOCR vs Tesseract extractions...", flush=True)
        # === AMOUNT ===
        # Pick higher confidence, both must be positive
        if paddle.amount and tesseract.amount:
            if paddle.confidence_amount >= tesseract.confidence_amount:
                result.amount = paddle.amount
                result.confidence_amount = paddle.confidence_amount
                print(f"[Merge] Amount: PaddleOCR {paddle.amount} (conf: {paddle.confidence_amount:.0%})", flush=True)
            else:
                result.amount = tesseract.amount
                result.confidence_amount = tesseract.confidence_amount
                print(f"[Merge] Amount: Tesseract {tesseract.amount} (conf: {tesseract.confidence_amount:.0%})", flush=True)
        elif paddle.amount:
            result.amount = paddle.amount
            result.confidence_amount = paddle.confidence_amount
        elif tesseract.amount:
            result.amount = tesseract.amount
            result.confidence_amount = tesseract.confidence_amount
        # === DATE ===
        # Pick higher confidence, validate date reasonableness
        if paddle.receipt_date and tesseract.receipt_date:
            if paddle.confidence_date >= tesseract.confidence_date:
                result.receipt_date = paddle.receipt_date
                result.confidence_date = paddle.confidence_date
                print(f"[Merge] Date: PaddleOCR {paddle.receipt_date}", flush=True)
            else:
                result.receipt_date = tesseract.receipt_date
                result.confidence_date = tesseract.confidence_date
                print(f"[Merge] Date: Tesseract {tesseract.receipt_date}", flush=True)
        elif paddle.receipt_date:
            result.receipt_date = paddle.receipt_date
            result.confidence_date = paddle.confidence_date
        elif tesseract.receipt_date:
            result.receipt_date = tesseract.receipt_date
            result.confidence_date = tesseract.confidence_date
        # === VENDOR NAME ===
        # Prefer one with company indicators (S.R.L., S.A., etc.)
        paddle_has_indicator = self._has_company_indicator(paddle.partner_name)
        tesseract_has_indicator = self._has_company_indicator(tesseract.partner_name)
        if paddle.partner_name and tesseract.partner_name:
            if paddle_has_indicator and not tesseract_has_indicator:
                result.partner_name = paddle.partner_name
                result.confidence_vendor = paddle.confidence_vendor
                print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (has company indicator)", flush=True)
            elif tesseract_has_indicator and not paddle_has_indicator:
                result.partner_name = tesseract.partner_name
                result.confidence_vendor = tesseract.confidence_vendor
                print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (has company indicator)", flush=True)
            elif paddle.confidence_vendor >= tesseract.confidence_vendor:
                result.partner_name = paddle.partner_name
                result.confidence_vendor = paddle.confidence_vendor
                print(f"[Merge] Vendor: PaddleOCR '{paddle.partner_name}' (higher conf)", flush=True)
            else:
                result.partner_name = tesseract.partner_name
                result.confidence_vendor = tesseract.confidence_vendor
                print(f"[Merge] Vendor: Tesseract '{tesseract.partner_name}' (higher conf)", flush=True)
        elif paddle.partner_name:
            result.partner_name = paddle.partner_name
            result.confidence_vendor = paddle.confidence_vendor
        elif tesseract.partner_name:
            result.partner_name = tesseract.partner_name
            result.confidence_vendor = tesseract.confidence_vendor
        # === CUI (Fiscal Code) ===
        # Validate format: 6-10 digits, prefer valid one
        paddle_cui_valid = self._is_valid_cui(paddle.cui)
        tesseract_cui_valid = self._is_valid_cui(tesseract.cui)
        if paddle.cui and tesseract.cui:
            if paddle_cui_valid and not tesseract_cui_valid:
                result.cui = paddle.cui
                print(f"[Merge] CUI: PaddleOCR {paddle.cui} (valid format)", flush=True)
            elif tesseract_cui_valid and not paddle_cui_valid:
                result.cui = tesseract.cui
                print(f"[Merge] CUI: Tesseract {tesseract.cui} (valid format)", flush=True)
            else:
                # Both valid or both invalid - prefer PaddleOCR
                result.cui = paddle.cui
                print(f"[Merge] CUI: PaddleOCR {paddle.cui}", flush=True)
        elif paddle.cui and paddle_cui_valid:
            result.cui = paddle.cui
        elif tesseract.cui and tesseract_cui_valid:
            result.cui = tesseract.cui
        elif paddle.cui:
            result.cui = paddle.cui
        elif tesseract.cui:
            result.cui = tesseract.cui
        # === TVA ENTRIES ===
        # Prefer non-empty, use the one with more entries or higher amounts
        if paddle.tva_entries and tesseract.tva_entries:
            # Compare: prefer the one with actual amounts (not just 0)
            paddle_total = sum(e.get('amount', Decimal('0')) for e in paddle.tva_entries)
            tesseract_total = sum(e.get('amount', Decimal('0')) for e in tesseract.tva_entries)
            if paddle_total >= tesseract_total:
                result.tva_entries = paddle.tva_entries
                result.tva_total = paddle.tva_total
                print(f"[Merge] TVA: PaddleOCR (total: {paddle_total})", flush=True)
            else:
                result.tva_entries = tesseract.tva_entries
                result.tva_total = tesseract.tva_total
                print(f"[Merge] TVA: Tesseract (total: {tesseract_total})", flush=True)
        elif paddle.tva_entries:
            result.tva_entries = paddle.tva_entries
            result.tva_total = paddle.tva_total
        elif tesseract.tva_entries:
            result.tva_entries = tesseract.tva_entries
            result.tva_total = tesseract.tva_total
        # === OTHER FIELDS ===
        # Simple preference: paddle > tesseract
        result.receipt_number = paddle.receipt_number or tesseract.receipt_number
        result.receipt_series = paddle.receipt_series or tesseract.receipt_series
        result.receipt_type = paddle.receipt_type or tesseract.receipt_type
        result.items_count = paddle.items_count or tesseract.items_count
        result.address = paddle.address or tesseract.address
        result.description = paddle.description or tesseract.description
        return result
    def _has_company_indicator(self, name: Optional[str]) -> bool:
        """Check if vendor name has company type indicator (S.R.L., S.A., etc.)"""
        if not name:
            return False
        name_upper = name.upper()
        indicators = [
            r'\bS\.?\s*R\.?\s*L\.?\b',
            r'\bS\.?\s*A\.?\b',
            r'\bS\.?\s*N\.?\s*C\.?\b',
            r'\bP\.?\s*F\.?\s*A\.?\b',
            r'\bI\.?\s*I\.?\b',
            r'\bHOLDING\b',
            r'\bGROUP\b',
            r'\bCOMPANY\b',
        ]
        for indicator in indicators:
            if re.search(indicator, name_upper):
                return True
        return False
    def _is_valid_cui(self, cui: Optional[str]) -> bool:
        """Validate CUI format: 6-10 digits."""
        if not cui:
            return False
        # Remove any RO prefix
        cui_clean = re.sub(r'^RO', '', cui.upper())
        # Must be 6-10 digits
        return bool(re.match(r'^\d{6,10}$', cui_clean))
    def _is_extraction_complete(self, ext: ExtractionResult, min_confidence: float = 0.85) -> bool:
        """
        Check if extraction has ALL required fields to skip further processing.
        Required for early exit (ALL must be true):
        - Overall confidence >= 85%
        - ALL 5 critical fields present: number, date, amount, TVA, CUI
        """
        # Must have high confidence
        if ext.overall_confidence < min_confidence:
            print(f"[OCR] Confidence {ext.overall_confidence:.0%} < {min_confidence:.0%} - continuing", flush=True)
            return False
        # Check all required fields
        has_number = bool(ext.receipt_number)
        has_date = bool(ext.receipt_date)
        has_amount = bool(ext.amount)
        has_tva = bool(ext.tva_total) or bool(ext.tva_entries)
        has_cui = bool(ext.cui)
        missing = []
        if not has_number: missing.append("number")
        if not has_date: missing.append("date")
        if not has_amount: missing.append("amount")
        if not has_tva: missing.append("TVA")
        if not has_cui: missing.append("CUI")
        if missing:
            print(f"[OCR] Missing: {', '.join(missing)} - continuing", flush=True)
            return False
        print(f"[OCR] ✓ All 5 fields found with {ext.overall_confidence:.0%} confidence", flush=True)
        return True
 # Singleton instance
 ocr_service = OCRService()
--- a/data-entry-app/frontend/src/components/ocr/OCRPreview.vue
+++ b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue
@@ -106,14 +106,27 @@
      <!-- Raw Text Toggle -->
      <div class="raw-text-section" v-if="data.raw_text">
-        <Button
+        <div class="raw-text-header">
-          :label="showRawText ? 'Ascunde text OCR' : 'Arata text OCR'"
+          <Button
-          :icon="showRawText ? 'pi pi-eye-slash' : 'pi pi-eye'"
+            :label="showRawText ? 'Ascunde text OCR' : 'Arata text OCR'"
-          severity="secondary"
+            :icon="showRawText ? 'pi pi-eye-slash' : 'pi pi-eye'"
-          size="small"
+            severity="secondary"
-          text
+            size="small"
-          @click="showRawText = !showRawText"
+            text
-        />
+            @click="showRawText = !showRawText"
          />
          <span v-if="data.ocr_engine" class="ocr-engine-badge" :class="getEngineClass(data.ocr_engine)">
            <i :class="getEngineIcon(data.ocr_engine)"></i>
            {{ getEngineLabel(data.ocr_engine) }}
          </span>
          <span v-if="data._ocr_message" class="ocr-message-badge" :class="getMessageClass(data._ocr_message)">
            {{ data._ocr_message }}
          </span>
          <span v-if="data.processing_time_ms" class="ocr-time-badge">
            <i class="pi pi-clock"></i>
            {{ formatProcessingTime(data.processing_time_ms) }}
          </span>
        </div>
        <div v-if="showRawText" class="raw-text">
          <pre>{{ data.raw_text }}</pre>
        </div>
@@ -168,6 +181,45 @@ const formatDate = (dateStr) => {
    year: 'numeric'
  })
 }
 const getEngineClass = (engine) => {
  if (!engine) return ''
  if (engine === 'paddle-light') return 'fast'
  if (engine === 'paddle-adaptive') return 'adaptive'
  if (engine === 'adaptive-full') return 'full'
  if (engine.includes('paddle')) return 'paddleocr'
  if (engine.includes('tesseract')) return 'tesseract'
  return ''
 }
 const getEngineIcon = (engine) => {
  if (!engine) return 'pi pi-cog'
  if (engine === 'paddle-light') return 'pi pi-bolt'  // Fast/lightning
  if (engine === 'adaptive-full') return 'pi pi-cog'  // Full pipeline
  return 'pi pi-cog'
 }
 const getEngineLabel = (engine) => {
  if (!engine) return ''
  if (engine === 'paddle-light') return 'Fast Mode (PaddleOCR)'
  if (engine === 'paddle-adaptive') return 'Adaptive (Paddle dual)'
  if (engine === 'adaptive-full') return 'Full Pipeline'
  if (engine.includes('paddle')) return 'PaddleOCR'
  if (engine.includes('tesseract')) return 'Tesseract'
  return engine
 }
 const getMessageClass = (message) => {
  if (!message) return ''
  if (message.includes('fast mode')) return 'fast-mode'
  if (message.includes('full pipeline')) return 'full-pipeline'
  return ''
 }
 const formatProcessingTime = (ms) => {
  if (ms < 1000) return `${ms}ms`
  return `${(ms / 1000).toFixed(1)}s`
 }
 </script>
 <style scoped>
@@ -305,6 +357,82 @@ const formatDate = (dateStr) => {
  border-top: 1px dashed #86efac;
 }
 .raw-text-header {
  display: flex;
  align-items: center;
  flex-wrap: wrap;
  gap: 0.75rem;
 }
 .ocr-engine-badge {
  display: inline-flex;
  align-items: center;
  gap: 0.25rem;
  padding: 0.25rem 0.5rem;
  border-radius: 4px;
  font-size: 0.75rem;
  font-weight: 500;
 }
 .ocr-engine-badge.paddleocr {
  background: #dbeafe;
  color: #1e40af;
 }
 .ocr-engine-badge.tesseract {
  background: #fef3c7;
  color: #92400e;
 }
 .ocr-engine-badge.fast {
  background: #dcfce7;
  color: #166534;
 }
 .ocr-engine-badge.adaptive {
  background: #dbeafe;
  color: #1e40af;
 }
 .ocr-engine-badge.full {
  background: #fef3c7;
  color: #92400e;
 }
 .ocr-message-badge {
  display: inline-flex;
  align-items: center;
  gap: 0.25rem;
  padding: 0.25rem 0.5rem;
  border-radius: 4px;
  font-size: 0.75rem;
  font-weight: 500;
  background: #f1f5f9;
  color: #475569;
 }
 .ocr-message-badge.fast-mode {
  background: #dcfce7;
  color: #166534;
 }
 .ocr-message-badge.full-pipeline {
  background: #fef3c7;
  color: #92400e;
 }
 .ocr-time-badge {
  display: inline-flex;
  align-items: center;
  gap: 0.25rem;
  padding: 0.25rem 0.5rem;
  border-radius: 4px;
  font-size: 0.75rem;
  font-weight: 600;
  background: #e0e7ff;
  color: #3730a3;
 }
 .raw-text {
  margin-top: 0.5rem;
  padding: 0.75rem;
--- a/data-entry-app/frontend/src/components/ocr/OCRUploadZone.vue
+++ b/data-entry-app/frontend/src/components/ocr/OCRUploadZone.vue
@@ -143,7 +143,12 @@ const processOCR = async () => {
    })
    if (response.data.success) {
-      emit('ocr-result', response.data.data)
+      // Include the OCR message in the data for debugging
      const resultData = {
        ...response.data.data,
        _ocr_message: response.data.message
      }
      emit('ocr-result', resultData)
    } else {
      error.value = response.data.message || 'OCR processing failed'
      emit('error', error.value)