roa2web-service-auto/backend/modules/data_entry/services/ocr/tesseract_engine.py

"""
Optimized Tesseract Engine for OCR - SPEED + QUALITY OPTIMIZED

Performance optimizations (vs previous version):
- Single PSM mode (PSM 4) instead of multi-PSM (4 modes × 2 calls = 8x faster)
- Single Tesseract call per image (skip image_to_data for speed)
- Lighter preprocessing (no over-binarization)
- --dpi 300 flag for proper scaling
- OEM 3 (default LSTM+Legacy) for balanced speed/accuracy

Quality optimizations for Romanian receipts:
- PSM 4: Single column layout (optimal for receipts)
- Polarity correction: ensures black text on white background
- Language: Romanian only (-l ron) for faster recognition
- Fallback to PSM 6 if PSM 4 produces poor results

Previous issues fixed:
- Was 8x slower than PaddleOCR due to multi-PSM + dual calls
- Produced gibberish on clear PDFs due to over-binarization
"""

import logging
import os
from dataclasses import dataclass, field
from typing import List, Optional, Tuple

import cv2
import numpy as np

# Check Tesseract availability
try:
    import pytesseract
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False
    pytesseract = None

logger = logging.getLogger(__name__)


@dataclass
class OCRResult:
    """Raw OCR result from Tesseract."""
    text: str
    confidence: float
    boxes: List[dict] = field(default_factory=list)
    engine: str = "tesseract"


class TesseractEngine:
    """
    Optimized Tesseract engine for receipt OCR.

    TESTED OPTIMAL SETTINGS (from comprehensive benchmark):
    - DPI 200 for PDF loading (not 300!)
    - Padding 40px for edge protection
    - PSM 6 for complex receipts, PSM 4 for simple ones
    - Multi-pass strategy when quality is critical

    SPEED vs QUALITY tradeoff:
    - Fast mode (single pass): ~0.9s, ~6-7 keywords
    - Quality mode (multi-pass): ~1.7s, ~8-9 keywords (+2 more keywords)

    BENCHMARK RESULTS:
    - padded_psm6_40: Best for complex receipts (igiena, five-holding)
    - baseline_psm4: Best for simple receipts (rechizite, benzina)
    - multi-pass: Best overall quality but slower
    """

    # PSM modes for receipts
    PSM_SINGLE_COLUMN = 4  # Best for simple vertical receipts
    PSM_UNIFORM_BLOCK = 6  # Best for complex layouts
    PSM_SPARSE_TEXT = 11   # Fallback for difficult receipts

    # Optimal padding (from benchmark)
    DEFAULT_PADDING = 40

    def __init__(self):
        """Initialize Tesseract engine."""
        if not TESSERACT_AVAILABLE:
            raise RuntimeError("pytesseract not available. Install with: pip install pytesseract")

        # Verify Tesseract installation
        try:
            self._version = pytesseract.get_tesseract_version()
        except Exception as e:
            raise RuntimeError(f"Tesseract not installed or not in PATH: {e}")

        logger.info(f"[TesseractEngine] Initialized (v{self._version})")

    def recognize(self, image: np.ndarray, fast_mode: bool = True) -> OCRResult:
        """
        Perform OCR recognition on image (OPTIMIZED).

        SPEED: Uses single PSM mode + single Tesseract call.
        Previously used 4 PSM modes × 2 calls = 8 Tesseract invocations.
        Now uses 1-2 calls maximum (with fallback).

        Args:
            image: Preprocessed grayscale image (DO NOT binarize for clear PDFs!)
            fast_mode: If True, skip confidence calculation for maximum speed

        Returns:
            OCRResult with text and confidence
        """
        if not TESSERACT_AVAILABLE:
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

        # Ensure grayscale
        if len(image.shape) == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Fix polarity (black text on white background)
        image = self._ensure_correct_polarity(image)

        # Try PSM 4 first (single column - best for receipts)
        result = self._recognize_fast(image, self.PSM_SINGLE_COLUMN, fast_mode)

        # If poor result, try PSM 6 as fallback
        if not result.text.strip() or result.confidence < 0.3:
            logger.debug(f"[Tesseract] PSM {self.PSM_SINGLE_COLUMN} poor result, trying PSM {self.PSM_UNIFORM_BLOCK}")
            fallback = self._recognize_fast(image, self.PSM_UNIFORM_BLOCK, fast_mode)
            if len(fallback.text) > len(result.text):
                result = fallback

        if result.text.strip():
            logger.info(f"[TesseractEngine] Result: {len(result.text)} chars, conf={result.confidence:.0%}")

        return result

    def _recognize_fast(self, image: np.ndarray, psm: int, fast_mode: bool = True) -> OCRResult:
        """
        Fast single-call Tesseract recognition.

        Optimizations:
        - Single call (image_to_string only in fast mode)
        - OEM 3 (LSTM+Legacy) - faster than OEM 1
        - --dpi 300 for proper scaling
        - Romanian only (-l ron)

        Args:
            image: Grayscale image
            psm: Page segmentation mode
            fast_mode: Skip confidence calculation for speed

        Returns:
            OCRResult
        """
        # Build optimized config:
        # OEM 3 = LSTM + Legacy (faster than pure LSTM)
        # --dpi 300 = proper scaling hint
        # -l ron = Romanian only (faster, avoids eng confusion)
        config = f'--psm {psm} --oem 3 --dpi 300 -l ron'

        try:
            if fast_mode:
                # Fast path: just get text, estimate confidence
                text = pytesseract.image_to_string(image, config=config)
                # Estimate confidence based on text quality
                confidence = self._estimate_confidence(text)
            else:
                # Accurate path: get text + real confidence
                text = pytesseract.image_to_string(image, config=config)
                data = pytesseract.image_to_data(
                    image, config=config, output_type=pytesseract.Output.DICT
                )
                confidences = [int(c) for c in data['conf'] if int(c) > 0]
                confidence = sum(confidences) / len(confidences) / 100 if confidences else 0.0

            return OCRResult(
                text=text,
                confidence=confidence,
                boxes=[],
                engine="tesseract"
            )

        except Exception as e:
            logger.warning(f"[Tesseract] PSM {psm} error: {e}")
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

    def _estimate_confidence(self, text: str) -> float:
        """
        Estimate OCR confidence based on text quality.

        Heuristics:
        - More alphanumeric chars = higher confidence
        - Less garbage chars = higher confidence
        - Romanian-specific patterns boost confidence
        """
        if not text.strip():
            return 0.0

        # Count valid vs garbage chars
        valid_chars = sum(1 for c in text if c.isalnum() or c in '.,;:-/\n ')
        total_chars = len(text)

        if total_chars == 0:
            return 0.0

        # Base confidence from char ratio
        confidence = valid_chars / total_chars

        # Boost for Romanian receipt patterns
        text_lower = text.lower()
        if any(word in text_lower for word in ['total', 'lei', 'ron', 'buc', 'tva', 'cif', 'bon']):
            confidence = min(confidence + 0.1, 1.0)

        return confidence

    def recognize_multipass(self, image: np.ndarray) -> OCRResult:
        """
        Multi-pass OCR for maximum quality (slower but more accurate).

        Strategy (from benchmark testing):
        - Pass 1: PSM 4 (single column) - no padding, fast baseline
        - Pass 2: PSM 6 (uniform block) - with 40px padding, better for complex layouts
        - Pass 3: PSM 11 (sparse text) - with 40px padding + stronger CLAHE, for difficult receipts

        Merges results: picks the pass with highest keyword count.
        On average finds +2.1 more keywords than single-pass (~8.7 vs 6.6).

        Time: ~1.7s (vs ~0.9s for single pass)

        Args:
            image: Input image (RGB or grayscale)

        Returns:
            OCRResult from the best pass
        """
        if not TESSERACT_AVAILABLE:
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

        # Ensure grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # Define passes with different settings
        passes = [
            # Pass 1: Fast baseline (no padding) - good for simple receipts
            {"name": "pass1_psm4", "psm": 4, "padding": 0, "clahe_clip": 1.5},
            # Pass 2: Padded PSM 6 - good for complex receipts
            {"name": "pass2_psm6_padded", "psm": 6, "padding": 40, "clahe_clip": 1.5},
            # Pass 3: Sparse text with stronger enhancement - for difficult cases
            {"name": "pass3_psm11", "psm": 11, "padding": 40, "clahe_clip": 2.0},
        ]

        best_result = None
        best_score = -1
        all_keywords = set()

        for p in passes:
            # Apply preprocessing for this pass
            processed = gray.copy()

            # Add padding if specified
            if p["padding"] > 0:
                processed = cv2.copyMakeBorder(
                    processed, p["padding"], p["padding"], p["padding"], p["padding"],
                    cv2.BORDER_CONSTANT, value=255
                )

            # Apply CLAHE
            clahe = cv2.createCLAHE(clipLimit=p["clahe_clip"], tileGridSize=(8, 8))
            processed = clahe.apply(processed)

            # Ensure correct polarity
            processed = self._ensure_correct_polarity(processed)

            # Run OCR
            config = f'--psm {p["psm"]} --oem 3 -l ron'
            try:
                text = pytesseract.image_to_string(processed, config=config)
                confidence = self._estimate_confidence(text)

                # Score based on Romanian receipt keywords
                text_lower = text.lower()
                keywords = ['cif', 'total', 'tva', 'lei', 'ron', 'buc', 'fiscal', 'bon',
                           'hartie', 'prosop', 'saci', 'creion', 'constanta', 'bucuresti']
                found_keywords = [kw for kw in keywords if kw in text_lower]
                all_keywords.update(found_keywords)

                # Score: keywords + CIF bonus + TOTAL bonus
                score = len(found_keywords) * 10
                if self._has_cif_pattern(text):
                    score += 15
                if self._has_total_pattern(text):
                    score += 10

                logger.debug(f"[Tesseract] {p['name']}: {len(found_keywords)} keywords, score={score}")

                if score > best_score:
                    best_score = score
                    best_result = OCRResult(
                        text=text,
                        confidence=confidence,
                        boxes=[],
                        engine=f"tesseract-multipass-{p['name']}"
                    )

            except Exception as e:
                logger.warning(f"[Tesseract] {p['name']} failed: {e}")
                continue

        if best_result:
            logger.info(f"[TesseractEngine] Multi-pass best: {best_result.engine}, "
                       f"{len(all_keywords)} total keywords found")
            return best_result

        return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract-multipass")

    def _has_cif_pattern(self, text: str) -> bool:
        """Check if text contains a valid CIF/CUI pattern."""
        import re
        text_upper = text.upper()
        patterns = [
            r'CIF[:\s]*RO?\d{6,10}',
            r'CUI[:\s]*RO?\d{6,10}',
            r'C\.?I\.?F\.?[:\s]*RO?\d{6,10}',
        ]
        for pattern in patterns:
            if re.search(pattern, text_upper):
                return True
        return bool(re.search(r'RO\d{7,10}', text_upper))

    def _has_total_pattern(self, text: str) -> bool:
        """Check if TOTAL is properly recognized (not truncated to BTOTAL/OTAL)."""
        import re
        text_upper = text.upper()
        return bool(re.search(r'(^|\s)TOTAL\s', text_upper, re.MULTILINE))

    def recognize_with_boxes(self, image: np.ndarray, psm: int = 4) -> OCRResult:
        """
        Recognition with bounding boxes (slower, for debugging/visualization).

        Use this only when you need box coordinates.
        For normal OCR, use recognize() which is faster.

        Args:
            image: Grayscale image
            psm: Page segmentation mode (default: 4 for receipts)

        Returns:
            OCRResult with text, confidence, and boxes
        """
        if not TESSERACT_AVAILABLE:
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

        # Ensure grayscale
        if len(image.shape) == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        image = self._ensure_correct_polarity(image)
        config = f'--psm {psm} --oem 3 --dpi 300 -l ron'

        try:
            text = pytesseract.image_to_string(image, config=config)
            data = pytesseract.image_to_data(
                image, config=config, output_type=pytesseract.Output.DICT
            )

            confidences = [int(c) for c in data['conf'] if int(c) > 0]
            avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0

            boxes = []
            for i in range(len(data['text'])):
                if data['text'][i].strip() and int(data['conf'][i]) > 0:
                    boxes.append({
                        'text': data['text'][i],
                        'confidence': int(data['conf'][i]) / 100,
                        'box': [data['left'][i], data['top'][i], data['width'][i], data['height'][i]]
                    })

            return OCRResult(text=text, confidence=avg_conf, boxes=boxes, engine="tesseract")

        except Exception as e:
            logger.warning(f"[Tesseract] recognize_with_boxes error: {e}")
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

    def _ensure_correct_polarity(self, image: np.ndarray) -> np.ndarray:
        """
        Ensure image has black text on white background.

        Receipts should have dark text on light background.
        If image is inverted (light text on dark), invert it.

        Detection method:
        - Calculate mean pixel value
        - If mean < 127, image is mostly dark (inverted)
        - Invert to correct polarity

        Args:
            image: Grayscale image

        Returns:
            Polarity-corrected image
        """
        mean_value = np.mean(image)

        if mean_value < 127:
            # Image is mostly dark = inverted (white text on black)
            logger.debug(f"[TesseractEngine] Detected inverted polarity (mean={mean_value:.1f}), correcting...")
            return 255 - image

        return image

    def recognize_numbers_only(self, image: np.ndarray) -> OCRResult:
        """
        OCR optimized for numeric content (amounts, totals).

        Uses character whitelist to reduce errors on numbers.

        Args:
            image: Preprocessed grayscale image

        Returns:
            OCRResult with numeric text
        """
        if not TESSERACT_AVAILABLE:
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

        # Ensure grayscale
        if len(image.shape) == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Fix polarity
        image = self._ensure_correct_polarity(image)

        # Config for numbers only
        # Whitelist: digits, comma, period, space, RON, LEI
        config = '--psm 6 --oem 1 -c tessedit_char_whitelist=0123456789.,- '

        try:
            text = pytesseract.image_to_string(image, config=config)

            data = pytesseract.image_to_data(
                image,
                config=config,
                output_type=pytesseract.Output.DICT
            )

            confidences = [int(c) for c in data['conf'] if int(c) > 0]
            avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0

            return OCRResult(
                text=text.strip(),
                confidence=avg_conf,
                boxes=[],
                engine="tesseract-numeric"
            )

        except Exception as e:
            logger.error(f"[TesseractEngine] Numeric OCR error: {e}")
            return OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")

    def recognize_cif_optimized(self, image: np.ndarray) -> Optional[str]:
        """
        Optimized CIF extraction using multi-strategy approach.

        BENCHMARK RESULTS (from test_critical_fields.py):
        - digit_opt_dpi200: 33% accuracy (best)
        - digit_whitelist: Works well on specific receipts
        - basic_ron_eng: Good backup

        Strategy:
        1. Try digit-optimized preprocessing (2x scale + Otsu)
        2. Try character whitelist (RO + digits only)
        3. Try standard ron+eng config
        4. Return best match based on CIF pattern validation

        Args:
            image: Input image (RGB from pdf2image or BGR from OpenCV)

        Returns:
            Extracted CIF string (e.g., "RO10562600") or None
        """
        import re

        if not TESSERACT_AVAILABLE:
            return None

        # Ensure grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image.copy()

        # Extract top 35% of image (where CIF is typically found)
        height = gray.shape[0]
        top_region = gray[:int(height * 0.35), :]

        candidates = []

        # Strategy 1: Digit-optimized preprocessing (best performer: 33% accuracy)
        try:
            # Scale up 2x + Otsu binarization
            scaled = cv2.resize(top_region, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
            enhanced = clahe.apply(scaled)
            _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            if np.mean(binary) < 127:
                binary = 255 - binary

            text = pytesseract.image_to_string(binary, config='--psm 6 --oem 3 -l ron')
            cif = self._extract_cif_from_text(text)
            if cif:
                candidates.append(('digit_opt', cif))
        except Exception as e:
            logger.debug(f"[TesseractEngine] digit_opt strategy failed: {e}")

        # Strategy 2: Character whitelist (RO + digits only)
        try:
            # Add padding
            padded = cv2.copyMakeBorder(top_region, 40, 40, 40, 40, cv2.BORDER_CONSTANT, value=255)
            scaled = cv2.resize(padded, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)

            config = '--psm 6 --oem 1 -c tessedit_char_whitelist=0123456789ROro'
            text = pytesseract.image_to_string(scaled, config=config)
            cif = self._extract_cif_from_text(text)
            if cif:
                candidates.append(('whitelist', cif))
        except Exception as e:
            logger.debug(f"[TesseractEngine] whitelist strategy failed: {e}")

        # Strategy 3: Standard ron+eng config (good backup)
        try:
            padded = cv2.copyMakeBorder(top_region, 40, 40, 40, 40, cv2.BORDER_CONSTANT, value=255)
            clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
            enhanced = clahe.apply(padded)

            text = pytesseract.image_to_string(enhanced, config='--psm 6 --oem 3 -l ron+eng')
            cif = self._extract_cif_from_text(text)
            if cif:
                candidates.append(('ron_eng', cif))
        except Exception as e:
            logger.debug(f"[TesseractEngine] ron_eng strategy failed: {e}")

        if not candidates:
            return None

        # Log all candidates
        for strategy, cif in candidates:
            logger.debug(f"[TesseractEngine] CIF candidate from {strategy}: {cif}")

        # Use majority voting if multiple strategies agree
        from collections import Counter
        cif_counts = Counter(cif for _, cif in candidates)
        most_common_cif, count = cif_counts.most_common(1)[0]

        if count > 1:
            # Multiple strategies agree
            logger.info(f"[TesseractEngine] CIF extracted (majority {count} strategies): {most_common_cif}")
            return most_common_cif

        # No agreement - prefer digit_opt strategy (33% accuracy in benchmarks)
        for strategy, cif in candidates:
            if strategy == 'digit_opt':
                logger.info(f"[TesseractEngine] CIF extracted via digit_opt (preferred): {cif}")
                return cif

        # Fallback to first candidate
        strategy, cif = candidates[0]
        logger.info(f"[TesseractEngine] CIF extracted via {strategy}: {cif}")
        return cif

    def _extract_cif_from_text(self, text: str) -> Optional[str]:
        """Extract CIF/CUI from OCR text."""
        import re
        text_upper = text.upper().replace(' ', '')

        patterns = [
            r'CIF[:\s]*R?O?(\d{6,10})',
            r'CUI[:\s]*R?O?(\d{6,10})',
            r'C\.?I\.?F\.?[:\s]*R?O?(\d{6,10})',
            r'RO(\d{7,10})',
            r'R\.?O\.?[\s:]*(\d{6,10})',
        ]

        for pattern in patterns:
            match = re.search(pattern, text_upper)
            if match:
                digits = match.group(1).lstrip('0') or '0'
                return f"RO{digits}"

        return None

    @staticmethod
    def validate_romanian_cif(cif: str) -> bool:
        """
        Validate Romanian CIF/CUI using checksum algorithm.

        Romanian CIF format: RO + 2-10 digits
        The last digit is a control digit calculated using modulo 11.

        Algorithm:
        1. Multiply each digit by corresponding weight (from right to left: 2,3,4,5,6,7,2,3,4,5)
        2. Sum all products
        3. Remainder of sum / 11 is the control digit
        4. If remainder is 10, control digit is 0

        Args:
            cif: CIF string (e.g., "RO10562600", "10562600")

        Returns:
            True if CIF is valid, False otherwise
        """
        # Remove RO prefix and spaces
        cif = cif.upper().replace(' ', '').replace('RO', '')

        # Must be 2-10 digits
        if not cif.isdigit() or len(cif) < 2 or len(cif) > 10:
            return False

        # Weights for checksum calculation (right to left)
        weights = [2, 3, 4, 5, 6, 7, 2, 3, 4, 5]

        # Pad with zeros on the left to make it 10 digits
        cif_padded = cif.zfill(10)

        # Calculate checksum (excluding last digit which is control)
        total = 0
        for i in range(9):
            total += int(cif_padded[i]) * weights[i]

        # Control digit
        control = total % 11
        if control == 10:
            control = 0

        # Compare with last digit
        return int(cif_padded[9]) == control

    @staticmethod
    def is_available() -> bool:
        """Check if Tesseract is available."""
        if not TESSERACT_AVAILABLE:
            return False

        try:
            pytesseract.get_tesseract_version()
            return True
        except Exception:
            return False

    @staticmethod
    def get_version() -> Optional[str]:
        """Get Tesseract version string."""
        if not TESSERACT_AVAILABLE:
            return None

        try:
            return str(pytesseract.get_tesseract_version())
        except Exception:
            return None