feat: Migrate to ultrathin monolith architecture

Consolidate 3 separate applications (reports-app, data-entry-app, telegram-bot) into a unified architecture with single backend and frontend: Backend Changes: - Unified FastAPI backend at backend/ with modular structure - Modules: reports, data_entry, telegram in backend/modules/ - Centralized config.py and main.py with all routers registered - Single worker mode (--workers 1) for Telegram bot compatibility - Shared Oracle connection pool and JWT authentication - Unified requirements.txt and environment configuration Frontend Changes: - Single Vue.js SPA with module-based routing - Unified frontend at src/ with modules in src/modules/{reports,data-entry}/ - Shared components and stores in src/shared/ - Error boundaries for module isolation - Dual API proxy in Vite for module communication Infrastructure: - New unified startup scripts: start-prod.sh, start-test.sh, start-backend.sh - Environment templates: .env.dev.example, .env.test.example, .env.prod.example - Updated deployment scripts for Windows IIS - Simplified SSH tunnel management Documentation: - Comprehensive CLAUDE.md with architecture overview - Module-specific docs in docs/{data-entry,telegram}/ - Architecture decision records in docs/ARCHITECTURE-DECISIONS.md - Deployment guides consolidated in deployment/windows/docs/ This migration reduces complexity, improves maintainability, and enables easier deployment while maintaining all existing functionality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-29 23:48:14 +02:00
parent 2a101f1ef5
commit c5e051ad80
378 changed files with 7566 additions and 73730 deletions
--- a/backend/modules/data_entry/services/ocr_engine.py
+++ b/backend/modules/data_entry/services/ocr_engine.py
@@ -0,0 +1,295 @@
+"""OCR engine wrapper for PaddleOCR and Tesseract."""
+
+import os
+import logging
+import threading
+import time
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+# Setup logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)  # Ensure logs are visible
+
+# Disable PaddleOCR model source check for faster startup (PaddleX 3.x)
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+# Lazy imports - these will be imported on first use
+PaddleOCR = None  # Will be imported lazily
+pytesseract = None  # Will be imported lazily
+
+# Check availability without importing heavy libraries
+def _check_paddle_available() -> bool:
+    """Check if paddleocr is installed without importing it."""
+    try:
+        import importlib.util
+        return importlib.util.find_spec("paddleocr") is not None
+    except Exception:
+        return False
+
+def _check_tesseract_available() -> bool:
+    """Check if pytesseract is installed without importing it."""
+    try:
+        import importlib.util
+        return importlib.util.find_spec("pytesseract") is not None
+    except Exception:
+        return False
+
+PADDLE_AVAILABLE = _check_paddle_available()
+TESSERACT_AVAILABLE = _check_tesseract_available()
+
+
+@dataclass
+class OCRResult:
+    """Raw OCR result."""
+    text: str
+    confidence: float
+    boxes: List[dict]
+    engine: str = ""  # OCR engine used: paddleocr or tesseract
+
+
+class OCREngine:
+    """Unified OCR engine with fallback support."""
+
+    def __init__(self):
+        self._paddle = None
+        self._paddle_init_started = False
+        self._paddle_ready = threading.Event()  # Signals when PaddleOCR is FULLY ready
+        self._paddle_init_lock = threading.Lock()
+
+    def _init_paddle_lazy(self):
+        """Lazy initialize PaddleOCR on first use (avoids slow startup)."""
+        global PaddleOCR
+
+        with self._paddle_init_lock:
+            if self._paddle_init_started:
+                return  # Already initializing or done
+            self._paddle_init_started = True
+
+        if PADDLE_AVAILABLE:
+            try:
+                print("Importing PaddleOCR (first use, may take ~15-20 seconds)...", flush=True)
+                from paddleocr import PaddleOCR as _PaddleOCR
+                PaddleOCR = _PaddleOCR
+
+                print("Initializing PaddleOCR engine...", flush=True)
+                # PaddleOCR 3.x API - optimized for Romanian receipts
+                # Note: 'latin' not available in PaddleOCR 3.x, 'en' works well for receipts
+                self._paddle = PaddleOCR(
+                    lang='en',  # 'en' handles Latin alphabet well for receipts
+                    # High quality settings for better accuracy
+                    det_db_thresh=0.3,      # Lower threshold = detect more text (default 0.3)
+                    det_db_box_thresh=0.5,  # Box confidence threshold (default 0.5)
+                    det_db_unclip_ratio=1.8,  # Expand detected boxes slightly (default 1.5)
+                    rec_batch_num=6,        # Batch size for recognition
+                    use_angle_cls=True,     # Enable text angle classification
+                )
+                print("PaddleOCR initialized successfully with high-quality settings", flush=True)
+            except Exception as e:
+                print(f"Warning: Failed to initialize PaddleOCR: {e}", flush=True)
+                self._paddle = None
+
+        # Signal that initialization is complete (success or failure)
+        self._paddle_ready.set()
+
+    def wait_for_paddle(self, timeout: float = 30.0) -> bool:
+        """
+        Wait for PaddleOCR to be fully initialized.
+
+        Args:
+            timeout: Max seconds to wait (default 30s)
+
+        Returns:
+            True if PaddleOCR is ready, False if timeout or unavailable
+        """
+        if not PADDLE_AVAILABLE:
+            return False
+
+        if self._paddle is not None:
+            return True  # Already ready
+
+        if not self._paddle_init_started:
+            # Start initialization if not already started
+            self._init_paddle_lazy()
+
+        # Wait for initialization to complete
+        print(f"[OCR] Waiting for PaddleOCR to be ready (max {timeout}s)...", flush=True)
+        start = time.time()
+        ready = self._paddle_ready.wait(timeout=timeout)
+        elapsed = time.time() - start
+
+        if ready and self._paddle is not None:
+            print(f"[OCR] PaddleOCR ready after {elapsed:.1f}s", flush=True)
+            return True
+        else:
+            print(f"[OCR] PaddleOCR not ready after {elapsed:.1f}s (timeout or failed)", flush=True)
+            return False
+
+    def is_paddle_ready(self) -> bool:
+        """Check if PaddleOCR is ready without waiting."""
+        return self._paddle is not None
+
+    def recognize(self, image: np.ndarray) -> OCRResult:
+        """Perform OCR on preprocessed image."""
+        logger.info(f"[OCR] Starting recognition, image shape: {image.shape}, dtype: {image.dtype}")
+
+        # Lazy init PaddleOCR on first call
+        self._init_paddle_lazy()
+
+        if PADDLE_AVAILABLE and self._paddle:
+            logger.info("[OCR] Using PaddleOCR engine")
+            return self._paddle_recognize(image)
+        elif TESSERACT_AVAILABLE:
+            logger.info("[OCR] Using Tesseract engine (PaddleOCR not available)")
+            return self._tesseract_recognize(image)
+        else:
+            logger.error("[OCR] No OCR engine available!")
+            raise RuntimeError(
+                "No OCR engine available. Install PaddleOCR or Tesseract."
+            )
+
+    def _paddle_recognize(self, image: np.ndarray) -> OCRResult:
+        """Recognize text using PaddleOCR 3.x API."""
+        # Wait for PaddleOCR to be fully ready (handles background init)
+        if not self.wait_for_paddle(timeout=30.0):
+            logger.warning("[PaddleOCR] Not ready, falling back to Tesseract")
+            if TESSERACT_AVAILABLE:
+                return self._tesseract_recognize(image)
+            raise RuntimeError("PaddleOCR not ready and Tesseract not available")
+
+        try:
+            logger.info(f"[PaddleOCR] Processing image, shape: {image.shape}")
+
+            # PaddleOCR 3.x requires 3-channel images
+            if len(image.shape) == 2:
+                # Convert grayscale to 3-channel BGR
+                import cv2
+                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+                logger.info(f"[PaddleOCR] Converted to BGR, new shape: {image.shape}")
+
+            # PaddleOCR 3.x uses predict() with new parameter names
+            logger.info("[PaddleOCR] Calling predict()...")
+            result = self._paddle.predict(image, use_textline_orientation=True)
+            logger.info(f"[PaddleOCR] predict() returned, result type: {type(result)}")
+
+            if not result or len(result) == 0:
+                logger.warning("[PaddleOCR] No results returned")
+                return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
+
+            # PaddleOCR 3.x returns OCRResult objects with different structure
+            ocr_result = result[0]
+
+            # Extract texts and scores from the new format
+            rec_texts = ocr_result.get('rec_texts', [])
+            rec_scores = ocr_result.get('rec_scores', [])
+            dt_polys = ocr_result.get('dt_polys', [])
+
+            if not rec_texts:
+                return OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
+
+            boxes = []
+            for i, text in enumerate(rec_texts):
+                conf = rec_scores[i] if i < len(rec_scores) else 0.0
+                box = dt_polys[i].tolist() if i < len(dt_polys) else []
+                boxes.append({
+                    'text': text,
+                    'confidence': float(conf),
+                    'box': box
+                })
+
+            avg_conf = sum(rec_scores) / len(rec_scores) if rec_scores else 0.0
+            text_result = '\n'.join(rec_texts)
+            logger.info(f"[PaddleOCR] SUCCESS - Found {len(rec_texts)} text lines, avg confidence: {avg_conf:.2%}")
+            logger.debug(f"[PaddleOCR] Raw text preview: {text_result[:200]}...")
+            return OCRResult(
+                text=text_result,
+                confidence=float(avg_conf),
+                boxes=boxes,
+                engine="paddleocr"
+            )
+        except Exception as e:
+            logger.error(f"[PaddleOCR] ERROR: {e}, falling back to Tesseract")
+            if TESSERACT_AVAILABLE:
+                return self._tesseract_recognize(image)
+            raise
+
+    def _tesseract_recognize(self, image: np.ndarray) -> OCRResult:
+        """Recognize text using Tesseract."""
+        global pytesseract
+
+        logger.info(f"[Tesseract] Processing image, shape: {image.shape}")
+
+        # Lazy import pytesseract
+        if pytesseract is None:
+            logger.info("[Tesseract] Importing pytesseract...")
+            import pytesseract as _pytesseract
+            pytesseract = _pytesseract
+
+        # PSM 4: Single column (best for receipts)
+        config = '--psm 4 -l ron+eng'
+        text = pytesseract.image_to_string(image, config=config)
+
+        # Quick confidence estimate
+        data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
+        confidences = [int(c) for c in data['conf'] if int(c) > 0]
+        avg_conf = sum(confidences) / len(confidences) / 100 if confidences else 0.0
+
+        logger.info(f"[Tesseract] Done: {len(text)} chars, conf: {avg_conf:.2%}")
+        return OCRResult(text=text, confidence=avg_conf, boxes=[], engine="tesseract")
+
+    def recognize_dual(self, image: np.ndarray) -> Tuple[OCRResult, Optional[OCRResult]]:
+        """
+        Run both OCR engines and return both results.
+
+        Returns:
+            Tuple of (paddle_result, tesseract_result)
+            tesseract_result may be None if Tesseract is not available
+        """
+        logger.info(f"[OCR Dual] Starting dual recognition, image shape: {image.shape}")
+
+        # Lazy init PaddleOCR
+        self._init_paddle_lazy()
+
+        paddle_result = None
+        tesseract_result = None
+
+        # Run PaddleOCR
+        if PADDLE_AVAILABLE and self._paddle:
+            try:
+                logger.info("[OCR Dual] Running PaddleOCR...")
+                paddle_result = self._paddle_recognize(image)
+                logger.info(f"[OCR Dual] PaddleOCR: {len(paddle_result.text)} chars, conf: {paddle_result.confidence:.2%}")
+            except Exception as e:
+                logger.error(f"[OCR Dual] PaddleOCR failed: {e}")
+                paddle_result = OCRResult(text="", confidence=0.0, boxes=[], engine="paddleocr")
+
+        # Run Tesseract
+        if TESSERACT_AVAILABLE:
+            try:
+                logger.info("[OCR Dual] Running Tesseract...")
+                tesseract_result = self._tesseract_recognize(image)
+                logger.info(f"[OCR Dual] Tesseract: {len(tesseract_result.text)} chars, conf: {tesseract_result.confidence:.2%}")
+            except Exception as e:
+                logger.error(f"[OCR Dual] Tesseract failed: {e}")
+                tesseract_result = OCRResult(text="", confidence=0.0, boxes=[], engine="tesseract")
+
+        # Fallback if PaddleOCR not available
+        if paddle_result is None:
+            if tesseract_result:
+                paddle_result = tesseract_result
+            else:
+                raise RuntimeError("No OCR engine available")
+
+        return paddle_result, tesseract_result
+
+    @staticmethod
+    def get_available_engines() -> List[str]:
+        """Return list of available OCR engines."""
+        engines = []
+        if PADDLE_AVAILABLE:
+            engines.append('paddleocr')
+        if TESSERACT_AVAILABLE:
+            engines.append('tesseract')
+        return engines