feat: Migrate to ultrathin monolith architecture

Consolidate 3 separate applications (reports-app, data-entry-app, telegram-bot) into a unified architecture with single backend and frontend: Backend Changes: - Unified FastAPI backend at backend/ with modular structure - Modules: reports, data_entry, telegram in backend/modules/ - Centralized config.py and main.py with all routers registered - Single worker mode (--workers 1) for Telegram bot compatibility - Shared Oracle connection pool and JWT authentication - Unified requirements.txt and environment configuration Frontend Changes: - Single Vue.js SPA with module-based routing - Unified frontend at src/ with modules in src/modules/{reports,data-entry}/ - Shared components and stores in src/shared/ - Error boundaries for module isolation - Dual API proxy in Vite for module communication Infrastructure: - New unified startup scripts: start-prod.sh, start-test.sh, start-backend.sh - Environment templates: .env.dev.example, .env.test.example, .env.prod.example - Updated deployment scripts for Windows IIS - Simplified SSH tunnel management Documentation: - Comprehensive CLAUDE.md with architecture overview - Module-specific docs in docs/{data-entry,telegram}/ - Architecture decision records in docs/ARCHITECTURE-DECISIONS.md - Deployment guides consolidated in deployment/windows/docs/ This migration reduces complexity, improves maintainability, and enables easier deployment while maintaining all existing functionality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-29 23:48:14 +02:00
parent 2a101f1ef5
commit c5e051ad80
378 changed files with 7566 additions and 73730 deletions
--- a/backend/modules/data_entry/services/image_preprocessor.py
+++ b/backend/modules/data_entry/services/image_preprocessor.py
@@ -0,0 +1,270 @@
+"""Image preprocessing for optimal OCR results."""
+
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import cv2
+
+try:
+    import pdf2image
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+
+
+class ImagePreprocessor:
+    """Preprocess receipt images for OCR."""
+
+    def _add_safety_padding(self, image: np.ndarray, padding: int = 50) -> np.ndarray:
+        """Add white padding around image to protect edge content during rotation.
+
+        This prevents left/right margin truncation in OCR by ensuring text near
+        edges isn't lost during deskew rotation.
+        """
+        if len(image.shape) == 2:
+            # Grayscale
+            return cv2.copyMakeBorder(
+                image, padding, padding, padding, padding,
+                cv2.BORDER_CONSTANT, value=255
+            )
+        else:
+            # Color (BGR)
+            return cv2.copyMakeBorder(
+                image, padding, padding, padding, padding,
+                cv2.BORDER_CONSTANT, value=(255, 255, 255)
+            )
+
+    def load_image(self, path: Path) -> np.ndarray:
+        """Load image from file."""
+        image = cv2.imread(str(path))
+        if image is None:
+            raise ValueError(f"Could not load image: {path}")
+        return image
+
+    def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]:
+        """
+        Convert PDF to images.
+
+        Args:
+            path: Path to PDF file
+            dpi: Resolution (300 = fast & good quality, 400 = better but slower)
+        """
+        if not PDF_AVAILABLE:
+            raise RuntimeError("pdf2image not available. Install with: pip install pdf2image")
+        images = pdf2image.convert_from_path(str(path), dpi=dpi)
+        return [np.array(img) for img in images]
+
+    def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray:
+        """
+        Apply LIGHT preprocessing - better for clear PDFs.
+        Heavy binarization can destroy text on clear images.
+        """
+        return self.preprocess_light(image)
+
+    def preprocess_light(self, image: np.ndarray) -> np.ndarray:
+        """
+        Light preprocessing for CLEAR images (PDFs, good scans).
+        Preserves original quality, only enhances contrast.
+        """
+        # 0. Add safety padding to protect edge content during deskew rotation
+        image = self._add_safety_padding(image)
+
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
+        height, width = gray.shape
+        max_side = max(height, width)
+        if max_side > 4000:
+            scale = 4000 / max_side
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
+            height, width = gray.shape
+
+        # 2b. Scale UP if too small
+        if width < 1500:
+            scale = 1500 / width
+            # Ensure we don't exceed 4000px after upscaling
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            if max(new_width, new_height) > 4000:
+                scale = 4000 / max(new_width, new_height)
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Light contrast enhancement only
+        clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+
+        # NO binarization, NO morphological ops - preserve original quality
+        return enhanced
+
+    def preprocess_heavy(self, image: np.ndarray) -> np.ndarray:
+        """
+        Heavy preprocessing for FADED thermal receipts.
+        Aggressive binarization to recover faded text.
+        """
+        # 0. Add safety padding to protect edge content during deskew rotation
+        image = self._add_safety_padding(image)
+
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
+        height, width = gray.shape
+        max_side = max(height, width)
+        if max_side > 4000:
+            scale = 4000 / max_side
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
+            height, width = gray.shape
+
+        # 2b. Scale UP if too small (larger = better OCR)
+        if width < 1500:
+            scale = 1500 / width
+            # Ensure we don't exceed 4000px after upscaling
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            if max(new_width, new_height) > 4000:
+                scale = 4000 / max(new_width, new_height)
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Contrast enhancement with CLAHE
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+
+        # 5. Denoise
+        denoised = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
+
+        # 6. Sharpening
+        gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
+        sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
+
+        # 7. Adaptive thresholding (binarization)
+        binary = cv2.adaptiveThreshold(
+            sharpened, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            blockSize=11, C=5
+        )
+
+        # 8. Morphological operations
+        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+        result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close)
+
+        return result
+
+    def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray:
+        """
+        Tesseract-optimized preprocessing.
+        Tesseract works best with:
+        - Clean black text on white background (binarized)
+        - High DPI (scale up small images)
+        - Otsu thresholding (better than adaptive for clean documents)
+        """
+        # 0. Add safety padding to protect edge content during deskew rotation
+        image = self._add_safety_padding(image)
+
+        # 1. Grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # 2. Scale for optimal Tesseract (target ~2000px width for receipts)
+        height, width = gray.shape
+        if width < 2000:
+            scale = 2000 / width
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+        elif width > 3000:
+            scale = 3000 / width
+            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
+
+        # 3. Deskew
+        gray = self._deskew(gray)
+
+        # 4. Strong contrast enhancement
+        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+        enhanced = clahe.apply(gray)
+
+        # 5. Denoise before binarization
+        denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21)
+
+        # 6. Otsu binarization (better than adaptive for clean PDFs)
+        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+        # 7. Light morphological cleanup
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
+        cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
+
+        return cleaned
+
+    def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
+        """
+        Generate 2 preprocessing variants for OCR (fast mode).
+        Returns: [light_processed, heavy_processed]
+        """
+        return [
+            self.preprocess_light(image),
+            self.preprocess_heavy(image),
+        ]
+
+    def _deskew(self, image: np.ndarray) -> np.ndarray:
+        """Correct image rotation/skew using Hough lines.
+
+        Uses expanded canvas to preserve all content during rotation,
+        preventing left/right margin truncation.
+        """
+        edges = cv2.Canny(image, 50, 150, apertureSize=3)
+        lines = cv2.HoughLinesP(
+            edges, 1, np.pi / 180,
+            threshold=100, minLineLength=100, maxLineGap=10
+        )
+
+        if lines is None:
+            return image
+
+        angles = []
+        for line in lines:
+            x1, y1, x2, y2 = line[0]
+            angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
+            if abs(angle) < 45:
+                angles.append(angle)
+
+        if not angles:
+            return image
+
+        median_angle = np.median(angles)
+        if abs(median_angle) < 0.5:
+            return image
+
+        h, w = image.shape[:2]
+        center = (w // 2, h // 2)
+        M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+
+        # Calculate new canvas size to fit entire rotated image (prevents edge truncation)
+        cos_angle = abs(np.cos(np.radians(median_angle)))
+        sin_angle = abs(np.sin(np.radians(median_angle)))
+        new_w = int(h * sin_angle + w * cos_angle)
+        new_h = int(h * cos_angle + w * sin_angle)
+
+        # Adjust rotation matrix for new canvas center
+        M[0, 2] += (new_w - w) / 2
+        M[1, 2] += (new_h - h) / 2
+
+        return cv2.warpAffine(
+            image, M, (new_w, new_h),
+            flags=cv2.INTER_CUBIC,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=255  # White background (grayscale)
+        )