diff --git a/data-entry-app/backend/app/services/image_preprocessor.py b/data-entry-app/backend/app/services/image_preprocessor.py index edd97e0..0890d48 100644 --- a/data-entry-app/backend/app/services/image_preprocessor.py +++ b/data-entry-app/backend/app/services/image_preprocessor.py @@ -16,6 +16,25 @@ except ImportError: class ImagePreprocessor: """Preprocess receipt images for OCR.""" + def _add_safety_padding(self, image: np.ndarray, padding: int = 50) -> np.ndarray: + """Add white padding around image to protect edge content during rotation. + + This prevents left/right margin truncation in OCR by ensuring text near + edges isn't lost during deskew rotation. + """ + if len(image.shape) == 2: + # Grayscale + return cv2.copyMakeBorder( + image, padding, padding, padding, padding, + cv2.BORDER_CONSTANT, value=255 + ) + else: + # Color (BGR) + return cv2.copyMakeBorder( + image, padding, padding, padding, padding, + cv2.BORDER_CONSTANT, value=(255, 255, 255) + ) + def load_image(self, path: Path) -> np.ndarray: """Load image from file.""" image = cv2.imread(str(path)) @@ -48,16 +67,31 @@ class ImagePreprocessor: Light preprocessing for CLEAR images (PDFs, good scans). Preserves original quality, only enhances contrast. """ + # 0. Add safety padding to protect edge content during deskew rotation + image = self._add_safety_padding(image) + # 1. Grayscale if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) else: gray = image.copy() - # 2. Resize if too small + # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit) height, width = gray.shape + max_side = max(height, width) + if max_side > 4000: + scale = 4000 / max_side + gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) + height, width = gray.shape + + # 2b. Scale UP if too small if width < 1500: scale = 1500 / width + # Ensure we don't exceed 4000px after upscaling + new_width = int(width * scale) + new_height = int(height * scale) + if max(new_width, new_height) > 4000: + scale = 4000 / max(new_width, new_height) gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) # 3. Deskew @@ -75,16 +109,31 @@ class ImagePreprocessor: Heavy preprocessing for FADED thermal receipts. Aggressive binarization to recover faded text. """ + # 0. Add safety padding to protect edge content during deskew rotation + image = self._add_safety_padding(image) + # 1. Grayscale if len(image.shape) == 3: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) else: gray = image.copy() - # 2. Resize if too small (larger = better OCR) + # 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit) height, width = gray.shape + max_side = max(height, width) + if max_side > 4000: + scale = 4000 / max_side + gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) + height, width = gray.shape + + # 2b. Scale UP if too small (larger = better OCR) if width < 1500: scale = 1500 / width + # Ensure we don't exceed 4000px after upscaling + new_width = int(width * scale) + new_height = int(height * scale) + if max(new_width, new_height) > 4000: + scale = 4000 / max(new_width, new_height) gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) # 3. Deskew @@ -115,6 +164,51 @@ class ImagePreprocessor: return result + def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray: + """ + Tesseract-optimized preprocessing. + Tesseract works best with: + - Clean black text on white background (binarized) + - High DPI (scale up small images) + - Otsu thresholding (better than adaptive for clean documents) + """ + # 0. Add safety padding to protect edge content during deskew rotation + image = self._add_safety_padding(image) + + # 1. Grayscale + if len(image.shape) == 3: + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = image.copy() + + # 2. Scale for optimal Tesseract (target ~2000px width for receipts) + height, width = gray.shape + if width < 2000: + scale = 2000 / width + gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) + elif width > 3000: + scale = 3000 / width + gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) + + # 3. Deskew + gray = self._deskew(gray) + + # 4. Strong contrast enhancement + clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) + enhanced = clahe.apply(gray) + + # 5. Denoise before binarization + denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21) + + # 6. Otsu binarization (better than adaptive for clean PDFs) + _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + # 7. Light morphological cleanup + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) + cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) + + return cleaned + def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]: """ Generate 2 preprocessing variants for OCR (fast mode). @@ -126,7 +220,11 @@ class ImagePreprocessor: ] def _deskew(self, image: np.ndarray) -> np.ndarray: - """Correct image rotation/skew using Hough lines.""" + """Correct image rotation/skew using Hough lines. + + Uses expanded canvas to preserve all content during rotation, + preventing left/right margin truncation. + """ edges = cv2.Canny(image, 50, 150, apertureSize=3) lines = cv2.HoughLinesP( edges, 1, np.pi / 180, @@ -153,8 +251,20 @@ class ImagePreprocessor: h, w = image.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, median_angle, 1.0) + + # Calculate new canvas size to fit entire rotated image (prevents edge truncation) + cos_angle = abs(np.cos(np.radians(median_angle))) + sin_angle = abs(np.sin(np.radians(median_angle))) + new_w = int(h * sin_angle + w * cos_angle) + new_h = int(h * cos_angle + w * sin_angle) + + # Adjust rotation matrix for new canvas center + M[0, 2] += (new_w - w) / 2 + M[1, 2] += (new_h - h) / 2 + return cv2.warpAffine( - image, M, (w, h), + image, M, (new_w, new_h), flags=cv2.INTER_CUBIC, - borderMode=cv2.BORDER_REPLICATE + borderMode=cv2.BORDER_CONSTANT, + borderValue=255 # White background (grayscale) ) diff --git a/data-entry-app/backend/app/services/ocr_extractor.py b/data-entry-app/backend/app/services/ocr_extractor.py index a37c73f..d90d7c1 100644 --- a/data-entry-app/backend/app/services/ocr_extractor.py +++ b/data-entry-app/backend/app/services/ocr_extractor.py @@ -23,6 +23,7 @@ class ExtractionResult: tva_total: Optional[Decimal] = None address: Optional[str] = None items_count: Optional[int] = None + payment_methods: List[dict] = field(default_factory=list) # [{"method":"CARD","amount":Decimal}] confidence_amount: float = 0.0 confidence_date: float = 0.0 @@ -183,6 +184,24 @@ class ReceiptExtractor: (r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75), ] + # Payment method patterns - appears after TOTAL LEI, before TOTAL TVA + # Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00" + PAYMENT_METHOD_PATTERNS = [ + # CARD with amount (high confidence) + (r'(?:PLATA\s+)?CARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95), + # NUMERAR (cash) with amount + (r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95), + # CASH alternative spelling + (r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90), + # Truncation recovery patterns (for OCR left-margin truncation issues) + # "RD" = truncated "CARD" (only 2 chars visible) + (r'\bRD\s*:?\s*([\d\s.,]+)', 'CARD', 0.70), + # "ARD" = truncated "CARD" (3 chars visible) + (r'\bARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.75), + # "MERAR" = truncated "NUMERAR" + (r'\bMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.70), + ] + # Items count patterns - OCR may produce OZ instead of POZ, etc. # Number may be on separate line before or after the label # IMPORTANT: Must be specific to avoid matching product quantities like "50BUC" @@ -246,17 +265,32 @@ class ReceiptExtractor: if not result.tva_entries: print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True) # Debug: show what patterns see - import re normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper) taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE) rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE) print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True) + + # Log TVA vs TOTAL for debugging (validation happens in ocr_service._final_validation) + # NOTE: We NO LONGER clear TVA here - the service will recalculate TOTAL from TVA if needed + if result.tva_total and result.amount: + if result.tva_total > result.amount: + print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True) + elif result.tva_total > result.amount * Decimal('0.5'): + print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True) + result.items_count = self._extract_items_count(text_upper) result.address = self._extract_address(text_upper) + result.payment_methods = self._extract_payment_methods(text_upper) # Detect receipt type result.receipt_type = self._detect_receipt_type(text_upper) + # Reverse TVA validation + if result.tva_entries and result.amount: + is_valid, expected_total, msg = self._validate_tva_reverse(result.tva_entries, result.amount) + if not is_valid: + print(f"[TVA Reverse Validation] {msg}", flush=True) + return result def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]: @@ -892,10 +926,18 @@ class ReceiptExtractor: except (ValueError, InvalidOperation): continue - # Calculate total - tva_total = None + # Extract TOTAL TVA BON as reference (separate from individual entries) + tva_bon_total = self._extract_total_tva_bon(normalized_text) + + # Calculate sum from entries + entries_sum = None if tva_entries: - tva_total = sum(entry['amount'] for entry in tva_entries) + entries_sum = sum(entry['amount'] for entry in tva_entries) + + # Validate and correct TVA values + tva_entries, tva_total = self._validate_and_correct_tva( + tva_entries, entries_sum, tva_bon_total + ) # Sort by code (A, B, C, D) tva_entries.sort(key=lambda x: x.get('code', 'Z')) @@ -929,6 +971,123 @@ class ReceiptExtractor: else: return 'A' # Default to standard rate + def _extract_total_tva_bon(self, text: str) -> Optional[Decimal]: + """ + Extract TOTAL TVA BON value separately as the reference. + This is the authoritative total TVA on the receipt. + + Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, etc. + """ + # Pattern for TOTAL TVA BON with amount after + patterns = [ + # Standard: TOTAL TVA BON: 14.92 + r'T?OTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d]+[.,]\d{2})\b', + # Amount before: 14.92 OTAL TUA BON (OCR line break) + r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+T[VU][AR]\s+BON', + # Amount on next line after TOTAL TVA BON + r'T?OTAL\s+T[VU][AR]\s+BON\s*\n\s*([\d]+[.,]\d{2})\b', + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + try: + amount_str = self._normalize_number(match.group(1)) + amount = Decimal(amount_str) + if amount > 0: + return amount + except (InvalidOperation, ValueError): + continue + + return None + + def _validate_and_correct_tva( + self, + tva_entries: List[dict], + entries_sum: Optional[Decimal], + tva_bon_total: Optional[Decimal] + ) -> Tuple[List[dict], Optional[Decimal]]: + """ + Validate and correct TVA values. + + Rules: + 1. TVA cannot be greater than TOTAL amount (will be validated at higher level) + 2. Sum of TVA A + TVA B + ... should equal TOTAL TVA BON + 3. If single entry and sum != tva_bon_total, use tva_bon_total + 4. Detect and fix OCR concatenation errors (e.g., 14.921492 from 14.92 + 14.92) + """ + if not tva_entries: + return tva_entries, tva_bon_total + + # Check for OCR concatenation errors in individual entries + # Pattern: X.XX followed by another decimal (e.g., 14.921492 from 14.92 + 14.92) + corrected_entries = [] + for entry in tva_entries: + amount = entry['amount'] + amount_str = str(amount) + + # Check if amount looks like concatenated decimals + # e.g., 14.921492 could be 14.92 + 14.92 incorrectly joined + # or 32.3132.31 from 32.31 + 32.31 + if len(amount_str) > 6 and '.' in amount_str: + int_part, dec_part = amount_str.split('.') + + # If decimal part > 2 digits, it's likely concatenation + if len(dec_part) > 2: + # Try to extract the first valid decimal amount + # e.g., from 14.921492, extract 14.92 + try: + corrected_amount = Decimal(f"{int_part}.{dec_part[:2]}") + print(f"[TVA Validation] Corrected concatenation error: {amount} → {corrected_amount}", flush=True) + entry['amount'] = corrected_amount + except InvalidOperation: + pass + + corrected_entries.append(entry) + + tva_entries = corrected_entries + + # Recalculate sum after corrections + entries_sum = sum(entry['amount'] for entry in tva_entries) if tva_entries else None + + # Validate sum against TOTAL TVA BON + if tva_bon_total and entries_sum: + # Allow small tolerance for rounding (0.02) + tolerance = Decimal('0.02') + difference = abs(entries_sum - tva_bon_total) + + if difference > tolerance: + print(f"[TVA Validation] Sum mismatch: entries_sum={entries_sum}, tva_bon_total={tva_bon_total}", flush=True) + + # If single entry and sum doesn't match, use TOTAL TVA BON as reference + if len(tva_entries) == 1: + print(f"[TVA Validation] Single entry - using TOTAL TVA BON as reference: {tva_bon_total}", flush=True) + tva_entries[0]['amount'] = tva_bon_total + entries_sum = tva_bon_total + # If multiple entries and sum > tva_bon_total, likely double counting + elif entries_sum > tva_bon_total: + # Check if one entry is the duplicate of another + amounts = [e['amount'] for e in tva_entries] + unique_amounts = set(amounts) + if len(unique_amounts) < len(amounts): + # Duplicate detected - likely TOTAL TVA BON counted as separate entry + print(f"[TVA Validation] Duplicate TVA detected, removing duplicates", flush=True) + # Keep only unique entries + seen = set() + unique_entries = [] + for entry in tva_entries: + key = (entry.get('code'), entry['amount']) + if key not in seen: + seen.add(key) + unique_entries.append(entry) + tva_entries = unique_entries + entries_sum = sum(e['amount'] for e in tva_entries) + + # Final total + tva_total = entries_sum if entries_sum else tva_bon_total + + return tva_entries, tva_total + def _detect_tva_percent(self, text: str) -> Optional[int]: """Detect TVA percentage from text content.""" # Look for common Romanian TVA percentages @@ -944,6 +1103,48 @@ class ReceiptExtractor: return 5 return None + def _validate_tva_reverse( + self, + tva_entries: List[dict], + total_amount: Optional[Decimal] + ) -> Tuple[bool, Optional[Decimal], str]: + """ + Reverse TVA validation: from TVA amount and rate, calculate expected total. + + Formula: + base = tva_amount / (rate/100) + expected_total = sum(base + tva_amount) for all entries + + Returns (is_valid, expected_total, message) + """ + if not tva_entries or not total_amount: + return True, None, "Insufficient data for reverse validation" + + expected_total = Decimal('0') + for entry in tva_entries: + tva_amount = entry['amount'] + rate = Decimal(str(entry['percent'])) + + if rate > 0: + # Calculate base from TVA: base = tva / (rate/100) + base = tva_amount / (rate / Decimal('100')) + expected_total += base + tva_amount + else: + # 0% TVA - can't calculate base, skip + pass + + if expected_total == 0: + return True, None, "Cannot calculate expected total (0% TVA only)" + + # Tolerance: max(0.50 RON, 1% of total) + tolerance = max(Decimal('0.50'), total_amount * Decimal('0.01')) + difference = abs(expected_total - total_amount) + + if difference <= tolerance: + return True, expected_total, f"TVA reverse validation passed (expected: {expected_total}, actual: {total_amount}, diff: {difference})" + else: + return False, expected_total, f"TVA reverse validation WARNING: expected {expected_total}, actual {total_amount}, diff {difference}" + def _extract_items_count(self, text: str) -> Optional[int]: """Extract number of items/articles from receipt.""" for pattern, _ in self.ITEMS_COUNT_PATTERNS: @@ -994,3 +1195,45 @@ class ReceiptExtractor: return address if len(address) >= 5 else None return None + + def _extract_payment_methods(self, text: str) -> List[dict]: + """ + Extract payment methods (CARD/NUMERAR) from receipt. + These appear after TOTAL LEI and before TOTAL TVA section. + + Returns list of: {'method': 'CARD'/'NUMERAR', 'amount': Decimal} + """ + payment_methods = [] + seen_methods = set() + + # Normalize spaces in numbers + normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) + + # Find the region between TOTAL LEI and TOTAL TVA + total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE) + total_tva_match = re.search(r'TOTAL\s+T[VU][AR]', normalized_text, re.IGNORECASE) + + # Define search region (after TOTAL LEI, before TOTAL TVA if exists) + if total_lei_match: + start_pos = total_lei_match.end() + end_pos = total_tva_match.start() if total_tva_match else len(normalized_text) + search_region = normalized_text[start_pos:end_pos] + else: + search_region = normalized_text # Fallback to full text + + for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS: + for match in re.finditer(pattern, search_region, re.IGNORECASE): + try: + amount_str = match.group(1).replace(' ', '') + amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) + amount = Decimal(amount_str) + if amount > 0 and method not in seen_methods: + payment_methods.append({ + 'method': method, + 'amount': amount + }) + seen_methods.add(method) + except (InvalidOperation, ValueError): + continue + + return payment_methods