fix: Resolve OCR left margin truncation issue
- Add safety padding (50px) around images before preprocessing to protect edge content during deskew rotation - Fix _deskew() to expand canvas during rotation instead of using fixed canvas size with BORDER_REPLICATE (which lost edge content) - Add fallback payment method patterns for truncated text detection (RD→CARD, ARD→CARD, MERAR→NUMERAR) This fixes the issue where text near left edge was being cut off, causing "CARD" to appear as "RD", "SUBTOTAL" as "UBTOTAL", etc. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,25 @@ except ImportError:
|
|||||||
class ImagePreprocessor:
|
class ImagePreprocessor:
|
||||||
"""Preprocess receipt images for OCR."""
|
"""Preprocess receipt images for OCR."""
|
||||||
|
|
||||||
|
def _add_safety_padding(self, image: np.ndarray, padding: int = 50) -> np.ndarray:
|
||||||
|
"""Add white padding around image to protect edge content during rotation.
|
||||||
|
|
||||||
|
This prevents left/right margin truncation in OCR by ensuring text near
|
||||||
|
edges isn't lost during deskew rotation.
|
||||||
|
"""
|
||||||
|
if len(image.shape) == 2:
|
||||||
|
# Grayscale
|
||||||
|
return cv2.copyMakeBorder(
|
||||||
|
image, padding, padding, padding, padding,
|
||||||
|
cv2.BORDER_CONSTANT, value=255
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Color (BGR)
|
||||||
|
return cv2.copyMakeBorder(
|
||||||
|
image, padding, padding, padding, padding,
|
||||||
|
cv2.BORDER_CONSTANT, value=(255, 255, 255)
|
||||||
|
)
|
||||||
|
|
||||||
def load_image(self, path: Path) -> np.ndarray:
|
def load_image(self, path: Path) -> np.ndarray:
|
||||||
"""Load image from file."""
|
"""Load image from file."""
|
||||||
image = cv2.imread(str(path))
|
image = cv2.imread(str(path))
|
||||||
@@ -48,16 +67,31 @@ class ImagePreprocessor:
|
|||||||
Light preprocessing for CLEAR images (PDFs, good scans).
|
Light preprocessing for CLEAR images (PDFs, good scans).
|
||||||
Preserves original quality, only enhances contrast.
|
Preserves original quality, only enhances contrast.
|
||||||
"""
|
"""
|
||||||
|
# 0. Add safety padding to protect edge content during deskew rotation
|
||||||
|
image = self._add_safety_padding(image)
|
||||||
|
|
||||||
# 1. Grayscale
|
# 1. Grayscale
|
||||||
if len(image.shape) == 3:
|
if len(image.shape) == 3:
|
||||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
else:
|
else:
|
||||||
gray = image.copy()
|
gray = image.copy()
|
||||||
|
|
||||||
# 2. Resize if too small
|
# 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
|
||||||
height, width = gray.shape
|
height, width = gray.shape
|
||||||
|
max_side = max(height, width)
|
||||||
|
if max_side > 4000:
|
||||||
|
scale = 4000 / max_side
|
||||||
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
|
||||||
|
height, width = gray.shape
|
||||||
|
|
||||||
|
# 2b. Scale UP if too small
|
||||||
if width < 1500:
|
if width < 1500:
|
||||||
scale = 1500 / width
|
scale = 1500 / width
|
||||||
|
# Ensure we don't exceed 4000px after upscaling
|
||||||
|
new_width = int(width * scale)
|
||||||
|
new_height = int(height * scale)
|
||||||
|
if max(new_width, new_height) > 4000:
|
||||||
|
scale = 4000 / max(new_width, new_height)
|
||||||
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||||
|
|
||||||
# 3. Deskew
|
# 3. Deskew
|
||||||
@@ -75,16 +109,31 @@ class ImagePreprocessor:
|
|||||||
Heavy preprocessing for FADED thermal receipts.
|
Heavy preprocessing for FADED thermal receipts.
|
||||||
Aggressive binarization to recover faded text.
|
Aggressive binarization to recover faded text.
|
||||||
"""
|
"""
|
||||||
|
# 0. Add safety padding to protect edge content during deskew rotation
|
||||||
|
image = self._add_safety_padding(image)
|
||||||
|
|
||||||
# 1. Grayscale
|
# 1. Grayscale
|
||||||
if len(image.shape) == 3:
|
if len(image.shape) == 3:
|
||||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
else:
|
else:
|
||||||
gray = image.copy()
|
gray = image.copy()
|
||||||
|
|
||||||
# 2. Resize if too small (larger = better OCR)
|
# 2a. Scale DOWN if any side exceeds 4000px (PaddleOCR limit)
|
||||||
height, width = gray.shape
|
height, width = gray.shape
|
||||||
|
max_side = max(height, width)
|
||||||
|
if max_side > 4000:
|
||||||
|
scale = 4000 / max_side
|
||||||
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
|
||||||
|
height, width = gray.shape
|
||||||
|
|
||||||
|
# 2b. Scale UP if too small (larger = better OCR)
|
||||||
if width < 1500:
|
if width < 1500:
|
||||||
scale = 1500 / width
|
scale = 1500 / width
|
||||||
|
# Ensure we don't exceed 4000px after upscaling
|
||||||
|
new_width = int(width * scale)
|
||||||
|
new_height = int(height * scale)
|
||||||
|
if max(new_width, new_height) > 4000:
|
||||||
|
scale = 4000 / max(new_width, new_height)
|
||||||
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||||
|
|
||||||
# 3. Deskew
|
# 3. Deskew
|
||||||
@@ -115,6 +164,51 @@ class ImagePreprocessor:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def preprocess_for_tesseract(self, image: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Tesseract-optimized preprocessing.
|
||||||
|
Tesseract works best with:
|
||||||
|
- Clean black text on white background (binarized)
|
||||||
|
- High DPI (scale up small images)
|
||||||
|
- Otsu thresholding (better than adaptive for clean documents)
|
||||||
|
"""
|
||||||
|
# 0. Add safety padding to protect edge content during deskew rotation
|
||||||
|
image = self._add_safety_padding(image)
|
||||||
|
|
||||||
|
# 1. Grayscale
|
||||||
|
if len(image.shape) == 3:
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
else:
|
||||||
|
gray = image.copy()
|
||||||
|
|
||||||
|
# 2. Scale for optimal Tesseract (target ~2000px width for receipts)
|
||||||
|
height, width = gray.shape
|
||||||
|
if width < 2000:
|
||||||
|
scale = 2000 / width
|
||||||
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||||
|
elif width > 3000:
|
||||||
|
scale = 3000 / width
|
||||||
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
# 3. Deskew
|
||||||
|
gray = self._deskew(gray)
|
||||||
|
|
||||||
|
# 4. Strong contrast enhancement
|
||||||
|
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
||||||
|
enhanced = clahe.apply(gray)
|
||||||
|
|
||||||
|
# 5. Denoise before binarization
|
||||||
|
denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21)
|
||||||
|
|
||||||
|
# 6. Otsu binarization (better than adaptive for clean PDFs)
|
||||||
|
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||||
|
|
||||||
|
# 7. Light morphological cleanup
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
|
||||||
|
cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
|
def get_all_variants(self, image: np.ndarray) -> List[np.ndarray]:
|
||||||
"""
|
"""
|
||||||
Generate 2 preprocessing variants for OCR (fast mode).
|
Generate 2 preprocessing variants for OCR (fast mode).
|
||||||
@@ -126,7 +220,11 @@ class ImagePreprocessor:
|
|||||||
]
|
]
|
||||||
|
|
||||||
def _deskew(self, image: np.ndarray) -> np.ndarray:
|
def _deskew(self, image: np.ndarray) -> np.ndarray:
|
||||||
"""Correct image rotation/skew using Hough lines."""
|
"""Correct image rotation/skew using Hough lines.
|
||||||
|
|
||||||
|
Uses expanded canvas to preserve all content during rotation,
|
||||||
|
preventing left/right margin truncation.
|
||||||
|
"""
|
||||||
edges = cv2.Canny(image, 50, 150, apertureSize=3)
|
edges = cv2.Canny(image, 50, 150, apertureSize=3)
|
||||||
lines = cv2.HoughLinesP(
|
lines = cv2.HoughLinesP(
|
||||||
edges, 1, np.pi / 180,
|
edges, 1, np.pi / 180,
|
||||||
@@ -153,8 +251,20 @@ class ImagePreprocessor:
|
|||||||
h, w = image.shape[:2]
|
h, w = image.shape[:2]
|
||||||
center = (w // 2, h // 2)
|
center = (w // 2, h // 2)
|
||||||
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
||||||
|
|
||||||
|
# Calculate new canvas size to fit entire rotated image (prevents edge truncation)
|
||||||
|
cos_angle = abs(np.cos(np.radians(median_angle)))
|
||||||
|
sin_angle = abs(np.sin(np.radians(median_angle)))
|
||||||
|
new_w = int(h * sin_angle + w * cos_angle)
|
||||||
|
new_h = int(h * cos_angle + w * sin_angle)
|
||||||
|
|
||||||
|
# Adjust rotation matrix for new canvas center
|
||||||
|
M[0, 2] += (new_w - w) / 2
|
||||||
|
M[1, 2] += (new_h - h) / 2
|
||||||
|
|
||||||
return cv2.warpAffine(
|
return cv2.warpAffine(
|
||||||
image, M, (w, h),
|
image, M, (new_w, new_h),
|
||||||
flags=cv2.INTER_CUBIC,
|
flags=cv2.INTER_CUBIC,
|
||||||
borderMode=cv2.BORDER_REPLICATE
|
borderMode=cv2.BORDER_CONSTANT,
|
||||||
|
borderValue=255 # White background (grayscale)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ class ExtractionResult:
|
|||||||
tva_total: Optional[Decimal] = None
|
tva_total: Optional[Decimal] = None
|
||||||
address: Optional[str] = None
|
address: Optional[str] = None
|
||||||
items_count: Optional[int] = None
|
items_count: Optional[int] = None
|
||||||
|
payment_methods: List[dict] = field(default_factory=list) # [{"method":"CARD","amount":Decimal}]
|
||||||
|
|
||||||
confidence_amount: float = 0.0
|
confidence_amount: float = 0.0
|
||||||
confidence_date: float = 0.0
|
confidence_date: float = 0.0
|
||||||
@@ -183,6 +184,24 @@ class ReceiptExtractor:
|
|||||||
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
|
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Payment method patterns - appears after TOTAL LEI, before TOTAL TVA
|
||||||
|
# Format: "CARD: 50.00" or "NUMERAR 100.00" or "PLATA CARD: 50.00"
|
||||||
|
PAYMENT_METHOD_PATTERNS = [
|
||||||
|
# CARD with amount (high confidence)
|
||||||
|
(r'(?:PLATA\s+)?CARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.95),
|
||||||
|
# NUMERAR (cash) with amount
|
||||||
|
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.95),
|
||||||
|
# CASH alternative spelling
|
||||||
|
(r'CASH\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.90),
|
||||||
|
# Truncation recovery patterns (for OCR left-margin truncation issues)
|
||||||
|
# "RD" = truncated "CARD" (only 2 chars visible)
|
||||||
|
(r'\bRD\s*:?\s*([\d\s.,]+)', 'CARD', 0.70),
|
||||||
|
# "ARD" = truncated "CARD" (3 chars visible)
|
||||||
|
(r'\bARD\s*:?\s*([\d\s.,]+)', 'CARD', 0.75),
|
||||||
|
# "MERAR" = truncated "NUMERAR"
|
||||||
|
(r'\bMERAR\s*:?\s*([\d\s.,]+)', 'NUMERAR', 0.70),
|
||||||
|
]
|
||||||
|
|
||||||
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
# Items count patterns - OCR may produce OZ instead of POZ, etc.
|
||||||
# Number may be on separate line before or after the label
|
# Number may be on separate line before or after the label
|
||||||
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
|
# IMPORTANT: Must be specific to avoid matching product quantities like "50BUC"
|
||||||
@@ -246,17 +265,32 @@ class ReceiptExtractor:
|
|||||||
if not result.tva_entries:
|
if not result.tva_entries:
|
||||||
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
|
print(f"[TVA Debug] No TVA found. Checking patterns...", flush=True)
|
||||||
# Debug: show what patterns see
|
# Debug: show what patterns see
|
||||||
import re
|
|
||||||
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
||||||
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
taxe_match = re.search(r'T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
||||||
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
rev_match = re.search(r'([\d.,]+)\s*T?OTAL\s+TAXE', normalized, re.IGNORECASE)
|
||||||
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
|
print(f"[TVA Debug] 'OTAL TAXE' found: {bool(taxe_match)}, reversed: {rev_match.group(1) if rev_match else None}", flush=True)
|
||||||
|
|
||||||
|
# Log TVA vs TOTAL for debugging (validation happens in ocr_service._final_validation)
|
||||||
|
# NOTE: We NO LONGER clear TVA here - the service will recalculate TOTAL from TVA if needed
|
||||||
|
if result.tva_total and result.amount:
|
||||||
|
if result.tva_total > result.amount:
|
||||||
|
print(f"[TVA Extraction] TVA ({result.tva_total}) > TOTAL ({result.amount}) - will be corrected in final validation", flush=True)
|
||||||
|
elif result.tva_total > result.amount * Decimal('0.5'):
|
||||||
|
print(f"[TVA Extraction] Warning: TVA ({result.tva_total}) is > 50% of TOTAL ({result.amount}) - suspicious", flush=True)
|
||||||
|
|
||||||
result.items_count = self._extract_items_count(text_upper)
|
result.items_count = self._extract_items_count(text_upper)
|
||||||
result.address = self._extract_address(text_upper)
|
result.address = self._extract_address(text_upper)
|
||||||
|
result.payment_methods = self._extract_payment_methods(text_upper)
|
||||||
|
|
||||||
# Detect receipt type
|
# Detect receipt type
|
||||||
result.receipt_type = self._detect_receipt_type(text_upper)
|
result.receipt_type = self._detect_receipt_type(text_upper)
|
||||||
|
|
||||||
|
# Reverse TVA validation
|
||||||
|
if result.tva_entries and result.amount:
|
||||||
|
is_valid, expected_total, msg = self._validate_tva_reverse(result.tva_entries, result.amount)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"[TVA Reverse Validation] {msg}", flush=True)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
|
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
|
||||||
@@ -892,10 +926,18 @@ class ReceiptExtractor:
|
|||||||
except (ValueError, InvalidOperation):
|
except (ValueError, InvalidOperation):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Calculate total
|
# Extract TOTAL TVA BON as reference (separate from individual entries)
|
||||||
tva_total = None
|
tva_bon_total = self._extract_total_tva_bon(normalized_text)
|
||||||
|
|
||||||
|
# Calculate sum from entries
|
||||||
|
entries_sum = None
|
||||||
if tva_entries:
|
if tva_entries:
|
||||||
tva_total = sum(entry['amount'] for entry in tva_entries)
|
entries_sum = sum(entry['amount'] for entry in tva_entries)
|
||||||
|
|
||||||
|
# Validate and correct TVA values
|
||||||
|
tva_entries, tva_total = self._validate_and_correct_tva(
|
||||||
|
tva_entries, entries_sum, tva_bon_total
|
||||||
|
)
|
||||||
|
|
||||||
# Sort by code (A, B, C, D)
|
# Sort by code (A, B, C, D)
|
||||||
tva_entries.sort(key=lambda x: x.get('code', 'Z'))
|
tva_entries.sort(key=lambda x: x.get('code', 'Z'))
|
||||||
@@ -929,6 +971,123 @@ class ReceiptExtractor:
|
|||||||
else:
|
else:
|
||||||
return 'A' # Default to standard rate
|
return 'A' # Default to standard rate
|
||||||
|
|
||||||
|
def _extract_total_tva_bon(self, text: str) -> Optional[Decimal]:
|
||||||
|
"""
|
||||||
|
Extract TOTAL TVA BON value separately as the reference.
|
||||||
|
This is the authoritative total TVA on the receipt.
|
||||||
|
|
||||||
|
Handles OCR variations: TOTAL TVA BON, OTAL TUA BON, etc.
|
||||||
|
"""
|
||||||
|
# Pattern for TOTAL TVA BON with amount after
|
||||||
|
patterns = [
|
||||||
|
# Standard: TOTAL TVA BON: 14.92
|
||||||
|
r'T?OTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d]+[.,]\d{2})\b',
|
||||||
|
# Amount before: 14.92 OTAL TUA BON (OCR line break)
|
||||||
|
r'([\d]+[.,]\d{2})\s*\n?\s*T?OTAL\s+T[VU][AR]\s+BON',
|
||||||
|
# Amount on next line after TOTAL TVA BON
|
||||||
|
r'T?OTAL\s+T[VU][AR]\s+BON\s*\n\s*([\d]+[.,]\d{2})\b',
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
amount_str = self._normalize_number(match.group(1))
|
||||||
|
amount = Decimal(amount_str)
|
||||||
|
if amount > 0:
|
||||||
|
return amount
|
||||||
|
except (InvalidOperation, ValueError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _validate_and_correct_tva(
|
||||||
|
self,
|
||||||
|
tva_entries: List[dict],
|
||||||
|
entries_sum: Optional[Decimal],
|
||||||
|
tva_bon_total: Optional[Decimal]
|
||||||
|
) -> Tuple[List[dict], Optional[Decimal]]:
|
||||||
|
"""
|
||||||
|
Validate and correct TVA values.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
1. TVA cannot be greater than TOTAL amount (will be validated at higher level)
|
||||||
|
2. Sum of TVA A + TVA B + ... should equal TOTAL TVA BON
|
||||||
|
3. If single entry and sum != tva_bon_total, use tva_bon_total
|
||||||
|
4. Detect and fix OCR concatenation errors (e.g., 14.921492 from 14.92 + 14.92)
|
||||||
|
"""
|
||||||
|
if not tva_entries:
|
||||||
|
return tva_entries, tva_bon_total
|
||||||
|
|
||||||
|
# Check for OCR concatenation errors in individual entries
|
||||||
|
# Pattern: X.XX followed by another decimal (e.g., 14.921492 from 14.92 + 14.92)
|
||||||
|
corrected_entries = []
|
||||||
|
for entry in tva_entries:
|
||||||
|
amount = entry['amount']
|
||||||
|
amount_str = str(amount)
|
||||||
|
|
||||||
|
# Check if amount looks like concatenated decimals
|
||||||
|
# e.g., 14.921492 could be 14.92 + 14.92 incorrectly joined
|
||||||
|
# or 32.3132.31 from 32.31 + 32.31
|
||||||
|
if len(amount_str) > 6 and '.' in amount_str:
|
||||||
|
int_part, dec_part = amount_str.split('.')
|
||||||
|
|
||||||
|
# If decimal part > 2 digits, it's likely concatenation
|
||||||
|
if len(dec_part) > 2:
|
||||||
|
# Try to extract the first valid decimal amount
|
||||||
|
# e.g., from 14.921492, extract 14.92
|
||||||
|
try:
|
||||||
|
corrected_amount = Decimal(f"{int_part}.{dec_part[:2]}")
|
||||||
|
print(f"[TVA Validation] Corrected concatenation error: {amount} → {corrected_amount}", flush=True)
|
||||||
|
entry['amount'] = corrected_amount
|
||||||
|
except InvalidOperation:
|
||||||
|
pass
|
||||||
|
|
||||||
|
corrected_entries.append(entry)
|
||||||
|
|
||||||
|
tva_entries = corrected_entries
|
||||||
|
|
||||||
|
# Recalculate sum after corrections
|
||||||
|
entries_sum = sum(entry['amount'] for entry in tva_entries) if tva_entries else None
|
||||||
|
|
||||||
|
# Validate sum against TOTAL TVA BON
|
||||||
|
if tva_bon_total and entries_sum:
|
||||||
|
# Allow small tolerance for rounding (0.02)
|
||||||
|
tolerance = Decimal('0.02')
|
||||||
|
difference = abs(entries_sum - tva_bon_total)
|
||||||
|
|
||||||
|
if difference > tolerance:
|
||||||
|
print(f"[TVA Validation] Sum mismatch: entries_sum={entries_sum}, tva_bon_total={tva_bon_total}", flush=True)
|
||||||
|
|
||||||
|
# If single entry and sum doesn't match, use TOTAL TVA BON as reference
|
||||||
|
if len(tva_entries) == 1:
|
||||||
|
print(f"[TVA Validation] Single entry - using TOTAL TVA BON as reference: {tva_bon_total}", flush=True)
|
||||||
|
tva_entries[0]['amount'] = tva_bon_total
|
||||||
|
entries_sum = tva_bon_total
|
||||||
|
# If multiple entries and sum > tva_bon_total, likely double counting
|
||||||
|
elif entries_sum > tva_bon_total:
|
||||||
|
# Check if one entry is the duplicate of another
|
||||||
|
amounts = [e['amount'] for e in tva_entries]
|
||||||
|
unique_amounts = set(amounts)
|
||||||
|
if len(unique_amounts) < len(amounts):
|
||||||
|
# Duplicate detected - likely TOTAL TVA BON counted as separate entry
|
||||||
|
print(f"[TVA Validation] Duplicate TVA detected, removing duplicates", flush=True)
|
||||||
|
# Keep only unique entries
|
||||||
|
seen = set()
|
||||||
|
unique_entries = []
|
||||||
|
for entry in tva_entries:
|
||||||
|
key = (entry.get('code'), entry['amount'])
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_entries.append(entry)
|
||||||
|
tva_entries = unique_entries
|
||||||
|
entries_sum = sum(e['amount'] for e in tva_entries)
|
||||||
|
|
||||||
|
# Final total
|
||||||
|
tva_total = entries_sum if entries_sum else tva_bon_total
|
||||||
|
|
||||||
|
return tva_entries, tva_total
|
||||||
|
|
||||||
def _detect_tva_percent(self, text: str) -> Optional[int]:
|
def _detect_tva_percent(self, text: str) -> Optional[int]:
|
||||||
"""Detect TVA percentage from text content."""
|
"""Detect TVA percentage from text content."""
|
||||||
# Look for common Romanian TVA percentages
|
# Look for common Romanian TVA percentages
|
||||||
@@ -944,6 +1103,48 @@ class ReceiptExtractor:
|
|||||||
return 5
|
return 5
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _validate_tva_reverse(
|
||||||
|
self,
|
||||||
|
tva_entries: List[dict],
|
||||||
|
total_amount: Optional[Decimal]
|
||||||
|
) -> Tuple[bool, Optional[Decimal], str]:
|
||||||
|
"""
|
||||||
|
Reverse TVA validation: from TVA amount and rate, calculate expected total.
|
||||||
|
|
||||||
|
Formula:
|
||||||
|
base = tva_amount / (rate/100)
|
||||||
|
expected_total = sum(base + tva_amount) for all entries
|
||||||
|
|
||||||
|
Returns (is_valid, expected_total, message)
|
||||||
|
"""
|
||||||
|
if not tva_entries or not total_amount:
|
||||||
|
return True, None, "Insufficient data for reverse validation"
|
||||||
|
|
||||||
|
expected_total = Decimal('0')
|
||||||
|
for entry in tva_entries:
|
||||||
|
tva_amount = entry['amount']
|
||||||
|
rate = Decimal(str(entry['percent']))
|
||||||
|
|
||||||
|
if rate > 0:
|
||||||
|
# Calculate base from TVA: base = tva / (rate/100)
|
||||||
|
base = tva_amount / (rate / Decimal('100'))
|
||||||
|
expected_total += base + tva_amount
|
||||||
|
else:
|
||||||
|
# 0% TVA - can't calculate base, skip
|
||||||
|
pass
|
||||||
|
|
||||||
|
if expected_total == 0:
|
||||||
|
return True, None, "Cannot calculate expected total (0% TVA only)"
|
||||||
|
|
||||||
|
# Tolerance: max(0.50 RON, 1% of total)
|
||||||
|
tolerance = max(Decimal('0.50'), total_amount * Decimal('0.01'))
|
||||||
|
difference = abs(expected_total - total_amount)
|
||||||
|
|
||||||
|
if difference <= tolerance:
|
||||||
|
return True, expected_total, f"TVA reverse validation passed (expected: {expected_total}, actual: {total_amount}, diff: {difference})"
|
||||||
|
else:
|
||||||
|
return False, expected_total, f"TVA reverse validation WARNING: expected {expected_total}, actual {total_amount}, diff {difference}"
|
||||||
|
|
||||||
def _extract_items_count(self, text: str) -> Optional[int]:
|
def _extract_items_count(self, text: str) -> Optional[int]:
|
||||||
"""Extract number of items/articles from receipt."""
|
"""Extract number of items/articles from receipt."""
|
||||||
for pattern, _ in self.ITEMS_COUNT_PATTERNS:
|
for pattern, _ in self.ITEMS_COUNT_PATTERNS:
|
||||||
@@ -994,3 +1195,45 @@ class ReceiptExtractor:
|
|||||||
return address if len(address) >= 5 else None
|
return address if len(address) >= 5 else None
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _extract_payment_methods(self, text: str) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Extract payment methods (CARD/NUMERAR) from receipt.
|
||||||
|
These appear after TOTAL LEI and before TOTAL TVA section.
|
||||||
|
|
||||||
|
Returns list of: {'method': 'CARD'/'NUMERAR', 'amount': Decimal}
|
||||||
|
"""
|
||||||
|
payment_methods = []
|
||||||
|
seen_methods = set()
|
||||||
|
|
||||||
|
# Normalize spaces in numbers
|
||||||
|
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
|
||||||
|
|
||||||
|
# Find the region between TOTAL LEI and TOTAL TVA
|
||||||
|
total_lei_match = re.search(r'TOTAL\s+LEI\s*([\d\s.,]+)', normalized_text, re.IGNORECASE)
|
||||||
|
total_tva_match = re.search(r'TOTAL\s+T[VU][AR]', normalized_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
# Define search region (after TOTAL LEI, before TOTAL TVA if exists)
|
||||||
|
if total_lei_match:
|
||||||
|
start_pos = total_lei_match.end()
|
||||||
|
end_pos = total_tva_match.start() if total_tva_match else len(normalized_text)
|
||||||
|
search_region = normalized_text[start_pos:end_pos]
|
||||||
|
else:
|
||||||
|
search_region = normalized_text # Fallback to full text
|
||||||
|
|
||||||
|
for pattern, method, confidence in self.PAYMENT_METHOD_PATTERNS:
|
||||||
|
for match in re.finditer(pattern, search_region, re.IGNORECASE):
|
||||||
|
try:
|
||||||
|
amount_str = match.group(1).replace(' ', '')
|
||||||
|
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
|
||||||
|
amount = Decimal(amount_str)
|
||||||
|
if amount > 0 and method not in seen_methods:
|
||||||
|
payment_methods.append({
|
||||||
|
'method': method,
|
||||||
|
'amount': amount
|
||||||
|
})
|
||||||
|
seen_methods.add(method)
|
||||||
|
except (InvalidOperation, ValueError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
return payment_methods
|
||||||
|
|||||||
Reference in New Issue
Block a user