feat(data-entry): Add unified receipt form with OCR confidence tracking
New unified receipt creation system with: - UnifiedReceiptForm component with inline OCR preview and confidence indicators - Compact upload zone with drag-drop and camera support - TVA and Payment fields with dynamic add/remove - Supplier dual-field with autocomplete and OCR hint - Receipt form sections with collapsible auxiliary data Backend OCR improvements: - Add confidence_tva and confidence_payment to extraction results - Update TVA extraction to return confidence scores - Include TVA (15%) and payment (10%) in overall_confidence calculation Also includes: - CSS design system rules documentation - Port check helper function for service scripts - Expanded design tokens documentation in CLAUDE.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -901,12 +901,15 @@ def _merge_extractions(primary, secondary):
|
||||
# Always use primary (docTR) - higher quality OCR
|
||||
result.tva_entries = primary.tva_entries
|
||||
result.tva_total = primary.tva_total
|
||||
result.confidence_tva = getattr(primary, 'confidence_tva', 0.0)
|
||||
elif primary.tva_entries:
|
||||
result.tva_entries = primary.tva_entries
|
||||
result.tva_total = primary.tva_total
|
||||
result.confidence_tva = getattr(primary, 'confidence_tva', 0.0)
|
||||
elif secondary.tva_entries:
|
||||
result.tva_entries = secondary.tva_entries
|
||||
result.tva_total = secondary.tva_total
|
||||
result.confidence_tva = getattr(secondary, 'confidence_tva', 0.0)
|
||||
|
||||
# Other fields - prefer primary
|
||||
result.receipt_number = primary.receipt_number or secondary.receipt_number
|
||||
@@ -915,7 +918,13 @@ def _merge_extractions(primary, secondary):
|
||||
result.partner_name = primary.partner_name or secondary.partner_name
|
||||
result.address = primary.address or secondary.address
|
||||
result.items_count = primary.items_count or secondary.items_count
|
||||
result.payment_methods = primary.payment_methods or secondary.payment_methods
|
||||
# Payment methods with confidence
|
||||
if primary.payment_methods:
|
||||
result.payment_methods = primary.payment_methods
|
||||
result.confidence_payment = getattr(primary, 'confidence_payment', 0.0)
|
||||
elif secondary.payment_methods:
|
||||
result.payment_methods = secondary.payment_methods
|
||||
result.confidence_payment = getattr(secondary, 'confidence_payment', 0.0)
|
||||
result.suggested_payment_mode = getattr(primary, 'suggested_payment_mode', None) or getattr(secondary, 'suggested_payment_mode', None)
|
||||
|
||||
# Client fields
|
||||
@@ -970,6 +979,11 @@ def _complement_extraction(primary, secondary):
|
||||
if not primary.tva_entries and secondary.tva_entries:
|
||||
primary.tva_entries = secondary.tva_entries
|
||||
primary.tva_total = secondary.tva_total
|
||||
primary.confidence_tva = getattr(secondary, 'confidence_tva', 0.0)
|
||||
|
||||
if not getattr(primary, 'payment_methods', None) and getattr(secondary, 'payment_methods', None):
|
||||
primary.payment_methods = secondary.payment_methods
|
||||
primary.confidence_payment = getattr(secondary, 'confidence_payment', 0.0)
|
||||
|
||||
if not primary.receipt_number and secondary.receipt_number:
|
||||
primary.receipt_number = secondary.receipt_number
|
||||
@@ -1024,6 +1038,8 @@ def _extraction_to_dict(extraction) -> dict:
|
||||
"confidence_date": extraction.confidence_date,
|
||||
"confidence_vendor": extraction.confidence_vendor,
|
||||
"confidence_client": getattr(extraction, 'confidence_client', 0.0),
|
||||
"confidence_tva": getattr(extraction, 'confidence_tva', 0.0),
|
||||
"confidence_payment": getattr(extraction, 'confidence_payment', 0.0),
|
||||
"overall_confidence": extraction.overall_confidence,
|
||||
"raw_text": extraction.raw_text,
|
||||
"ocr_engine": extraction.ocr_engine,
|
||||
|
||||
@@ -13,8 +13,9 @@ Usage:
|
||||
CUI_LIST = ["22891860"]
|
||||
NAME_PATTERNS = ["LIDL", "LDL"]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
def extract_tva_entries(self, text: str) -> Tuple[List[dict], float]:
|
||||
# Custom Lidl TVA extraction logic
|
||||
# Returns (entries_list, confidence_score)
|
||||
...
|
||||
"""
|
||||
|
||||
@@ -331,7 +332,7 @@ class BaseStoreProfile(ABC):
|
||||
# Extraction methods - override in subclasses as needed
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
def extract_tva_entries(self, text: str) -> Tuple[List[dict], float]:
|
||||
"""
|
||||
Extract TVA entries from receipt text - GENERIC implementation.
|
||||
|
||||
@@ -346,37 +347,53 @@ class BaseStoreProfile(ABC):
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: code, percent, amount
|
||||
Tuple of (List of dicts with keys: code, percent, amount, confidence float)
|
||||
"""
|
||||
entries = []
|
||||
max_confidence = 0.0
|
||||
text_upper = text.upper()
|
||||
|
||||
# Step 1: Check for known non-VAT payer (by class flag or text detection)
|
||||
if self.IS_NON_VAT_PAYER or self._is_non_vat_payer(text_upper):
|
||||
return [] # No TVA entries for non-VAT payers
|
||||
return ([], 0.0) # No TVA entries for non-VAT payers
|
||||
|
||||
# Step 2: Normalize OCR spaces in numbers
|
||||
normalized = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text_upper)
|
||||
lines = normalized.split('\n')
|
||||
|
||||
# Step 3: Try all formats, collect candidates
|
||||
# Step 3: Try all formats, collect candidates with confidence
|
||||
candidates = []
|
||||
|
||||
# Try inline multi-rate (Lidl-style)
|
||||
candidates.extend(self._try_tva_inline(normalized))
|
||||
inline_entries, inline_conf = self._try_tva_inline(normalized)
|
||||
candidates.extend(inline_entries)
|
||||
if inline_conf > max_confidence:
|
||||
max_confidence = inline_conf
|
||||
|
||||
# Try reversed format (Stepout-style)
|
||||
candidates.extend(self._try_tva_reversed(normalized, lines))
|
||||
reversed_entries, reversed_conf = self._try_tva_reversed(normalized, lines)
|
||||
candidates.extend(reversed_entries)
|
||||
if reversed_conf > max_confidence:
|
||||
max_confidence = reversed_conf
|
||||
|
||||
# Try multiline format (Brick/Electrobering)
|
||||
candidates.extend(self._try_tva_multiline(normalized, lines))
|
||||
multiline_entries, multiline_conf = self._try_tva_multiline(normalized, lines)
|
||||
candidates.extend(multiline_entries)
|
||||
if multiline_conf > max_confidence:
|
||||
max_confidence = multiline_conf
|
||||
|
||||
# Try table format (OMV-style)
|
||||
candidates.extend(self._try_tva_table(normalized))
|
||||
table_entries, table_conf = self._try_tva_table(normalized)
|
||||
candidates.extend(table_entries)
|
||||
if table_conf > max_confidence:
|
||||
max_confidence = table_conf
|
||||
|
||||
# Try standard/fallback patterns
|
||||
if not candidates:
|
||||
candidates.extend(self._try_tva_standard(normalized))
|
||||
standard_entries, standard_conf = self._try_tva_standard(normalized)
|
||||
candidates.extend(standard_entries)
|
||||
if standard_conf > max_confidence:
|
||||
max_confidence = standard_conf
|
||||
|
||||
# Step 4: Deduplicate and return
|
||||
seen = set()
|
||||
@@ -386,7 +403,7 @@ class BaseStoreProfile(ABC):
|
||||
entries.append(entry)
|
||||
seen.add(key)
|
||||
|
||||
return entries
|
||||
return (entries, max_confidence if entries else 0.0)
|
||||
|
||||
def _is_non_vat_payer(self, text: str) -> bool:
|
||||
"""Check if receipt is from non-VAT payer."""
|
||||
@@ -395,9 +412,10 @@ class BaseStoreProfile(ABC):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _try_tva_inline(self, text: str) -> List[dict]:
|
||||
def _try_tva_inline(self, text: str) -> Tuple[List[dict], float]:
|
||||
"""Try Lidl-style inline format: 'TVA A 21,00% 7.71'"""
|
||||
entries = []
|
||||
max_confidence = 0.0
|
||||
# Pattern: "TVA A 21,00% 7.71" or "TVA B 11,00% 2.13"
|
||||
for pattern, confidence, fmt in self.TVA_PATTERNS:
|
||||
if fmt != 'inline':
|
||||
@@ -415,13 +433,16 @@ class BaseStoreProfile(ABC):
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
if confidence > max_confidence:
|
||||
max_confidence = confidence
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
return (entries, max_confidence)
|
||||
|
||||
def _try_tva_reversed(self, text: str, lines: List[str]) -> List[dict]:
|
||||
def _try_tva_reversed(self, text: str, lines: List[str]) -> Tuple[List[dict], float]:
|
||||
"""Try Stepout-style reversed format: '5.00% TUA*B 2.00' (rate BEFORE TVA marker)"""
|
||||
entries = []
|
||||
confidence = 0.97 # Default confidence for reversed format
|
||||
# Pattern: "5.00% TUA*B 2.00" - procent BEFORE TVA, amount same line or next
|
||||
for i, line in enumerate(lines):
|
||||
# Try pattern with amount on SAME line: "5.00% TUA*B 2.00"
|
||||
@@ -462,11 +483,12 @@ class BaseStoreProfile(ABC):
|
||||
})
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
return (entries, confidence if entries else 0.0)
|
||||
|
||||
def _try_tva_multiline(self, text: str, lines: List[str]) -> List[dict]:
|
||||
def _try_tva_multiline(self, text: str, lines: List[str]) -> Tuple[List[dict], float]:
|
||||
"""Try multiline format: 'TOTAL TVA A - 19%' + amount on next line"""
|
||||
entries = []
|
||||
confidence = 0.95 # Default confidence for multiline format
|
||||
# Pattern: "TOTAL TVA A - 19%" or "TOTAL TVA A 19%" on one line, amount on next
|
||||
multiline_patterns = [
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
|
||||
@@ -491,14 +513,15 @@ class BaseStoreProfile(ABC):
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
return (entries, confidence)
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
return (entries, 0.0)
|
||||
|
||||
def _try_tva_table(self, text: str) -> List[dict]:
|
||||
def _try_tva_table(self, text: str) -> Tuple[List[dict], float]:
|
||||
"""Try OMV-style table format: 'A-21,00% 285,66 49,58'"""
|
||||
entries = []
|
||||
confidence = 0.96 # Default confidence for table format
|
||||
# Pattern: "A-21,00% 285,66 49,58" (code-percent base_amount tva_amount)
|
||||
table_pattern = r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)'
|
||||
for match in re.finditer(table_pattern, text, re.IGNORECASE):
|
||||
@@ -530,13 +553,15 @@ class BaseStoreProfile(ABC):
|
||||
'percent': 19, # Default rate
|
||||
'amount': amount
|
||||
})
|
||||
confidence = 0.90 # Lower confidence for fallback
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
return entries
|
||||
return (entries, confidence if entries else 0.0)
|
||||
|
||||
def _try_tva_standard(self, text: str) -> List[dict]:
|
||||
def _try_tva_standard(self, text: str) -> Tuple[List[dict], float]:
|
||||
"""Try standard TVA patterns as fallback"""
|
||||
entries = []
|
||||
matched_confidence = 0.0
|
||||
standard_fmts = ['standard', 'bon', 'percent', 'coded', 'fallback', 'books']
|
||||
for pattern, confidence, fmt in self.TVA_PATTERNS:
|
||||
if fmt not in standard_fmts:
|
||||
@@ -563,7 +588,7 @@ class BaseStoreProfile(ABC):
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
return (entries, confidence)
|
||||
elif len(groups) == 1:
|
||||
# Just amount
|
||||
amount = self._parse_decimal(self._clean_ocr_number(groups[0]))
|
||||
@@ -573,10 +598,10 @@ class BaseStoreProfile(ABC):
|
||||
'percent': 19,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
return (entries, confidence)
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
return entries
|
||||
return (entries, matched_confidence)
|
||||
|
||||
def _clean_ocr_number(self, value: str) -> str:
|
||||
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
|
||||
|
||||
Reference in New Issue
Block a user