fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -2,11 +2,16 @@
|
||||
BRICK (Five-Holding) store profile for OCR extraction.
|
||||
|
||||
Five-Holding S.A. operates BRICK stores with standard receipt format.
|
||||
|
||||
Receipt structure:
|
||||
- TVA format: "TOTAL TVA A - 21%" with amount on next line
|
||||
- Payment: "CARD" on separate line (amount from TOTAL LEI)
|
||||
- Client CUI: "CLIENT C.U.L./C.IF. :ROXXXXXXX" (OCR reads I as L)
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
@@ -15,32 +20,60 @@ from . import ProfileRegistry
|
||||
@ProfileRegistry.register
|
||||
class BrickProfile(BaseStoreProfile):
|
||||
"""
|
||||
FIVE-HOLDING S.A. (BRICK) - standard TVA format.
|
||||
FIVE-HOLDING S.A. (BRICK) - standard TVA format with client CUI.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format
|
||||
- Single TVA rate typically
|
||||
- No client CUI on receipts
|
||||
- Standard TVA format with rate code (A, B, etc.)
|
||||
- TVA amount on separate line after percentage
|
||||
- CARD payment indicated by keyword (amount derived from total)
|
||||
- Client CUI in format: CLIENT C.U.L./C.IF.
|
||||
- OCR often reads "I" as "L" in CUI markers
|
||||
"""
|
||||
|
||||
CUI_LIST = ["10562600"]
|
||||
NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK"] # OCR variants
|
||||
NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK", "F1VE"]
|
||||
STORE_NAME = "FIVE-HOLDING S.A."
|
||||
|
||||
# Standard TVA patterns (flexible - accepts any rate)
|
||||
# BRICK TVA patterns (amount often on separate line)
|
||||
TVA_PATTERNS = [
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
|
||||
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "A - XX,XX% = YY,YY"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# Simple: "TVA XX% YY,YY"
|
||||
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
||||
# "TOTAL TVA A - 21%" with amount on next line (captured as multiline)
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
|
||||
# "OTAL IVAA 21%" - OCR error variant
|
||||
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
|
||||
# "TOTAL TVA A 21%" without separator
|
||||
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
|
||||
# "TVA A: XX% = YY,YY" - inline format
|
||||
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# TOTAL TVA BON pattern (fallback)
|
||||
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s*BON\s*\n?\s*([\d.,]+)'
|
||||
|
||||
# Client CUI patterns - specific to Brick (handles OCR L/I confusion)
|
||||
CLIENT_CUI_PATTERNS = [
|
||||
# "CLIENT C.U.L./C.IF. :R01879855" - exact OCR format (I->L)
|
||||
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/?\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
|
||||
# "CLIENT C.U.I./C.I.F.: RO1879855" - standard format
|
||||
(r'CLIENT\s+C\.?U\.?I\.?\s*/?\s*C\.?I\.?F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
|
||||
# "CIF CLIENT: XXXXXXX" - alternative format
|
||||
(r'CIF\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.95),
|
||||
]
|
||||
|
||||
# Client markers for Brick
|
||||
CLIENT_MARKERS = [
|
||||
r'CLIENT\s+C\.?U\.?[LI1]',
|
||||
r'CLIENT\s+C\.?I\.?F',
|
||||
r'CIF\s+CLIENT',
|
||||
]
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract BRICK-specific TVA entries.
|
||||
|
||||
BRICK receipts show TVA in multi-line format:
|
||||
"TOTAL TVA A - 21%"
|
||||
"32.31"
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
@@ -48,11 +81,12 @@ class BrickProfile(BaseStoreProfile):
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
text_upper = text.upper()
|
||||
seen = set()
|
||||
|
||||
# Try coded patterns first
|
||||
for pattern in self.TVA_PATTERNS[:2]:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
# Try coded patterns first (with multiline support)
|
||||
for pattern in self.TVA_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
@@ -67,35 +101,182 @@ class BrickProfile(BaseStoreProfile):
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
return entries # Brick usually has single TVA rate
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback to simple format
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[2]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
# Fallback: "TOTAL TVA BON" with amount on next line
|
||||
match = re.search(self.TOTAL_TVA_BON_PATTERN, text_upper, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
try:
|
||||
amount = self._parse_decimal(match.group(1))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Default rate
|
||||
'amount': amount
|
||||
})
|
||||
except (ValueError, InvalidOperation):
|
||||
pass
|
||||
|
||||
return entries
|
||||
|
||||
def extract_payment_methods(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract BRICK-specific payment methods.
|
||||
|
||||
BRICK receipts show payment method on separate line:
|
||||
"TOTAL LEI"
|
||||
"21.18"
|
||||
"CARD"
|
||||
"0.00" <- REST (change)
|
||||
|
||||
When CARD appears with REST=0, full amount was paid by card.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
List of payment methods with method, amount, and confidence
|
||||
"""
|
||||
payments = []
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Find TOTAL LEI amount
|
||||
total_amount = None
|
||||
for i, line in enumerate(lines):
|
||||
if 'TOTAL' in line and 'LEI' in line:
|
||||
# Amount is likely on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
total_amount = self._parse_decimal(amount_str)
|
||||
break
|
||||
# Also try inline: "TOTAL LEI 21.18"
|
||||
match = re.search(r'TOTAL\s+LEI\s*([\d.,]+)', line)
|
||||
if match:
|
||||
total_amount = self._parse_decimal(match.group(1))
|
||||
break
|
||||
|
||||
if not total_amount:
|
||||
# Fallback to generic total extraction
|
||||
total_amount, _ = self.extract_total(text)
|
||||
|
||||
if not total_amount:
|
||||
return []
|
||||
|
||||
# Check for CARD or NUMERAR keywords
|
||||
has_card = any('CARD' in line for line in lines)
|
||||
has_numerar = any('NUMERAR' in line for line in lines)
|
||||
|
||||
# Find REST amount to determine actual card amount
|
||||
rest_amount = Decimal('0')
|
||||
for i, line in enumerate(lines):
|
||||
if 'REST' in line:
|
||||
# REST amount is on next line or same line
|
||||
match = re.search(r'REST\s*([\d.,]+)', line)
|
||||
if match:
|
||||
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
|
||||
elif i + 1 < len(lines):
|
||||
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
|
||||
break
|
||||
|
||||
if has_card:
|
||||
# Card payment = total - rest
|
||||
card_amount = total_amount - rest_amount
|
||||
if card_amount > 0:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': card_amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
|
||||
if has_numerar:
|
||||
# If both card and cash, need more complex logic
|
||||
# For now, assume numerar is the rest if card is present
|
||||
if not has_card:
|
||||
payments.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.95
|
||||
})
|
||||
elif rest_amount > 0:
|
||||
payments.append({
|
||||
'method': 'NUMERAR',
|
||||
'amount': rest_amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
|
||||
# If no explicit payment keyword but REST=0, assume card
|
||||
if not payments and rest_amount == 0:
|
||||
# Check for any payment indicators
|
||||
for line in lines:
|
||||
if 'CARD' in line or 'DEBIT' in line or 'CREDIT' in line:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.90
|
||||
})
|
||||
break
|
||||
|
||||
# FALLBACK: If still no payment found but we have total amount,
|
||||
# assume CARD for business receipts (Brick stores usually accept card)
|
||||
# This handles cases where OCR fails to capture payment method
|
||||
if not payments and total_amount and total_amount > 0:
|
||||
# Check if this is a fiscal receipt (BON FISCAL)
|
||||
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
|
||||
if is_fiscal:
|
||||
payments.append({
|
||||
'method': 'CARD',
|
||||
'amount': total_amount,
|
||||
'confidence': 0.70 # Lower confidence for inferred payment
|
||||
})
|
||||
|
||||
return payments
|
||||
|
||||
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Extract client CUI from BRICK receipt.
|
||||
|
||||
BRICK uses format: "CLIENT C.U.L./C.IF. :R01879855"
|
||||
Note: OCR often reads "I" as "L" in these markers.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
|
||||
Returns:
|
||||
Tuple of (cui, confidence) or (None, 0.0)
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for Brick client markers
|
||||
has_client = any(
|
||||
re.search(marker, text_upper, re.IGNORECASE)
|
||||
for marker in self.CLIENT_MARKERS
|
||||
)
|
||||
|
||||
if not has_client:
|
||||
return (None, 0.0)
|
||||
|
||||
# Try Brick-specific patterns
|
||||
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match:
|
||||
cui = match.group(1)
|
||||
# Clean up: remove RO prefix, spaces
|
||||
cui_digits = re.sub(r'[^0-9]', '', cui)
|
||||
if 6 <= len(cui_digits) <= 10:
|
||||
return (cui_digits, confidence)
|
||||
|
||||
return (None, 0.0)
|
||||
|
||||
def get_validation_hints(self) -> Dict[str, Any]:
|
||||
"""Return BRICK-specific validation hints."""
|
||||
return {
|
||||
"has_multi_rate_tva": False,
|
||||
"card_equals_total": False,
|
||||
"has_client_cui": False,
|
||||
"card_equals_total": True, # Card amount equals total when REST=0
|
||||
"has_client_cui": True, # Brick receipts CAN have client CUI
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"tva_on_separate_line": True, # TVA amount on next line
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user