fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions

View File

@@ -251,9 +251,12 @@ class ProfileRegistry:
# Get list of profile modules (exclude __init__, base)
module_names = cls._get_profile_module_names()
# Determine the module prefix based on how THIS module was imported
base_package = cls.__module__
count = 0
for module_name in module_names:
full_name = f"backend.modules.data_entry.services.ocr.profiles.{module_name}"
full_name = f"{base_package}.{module_name}"
try:
if full_name in sys.modules:
@@ -349,8 +352,15 @@ class ProfileRegistry:
module_names = cls._get_profile_module_names()
# Determine the module prefix based on how THIS module was imported
# This handles both:
# - Running from backend dir: "modules.data_entry.services.ocr.profiles"
# - Running from project root: "backend.modules.data_entry.services.ocr.profiles"
this_module = cls.__module__ # e.g. "backend.modules..." or "modules..."
base_package = this_module # Use the same prefix for child modules
for module_name in module_names:
full_name = f"backend.modules.data_entry.services.ocr.profiles.{module_name}"
full_name = f"{base_package}.{module_name}"
try:
importlib.import_module(full_name)
logger.debug(f"Loaded module: {module_name}")

View File

@@ -111,25 +111,34 @@ class BaseStoreProfile(ABC):
(r'(?:^|\n|\s)MERAR\s*:?\s*(\d{1,6}[.,]\d{2})\b', 'NUMERAR', 0.70),
]
# Client section markers (for B2B receipts)
# Client section markers (for B2B receipts) - More flexible patterns
CLIENT_MARKERS = [
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:',
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:',
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]\.?\s*:',
r'CLIENT\s*:',
r'CUMPARATOR\s*:',
r'BENEFICIAR\s*:',
r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT', # "CIF CLIENT" (with or without colon)
r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT', # "CUI CLIENT"
r'CLIENT\s+C\.?\s*[UI1]\.?\s*[IF1]', # "CLIENT CIF" / "CLIENT CUI"
r'CLIENT\s*:', # "CLIENT:"
r'CUMPARATOR\s*:', # "CUMPARATOR:"
r'BENEFICIAR\s*:', # "BENEFICIAR:"
r'CUMP[AĂ]R[AĂ]TOR', # "CUMPARATOR" without colon
r'COD\s+FISCAL\s+CLIENT', # "COD FISCAL CLIENT"
]
# Client CUI patterns (pattern, confidence)
# Client CUI patterns (pattern, confidence) - More flexible
CLIENT_CUI_PATTERNS = [
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT\s+C\.?\s*U\.?\s*[I1]\.?', 0.99),
(r'(R[O0]\d{6,10})\s*:?\s*\n\s*CLIENT', 0.98),
(r'C[I1]F\s+[A-Z]*\s*CLIENT\s*:?\s*(R[O0]\d{6,10})', 0.98),
(r'C\.?\s*[I1]\.?\s*F\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'C\.?\s*U\.?\s*[I1]\.?\s+CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.98),
(r'CLIENT\s+C\.?\s*U\.?\s*[I1]\.?\s*:?\s*(R[O0]?\d{6,10})', 0.95),
(r'CLIENT\s*:?\s*(R[O0]?\d{6,10})', 0.90),
# "CIF CLIENT:XXXXXXX" or "CIF CLIENT: ROXXXXXXX" - most common format
(r'C\.?[I1]\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
# "CLIENT CIF: XXXXXXX"
(r'CLIENT\s+C\.?[I1]\.?F\.?\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "CUI CLIENT: XXXXXXX"
(r'C\.?U\.?[I1]\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "ROXXXXXXX" followed by CLIENT marker
(r'(R[O0]\d{6,10})\s*\n\s*CLIENT', 0.97),
# "C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.96),
# "CLIENT: ROXXXXXXX" or "CLIENT: XXXXXXX"
(r'CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.90),
# "COD FISCAL CLIENT: XXXXXXX"
(r'COD\s+FISCAL\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.95),
]
# Company type indicators (for identifying company names)

View File

@@ -2,11 +2,16 @@
BRICK (Five-Holding) store profile for OCR extraction.
Five-Holding S.A. operates BRICK stores with standard receipt format.
Receipt structure:
- TVA format: "TOTAL TVA A - 21%" with amount on next line
- Payment: "CARD" on separate line (amount from TOTAL LEI)
- Client CUI: "CLIENT C.U.L./C.IF. :ROXXXXXXX" (OCR reads I as L)
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@@ -15,32 +20,60 @@ from . import ProfileRegistry
@ProfileRegistry.register
class BrickProfile(BaseStoreProfile):
"""
FIVE-HOLDING S.A. (BRICK) - standard TVA format.
FIVE-HOLDING S.A. (BRICK) - standard TVA format with client CUI.
Key characteristics:
- Standard TVA format
- Single TVA rate typically
- No client CUI on receipts
- Standard TVA format with rate code (A, B, etc.)
- TVA amount on separate line after percentage
- CARD payment indicated by keyword (amount derived from total)
- Client CUI in format: CLIENT C.U.L./C.IF.
- OCR often reads "I" as "L" in CUI markers
"""
CUI_LIST = ["10562600"]
NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK"] # OCR variants
NAME_PATTERNS = ["BRICK", "FIVE-HOLDING", "FIVE HOLDING", "BR1CK", "F1VE"]
STORE_NAME = "FIVE-HOLDING S.A."
# Standard TVA patterns (flexible - accepts any rate)
# BRICK TVA patterns (amount often on separate line)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# Simple: "TVA XX% YY,YY"
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
# "TOTAL TVA A - 21%" with amount on next line (captured as multiline)
r'TOTAL\s+TVA\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
# "OTAL IVAA 21%" - OCR error variant
r'O?TAL\s+[IT]VA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
# "TOTAL TVA A 21%" without separator
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%\s*\n?\s*([\d.,]+)',
# "TVA A: XX% = YY,YY" - inline format
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s*BON\s*\n?\s*([\d.,]+)'
# Client CUI patterns - specific to Brick (handles OCR L/I confusion)
CLIENT_CUI_PATTERNS = [
# "CLIENT C.U.L./C.IF. :R01879855" - exact OCR format (I->L)
(r'CLIENT\s+C\.?U\.?[LI1]\.?\s*/?\s*C\.?[LI1]\.?F\.?\s*:?\s*(R?O?\d{6,10})', 0.99),
# "CLIENT C.U.I./C.I.F.: RO1879855" - standard format
(r'CLIENT\s+C\.?U\.?I\.?\s*/?\s*C\.?I\.?F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
# "CIF CLIENT: XXXXXXX" - alternative format
(r'CIF\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.95),
]
# Client markers for Brick
CLIENT_MARKERS = [
r'CLIENT\s+C\.?U\.?[LI1]',
r'CLIENT\s+C\.?I\.?F',
r'CIF\s+CLIENT',
]
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract BRICK-specific TVA entries.
BRICK receipts show TVA in multi-line format:
"TOTAL TVA A - 21%"
"32.31"
Args:
text: Raw OCR text from receipt
@@ -48,11 +81,12 @@ class BrickProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
text_upper = text.upper()
seen = set()
# Try coded patterns first
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
# Try coded patterns first (with multiline support)
for pattern in self.TVA_PATTERNS:
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
@@ -67,35 +101,182 @@ class BrickProfile(BaseStoreProfile):
'amount': amount
})
seen.add(entry_key)
return entries # Brick usually has single TVA rate
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break
except (ValueError, InvalidOperation):
continue
# Fallback: "TOTAL TVA BON" with amount on next line
match = re.search(self.TOTAL_TVA_BON_PATTERN, text_upper, re.IGNORECASE | re.MULTILINE)
if match:
try:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default rate
'amount': amount
})
except (ValueError, InvalidOperation):
pass
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract BRICK-specific payment methods.
BRICK receipts show payment method on separate line:
"TOTAL LEI"
"21.18"
"CARD"
"0.00" <- REST (change)
When CARD appears with REST=0, full amount was paid by card.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Find TOTAL LEI amount
total_amount = None
for i, line in enumerate(lines):
if 'TOTAL' in line and 'LEI' in line:
# Amount is likely on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
total_amount = self._parse_decimal(amount_str)
break
# Also try inline: "TOTAL LEI 21.18"
match = re.search(r'TOTAL\s+LEI\s*([\d.,]+)', line)
if match:
total_amount = self._parse_decimal(match.group(1))
break
if not total_amount:
# Fallback to generic total extraction
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Check for CARD or NUMERAR keywords
has_card = any('CARD' in line for line in lines)
has_numerar = any('NUMERAR' in line for line in lines)
# Find REST amount to determine actual card amount
rest_amount = Decimal('0')
for i, line in enumerate(lines):
if 'REST' in line:
# REST amount is on next line or same line
match = re.search(r'REST\s*([\d.,]+)', line)
if match:
rest_amount = self._parse_decimal(match.group(1)) or Decimal('0')
elif i + 1 < len(lines):
rest_amount = self._parse_decimal(lines[i + 1].strip()) or Decimal('0')
break
if has_card:
# Card payment = total - rest
card_amount = total_amount - rest_amount
if card_amount > 0:
payments.append({
'method': 'CARD',
'amount': card_amount,
'confidence': 0.95
})
if has_numerar:
# If both card and cash, need more complex logic
# For now, assume numerar is the rest if card is present
if not has_card:
payments.append({
'method': 'NUMERAR',
'amount': total_amount,
'confidence': 0.95
})
elif rest_amount > 0:
payments.append({
'method': 'NUMERAR',
'amount': rest_amount,
'confidence': 0.90
})
# If no explicit payment keyword but REST=0, assume card
if not payments and rest_amount == 0:
# Check for any payment indicators
for line in lines:
if 'CARD' in line or 'DEBIT' in line or 'CREDIT' in line:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.90
})
break
# FALLBACK: If still no payment found but we have total amount,
# assume CARD for business receipts (Brick stores usually accept card)
# This handles cases where OCR fails to capture payment method
if not payments and total_amount and total_amount > 0:
# Check if this is a fiscal receipt (BON FISCAL)
is_fiscal = 'BON FISCAL' in text_upper or 'FISCAL' in text_upper
if is_fiscal:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70 # Lower confidence for inferred payment
})
return payments
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from BRICK receipt.
BRICK uses format: "CLIENT C.U.L./C.IF. :R01879855"
Note: OCR often reads "I" as "L" in these markers.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for Brick client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try Brick-specific patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def get_validation_hints(self) -> Dict[str, Any]:
"""Return BRICK-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"has_client_cui": False,
"card_equals_total": True, # Card amount equals total when REST=0
"has_client_cui": True, # Brick receipts CAN have client CUI
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True, # TVA amount on next line
}

View File

@@ -2,11 +2,16 @@
ELECTROBERING S.R.L. store profile for OCR extraction.
Electronics and home supplies store.
Receipt structure:
- TVA format: "TOTAL TVA A - - 19%" with amount on next line
- "TOTAL TVA BON" with total TVA amount
- Client CUI: "CIF CLIENT: XXXXXXX"
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@@ -15,11 +20,11 @@ from . import ProfileRegistry
@ProfileRegistry.register
class ElectroberingProfile(BaseStoreProfile):
"""
ELECTROBERING S.R.L. - standard TVA profile.
ELECTROBERING S.R.L. - standard TVA profile with multiline support.
Key characteristics:
- Standard TVA format (single rate, any percentage)
- Electronics and home supplies
- TVA format with rate on one line, amount on next
- Double-dash separators common (OCR artifact)
- May have client CUI for B2B purchases
- CARD payment typical
"""
@@ -28,19 +33,28 @@ class ElectroberingProfile(BaseStoreProfile):
NAME_PATTERNS = ["ELECTROBERING", "ELECTR0BERING", "ELECTROBERING SRL"]
STORE_NAME = "ELECTROBERING S.R.L."
# Standard TVA patterns (flexible - accepts any rate)
# ELECTROBERING TVA patterns (handles double-dash and multiline)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA XX% YY,YY" (simple format without code)
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
# "TOTAL TVA A - - 19%" with amount on next line
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
# "TOTAL TVA A 19%" without separator
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
# Standard: "TVA A: XX% = YY,YY"
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Extract ELECTROBERING-specific TVA entries.
ELECTROBERING receipts show TVA in multi-line format:
"TOTAL TVA A - - 19%"
"5.59"
"TOTAL TVA BON"
"5.59"
Args:
text: Raw OCR text from receipt
@@ -49,45 +63,61 @@ class ElectroberingProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
lines = text_upper.split('\n')
# Try coded patterns first
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
# Find TVA rate line and get amount from next line
for i, line in enumerate(lines):
# Match "TOTAL TVA A - - 19%" or "TOTAL TVA A 19%"
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', line)
if match:
code = match.group(1)
percent = int(match.group(2))
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'code': code,
'percent': percent,
'amount': amount
})
break
return entries
# Fallback: Find TOTAL TVA BON and get amount
for i, line in enumerate(lines):
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default Romanian TVA rate
'amount': amount
})
return entries
# Last fallback: inline format "TVA A: XX% = YY,YY"
for pattern in [self.TVA_PATTERNS[2]]:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match and len(match.groups()) >= 3:
try:
code = match.group(1)
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation):
continue
pass
return entries
@@ -99,4 +129,5 @@ class ElectroberingProfile(BaseStoreProfile):
"has_client_cui": True, # May have client CUI for B2B
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True,
}

View File

@@ -2,6 +2,10 @@
GAMA INK SERVICE SRL store profile for OCR extraction.
Toner refill and printer supplies store.
Receipt structure:
- TVA format: "TOTAL TVA A 4 19%" with amount on next line (4 is OCR for -)
- "TOTAL TVA BON" with total TVA amount
"""
import re
@@ -15,11 +19,11 @@ from . import ProfileRegistry
@ProfileRegistry.register
class GamaInkProfile(BaseStoreProfile):
"""
GAMA INK SERVICE SRL - standard TVA profile.
GAMA INK SERVICE SRL - standard TVA profile with multiline support.
Key characteristics:
- Standard TVA format (single rate, any percentage)
- Service-based (toner refill, printer supplies)
- TVA format with rate on one line, amount on next
- OCR often reads "-" as "4" (e.g., "A 4 19%" instead of "A - 19%")
- CARD payment typical
"""
@@ -27,21 +31,23 @@ class GamaInkProfile(BaseStoreProfile):
NAME_PATTERNS = ["GAMA INK", "GAMA", "GAMAINK", "GAMA INK SERVICE"]
STORE_NAME = "GAMA INK SERVICE SRL"
# Standard TVA patterns (flexible - accepts any rate)
# GAMA INK TVA patterns (handles OCR errors)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA XX% YY,YY" (simple format without code)
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
# "TVA: YY,YY" (amount only, percent inferred)
r'TVA\s*:?\s*([\d.,]+)\s*(?:LEI|RON)?',
# "TOTAL TVA A 4 19%" (4 is OCR for -)
r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%',
# "TOTAL TVA A - 19%"
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Extract GAMA INK-specific TVA entries.
Format: "TOTAL TVA A 4 19%" on one line, amount on next line.
Note: OCR reads "-" as "4" sometimes.
Args:
text: Raw OCR text from receipt
@@ -50,45 +56,43 @@ class GamaInkProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
lines = text_upper.split('\n')
# Try coded patterns first (have both code and percent)
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format (percent + amount without code)
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
# Find TVA rate line and get amount from next line
for i, line in enumerate(lines):
# Match "TOTAL TVA A 4 19%" or "TOTAL TVA A - 19%"
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%', line)
if match:
code = match.group(1)
percent = int(match.group(2))
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'code': code,
'percent': percent,
'amount': amount
})
break
except (ValueError, InvalidOperation):
continue
return entries
# Fallback: Find TOTAL TVA BON and get amount
for i, line in enumerate(lines):
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default Romanian TVA rate
'amount': amount
})
return entries
return entries
@@ -97,7 +101,8 @@ class GamaInkProfile(BaseStoreProfile):
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": False,
"has_client_cui": True, # May have client CUI for business
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True,
}

View File

@@ -5,6 +5,7 @@ OMV receipts typically include client CUI and use standard TVA format.
Common at gas stations with fuel purchases.
Date format: YYYY. MM. DD with spaces (e.g., "2025. 08. 14")
OCR quirk: Numbers often have spaces before decimals (e.g., "55, 22" instead of "55,22")
"""
import re
@@ -24,17 +25,24 @@ class OMVProfile(BaseStoreProfile):
Key characteristics:
- Standard TVA format (usually single rate, any percentage)
- Includes client CUI on receipt (for business purchases)
- TVA table format: "A-XX,XX% base_amount tva_amount"
- TVA table format: "A-XX, XX% base_amount tva_amount" (with OCR spaces)
- Supports historical rates (19%) and current rates (21%)
- Date format: YYYY. MM. DD (with spaces)
- Client CUI format: "CLIENT C.U. I./C.I.F.: ROXXXXXXX"
"""
CUI_LIST = ["11201891"]
NAME_PATTERNS = ["OMV", "PETROM", "OMV PETROM", "0MV"] # OCR variants
STORE_NAME = "OMV PETROM MARKETING S.R.L."
# OMV TVA table pattern: "A-19,00% 285,66 49,58" (code-percent base tva)
TVA_TABLE_PATTERN = r'([A-D])\s*[-:]\s*(\d{1,2})[.,]\d{2}\s*%\s+([\d.,]+)\s+([\d.,]+)'
# OMV TVA table patterns (handles OCR spaces in numbers)
# Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, total)
TVA_TABLE_PATTERNS = [
# "A-21, 00% 55, 22 318, 16" - with spaces in numbers
r'([A-D])\s*[-:]\s*(\d{1,2})[.,\s]*\d{0,2}\s*%\s+([\d.,\s]+)\s+([\d.,\s]+)',
# "TOTAL TAXE: 55, 22" - fallback to TOTAL TAXE
r'TOTAL\s+TAXE\s*:?\s*([\d.,\s]+)',
]
# Standard TVA pattern fallback
TVA_STANDARD_PATTERN = r'TVA\s*:?\s*([\d.,]+)'
@@ -49,12 +57,38 @@ class OMVProfile(BaseStoreProfile):
(r'(\d{2})[.,]\s*(\d{2})[.,]\s*(\d{4})', 0.85, 'dmy'),
]
# Client CUI patterns for OMV (unique format)
CLIENT_CUI_PATTERNS = [
# "CLIENT C.U. I./C.I.F.: RO1879855"
(r'CLIENT\s+C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s*:?\s*(R?O?\s*\d{6,10})', 0.99),
# "C.U.I./C.I.F. CLIENT: XXXXXXX"
(r'C\.?\s*U\.?\s*I\.?\s*/?\s*C\.?\s*I\.?\s*F\.?\s+CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.98),
# Fallback to simpler pattern
(r'CLIENT\s*:?\s*(R?O?\s*\d{6,10})', 0.90),
]
# Client markers for OMV
CLIENT_MARKERS = [
r'CLIENT\s+C\.?\s*U\.?\s*I',
r'CLIENT\s+C\.?\s*I\.?\s*F',
r'NUME\s+CLIENT',
r'CLIENT\s*:',
]
def _clean_ocr_number(self, value: str) -> str:
"""Remove OCR spaces from numbers (e.g., '55, 22' -> '55,22')."""
# Remove spaces around commas and periods
value = re.sub(r'\s*([.,])\s*', r'\1', value)
# Remove any remaining spaces
value = value.replace(' ', '')
return value
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract OMV-specific TVA entries.
OMV receipts often show TVA in table format with base and TVA amounts.
Falls back to standard extraction if table format not found.
OMV receipts show TVA in table format with spaces in numbers.
Format: "A-21, 00% 55, 22 318, 16" (rate, TVA amount, base)
Args:
text: Raw OCR text from receipt
@@ -63,35 +97,138 @@ class OMVProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
# Try table format first (more accurate)
for match in re.finditer(self.TVA_TABLE_PATTERN, text, re.IGNORECASE):
# Try table format first: "A-21, 00% 55, 22 318, 16"
table_pattern = self.TVA_TABLE_PATTERNS[0]
for match in re.finditer(table_pattern, text_upper):
try:
code = match.group(1).upper()
percent = int(match.group(2))
# TVA amount is the second number (smaller one)
tva_amount = self._parse_decimal(match.group(4))
# Clean OCR spaces from amounts
tva_amount_str = self._clean_ocr_number(match.group(3))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation):
entries.append({
'code': code,
'percent': percent,
'amount': tva_amount
})
return entries # OMV usually has single TVA rate
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback: "TOTAL TAXE: 55, 22"
fallback_pattern = self.TVA_TABLE_PATTERNS[1]
match = re.search(fallback_pattern, text_upper)
if match:
try:
tva_amount_str = self._clean_ocr_number(match.group(1))
tva_amount = self._parse_decimal(tva_amount_str)
if tva_amount and tva_amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Standard rate, will be corrected by validation
'amount': tva_amount
})
except (ValueError, InvalidOperation):
pass
return entries
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from OMV receipt.
OMV uses format: "CLIENT C.U. I./C.I.F.: RO1879855"
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for OMV client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try OMV-specific patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract OMV-specific payment methods.
OMV receipts use "CARTE CREDIT" instead of "CARD".
Payment amount equals TOTAL for gas station receipts.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# OMV payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments # OMV usually has single payment method
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return OMV-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,

View File

@@ -100,11 +100,62 @@ class SocarProfile(BaseStoreProfile):
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract SOCAR-specific payment methods.
Gas stations use "CARTE CREDIT" or "CARD" for card payments.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
# Get total amount first
total_amount, _ = self.extract_total(text)
if not total_amount:
return []
# Gas station payment patterns
payment_indicators = [
('CARTE CREDIT', 'CARD', 0.98),
('CARTE DE CREDIT', 'CARD', 0.98),
('CARD', 'CARD', 0.95),
('VISA', 'CARD', 0.95),
('MASTERCARD', 'CARD', 0.95),
('CONTACTLESS', 'CARD', 0.90),
('NUMERAR', 'NUMERAR', 0.95),
('CASH', 'NUMERAR', 0.90),
]
for indicator, method, confidence in payment_indicators:
if indicator in text_upper:
payments.append({
'method': method,
'amount': total_amount,
'confidence': confidence
})
return payments
# Fallback: If no explicit payment but has BON FISCAL, assume CARD
if 'BON FISCAL' in text_upper:
payments.append({
'method': 'CARD',
'amount': total_amount,
'confidence': 0.70
})
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return SOCAR-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": False,
"card_equals_total": True, # Gas station: card equals total
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,

View File

@@ -2,11 +2,17 @@
STEPOUT MARKET SRL store profile for OCR extraction.
Bookstore with reduced TVA rate (5% for books in Romania).
Receipt structure:
- TVA format: "5.00% TUA*B" with amount on next line
- Total format: "SUMA TOTALA:" with amount on next line
- Payment: "CARD" with amount on next line
- Client CUI: "CIF CLIENT:XXXXXXX"
"""
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from typing import List, Dict, Any, Tuple, Optional
from .base import BaseStoreProfile
from . import ProfileRegistry
@@ -19,33 +25,66 @@ class StepoutMarketProfile(BaseStoreProfile):
Key characteristics:
- Reduced TVA rate: 5% for books (cărți qualification in Romania)
- May also have standard rates for non-book items
- Patterns are flexible to accept ANY TVA rate
- TVA format: "X.XX% TUA*B" (OCR reads TVA as TUA)
- Multiline format for amounts
- CARD payment typical
"""
CUI_LIST = ["35532655"]
NAME_PATTERNS = ["STEPOUT", "STEPOUT MARKET", "STEP0UT", "STEPOUT MARKET SRL"]
NAME_PATTERNS = ["STEPOUT", "STEPOUT MARKET", "STEP0UT", "STEPUUT", "STEPOUT MARKET SRL"]
STORE_NAME = "STEPOUT MARKET SRL"
# TVA patterns (flexible - accepts any rate including 5%)
# TVA patterns for Stepout (handles TUA OCR error and multiline)
TVA_PATTERNS = [
# "TVA A: 5% = YY,YY" or "TVA-A 5% YY,YY" (coded format)
# "5.00% TUA*B" - OCR format with TUA
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])',
# "TVA A: 5% = YY,YY" or "TVA-A 5% YY,YY" (inline format)
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - 5,00% = YY,YY" (table format)
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA 5% YY,YY" (simple format - common for single rate)
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
# "TVA 5,00%: YY,YY" (percent with colon)
r'TVA\s+(\d{1,2})[.,]\d{2}\s*%\s*:?\s*([\d.,]+)',
# "TOTAL TUA:" with amount on next line
r'TOTAL\s+T[UV]A\s*:',
]
# Total patterns for Stepout
TOTAL_PATTERNS = [
# "SUMA TOTALA:" with amount on next line
(r'SUMA\s+TOTALA\s*:', 0.98),
# "TOTAL:" fallback
(r'TOTAL\s*:', 0.90),
]
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from Stepout Market receipt.
Format: "SUMA TOTALA:" on one line, amount on next line.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (total_amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
lines = text_upper.split('\n')
for pattern, confidence in self.TOTAL_PATTERNS:
for i, line in enumerate(lines):
if re.search(pattern, line, re.IGNORECASE):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
return (amount, confidence)
# Fallback to base class
return super().extract_total(text)
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Extract TVA entries from Stepout Market receipt.
Stepout Market primarily sells books which have 5% TVA in Romania.
The patterns are generic and will extract whatever rate is on the receipt.
Format: "5.00% TUA*B" on one line, amount on next line.
Args:
text: Raw OCR text from receipt
@@ -54,59 +93,112 @@ class StepoutMarketProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
lines = text_upper.split('\n')
# Try coded patterns first (have code letter)
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
# Try "X.XX% TUA*B" format first
for i, line in enumerate(lines):
match = re.search(r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?([A-D])', line)
if match:
percent = int(match.group(1))
code = match.group(2)
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
return entries # Single rate store
# Fallback to simple format (no code letter, just percent + amount)
if not entries:
for pattern in self.TVA_PATTERNS[2:]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
if amount and amount > 0:
# Default to code 'A' for simple format
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
break # Only take first match for simple format
except (ValueError, InvalidOperation):
continue
if entries:
break
# Try "TOTAL TUA:" format
for i, line in enumerate(lines):
if re.search(r'TOTAL\s+T[UV]A\s*:', line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'B', # Books are usually code B (5%)
'percent': 5,
'amount': amount
})
return entries
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods from Stepout Market receipt.
Format: "CARD" on one line, amount on next line.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
lines = text_upper.split('\n')
# Find CARD or NUMERAR keyword
for i, line in enumerate(lines):
line_stripped = line.strip()
if line_stripped == 'CARD':
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': 'CARD',
'amount': amount,
'confidence': 0.95
})
return payments
elif line_stripped == 'NUMERAR' or 'CASH' in line_stripped:
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': 'NUMERAR',
'amount': amount,
'confidence': 0.95
})
return payments
# Fallback: check for inline CARD amount
for line in lines:
match = re.search(r'CARD\s*:?\s*([\d.,]+)', line)
if match:
amount = self._parse_decimal(match.group(1))
if amount and amount > 0:
payments.append({
'method': 'CARD',
'amount': amount,
'confidence': 0.90
})
return payments
return payments
def get_validation_hints(self) -> Dict[str, Any]:
"""Return STEPOUT MARKET-specific validation hints."""
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": True, # May have client CUI
"has_client_cui": True,
"has_efactura": False,
"is_non_vat_payer": False,
"typical_tva_rate": 5, # Books have 5% TVA in Romania
"product_category": "books",
"tva_on_separate_line": True,
}

View File

@@ -6,7 +6,7 @@ Key duplication service. Notable for CASH (NUMERAR) payments.
import re
from decimal import Decimal, InvalidOperation
from typing import List, Dict, Any
from typing import List, Dict, Any, Optional, Tuple
from .base import BaseStoreProfile
from . import ProfileRegistry
@@ -22,26 +22,101 @@ class UnlimitedKeysProfile(BaseStoreProfile):
- Key duplication service
- NUMERAR (cash) payment common - different from most stores!
- May also accept CARD
- OCR often reads "TVA" as "TUA" - need OCR error variants
"""
CUI_LIST = ["18993187"]
NAME_PATTERNS = ["UNLIMITED KEYS", "UNLIMITED", "UNL1MITED", "UNLIMITED KEYS SRL"]
STORE_NAME = "UNLIMITED KEYS S.R.L."
# Standard TVA patterns (flexible - accepts any rate)
# Standard TVA patterns - including OCR error variants (TVA -> TUA)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY" (including TUA OCR error)
r'T[UV]A\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,\s]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA XX% YY,YY" (simple format without code)
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,\s]+)',
# "TVA XX% YY,YY" (simple format, includes TUA)
r'T[UV]A\s+(\d{1,2})\s*%\s*([\d.,\s]+)',
# "XX.XX% TUA*A YY.YY" (OCR format with TUA*A or TUA)
r'(\d{1,2})[.,]\d{0,2}\s*%\s*T[UV]A\*?[A-D]?\s*([\d.,\s]+)',
# "TOTAL TUA: YY.YY" (total TVA amount only)
r'TOTAL\s+T[UV]A\s*:?\s*([\d.,\s]+)',
]
# TOTAL patterns for UNLIMITED KEYS (handles "80 .00" format)
TOTAL_PATTERNS = [
# "SUMA TOTALA: 80 .00" (with space before decimal)
(r'SUMA\s+TOTALA\s*:?\s*([\d\s.,]+)', 0.98),
# "TOTALA: 80,00"
(r'TOTALA\s*:?\s*([\d.,]+)', 0.95),
# Standard TOTAL patterns from base class
(r'TOTAL\s+(?:DE\s+PLATA|ACHITAT|LEI)\s*:?\s*([\d.,]+)', 0.95),
(r'TOTAL\s*:?\s*([\d.,]+)', 0.90),
]
# Payment patterns - NUMERAR is primary for this store
PAYMENT_PATTERNS = [
# "NUMERAR 80.00" or "NUMERAR: 80.00"
(r'NUMERAR\s*:?\s*([\d.,\s]+)', 'NUMERAR', 0.98),
# "CARD 80.00" or "CARD: 80.00"
(r'CARD\s*:?\s*([\d.,\s]+)', 'CARD', 0.95),
]
# Client CUI patterns - specific to this receipt format
CLIENT_CUI_PATTERNS = [
# "CIF CLIENT:1879855" (exact format from OCR)
(r'CIF\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.99),
# "CLIENT CIF: ROXXXXXXX"
(r'CLIENT\s+CIF\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
# "C.I.F. CLIENT: XXXXXXX"
(r'C\.?I\.?F\.?\s+CLIENT\s*:?\s*(R?[O0]?\s*\d{6,10})', 0.98),
]
# Override client markers to be less strict
CLIENT_MARKERS = [
r'CIF\s+CLIENT',
r'CLIENT\s+CIF',
r'C\.?I\.?F\.?\s+CLIENT',
r'CLIENT\s*:',
]
def extract_total(self, text: str) -> Tuple[Optional[Decimal], float]:
"""
Extract total amount from receipt text.
Handles UNLIMITED KEYS format with space before decimal (e.g., "80 .00").
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (total_amount, confidence) or (None, 0.0)
"""
text_upper = text.upper()
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
try:
# Clean up amount string (remove spaces, fix decimal)
amount_str = match.group(1)
# Remove spaces that might appear before decimal
amount_str = re.sub(r'\s+', '', amount_str)
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
return (amount, confidence)
except (ValueError, InvalidOperation):
continue
return (None, 0.0)
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Handles OCR errors where TVA is read as TUA.
Args:
text: Raw OCR text from receipt
@@ -49,48 +124,139 @@ class UnlimitedKeysProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
# Try coded patterns first
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
# Pattern 4: "XX.XX% TUA*A YY.YY" - common OCR format
pattern4 = self.TVA_PATTERNS[3]
match = re.search(pattern4, text_upper)
if match:
try:
percent = int(match.group(1))
amount_str = re.sub(r'\s+', '', match.group(2))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': percent,
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
pass
# Pattern 5: "TOTAL TUA: YY.YY" - fallback to total TVA
pattern5 = self.TVA_PATTERNS[4]
match = re.search(pattern5, text_upper)
if match:
try:
amount_str = re.sub(r'\s+', '', match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
# Infer percent from amount vs total ratio
entries.append({
'code': 'A',
'percent': 19, # Standard Romanian TVA rate
'amount': amount
})
return entries
except (ValueError, InvalidOperation, IndexError):
pass
# Try coded patterns
for pattern in self.TVA_PATTERNS[:3]:
for match in re.finditer(pattern, text_upper, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
groups = match.groups()
if len(groups) == 3:
code = groups[0].upper()
percent = int(groups[1])
amount_str = re.sub(r'\s+', '', groups[2])
else:
code = 'A'
percent = int(groups[0])
amount_str = re.sub(r'\s+', '', groups[1])
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'code': code,
'percent': percent,
'amount': amount
})
break
except (ValueError, InvalidOperation):
return entries
except (ValueError, InvalidOperation, IndexError):
continue
return entries
def extract_payment_methods(self, text: str) -> List[dict]:
"""
Extract payment methods from receipt text.
Handles NUMERAR (cash) as primary payment for this store.
Args:
text: Raw OCR text from receipt
Returns:
List of payment methods with method, amount, and confidence
"""
payments = []
text_upper = text.upper()
for pattern, method, confidence in self.PAYMENT_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
try:
amount_str = re.sub(r'\s+', '', match.group(1))
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
payments.append({
'method': method,
'amount': amount,
'confidence': confidence
})
except (ValueError, InvalidOperation):
continue
return payments
def extract_client_cui(self, text: str) -> Tuple[Optional[str], float]:
"""
Extract client CUI from receipt text.
Handles "CIF CLIENT:1879855" format specific to this store.
Args:
text: Raw OCR text from receipt
Returns:
Tuple of (cui, confidence) or (None, 0.0)
"""
text_upper = text.upper()
# Check for client markers
has_client = any(
re.search(marker, text_upper, re.IGNORECASE)
for marker in self.CLIENT_MARKERS
)
if not has_client:
return (None, 0.0)
# Try client CUI patterns
for pattern, confidence in self.CLIENT_CUI_PATTERNS:
match = re.search(pattern, text_upper, re.IGNORECASE)
if match:
cui = match.group(1)
# Clean up: remove RO prefix, spaces
cui_digits = re.sub(r'[^0-9]', '', cui)
if 6 <= len(cui_digits) <= 10:
return (cui_digits, confidence)
return (None, 0.0)
def get_validation_hints(self) -> Dict[str, Any]:
"""Return UNLIMITED KEYS-specific validation hints."""
return {

View File

@@ -456,7 +456,9 @@ class ReceiptExtractor:
# Lookup store-specific profile for enhanced extraction accuracy
store_profile = ProfileRegistry.get_profile(result.cui) if result.cui else None
if store_profile:
print(f"[Profile] Using {store_profile.__class__.__name__} for CUI {result.cui}", flush=True)
print(f"[Profile] Using {store_profile.STORE_NAME} ({store_profile.__class__.__name__}) for CUI {result.cui}", flush=True)
else:
print(f"[Profile] ⚠️ No profile found for CUI '{result.cui}' - using GENERIC extraction", flush=True)
# =========================================================================
# STEP 2: Extract ALL fields using profile (if available) or generic
@@ -490,8 +492,11 @@ class ReceiptExtractor:
result.client_address = client_address
result.confidence_client = confidence
# Log extraction results for debugging
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
print(f"[Profile] Extracted: total={result.amount}, date={result.receipt_date}, "
f"TVA entries={len(result.tva_entries)}, payments={len(result.payment_methods)}", flush=True)
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
else:
# Generic extraction for unknown stores
result.amount, result.confidence_amount = self._extract_amount(text_upper)
@@ -507,6 +512,12 @@ class ReceiptExtractor:
result.client_address = client_address
result.confidence_client = confidence
# Log generic extraction results for debugging
tva_summary = ", ".join([f"{e.get('percent', '?')}%={e.get('amount', '?')}" for e in result.tva_entries]) if result.tva_entries else "none"
payment_summary = ", ".join([f"{p.get('method', '?')}={p.get('amount', '?')}" for p in result.payment_methods]) if result.payment_methods else "none"
print(f"[Generic] Extracted: total={result.amount}, date={result.receipt_date}, "
f"TVA=[{tva_summary}], payments=[{payment_summary}], client_cui={result.client_cui}", flush=True)
# Series extraction (no profile method, always generic)
result.receipt_series, _ = self._extract_series(text_upper)