fix(ocr): Fix store profile extraction patterns and module loading
Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -2,11 +2,16 @@
|
||||
ELECTROBERING S.R.L. store profile for OCR extraction.
|
||||
|
||||
Electronics and home supplies store.
|
||||
|
||||
Receipt structure:
|
||||
- TVA format: "TOTAL TVA A - - 19%" with amount on next line
|
||||
- "TOTAL TVA BON" with total TVA amount
|
||||
- Client CUI: "CIF CLIENT: XXXXXXX"
|
||||
"""
|
||||
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
from .base import BaseStoreProfile
|
||||
from . import ProfileRegistry
|
||||
@@ -15,11 +20,11 @@ from . import ProfileRegistry
|
||||
@ProfileRegistry.register
|
||||
class ElectroberingProfile(BaseStoreProfile):
|
||||
"""
|
||||
ELECTROBERING S.R.L. - standard TVA profile.
|
||||
ELECTROBERING S.R.L. - standard TVA profile with multiline support.
|
||||
|
||||
Key characteristics:
|
||||
- Standard TVA format (single rate, any percentage)
|
||||
- Electronics and home supplies
|
||||
- TVA format with rate on one line, amount on next
|
||||
- Double-dash separators common (OCR artifact)
|
||||
- May have client CUI for B2B purchases
|
||||
- CARD payment typical
|
||||
"""
|
||||
@@ -28,19 +33,28 @@ class ElectroberingProfile(BaseStoreProfile):
|
||||
NAME_PATTERNS = ["ELECTROBERING", "ELECTR0BERING", "ELECTROBERING SRL"]
|
||||
STORE_NAME = "ELECTROBERING S.R.L."
|
||||
|
||||
# Standard TVA patterns (flexible - accepts any rate)
|
||||
# ELECTROBERING TVA patterns (handles double-dash and multiline)
|
||||
TVA_PATTERNS = [
|
||||
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
|
||||
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "A - XX,XX% = YY,YY"
|
||||
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
# "TVA XX% YY,YY" (simple format without code)
|
||||
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
|
||||
# "TOTAL TVA A - - 19%" with amount on next line
|
||||
r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%',
|
||||
# "TOTAL TVA A 19%" without separator
|
||||
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
|
||||
# Standard: "TVA A: XX% = YY,YY"
|
||||
r'TVA\s*([A-D])\s*[-:]?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
|
||||
]
|
||||
|
||||
# TOTAL TVA BON pattern (fallback)
|
||||
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
|
||||
|
||||
def extract_tva_entries(self, text: str) -> List[dict]:
|
||||
"""
|
||||
Extract TVA entries from receipt text.
|
||||
Extract ELECTROBERING-specific TVA entries.
|
||||
|
||||
ELECTROBERING receipts show TVA in multi-line format:
|
||||
"TOTAL TVA A - - 19%"
|
||||
"5.59"
|
||||
"TOTAL TVA BON"
|
||||
"5.59"
|
||||
|
||||
Args:
|
||||
text: Raw OCR text from receipt
|
||||
@@ -49,45 +63,61 @@ class ElectroberingProfile(BaseStoreProfile):
|
||||
List of TVA entries with code, percent, and amount
|
||||
"""
|
||||
entries = []
|
||||
seen = set()
|
||||
text_upper = text.upper()
|
||||
lines = text_upper.split('\n')
|
||||
|
||||
# Try coded patterns first
|
||||
for pattern in self.TVA_PATTERNS[:2]:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
code = match.group(1).upper()
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
|
||||
if amount and amount > 0:
|
||||
entry_key = (code, percent)
|
||||
if entry_key not in seen:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
seen.add(entry_key)
|
||||
except (ValueError, InvalidOperation, IndexError):
|
||||
continue
|
||||
|
||||
# Fallback to simple format
|
||||
if not entries:
|
||||
simple_pattern = self.TVA_PATTERNS[2]
|
||||
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
|
||||
try:
|
||||
percent = int(match.group(1))
|
||||
amount = self._parse_decimal(match.group(2))
|
||||
# Find TVA rate line and get amount from next line
|
||||
for i, line in enumerate(lines):
|
||||
# Match "TOTAL TVA A - - 19%" or "TOTAL TVA A 19%"
|
||||
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[-\s]+(\d{1,2})\s*%', line)
|
||||
if match:
|
||||
code = match.group(1)
|
||||
percent = int(match.group(2))
|
||||
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
break
|
||||
return entries
|
||||
|
||||
# Fallback: Find TOTAL TVA BON and get amount
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
|
||||
# Amount should be on next line
|
||||
if i + 1 < len(lines):
|
||||
amount_str = lines[i + 1].strip()
|
||||
amount = self._parse_decimal(amount_str)
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': 'A',
|
||||
'percent': 19, # Default Romanian TVA rate
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
|
||||
# Last fallback: inline format "TVA A: XX% = YY,YY"
|
||||
for pattern in [self.TVA_PATTERNS[2]]:
|
||||
match = re.search(pattern, text_upper, re.IGNORECASE)
|
||||
if match and len(match.groups()) >= 3:
|
||||
try:
|
||||
code = match.group(1)
|
||||
percent = int(match.group(2))
|
||||
amount = self._parse_decimal(match.group(3))
|
||||
if amount and amount > 0:
|
||||
entries.append({
|
||||
'code': code,
|
||||
'percent': percent,
|
||||
'amount': amount
|
||||
})
|
||||
return entries
|
||||
except (ValueError, InvalidOperation):
|
||||
continue
|
||||
pass
|
||||
|
||||
return entries
|
||||
|
||||
@@ -99,4 +129,5 @@ class ElectroberingProfile(BaseStoreProfile):
|
||||
"has_client_cui": True, # May have client CUI for B2B
|
||||
"has_efactura": False,
|
||||
"is_non_vat_payer": False,
|
||||
"tva_on_separate_line": True,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user