fix(ocr): Fix store profile extraction patterns and module loading

Major fixes to OCR store profiles for Romanian receipt extraction:

- Fix ProfileRegistry module path resolution (was loading 0 profiles)
- Add multiline TVA extraction for Brick, Electrobering, Gama Ink
- Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations
- Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers
- Add client CUI patterns for Brick receipts
- Add profile selection logging to ocr_extractor.py
- Create test script for all 29 PDFs (test_all_profiles.py)

Test results: 13/29 passing (improved from 9/29)
Remaining failures are primarily OCR quality issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-01-07 09:40:58 +00:00
parent 099556213d
commit 28f259cd05
13 changed files with 1531 additions and 257 deletions

View File

@@ -2,6 +2,10 @@
GAMA INK SERVICE SRL store profile for OCR extraction.
Toner refill and printer supplies store.
Receipt structure:
- TVA format: "TOTAL TVA A 4 19%" with amount on next line (4 is OCR for -)
- "TOTAL TVA BON" with total TVA amount
"""
import re
@@ -15,11 +19,11 @@ from . import ProfileRegistry
@ProfileRegistry.register
class GamaInkProfile(BaseStoreProfile):
"""
GAMA INK SERVICE SRL - standard TVA profile.
GAMA INK SERVICE SRL - standard TVA profile with multiline support.
Key characteristics:
- Standard TVA format (single rate, any percentage)
- Service-based (toner refill, printer supplies)
- TVA format with rate on one line, amount on next
- OCR often reads "-" as "4" (e.g., "A 4 19%" instead of "A - 19%")
- CARD payment typical
"""
@@ -27,21 +31,23 @@ class GamaInkProfile(BaseStoreProfile):
NAME_PATTERNS = ["GAMA INK", "GAMA", "GAMAINK", "GAMA INK SERVICE"]
STORE_NAME = "GAMA INK SERVICE SRL"
# Standard TVA patterns (flexible - accepts any rate)
# GAMA INK TVA patterns (handles OCR errors)
TVA_PATTERNS = [
# "TVA A: XX% = YY,YY" or "TVA-A XX% YY,YY"
r'TVA\s*[-:]?\s*([A-D])\s*:?\s*(\d{1,2})\s*%\s*[=:]?\s*([\d.,]+)',
# "A - XX,XX% = YY,YY"
r'([A-D])\s*[-:]\s*(\d{1,2})[.,]?\d{0,2}\s*%\s*[=:]?\s*([\d.,]+)',
# "TVA XX% YY,YY" (simple format without code)
r'TVA\s+(\d{1,2})\s*%\s*([\d.,]+)',
# "TVA: YY,YY" (amount only, percent inferred)
r'TVA\s*:?\s*([\d.,]+)\s*(?:LEI|RON)?',
# "TOTAL TVA A 4 19%" (4 is OCR for -)
r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%',
# "TOTAL TVA A - 19%"
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
]
# TOTAL TVA BON pattern (fallback)
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
def extract_tva_entries(self, text: str) -> List[dict]:
"""
Extract TVA entries from receipt text.
Extract GAMA INK-specific TVA entries.
Format: "TOTAL TVA A 4 19%" on one line, amount on next line.
Note: OCR reads "-" as "4" sometimes.
Args:
text: Raw OCR text from receipt
@@ -50,45 +56,43 @@ class GamaInkProfile(BaseStoreProfile):
List of TVA entries with code, percent, and amount
"""
entries = []
seen = set()
text_upper = text.upper()
lines = text_upper.split('\n')
# Try coded patterns first (have both code and percent)
for pattern in self.TVA_PATTERNS[:2]:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount = self._parse_decimal(match.group(3))
if amount and amount > 0:
entry_key = (code, percent)
if entry_key not in seen:
entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen.add(entry_key)
except (ValueError, InvalidOperation, IndexError):
continue
# Fallback to simple format (percent + amount without code)
if not entries:
simple_pattern = self.TVA_PATTERNS[2]
for match in re.finditer(simple_pattern, text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount = self._parse_decimal(match.group(2))
# Find TVA rate line and get amount from next line
for i, line in enumerate(lines):
# Match "TOTAL TVA A 4 19%" or "TOTAL TVA A - 19%"
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%', line)
if match:
code = match.group(1)
percent = int(match.group(2))
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'code': code,
'percent': percent,
'amount': amount
})
break
except (ValueError, InvalidOperation):
continue
return entries
# Fallback: Find TOTAL TVA BON and get amount
for i, line in enumerate(lines):
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
# Amount should be on next line
if i + 1 < len(lines):
amount_str = lines[i + 1].strip()
amount = self._parse_decimal(amount_str)
if amount and amount > 0:
entries.append({
'code': 'A',
'percent': 19, # Default Romanian TVA rate
'amount': amount
})
return entries
return entries
@@ -97,7 +101,8 @@ class GamaInkProfile(BaseStoreProfile):
return {
"has_multi_rate_tva": False,
"card_equals_total": True,
"has_client_cui": False,
"has_client_cui": True, # May have client CUI for business
"has_efactura": False,
"is_non_vat_payer": False,
"tva_on_separate_line": True,
}