Major fixes to OCR store profiles for Romanian receipt extraction: - Fix ProfileRegistry module path resolution (was loading 0 profiles) - Add multiline TVA extraction for Brick, Electrobering, Gama Ink - Add "CARTE CREDIT" payment detection for OMV/SOCAR gas stations - Handle OCR artifacts: TVA→TUA, "-"→"4", I→L in CUI markers - Add client CUI patterns for Brick receipts - Add profile selection logging to ocr_extractor.py - Create test script for all 29 PDFs (test_all_profiles.py) Test results: 13/29 passing (improved from 9/29) Remaining failures are primarily OCR quality issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
109 lines
3.6 KiB
Python
109 lines
3.6 KiB
Python
"""
|
|
GAMA INK SERVICE SRL store profile for OCR extraction.
|
|
|
|
Toner refill and printer supplies store.
|
|
|
|
Receipt structure:
|
|
- TVA format: "TOTAL TVA A 4 19%" with amount on next line (4 is OCR for -)
|
|
- "TOTAL TVA BON" with total TVA amount
|
|
"""
|
|
|
|
import re
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import List, Dict, Any
|
|
|
|
from .base import BaseStoreProfile
|
|
from . import ProfileRegistry
|
|
|
|
|
|
@ProfileRegistry.register
|
|
class GamaInkProfile(BaseStoreProfile):
|
|
"""
|
|
GAMA INK SERVICE SRL - standard TVA profile with multiline support.
|
|
|
|
Key characteristics:
|
|
- TVA format with rate on one line, amount on next
|
|
- OCR often reads "-" as "4" (e.g., "A 4 19%" instead of "A - 19%")
|
|
- CARD payment typical
|
|
"""
|
|
|
|
CUI_LIST = ["17741882"]
|
|
NAME_PATTERNS = ["GAMA INK", "GAMA", "GAMAINK", "GAMA INK SERVICE"]
|
|
STORE_NAME = "GAMA INK SERVICE SRL"
|
|
|
|
# GAMA INK TVA patterns (handles OCR errors)
|
|
TVA_PATTERNS = [
|
|
# "TOTAL TVA A 4 19%" (4 is OCR for -)
|
|
r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%',
|
|
# "TOTAL TVA A - 19%"
|
|
r'TOTAL\s+TVA\s+([A-D])\s+(\d{1,2})\s*%',
|
|
]
|
|
|
|
# TOTAL TVA BON pattern (fallback)
|
|
TOTAL_TVA_BON_PATTERN = r'TOTAL\s+TVA\s+BON'
|
|
|
|
def extract_tva_entries(self, text: str) -> List[dict]:
|
|
"""
|
|
Extract GAMA INK-specific TVA entries.
|
|
|
|
Format: "TOTAL TVA A 4 19%" on one line, amount on next line.
|
|
Note: OCR reads "-" as "4" sometimes.
|
|
|
|
Args:
|
|
text: Raw OCR text from receipt
|
|
|
|
Returns:
|
|
List of TVA entries with code, percent, and amount
|
|
"""
|
|
entries = []
|
|
text_upper = text.upper()
|
|
lines = text_upper.split('\n')
|
|
|
|
# Find TVA rate line and get amount from next line
|
|
for i, line in enumerate(lines):
|
|
# Match "TOTAL TVA A 4 19%" or "TOTAL TVA A - 19%"
|
|
match = re.search(r'TOTAL\s+TVA\s*([A-D])\s*[4\-\s]+(\d{1,2})\s*%', line)
|
|
if match:
|
|
code = match.group(1)
|
|
percent = int(match.group(2))
|
|
|
|
# Amount should be on next line
|
|
if i + 1 < len(lines):
|
|
amount_str = lines[i + 1].strip()
|
|
amount = self._parse_decimal(amount_str)
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': code,
|
|
'percent': percent,
|
|
'amount': amount
|
|
})
|
|
return entries
|
|
|
|
# Fallback: Find TOTAL TVA BON and get amount
|
|
for i, line in enumerate(lines):
|
|
if re.search(self.TOTAL_TVA_BON_PATTERN, line):
|
|
# Amount should be on next line
|
|
if i + 1 < len(lines):
|
|
amount_str = lines[i + 1].strip()
|
|
amount = self._parse_decimal(amount_str)
|
|
if amount and amount > 0:
|
|
entries.append({
|
|
'code': 'A',
|
|
'percent': 19, # Default Romanian TVA rate
|
|
'amount': amount
|
|
})
|
|
return entries
|
|
|
|
return entries
|
|
|
|
def get_validation_hints(self) -> Dict[str, Any]:
|
|
"""Return GAMA INK-specific validation hints."""
|
|
return {
|
|
"has_multi_rate_tva": False,
|
|
"card_equals_total": True,
|
|
"has_client_cui": True, # May have client CUI for business
|
|
"has_efactura": False,
|
|
"is_non_vat_payer": False,
|
|
"tva_on_separate_line": True,
|
|
}
|