feat: Add multiple TVA entries support for Romanian receipts

- Add TvaEntry schema supporting multiple TVA rates (A, B, C, D codes)
- Update OCR extractor to extract multiple TVA entries from receipts
- Support both old (19%, 9%, 5%) and new Romanian rates (21%, 11% from Aug 2025)
- Add tva_breakdown, tva_total, items_count, vendor_address to Receipt model
- Update OCRPreview.vue to display TVA entries with rate badges
- Add "Detalii Suplimentare" section in ReceiptCreateView with editable TVA table
- Add TVA breakdown display in ReceiptDetailView
- Create database migration for new TVA columns

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-12 16:23:53 +02:00
parent 41ae97180e
commit 20448f7aa0
11 changed files with 1021 additions and 68 deletions

View File

@@ -3,7 +3,7 @@
import re
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Optional, Tuple
from typing import Optional, Tuple, List
from dataclasses import dataclass, field
@@ -18,6 +18,11 @@ class ExtractionResult:
partner_name: Optional[str] = None
cui: Optional[str] = None
description: Optional[str] = None
# Additional extracted fields - Multiple TVA entries support
tva_entries: List[dict] = field(default_factory=list) # [{code, percent, amount}]
tva_total: Optional[Decimal] = None
address: Optional[str] = None
items_count: Optional[int] = None
confidence_amount: float = 0.0
confidence_date: float = 0.0
@@ -40,44 +45,158 @@ class ReceiptExtractor:
"""Extract receipt fields using pattern matching for Romanian receipts."""
# Total amount patterns (most specific first)
# Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc.
# OCR often produces errors, so patterns must be tolerant
TOTAL_PATTERNS = [
# Most common: TOTAL LEI followed by amount
(r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98),
(r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), # OCR may miss first letter
# Standard patterns
(r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95),
(r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95),
# SUBTOTAL when TOTAL not found
(r'SUBTOTAL\s*([\d\s.,]+)', 0.90),
(r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88), # OCR variations
# Payment methods
(r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90),
(r'SUMA\s*:?\s*([\d\s.,]+)', 0.85),
(r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85),
(r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80),
(r'REST\s*:?\s*([\d\s.,]+)', 0.70), # Sometimes total is near REST
]
# Date patterns
# Fallback: Find the largest repeated amount (likely the total)
# This handles cases where OCR doesn't capture "TOTAL" keyword
# Date patterns - support dash, dot, and slash separators
# OCR may produce DRTA instead of DATA, DAIA, etc.
DATE_PATTERNS = [
(r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95),
(r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90),
(r'(\d{2}[./]\d{2}[./]\d{4})', 0.80),
(r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format
# DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant)
(r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
(r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98),
# Date followed by ORA (time) - OCR may produce 0RA
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95),
# Date followed by time without ORA keyword
(r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90),
# Standalone date
(r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80),
# YYYY-MM-DD format (less common)
(r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75),
]
# Receipt number patterns
# Receipt number patterns - Romanian fiscal receipt formats
# OCR may produce N instead of : or other errors
NUMBER_PATTERNS = [
# NDS format (common in Romanian POS)
(r'NDS\s*:?\s*(\d+)', 0.98),
# C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760)
(r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format
(r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number
(r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix
# BF (Bon Fiscal) number
(r'BF\s*:?\s*(\d+)', 0.93),
# NIVS format
(r'NIVS\s*:?\s*(\d+)', 0.95),
# Standard NR BON formats
(r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95),
(r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95),
(r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95),
# Document number
(r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90),
(r'NR\.?\s*:?\s*(\d{4,})', 0.70),
# ID BF format
(r'ID\s*BF\s*:?\s*(\d+)', 0.90),
# TD format (transaction ID)
(r'TD\s*:?\s*(\d+)', 0.85),
# 6-8 digit number (typical receipt number length)
(r'\b(\d{6,8})\b', 0.70),
# Generic long number at end (fallback)
(r'NR\.?\s*:?\s*(\d{4,})', 0.65),
]
# CUI (fiscal code) patterns
# CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI
# CIF = Cod de Identificare Fiscală (vendor's tax ID)
# CLIENT C.U.I. = client's tax ID (should be ignored)
# OCR errors: R0 instead of RO, C1F instead of CIF
CUI_PATTERNS = [
(r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
(r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95),
(r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90),
(r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80),
# CIF at start of line (definitely vendor) - tolerant to OCR errors
(r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98),
(r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # C1F OCR error
# CIF not preceded by CLIENT (negative lookbehind)
(r'(?<!CLIENT\s)(?<!LIENT\s)CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95),
# Standalone CIF: format with OCR tolerance
(r'\bC[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})\b', 0.90),
# COD FISCAL (vendor)
(r'COD\s+FISCAL\s*:?\s*(?:R[O0])?(\d{6,10})', 0.90),
# C.I.F. format (with dots)
(r'(?<!CLIENT\s)C\.[I1]\.F\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.88),
# CUI format (less specific, use with caution)
(r'(?<!CLIENT\s)C\.?U\.?[I1]\.?\s*:?\s*(?:R[O0])?(\d{6,10})', 0.85),
]
# Series patterns
# Series patterns - be strict to avoid false matches
SERIES_PATTERNS = [
(r'SERIE\s*:?\s*([A-Z]{1,4})', 0.90),
(r'([A-Z]{2,4})\s+NR\.?\s*\d+', 0.80),
# Z: format from Romanian fiscal receipts (must be at start of line or after space)
(r'(?:^|\s)Z\s*:\s*(\d{4})', 0.85),
# BF series with explicit marker
(r'(?:^|\s)BF\s*:\s*(\d{4})', 0.85),
]
# TVA (VAT) patterns - OCR may produce TUA, TVR, etc.
TVA_PATTERNS = [
# TOTAL TVA BON format (OCR tolerant: TUA, TVR)
(r'TOTAL\s+T[VU][AR]\s+BON\s*:?\s*([\d\s.,]+)', 0.98),
(r'T[O0]TAL\s+T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.95),
# TVA with percentage (OCR tolerant)
(r'T[VU][AR]\s+(?:A\s*[-:]?\s*)?(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.95),
(r'T[VU][AR]\s+[A-Z]\s*[-:]\s*(\d{1,2})\s*%\s*([\d\s.,]+)', 0.93),
# Simple TVA pattern
(r'T[VU][AR]\s*:?\s*([\d\s.,]+)', 0.85),
# Standalone percentage line near TVA
(r'(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)', 0.75),
]
# Items count patterns - OCR may produce OZ instead of POZ, etc.
# Number may be on separate line before or after the label
ITEMS_COUNT_PATTERNS = [
# NR. POZ. ART. IN BON: 17 (Romanian format with dots and spaces)
# OCR tolerant: OZ instead of POZ, ARI instead of ART
(r'NR\.?\s*P?[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*(\d+)', 0.98),
# Number on line BEFORE "OZ. ART. IN BON:" - OCR sometimes reorders
(r'(\d{1,2})\s*\n\s*[O0]Z\.?\s*ART', 0.95),
# Number may be on next line after label
(r'[O0]Z\.?\s*ART\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.93),
(r'NR\.?\s*(?:P?[O0]Z\.?)?\s*ART(?:ICOLE)?\.?\s*(?:IN\s+BON)?\s*:?\s*[\n\s]*(\d+)', 0.90),
# Simpler patterns
(r'ARTIC[O0]LE\s*:?\s*(\d+)', 0.88),
(r'P?[O0]Z\s*:?\s*(\d+)', 0.85),
# X articole/pozitii
(r'(\d+)\s*(?:ARTIC[O0]LE|P[O0]ZITII|BUC)', 0.80),
]
# Address patterns (Romanian format)
ADDRESS_PATTERNS = [
# Street patterns
(r'(STR\.?\s+[A-Z0-9\s.,]+(?:NR\.?\s*\d+)?)', 0.90),
# Full address with JUD (county)
(r'(JUD\.?\s+[A-Z]+,?\s*(?:MUN\.?|OR\.?|COM\.?)?\s*[A-Z]+)', 0.85),
]
# Vendor name indicators (lines containing these are likely vendor names)
VENDOR_INDICATORS = [
r'\bS\.?R\.?L\.?\b', # S.R.L.
r'\bS\.?A\.?\b', # S.A.
r'\bS\.?N\.?C\.?\b', # S.N.C.
r'\bS\.?C\.?S\.?\b', # S.C.S.
r'\bI\.?I\.?\b', # I.I. (Individual)
r'\bP\.?F\.?A\.?\b', # P.F.A.
r'\bS\.?C\.?\b', # S.C.
r'HOLDING',
r'COMPANY',
r'GROUP',
r'MAGAZIN',
r'MARKET',
r'SHOP',
]
def extract(self, text: str) -> ExtractionResult:
@@ -86,13 +205,18 @@ class ReceiptExtractor:
result.raw_text = text
text_upper = text.upper()
# Extract fields
# Extract core fields
result.amount, result.confidence_amount = self._extract_amount(text_upper)
result.receipt_date, result.confidence_date = self._extract_date(text_upper)
result.receipt_number, _ = self._extract_number(text_upper)
result.receipt_series, _ = self._extract_series(text_upper)
result.partner_name, result.confidence_vendor = self._extract_vendor(text)
result.cui, _ = self._extract_cui(text_upper)
result.cui, _ = self._extract_cui(text_upper, text)
# Extract additional fields - Multiple TVA entries
result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper)
result.items_count = self._extract_items_count(text_upper)
result.address = self._extract_address(text_upper)
# Detect receipt type
result.receipt_type = self._detect_receipt_type(text_upper)
@@ -101,18 +225,85 @@ class ReceiptExtractor:
def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]:
"""Extract total amount from text."""
# First try standard patterns (TOTAL, SUBTOTAL, etc.)
for pattern, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if match:
try:
amount_str = re.sub(r'[^\d.,]', '', match.group(1))
# Handle Romanian number format (1.234,56)
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 0:
return amount, confidence
except (InvalidOperation, ValueError):
continue
# Strategy 2: Find amounts AFTER product lines end
# Products have pattern: "X BUC/ROLA X price = price"
# Total appears after all products
product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X'
product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE))
if product_matches:
# Get text after the last product line
last_product_pos = product_matches[-1].end()
after_products = text[last_product_pos:]
# Find standalone amounts on their own line after products
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
standalone_amounts = []
for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE):
try:
amount_str = match.group(1).replace(' ', '')
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 10: # Filter out small values
standalone_amounts.append(amount)
except (InvalidOperation, ValueError):
continue
if standalone_amounts:
# The largest standalone amount after products is likely the total
max_amount = max(standalone_amounts)
# Higher confidence if it appears multiple times
count = standalone_amounts.count(max_amount)
confidence = 0.85 if count >= 2 else 0.75
return max_amount, confidence
# Strategy 3: Find the most repeated large amount
# Normalize spaces in numbers (OCR may produce "186. 16")
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
amount_pattern = r'(\d{2,4}[.,]\d{2})\b'
amounts = re.findall(amount_pattern, normalized_text)
if amounts:
from collections import Counter
amount_counts = Counter(amounts)
# Filter amounts that appear 2+ times and are > 20
candidates = []
for amt_str, count in amount_counts.items():
try:
amt = Decimal(self._normalize_number(amt_str))
if count >= 2 and amt > 20:
candidates.append((amt, count))
except (InvalidOperation, ValueError):
continue
if candidates:
# Return the LARGEST amount that appears multiple times
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][0], 0.65
# Last resort: Find any standalone large amount
line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$'
for match in re.finditer(line_amount_pattern, text, re.MULTILINE):
try:
amount_str = match.group(1).replace(' ', '')
amount_str = self._normalize_number(amount_str)
amount = Decimal(amount_str)
if amount > 50: # Higher threshold for fallback
return amount, 0.50
except (InvalidOperation, ValueError):
continue
return None, 0.0
def _normalize_number(self, num_str: str) -> str:
@@ -147,7 +338,8 @@ class ReceiptExtractor:
match = re.search(pattern, text)
if match:
try:
date_str = match.group(1).replace('/', '.')
# Normalize separators to dots
date_str = match.group(1).replace('/', '.').replace('-', '.')
# Try DD.MM.YYYY format first
try:
@@ -181,23 +373,68 @@ class ReceiptExtractor:
return None, 0.0
def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]:
"""Extract vendor/partner name from text."""
"""
Extract vendor/partner name from text.
Uses multiple strategies:
1. Look for lines with company type indicators (S.R.L., S.A., etc.)
2. Look for lines near CIF
3. Use first valid line as fallback
"""
lines = text.split('\n')
skip_keywords = [
'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA',
'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR',
'RON', 'LEI', 'CHITANTA', 'REST'
'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT',
'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT',
'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT',
'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY',
'BUC', 'ROLA', 'CUMPARATOR'
]
for i, line in enumerate(lines[:7]): # Check first 7 lines
# Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.)
for i, line in enumerate(lines[:15]): # Check first 15 lines
line = line.strip()
if not line or len(line) < 3:
continue
line_upper = line.upper()
# Check for vendor indicators
for indicator in self.VENDOR_INDICATORS:
if re.search(indicator, line_upper):
# Found a company name indicator
vendor = self._clean_vendor_name(line)
if vendor and len(vendor) >= 3:
# High confidence for lines with company indicators
return vendor, 0.95
# Strategy 2: Look for lines right before or after CIF
for i, line in enumerate(lines[:15]):
line_upper = line.upper()
if 'CIF' in line_upper and 'CLIENT' not in line_upper:
# Check line before
if i > 0:
prev_line = lines[i-1].strip()
if prev_line and len(prev_line) >= 3:
if not any(kw in prev_line.upper() for kw in skip_keywords):
vendor = self._clean_vendor_name(prev_line)
if vendor:
return vendor, 0.85
# Strategy 3: First valid line as fallback
for i, line in enumerate(lines[:10]):
line = line.strip()
# Skip empty lines
if not line:
if not line or len(line) < 3:
continue
# Skip lines that are just numbers
if re.match(r'^[\d.,\s]+$', line):
# Skip lines that are just numbers or codes
if re.match(r'^[\d.,\s:]+$', line):
continue
# Skip lines with barcodes/product codes
if re.match(r'^[A-Z]*\d{6,}', line):
continue
# Skip lines with keywords
@@ -205,23 +442,68 @@ class ReceiptExtractor:
continue
# Clean the line
vendor = re.sub(r'[^\w\s.,&-]', '', line).strip()
vendor = self._clean_vendor_name(line)
if len(vendor) >= 3:
if vendor and len(vendor) >= 3:
# Confidence decreases for lines further down
confidence = max(0.3, 0.8 - (i * 0.1))
confidence = max(0.3, 0.7 - (i * 0.05))
return vendor, confidence
return None, 0.0
def _extract_cui(self, text: str) -> Tuple[Optional[str], float]:
"""Extract CUI (fiscal identification code) from text."""
def _clean_vendor_name(self, name: str) -> Optional[str]:
"""Clean and normalize vendor name."""
if not name:
return None
# Remove common OCR artifacts
name = re.sub(r'[^\w\s.,&\-()]', ' ', name)
# Normalize whitespace
name = re.sub(r'\s+', ' ', name).strip()
# Skip if it looks like an address line only
if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()):
return None
# Skip if too short after cleaning
if len(name) < 3:
return None
return name
def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]:
"""
Extract vendor CUI (fiscal identification code) from text.
Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...'
"""
# First, try to find CIF on a line that doesn't contain CLIENT
lines = text_upper.split('\n')
for line in lines:
# Skip lines that contain CLIENT (these are buyer's CUI, not vendor's)
if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line:
continue
# Look for CIF in this line
for pattern, confidence in self.CUI_PATTERNS:
match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE)
if match:
cui = match.group(1)
if 6 <= len(cui) <= 10:
return cui, confidence
# Fallback: search entire text but exclude CLIENT patterns
for pattern, confidence in self.CUI_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
# Find all matches
for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE):
cui = match.group(1)
if 6 <= len(cui) <= 10:
return cui, confidence
# Check if this match is preceded by CLIENT in the same line
start = match.start()
line_start = text_upper.rfind('\n', 0, start) + 1
line_text = text_upper[line_start:start]
if 'CLIENT' not in line_text and 'LIENT' not in line_text:
return cui, confidence
return None, 0.0
def _detect_receipt_type(self, text: str) -> str:
@@ -229,3 +511,223 @@ class ReceiptExtractor:
if 'CHITANTA' in text or 'CHITANȚĂ' in text:
return 'chitanta'
return 'bon_fiscal'
def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]:
"""
Extract multiple TVA (VAT) entries from text.
Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%).
Returns (tva_entries, tva_total) where tva_entries is a list of:
{'code': 'A', 'percent': 19, 'amount': Decimal('15.20')}
"""
tva_entries = []
seen_entries = set() # To avoid duplicates
# Normalize spaces in numbers first (OCR may produce "32. 31")
normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text)
# Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code)
# OCR tolerant: TUA, TVR, etc.
pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE):
try:
code = match.group(1).upper()
percent = int(match.group(2))
amount_str = match.group(3).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A')
if not tva_entries:
pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)'
for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE):
try:
percent = int(match.group(1))
amount_str = match.group(2).replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0:
# Determine code based on percent
code = self._get_tva_code_from_percent(percent)
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 3: "TVAA - 21%" on one line, amount on next line
if not tva_entries:
tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%'
for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE):
try:
code = (match.group(1) or 'A').upper()
percent = int(match.group(2))
# Look for amount on the next line or immediately after
after_tva = normalized_text[match.end():]
amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva)
if amount_match:
amount_str = self._normalize_number(amount_match.group(1))
amount = Decimal(amount_str)
if amount > 0:
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
except (ValueError, InvalidOperation):
continue
# Pattern 4: Use TVA_PATTERNS for fallback
if not tva_entries:
for pattern, _ in self.TVA_PATTERNS:
match = re.search(pattern, normalized_text, re.IGNORECASE)
if match:
try:
# Some patterns have 2 groups (percent, amount), others just amount
if match.lastindex >= 2:
percent = int(match.group(1))
amount_str = match.group(2)
else:
amount_str = match.group(1)
# Try to detect percent from text
percent = self._detect_tva_percent(text)
amount_str = amount_str.replace(' ', '')
amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str))
amount = Decimal(amount_str)
if amount > 0 and percent:
code = self._get_tva_code_from_percent(percent)
entry_key = (code, percent)
if entry_key not in seen_entries:
tva_entries.append({
'code': code,
'percent': percent,
'amount': amount
})
seen_entries.add(entry_key)
break # Only use first match from fallback
except (ValueError, InvalidOperation):
continue
# Calculate total
tva_total = None
if tva_entries:
tva_total = sum(entry['amount'] for entry in tva_entries)
# Sort by code (A, B, C, D)
tva_entries.sort(key=lambda x: x.get('code', 'Z'))
return tva_entries, tva_total
def _get_tva_code_from_percent(self, percent: int) -> str:
"""Map TVA percentage to standard Romanian code.
Romanian TVA rates changed in August 2025:
- Standard rate: 19% → 21%
- Reduced rate: 9% → 11%
- Other rates (5%, 0%) remain unchanged
Old rates (before Aug 2025): New rates (from Aug 2025):
- A = 19% (standard) - A = 21% (standard)
- B = 9% (reduced) - B = 11% (reduced)
- C = 5% (reduced) - C = 5% (reduced)
- D = 0% (exempt) - D = 0% (exempt)
Both old and new rates are supported for historical receipts.
"""
if percent in (19, 21):
return 'A' # Standard rate (19% old, 21% new from Aug 2025)
elif percent in (9, 11):
return 'B' # Reduced rate (9% old, 11% new from Aug 2025)
elif percent == 5:
return 'C' # Reduced rate (unchanged)
elif percent == 0:
return 'D' # Exempt (unchanged)
else:
return 'A' # Default to standard rate
def _detect_tva_percent(self, text: str) -> Optional[int]:
"""Detect TVA percentage from text content."""
# Look for common Romanian TVA percentages
if '19%' in text or '19 %' in text:
return 19
elif '21%' in text or '21 %' in text:
return 21
elif '11%' in text or '11 %' in text:
return 11
elif '9%' in text or '9 %' in text:
return 9
elif '5%' in text or '5 %' in text:
return 5
return None
def _extract_items_count(self, text: str) -> Optional[int]:
"""Extract number of items/articles from receipt."""
for pattern, _ in self.ITEMS_COUNT_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
count = int(match.group(1))
if 0 < count < 1000: # Reasonable range
return count
except ValueError:
continue
return None
def _extract_address(self, text: str) -> Optional[str]:
"""Extract vendor address from text."""
lines = text.split('\n')
address_parts = []
for line in lines[:15]: # Check first 15 lines
line = line.strip()
if not line:
continue
# Check for address patterns
line_upper = line.upper()
# JUD. (county) pattern
if re.search(r'\bJUD\.?\s+', line_upper):
address_parts.append(line)
continue
# STR. (street) pattern
if re.search(r'\bSTR\.?\s+', line_upper):
address_parts.append(line)
continue
# MUN./OR./COM. (city/town) pattern
if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper):
address_parts.append(line)
continue
if address_parts:
# Join and clean address parts
address = ', '.join(address_parts)
# Clean up
address = re.sub(r'\s+', ' ', address).strip()
address = re.sub(r',\s*,', ',', address)
return address if len(address) >= 5 else None
return None