From 20448f7aa027f1bf2f207ea004d39074b166fe81 Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Fri, 12 Dec 2025 16:23:53 +0200 Subject: [PATCH] feat: Add multiple TVA entries support for Romanian receipts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add TvaEntry schema supporting multiple TVA rates (A, B, C, D codes) - Update OCR extractor to extract multiple TVA entries from receipts - Support both old (19%, 9%, 5%) and new Romanian rates (21%, 11% from Aug 2025) - Add tva_breakdown, tva_total, items_count, vendor_address to Receipt model - Update OCRPreview.vue to display TVA entries with rate badges - Add "Detalii Suplimentare" section in ReceiptCreateView with editable TVA table - Add TVA breakdown display in ReceiptDetailView - Create database migration for new TVA columns 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../backend/app/db/models/receipt.py | 6 + data-entry-app/backend/app/routers/ocr.py | 22 +- data-entry-app/backend/app/schemas/ocr.py | 44 +- data-entry-app/backend/app/schemas/receipt.py | 21 + .../app/services/image_preprocessor.py | 69 ++- .../backend/app/services/ocr_engine.py | 12 +- .../backend/app/services/ocr_extractor.py | 568 +++++++++++++++++- ...212_140422_add_tva_breakdown_to_receipt.py | 37 ++ .../src/components/ocr/OCRPreview.vue | 75 +++ .../src/views/receipts/ReceiptCreateView.vue | 130 +++- .../src/views/receipts/ReceiptDetailView.vue | 105 ++++ 11 files changed, 1021 insertions(+), 68 deletions(-) create mode 100644 data-entry-app/backend/migrations/versions/20251212_140422_add_tva_breakdown_to_receipt.py diff --git a/data-entry-app/backend/app/db/models/receipt.py b/data-entry-app/backend/app/db/models/receipt.py index bc10e54..12f4d0b 100644 --- a/data-entry-app/backend/app/db/models/receipt.py +++ b/data-entry-app/backend/app/db/models/receipt.py @@ -51,6 +51,12 @@ class Receipt(SQLModel, table=True): amount: Decimal = Field(decimal_places=2, max_digits=15) description: Optional[str] = Field(default=None, max_length=500) + # TVA info (extracted from OCR) - stored as JSON for multiple entries + tva_breakdown: Optional[str] = Field(default=None, max_length=1000) # JSON: [{"code":"A","percent":19,"amount":"15.20"}] + tva_total: Optional[Decimal] = Field(default=None, decimal_places=2, max_digits=15) + items_count: Optional[int] = Field(default=None) + vendor_address: Optional[str] = Field(default=None, max_length=500) + # Expense type (for auto-generating accounting entries) expense_type_code: Optional[str] = Field(default=None, max_length=20) diff --git a/data-entry-app/backend/app/routers/ocr.py b/data-entry-app/backend/app/routers/ocr.py index f071421..65c6ad7 100644 --- a/data-entry-app/backend/app/routers/ocr.py +++ b/data-entry-app/backend/app/routers/ocr.py @@ -11,7 +11,7 @@ from app.db.database import get_session from app.db.crud.attachment import AttachmentCRUD from app.services.ocr_service import ocr_service from app.services.ocr_engine import OCREngine -from app.schemas.ocr import OCRResponse, OCRStatusResponse, ExtractionData +from app.schemas.ocr import OCRResponse, OCRStatusResponse, ExtractionData, TvaEntry router = APIRouter() @@ -78,6 +78,12 @@ async def extract_from_image(file: UploadFile = File(...)): raise HTTPException(status_code=422, detail=message) # Convert ExtractionResult to ExtractionData schema + # Convert tva_entries from dict to TvaEntry objects + tva_entries_schema = [ + TvaEntry(code=e.get('code'), percent=e['percent'], amount=e['amount']) + for e in result.tva_entries + ] if result.tva_entries else [] + data = ExtractionData( receipt_type=result.receipt_type, receipt_number=result.receipt_number, @@ -87,6 +93,10 @@ async def extract_from_image(file: UploadFile = File(...)): partner_name=result.partner_name, cui=result.cui, description=result.description, + tva_entries=tva_entries_schema, + tva_total=result.tva_total, + address=result.address, + items_count=result.items_count, confidence_amount=result.confidence_amount, confidence_date=result.confidence_date, confidence_vendor=result.confidence_vendor, @@ -137,6 +147,12 @@ async def extract_from_attachment( raise HTTPException(status_code=422, detail=message) # Convert ExtractionResult to ExtractionData schema + # Convert tva_entries from dict to TvaEntry objects + tva_entries_schema = [ + TvaEntry(code=e.get('code'), percent=e['percent'], amount=e['amount']) + for e in result.tva_entries + ] if result.tva_entries else [] + data = ExtractionData( receipt_type=result.receipt_type, receipt_number=result.receipt_number, @@ -146,6 +162,10 @@ async def extract_from_attachment( partner_name=result.partner_name, cui=result.cui, description=result.description, + tva_entries=tva_entries_schema, + tva_total=result.tva_total, + address=result.address, + items_count=result.items_count, confidence_amount=result.confidence_amount, confidence_date=result.confidence_date, confidence_vendor=result.confidence_vendor, diff --git a/data-entry-app/backend/app/schemas/ocr.py b/data-entry-app/backend/app/schemas/ocr.py index 6dd0e4b..0b79a4f 100644 --- a/data-entry-app/backend/app/schemas/ocr.py +++ b/data-entry-app/backend/app/schemas/ocr.py @@ -2,11 +2,18 @@ from datetime import date from decimal import Decimal -from typing import Optional +from typing import Optional, List from pydantic import BaseModel, Field +class TvaEntry(BaseModel): + """Single TVA entry with code, percentage and amount.""" + code: Optional[str] = Field(default=None, description="TVA code: A, B, C, D") + percent: int = Field(description="TVA percentage: 0, 5, 9, 19, 21") + amount: Decimal = Field(description="TVA amount for this rate") + + class ExtractionData(BaseModel): """Extracted receipt data from OCR.""" @@ -19,6 +26,12 @@ class ExtractionData(BaseModel): cui: Optional[str] = Field(default=None, description="CUI (fiscal identification code)") description: Optional[str] = Field(default=None, description="Optional description") + # Additional extracted fields - Multiple TVA entries support + tva_entries: List[TvaEntry] = Field(default=[], description="List of TVA entries by rate (A, B, C, D)") + tva_total: Optional[Decimal] = Field(default=None, description="Total TVA amount") + address: Optional[str] = Field(default=None, description="Vendor address") + items_count: Optional[int] = Field(default=None, description="Number of items/articles") + confidence_amount: float = Field(default=0.0, ge=0, le=1, description="Amount extraction confidence") confidence_date: float = Field(default=0.0, ge=0, le=1, description="Date extraction confidence") confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence") @@ -30,18 +43,25 @@ class ExtractionData(BaseModel): json_schema_extra = { "example": { "receipt_type": "bon_fiscal", - "receipt_number": "12345", - "receipt_series": None, - "receipt_date": "2024-01-15", - "amount": 125.50, - "partner_name": "MEGA IMAGE SRL", - "cui": "12345678", + "receipt_number": "1360760", + "receipt_series": "0146", + "receipt_date": "2025-10-11", + "amount": 186.16, + "partner_name": "FIVE-HOLDING S.A.", + "cui": "10562600", "description": None, - "confidence_amount": 0.95, - "confidence_date": 0.90, - "confidence_vendor": 0.75, - "overall_confidence": 0.87, - "raw_text": "BON FISCAL\nMEGA IMAGE SRL\n..." + "tva_entries": [ + {"code": "A", "percent": 19, "amount": 25.00}, + {"code": "B", "percent": 9, "amount": 7.31} + ], + "tva_total": 32.31, + "address": "JUD. CONSTANTA, MUN. CONSTANTA, STR. ION ROATA NR. 3", + "items_count": 17, + "confidence_amount": 0.98, + "confidence_date": 0.98, + "confidence_vendor": 0.95, + "overall_confidence": 0.97, + "raw_text": "FIVE-HOLDING S.A.\nCIF: RO10562600\n..." } } diff --git a/data-entry-app/backend/app/schemas/receipt.py b/data-entry-app/backend/app/schemas/receipt.py index b9326e5..e45f0a6 100644 --- a/data-entry-app/backend/app/schemas/receipt.py +++ b/data-entry-app/backend/app/schemas/receipt.py @@ -64,6 +64,15 @@ class AttachmentResponse(BaseModel): uploaded_at: datetime +# ============ TVA Schema ============ + +class TvaEntrySchema(BaseModel): + """Single TVA entry with code, percentage and amount.""" + code: Optional[str] = Field(default=None, description="TVA code: A, B, C, D") + percent: int = Field(description="TVA percentage: 0, 5, 9, 19, 21") + amount: Decimal = Field(description="TVA amount for this rate") + + # ============ Receipt Schemas ============ class ReceiptBase(BaseModel): @@ -75,6 +84,12 @@ class ReceiptBase(BaseModel): receipt_date: date amount: Decimal = Field(gt=0) description: Optional[str] = Field(default=None, max_length=500) + # TVA info (multiple entries support) + tva_breakdown: Optional[List[TvaEntrySchema]] = Field(default=None, description="List of TVA entries") + tva_total: Optional[Decimal] = Field(default=None, description="Total TVA amount") + items_count: Optional[int] = Field(default=None, description="Number of items") + vendor_address: Optional[str] = Field(default=None, max_length=500, description="Vendor address") + # Other fields expense_type_code: Optional[str] = Field(default=None, max_length=20) company_id: int partner_id: Optional[int] = None @@ -98,6 +113,12 @@ class ReceiptUpdate(BaseModel): receipt_date: Optional[date] = None amount: Optional[Decimal] = Field(default=None, gt=0) description: Optional[str] = Field(default=None, max_length=500) + # TVA info (multiple entries support) + tva_breakdown: Optional[List[TvaEntrySchema]] = Field(default=None, description="List of TVA entries") + tva_total: Optional[Decimal] = Field(default=None, description="Total TVA amount") + items_count: Optional[int] = Field(default=None, description="Number of items") + vendor_address: Optional[str] = Field(default=None, max_length=500, description="Vendor address") + # Other fields expense_type_code: Optional[str] = Field(default=None, max_length=20) partner_id: Optional[int] = None partner_name: Optional[str] = Field(default=None, max_length=200) diff --git a/data-entry-app/backend/app/services/image_preprocessor.py b/data-entry-app/backend/app/services/image_preprocessor.py index fd38368..3ee28f9 100644 --- a/data-entry-app/backend/app/services/image_preprocessor.py +++ b/data-entry-app/backend/app/services/image_preprocessor.py @@ -23,24 +23,37 @@ class ImagePreprocessor: raise ValueError(f"Could not load image: {path}") return image - def pdf_to_images(self, path: Path, dpi: int = 300) -> List[np.ndarray]: - """Convert PDF to images.""" + def pdf_to_images(self, path: Path, dpi: int = 400) -> List[np.ndarray]: + """ + Convert PDF to images with high DPI for better OCR. + + Args: + path: Path to PDF file + dpi: Resolution (400 recommended for receipts, higher = better quality but slower) + """ if not PDF_AVAILABLE: raise RuntimeError("pdf2image not available. Install with: pip install pdf2image") + # Use 400 DPI for better text recognition on thermal receipts images = pdf2image.convert_from_path(str(path), dpi=dpi) return [np.array(img) for img in images] - def preprocess(self, image: np.ndarray) -> np.ndarray: + def preprocess(self, image: np.ndarray, high_quality: bool = True) -> np.ndarray: """ Apply preprocessing pipeline for thermal receipt images. Pipeline: 1. Convert to grayscale - 2. Resize if too small (min 1000px width) + 2. Resize if too small (min 1500px width for high quality) 3. Deskew (straighten rotated text) - 4. Denoise (Non-local means) - 5. Adaptive thresholding (binarization) - 6. Morphological close (connect broken chars) + 4. Contrast enhancement (CLAHE) + 5. Denoise (Non-local means) + 6. Sharpening (for clearer text edges) + 7. Adaptive thresholding (binarization) + 8. Morphological operations (connect broken chars) + + Args: + image: Input image (BGR or grayscale) + high_quality: If True, apply more aggressive preprocessing """ # 1. Grayscale if len(image.shape) == 3: @@ -48,10 +61,11 @@ class ImagePreprocessor: else: gray = image.copy() - # 2. Resize if too small + # 2. Resize if too small (larger = better OCR) height, width = gray.shape - if width < 1000: - scale = 1000 / width + min_width = 1500 if high_quality else 1000 + if width < min_width: + scale = min_width / width gray = cv2.resize( gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC @@ -60,24 +74,43 @@ class ImagePreprocessor: # 3. Deskew gray = self._deskew(gray) - # 4. Denoise + # 4. Contrast enhancement with CLAHE (Contrast Limited Adaptive Histogram Equalization) + clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) + enhanced = clahe.apply(gray) + + # 5. Denoise (slightly less aggressive to preserve text details) denoised = cv2.fastNlMeansDenoising( - gray, h=10, + enhanced, h=8, # Lower h = preserve more details templateWindowSize=7, searchWindowSize=21 ) - # 5. Adaptive thresholding + # 6. Sharpening to enhance text edges + if high_quality: + # Unsharp mask for better text clarity + gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0) + sharpened = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0) + else: + sharpened = denoised + + # 7. Adaptive thresholding with optimized parameters binary = cv2.adaptiveThreshold( - denoised, 255, + sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, - blockSize=15, C=8 + blockSize=11, # Smaller block = better for small text + C=5 # Lower C = darker result, better for faded receipts ) - # 6. Morphological close - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) - result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) + # 8. Morphological operations + # Close small gaps in characters + kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) + result = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel_close) + + # Optional: Remove small noise spots + if high_quality: + kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) + result = cv2.morphologyEx(result, cv2.MORPH_OPEN, kernel_open) return result diff --git a/data-entry-app/backend/app/services/ocr_engine.py b/data-entry-app/backend/app/services/ocr_engine.py index 7d46027..f086729 100644 --- a/data-entry-app/backend/app/services/ocr_engine.py +++ b/data-entry-app/backend/app/services/ocr_engine.py @@ -64,11 +64,17 @@ class OCREngine: PaddleOCR = _PaddleOCR print("Initializing PaddleOCR engine...") - # PaddleOCR 3.x API - simplified parameters + # PaddleOCR 3.x API - optimized for Romanian receipts self._paddle = PaddleOCR( - lang='en', # Better for mixed text with numbers + lang='en', # 'en' works better than 'ro' for mixed alphanumeric + # High quality settings for better accuracy + det_db_thresh=0.3, # Lower threshold = detect more text (default 0.3) + det_db_box_thresh=0.5, # Box confidence threshold (default 0.5) + det_db_unclip_ratio=1.8, # Expand detected boxes slightly (default 1.5) + rec_batch_num=6, # Batch size for recognition + use_angle_cls=True, # Enable text angle classification ) - print("PaddleOCR initialized successfully") + print("PaddleOCR initialized successfully with high-quality settings") except Exception as e: print(f"Warning: Failed to initialize PaddleOCR: {e}") self._paddle = None diff --git a/data-entry-app/backend/app/services/ocr_extractor.py b/data-entry-app/backend/app/services/ocr_extractor.py index e815079..8b60a7d 100644 --- a/data-entry-app/backend/app/services/ocr_extractor.py +++ b/data-entry-app/backend/app/services/ocr_extractor.py @@ -3,7 +3,7 @@ import re from datetime import date, datetime from decimal import Decimal, InvalidOperation -from typing import Optional, Tuple +from typing import Optional, Tuple, List from dataclasses import dataclass, field @@ -18,6 +18,11 @@ class ExtractionResult: partner_name: Optional[str] = None cui: Optional[str] = None description: Optional[str] = None + # Additional extracted fields - Multiple TVA entries support + tva_entries: List[dict] = field(default_factory=list) # [{code, percent, amount}] + tva_total: Optional[Decimal] = None + address: Optional[str] = None + items_count: Optional[int] = None confidence_amount: float = 0.0 confidence_date: float = 0.0 @@ -40,44 +45,158 @@ class ReceiptExtractor: """Extract receipt fields using pattern matching for Romanian receipts.""" # Total amount patterns (most specific first) + # Romanian receipts use various formats: TOTAL LEI, TOTAL:, TOTAL RON, etc. + # OCR often produces errors, so patterns must be tolerant TOTAL_PATTERNS = [ + # Most common: TOTAL LEI followed by amount + (r'TOTAL\s+LEI\s*([\d\s.,]+)', 0.98), + (r'[OT]?OTAL\s+LEI\s*([\d\s.,]+)', 0.95), # OCR may miss first letter + # Standard patterns (r'TOTAL\s*:?\s*([\d\s.,]+)\s*(?:RON|LEI)?', 0.95), (r'TOTAL\s+(?:RON|LEI)\s*([\d\s.,]+)', 0.95), + # SUBTOTAL when TOTAL not found + (r'SUBTOTAL\s*([\d\s.,]+)', 0.90), + (r'[SB]?UBTOTAL\s*([\d\s.,]+)', 0.88), # OCR variations + # Payment methods (r'DE\s+PLATA\s*:?\s*([\d\s.,]+)', 0.90), (r'SUMA\s*:?\s*([\d\s.,]+)', 0.85), (r'PLATA\s+CARD\s*:?\s*([\d\s.,]+)', 0.85), (r'NUMERAR\s*:?\s*([\d\s.,]+)', 0.80), + (r'REST\s*:?\s*([\d\s.,]+)', 0.70), # Sometimes total is near REST ] - # Date patterns + # Fallback: Find the largest repeated amount (likely the total) + # This handles cases where OCR doesn't capture "TOTAL" keyword + + # Date patterns - support dash, dot, and slash separators + # OCR may produce DRTA instead of DATA, DAIA, etc. DATE_PATTERNS = [ - (r'DATA\s*:?\s*(\d{2}[./]\d{2}[./]\d{4})', 0.95), - (r'(\d{2}[./]\d{2}[./]\d{4})\s+\d{2}:\d{2}', 0.90), - (r'(\d{2}[./]\d{2}[./]\d{4})', 0.80), - (r'(\d{4}[./]\d{2}[./]\d{2})', 0.75), # YYYY.MM.DD format + # DATA/DRTA/DAIA: DD-MM-YYYY (OCR tolerant) + (r'D[AR]TA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), + (r'DATA\s*:?\s*(\d{2}[-./]\d{2}[-./]\d{4})', 0.98), + # Date followed by ORA (time) - OCR may produce 0RA + (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+[O0]RA\s*:?\s*\d{2}:\d{2}', 0.95), + # Date followed by time without ORA keyword + (r'(\d{2}[-./]\d{2}[-./]\d{4})\s+\d{2}:\d{2}', 0.90), + # Standalone date + (r'(\d{2}[-./]\d{2}[-./]\d{4})', 0.80), + # YYYY-MM-DD format (less common) + (r'(\d{4}[-./]\d{2}[-./]\d{2})', 0.75), ] - # Receipt number patterns + # Receipt number patterns - Romanian fiscal receipt formats + # OCR may produce N instead of : or other errors NUMBER_PATTERNS = [ + # NDS format (common in Romanian POS) + (r'NDS\s*:?\s*(\d+)', 0.98), + # C3POS terminal format - OCR may have N instead of : (C3POS-CT2N1360760) + (r'C3POS[-A-Z0-9]*[N:](\d{6,7})', 0.98), # CT2N1360760 format + (r'C3POS.*?(\d{6,7})\b', 0.95), # Any C3POS followed by 6-7 digit number + (r'CT2[N:]\s*(\d{6,})', 0.95), # CT2N prefix + # BF (Bon Fiscal) number + (r'BF\s*:?\s*(\d+)', 0.93), + # NIVS format + (r'NIVS\s*:?\s*(\d+)', 0.95), + # Standard NR BON formats (r'NR\.?\s*BON\s*:?\s*(\d+)', 0.95), (r'BON\s+(?:FISCAL\s+)?NR\.?\s*:?\s*(\d+)', 0.95), (r'CHITANTA\s+NR\.?\s*:?\s*(\d+)', 0.95), + # Document number (r'NR\.?\s+DOCUMENT\s*:?\s*(\d+)', 0.90), - (r'NR\.?\s*:?\s*(\d{4,})', 0.70), + # ID BF format + (r'ID\s*BF\s*:?\s*(\d+)', 0.90), + # TD format (transaction ID) + (r'TD\s*:?\s*(\d+)', 0.85), + # 6-8 digit number (typical receipt number length) + (r'\b(\d{6,8})\b', 0.70), + # Generic long number at end (fallback) + (r'NR\.?\s*:?\s*(\d{4,})', 0.65), ] - # CUI (fiscal code) patterns + # CUI (fiscal code) patterns - IMPORTANT: exclude CLIENT CUI + # CIF = Cod de Identificare Fiscală (vendor's tax ID) + # CLIENT C.U.I. = client's tax ID (should be ignored) + # OCR errors: R0 instead of RO, C1F instead of CIF CUI_PATTERNS = [ - (r'C\.?U\.?I\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95), - (r'C\.?I\.?F\.?\s*:?\s*(?:RO)?(\d{6,10})', 0.95), - (r'COD\s+FISCAL\s*:?\s*(?:RO)?(\d{6,10})', 0.90), - (r'(?:RO)?(\d{6,10})\s*-?\s*(?:J|CUI)', 0.80), + # CIF at start of line (definitely vendor) - tolerant to OCR errors + (r'^CIF\s*:?\s*(?:R[O0])?(\d{6,10})', 0.98), + (r'^C[I1]F\s*:?\s*(?:R[O0])?(\d{6,10})', 0.95), # C1F OCR error + # CIF not preceded by CLIENT (negative lookbehind) + (r'(? ExtractionResult: @@ -86,13 +205,18 @@ class ReceiptExtractor: result.raw_text = text text_upper = text.upper() - # Extract fields + # Extract core fields result.amount, result.confidence_amount = self._extract_amount(text_upper) result.receipt_date, result.confidence_date = self._extract_date(text_upper) result.receipt_number, _ = self._extract_number(text_upper) result.receipt_series, _ = self._extract_series(text_upper) result.partner_name, result.confidence_vendor = self._extract_vendor(text) - result.cui, _ = self._extract_cui(text_upper) + result.cui, _ = self._extract_cui(text_upper, text) + + # Extract additional fields - Multiple TVA entries + result.tva_entries, result.tva_total = self._extract_tva_entries(text_upper) + result.items_count = self._extract_items_count(text_upper) + result.address = self._extract_address(text_upper) # Detect receipt type result.receipt_type = self._detect_receipt_type(text_upper) @@ -101,18 +225,85 @@ class ReceiptExtractor: def _extract_amount(self, text: str) -> Tuple[Optional[Decimal], float]: """Extract total amount from text.""" + # First try standard patterns (TOTAL, SUBTOTAL, etc.) for pattern, confidence in self.TOTAL_PATTERNS: match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) if match: try: amount_str = re.sub(r'[^\d.,]', '', match.group(1)) - # Handle Romanian number format (1.234,56) amount_str = self._normalize_number(amount_str) amount = Decimal(amount_str) if amount > 0: return amount, confidence except (InvalidOperation, ValueError): continue + + # Strategy 2: Find amounts AFTER product lines end + # Products have pattern: "X BUC/ROLA X price = price" + # Total appears after all products + product_pattern = r'\d\s+(?:BUC|ROLA|ROLN|ROL)\s+X' + product_matches = list(re.finditer(product_pattern, text, re.IGNORECASE)) + if product_matches: + # Get text after the last product line + last_product_pos = product_matches[-1].end() + after_products = text[last_product_pos:] + + # Find standalone amounts on their own line after products + line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$' + standalone_amounts = [] + for match in re.finditer(line_amount_pattern, after_products, re.MULTILINE): + try: + amount_str = match.group(1).replace(' ', '') + amount_str = self._normalize_number(amount_str) + amount = Decimal(amount_str) + if amount > 10: # Filter out small values + standalone_amounts.append(amount) + except (InvalidOperation, ValueError): + continue + + if standalone_amounts: + # The largest standalone amount after products is likely the total + max_amount = max(standalone_amounts) + # Higher confidence if it appears multiple times + count = standalone_amounts.count(max_amount) + confidence = 0.85 if count >= 2 else 0.75 + return max_amount, confidence + + # Strategy 3: Find the most repeated large amount + # Normalize spaces in numbers (OCR may produce "186. 16") + normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) + amount_pattern = r'(\d{2,4}[.,]\d{2})\b' + amounts = re.findall(amount_pattern, normalized_text) + if amounts: + from collections import Counter + amount_counts = Counter(amounts) + # Filter amounts that appear 2+ times and are > 20 + candidates = [] + for amt_str, count in amount_counts.items(): + try: + amt = Decimal(self._normalize_number(amt_str)) + if count >= 2 and amt > 20: + candidates.append((amt, count)) + except (InvalidOperation, ValueError): + continue + + if candidates: + # Return the LARGEST amount that appears multiple times + candidates.sort(key=lambda x: x[0], reverse=True) + return candidates[0][0], 0.65 + + # Last resort: Find any standalone large amount + line_amount_pattern = r'^[\s]*(\d{2,4}[.,]\s*\d{2})[\s]*$' + for match in re.finditer(line_amount_pattern, text, re.MULTILINE): + try: + amount_str = match.group(1).replace(' ', '') + amount_str = self._normalize_number(amount_str) + amount = Decimal(amount_str) + if amount > 50: # Higher threshold for fallback + return amount, 0.50 + except (InvalidOperation, ValueError): + continue + return None, 0.0 def _normalize_number(self, num_str: str) -> str: @@ -147,7 +338,8 @@ class ReceiptExtractor: match = re.search(pattern, text) if match: try: - date_str = match.group(1).replace('/', '.') + # Normalize separators to dots + date_str = match.group(1).replace('/', '.').replace('-', '.') # Try DD.MM.YYYY format first try: @@ -181,23 +373,68 @@ class ReceiptExtractor: return None, 0.0 def _extract_vendor(self, text: str) -> Tuple[Optional[str], float]: - """Extract vendor/partner name from text.""" + """ + Extract vendor/partner name from text. + Uses multiple strategies: + 1. Look for lines with company type indicators (S.R.L., S.A., etc.) + 2. Look for lines near CIF + 3. Use first valid line as fallback + """ lines = text.split('\n') skip_keywords = [ 'BON', 'FISCAL', 'TOTAL', 'DATA', 'NR', 'ORA', 'SUBTOTAL', 'TVA', 'PLATA', 'CARD', 'NUMERAR', - 'RON', 'LEI', 'CHITANTA', 'REST' + 'RON', 'LEI', 'CHITANTA', 'REST', 'CLIENT', + 'OPERATOR', 'CASIER', 'POS', 'AMEF', 'BINE ATI VENIT', + 'VA RUGAM', 'PASTRATI', 'VOCEA', 'TIPARIT', + 'DETERGENT', 'PROSOP', 'HARTIE', 'SACI', 'SPRAY', + 'BUC', 'ROLA', 'CUMPARATOR' ] - for i, line in enumerate(lines[:7]): # Check first 7 lines + # Strategy 1: Look for lines with vendor indicators (S.R.L., S.A., HOLDING, etc.) + for i, line in enumerate(lines[:15]): # Check first 15 lines + line = line.strip() + if not line or len(line) < 3: + continue + + line_upper = line.upper() + + # Check for vendor indicators + for indicator in self.VENDOR_INDICATORS: + if re.search(indicator, line_upper): + # Found a company name indicator + vendor = self._clean_vendor_name(line) + if vendor and len(vendor) >= 3: + # High confidence for lines with company indicators + return vendor, 0.95 + + # Strategy 2: Look for lines right before or after CIF + for i, line in enumerate(lines[:15]): + line_upper = line.upper() + if 'CIF' in line_upper and 'CLIENT' not in line_upper: + # Check line before + if i > 0: + prev_line = lines[i-1].strip() + if prev_line and len(prev_line) >= 3: + if not any(kw in prev_line.upper() for kw in skip_keywords): + vendor = self._clean_vendor_name(prev_line) + if vendor: + return vendor, 0.85 + + # Strategy 3: First valid line as fallback + for i, line in enumerate(lines[:10]): line = line.strip() # Skip empty lines - if not line: + if not line or len(line) < 3: continue - # Skip lines that are just numbers - if re.match(r'^[\d.,\s]+$', line): + # Skip lines that are just numbers or codes + if re.match(r'^[\d.,\s:]+$', line): + continue + + # Skip lines with barcodes/product codes + if re.match(r'^[A-Z]*\d{6,}', line): continue # Skip lines with keywords @@ -205,23 +442,68 @@ class ReceiptExtractor: continue # Clean the line - vendor = re.sub(r'[^\w\s.,&-]', '', line).strip() + vendor = self._clean_vendor_name(line) - if len(vendor) >= 3: + if vendor and len(vendor) >= 3: # Confidence decreases for lines further down - confidence = max(0.3, 0.8 - (i * 0.1)) + confidence = max(0.3, 0.7 - (i * 0.05)) return vendor, confidence return None, 0.0 - def _extract_cui(self, text: str) -> Tuple[Optional[str], float]: - """Extract CUI (fiscal identification code) from text.""" + def _clean_vendor_name(self, name: str) -> Optional[str]: + """Clean and normalize vendor name.""" + if not name: + return None + + # Remove common OCR artifacts + name = re.sub(r'[^\w\s.,&\-()]', ' ', name) + # Normalize whitespace + name = re.sub(r'\s+', ' ', name).strip() + + # Skip if it looks like an address line only + if re.match(r'^(STR|JUD|MUN|NR|BL|SC|ET|AP)\.?\s', name.upper()): + return None + + # Skip if too short after cleaning + if len(name) < 3: + return None + + return name + + def _extract_cui(self, text_upper: str, original_text: str) -> Tuple[Optional[str], float]: + """ + Extract vendor CUI (fiscal identification code) from text. + Excludes CLIENT CUI which appears as 'CLIENT C.U.I./C.I.F.:...' + """ + # First, try to find CIF on a line that doesn't contain CLIENT + lines = text_upper.split('\n') + for line in lines: + # Skip lines that contain CLIENT (these are buyer's CUI, not vendor's) + if 'CLIENT' in line or 'CUMPARATOR' in line or 'LIENT' in line: + continue + + # Look for CIF in this line + for pattern, confidence in self.CUI_PATTERNS: + match = re.search(pattern, line, re.IGNORECASE | re.MULTILINE) + if match: + cui = match.group(1) + if 6 <= len(cui) <= 10: + return cui, confidence + + # Fallback: search entire text but exclude CLIENT patterns for pattern, confidence in self.CUI_PATTERNS: - match = re.search(pattern, text, re.IGNORECASE) - if match: + # Find all matches + for match in re.finditer(pattern, text_upper, re.IGNORECASE | re.MULTILINE): cui = match.group(1) if 6 <= len(cui) <= 10: - return cui, confidence + # Check if this match is preceded by CLIENT in the same line + start = match.start() + line_start = text_upper.rfind('\n', 0, start) + 1 + line_text = text_upper[line_start:start] + if 'CLIENT' not in line_text and 'LIENT' not in line_text: + return cui, confidence + return None, 0.0 def _detect_receipt_type(self, text: str) -> str: @@ -229,3 +511,223 @@ class ReceiptExtractor: if 'CHITANTA' in text or 'CHITANȚĂ' in text: return 'chitanta' return 'bon_fiscal' + + def _extract_tva_entries(self, text: str) -> Tuple[List[dict], Optional[Decimal]]: + """ + Extract multiple TVA (VAT) entries from text. + Romanian receipts can have multiple TVA rates (A=19%, B=9%, C=5%, D=0%). + + Returns (tva_entries, tva_total) where tva_entries is a list of: + {'code': 'A', 'percent': 19, 'amount': Decimal('15.20')} + """ + tva_entries = [] + seen_entries = set() # To avoid duplicates + + # Normalize spaces in numbers first (OCR may produce "32. 31") + normalized_text = re.sub(r'(\d+)[.,]\s+(\d{2})', r'\1.\2', text) + + # Pattern 1: "TVA A - 19%: 15.20" or "TVAA - 21% 32.31" (with code) + # OCR tolerant: TUA, TVR, etc. + pattern_with_code = r'T[VU][AR]\s*([A-D])\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)' + for match in re.finditer(pattern_with_code, normalized_text, re.IGNORECASE): + try: + code = match.group(1).upper() + percent = int(match.group(2)) + amount_str = match.group(3).replace(' ', '') + amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) + amount = Decimal(amount_str) + if amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + continue + + # Pattern 2: "TVA - 21%: 32.31" (without explicit code, assume 'A') + if not tva_entries: + pattern_no_code = r'T[VU][AR]\s*[-:]\s*(\d{1,2})\s*%\s*:?\s*([\d\s.,]+)' + for match in re.finditer(pattern_no_code, normalized_text, re.IGNORECASE): + try: + percent = int(match.group(1)) + amount_str = match.group(2).replace(' ', '') + amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) + amount = Decimal(amount_str) + if amount > 0: + # Determine code based on percent + code = self._get_tva_code_from_percent(percent) + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + continue + + # Pattern 3: "TVAA - 21%" on one line, amount on next line + if not tva_entries: + tva_line_pattern = r'T[VU][AR]\s*([A-D])?\s*[-:]\s*(\d{1,2})\s*%' + for match in re.finditer(tva_line_pattern, normalized_text, re.IGNORECASE): + try: + code = (match.group(1) or 'A').upper() + percent = int(match.group(2)) + + # Look for amount on the next line or immediately after + after_tva = normalized_text[match.end():] + amount_match = re.search(r'^[\s\n]*([\d.,]+)', after_tva) + if amount_match: + amount_str = self._normalize_number(amount_match.group(1)) + amount = Decimal(amount_str) + if amount > 0: + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + except (ValueError, InvalidOperation): + continue + + # Pattern 4: Use TVA_PATTERNS for fallback + if not tva_entries: + for pattern, _ in self.TVA_PATTERNS: + match = re.search(pattern, normalized_text, re.IGNORECASE) + if match: + try: + # Some patterns have 2 groups (percent, amount), others just amount + if match.lastindex >= 2: + percent = int(match.group(1)) + amount_str = match.group(2) + else: + amount_str = match.group(1) + # Try to detect percent from text + percent = self._detect_tva_percent(text) + + amount_str = amount_str.replace(' ', '') + amount_str = self._normalize_number(re.sub(r'[^\d.,]', '', amount_str)) + amount = Decimal(amount_str) + if amount > 0 and percent: + code = self._get_tva_code_from_percent(percent) + entry_key = (code, percent) + if entry_key not in seen_entries: + tva_entries.append({ + 'code': code, + 'percent': percent, + 'amount': amount + }) + seen_entries.add(entry_key) + break # Only use first match from fallback + except (ValueError, InvalidOperation): + continue + + # Calculate total + tva_total = None + if tva_entries: + tva_total = sum(entry['amount'] for entry in tva_entries) + + # Sort by code (A, B, C, D) + tva_entries.sort(key=lambda x: x.get('code', 'Z')) + + return tva_entries, tva_total + + def _get_tva_code_from_percent(self, percent: int) -> str: + """Map TVA percentage to standard Romanian code. + + Romanian TVA rates changed in August 2025: + - Standard rate: 19% → 21% + - Reduced rate: 9% → 11% + - Other rates (5%, 0%) remain unchanged + + Old rates (before Aug 2025): New rates (from Aug 2025): + - A = 19% (standard) - A = 21% (standard) + - B = 9% (reduced) - B = 11% (reduced) + - C = 5% (reduced) - C = 5% (reduced) + - D = 0% (exempt) - D = 0% (exempt) + + Both old and new rates are supported for historical receipts. + """ + if percent in (19, 21): + return 'A' # Standard rate (19% old, 21% new from Aug 2025) + elif percent in (9, 11): + return 'B' # Reduced rate (9% old, 11% new from Aug 2025) + elif percent == 5: + return 'C' # Reduced rate (unchanged) + elif percent == 0: + return 'D' # Exempt (unchanged) + else: + return 'A' # Default to standard rate + + def _detect_tva_percent(self, text: str) -> Optional[int]: + """Detect TVA percentage from text content.""" + # Look for common Romanian TVA percentages + if '19%' in text or '19 %' in text: + return 19 + elif '21%' in text or '21 %' in text: + return 21 + elif '11%' in text or '11 %' in text: + return 11 + elif '9%' in text or '9 %' in text: + return 9 + elif '5%' in text or '5 %' in text: + return 5 + return None + + def _extract_items_count(self, text: str) -> Optional[int]: + """Extract number of items/articles from receipt.""" + for pattern, _ in self.ITEMS_COUNT_PATTERNS: + match = re.search(pattern, text, re.IGNORECASE) + if match: + try: + count = int(match.group(1)) + if 0 < count < 1000: # Reasonable range + return count + except ValueError: + continue + return None + + def _extract_address(self, text: str) -> Optional[str]: + """Extract vendor address from text.""" + lines = text.split('\n') + address_parts = [] + + for line in lines[:15]: # Check first 15 lines + line = line.strip() + if not line: + continue + + # Check for address patterns + line_upper = line.upper() + + # JUD. (county) pattern + if re.search(r'\bJUD\.?\s+', line_upper): + address_parts.append(line) + continue + + # STR. (street) pattern + if re.search(r'\bSTR\.?\s+', line_upper): + address_parts.append(line) + continue + + # MUN./OR./COM. (city/town) pattern + if re.search(r'\b(MUN|OR|COM)\.?\s+', line_upper): + address_parts.append(line) + continue + + if address_parts: + # Join and clean address parts + address = ', '.join(address_parts) + # Clean up + address = re.sub(r'\s+', ' ', address).strip() + address = re.sub(r',\s*,', ',', address) + return address if len(address) >= 5 else None + + return None diff --git a/data-entry-app/backend/migrations/versions/20251212_140422_add_tva_breakdown_to_receipt.py b/data-entry-app/backend/migrations/versions/20251212_140422_add_tva_breakdown_to_receipt.py new file mode 100644 index 0000000..ad8408a --- /dev/null +++ b/data-entry-app/backend/migrations/versions/20251212_140422_add_tva_breakdown_to_receipt.py @@ -0,0 +1,37 @@ +"""add_tva_breakdown_to_receipt + +Revision ID: 1cfb423c6953 +Revises: 001_initial +Create Date: 2025-12-12 14:04:22.464289+00:00 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = '1cfb423c6953' +down_revision: Union[str, None] = '001_initial' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add TVA-related columns to receipts table + with op.batch_alter_table('receipts', schema=None) as batch_op: + batch_op.add_column(sa.Column('tva_breakdown', sqlmodel.sql.sqltypes.AutoString(length=1000), nullable=True)) + batch_op.add_column(sa.Column('tva_total', sa.Numeric(precision=15, scale=2), nullable=True)) + batch_op.add_column(sa.Column('items_count', sa.Integer(), nullable=True)) + batch_op.add_column(sa.Column('vendor_address', sqlmodel.sql.sqltypes.AutoString(length=500), nullable=True)) + + +def downgrade() -> None: + # Remove TVA-related columns from receipts table + with op.batch_alter_table('receipts', schema=None) as batch_op: + batch_op.drop_column('vendor_address') + batch_op.drop_column('items_count') + batch_op.drop_column('tva_total') + batch_op.drop_column('tva_breakdown') diff --git a/data-entry-app/frontend/src/components/ocr/OCRPreview.vue b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue index 78127ff..c4087d0 100644 --- a/data-entry-app/frontend/src/components/ocr/OCRPreview.vue +++ b/data-entry-app/frontend/src/components/ocr/OCRPreview.vue @@ -71,6 +71,37 @@ CUI: {{ data.cui }} + + +
+ +
+
+ {{ entry.code }} + {{ entry.percent }}% + {{ formatAmount(entry.amount) }} RON +
+
+ Total TVA: {{ formatAmount(data.tva_total) }} RON +
+
+
+ + +
+ +
+ {{ data.items_count }} articole +
+
+ + +
+ +
+ {{ data.address }} +
+
@@ -224,6 +255,50 @@ const formatDate = (dateStr) => { color: #475569; } +.tva-breakdown { + display: flex; + flex-direction: column; + gap: 0.25rem; +} + +.tva-entry { + display: flex; + align-items: center; + gap: 0.5rem; +} + +.tva-code { + font-weight: 600; + color: #475569; + min-width: 1rem; +} + +.tva-percent-badge { + display: inline-block; + padding: 0.15rem 0.5rem; + background: #dbeafe; + border-radius: 4px; + font-size: 0.8rem; + color: #1e40af; + min-width: 2.5rem; + text-align: center; +} + +.tva-amount { + font-weight: 500; +} + +.tva-total { + margin-top: 0.25rem; + padding-top: 0.25rem; + border-top: 1px dashed #cbd5e1; +} + +.address-text { + font-size: 0.9rem; + color: #475569; +} + .raw-text-section { margin-top: 1rem; padding-top: 1rem; diff --git a/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue b/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue index f64e554..b14cfc6 100644 --- a/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue +++ b/data-entry-app/frontend/src/views/receipts/ReceiptCreateView.vue @@ -246,11 +246,59 @@