feat: Improve OCR adaptive pipeline with early exit and better pattern matching

- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found
- Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR)
- Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts)
- Add pattern for "OTAL TAXE" with T cut off and reversed amount position
- Make TVA rate pattern more flexible (code letter optional, handle "-21%")
- Replace logger.info with print(flush=True) for better debugging visibility
- Improve OCRPreview.vue to show extraction progress and raw OCR text

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-13 01:54:52 +02:00
parent 6c3dd89f6d
commit 9f06482681
9 changed files with 952 additions and 116 deletions

View File

@@ -1,10 +1,19 @@
"""FastAPI application entry point for Data Entry App."""
import sys
import logging
import threading
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI
# Configure logging to show INFO level messages
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
@@ -30,6 +39,18 @@ async def lifespan(app: FastAPI):
settings.upload_path_resolved
print(f"Upload path: {settings.upload_path_resolved}")
# Pre-initialize OCR engine in background (PaddleOCR takes 15-20s)
def init_ocr_background():
try:
from app.services.ocr_service import ocr_service
ocr_service.ocr_engine._init_paddle_lazy()
print("OCR engine ready")
except Exception as e:
print(f"Warning: OCR engine pre-load failed: {e}")
print("Starting OCR engine pre-load (background)...")
threading.Thread(target=init_ocr_background, daemon=True).start()
yield
# Shutdown