feat: Improve OCR adaptive pipeline with early exit and better pattern matching

- Add adaptive 3-step OCR pipeline with early exit when all 5 fields found - Add pattern for "C. I. F." with spaces (OCR artifact from PaddleOCR) - Add pattern for YYYY. MM. DD date format with spaces (OMV/Petrom receipts) - Add pattern for "OTAL TAXE" with T cut off and reversed amount position - Make TVA rate pattern more flexible (code letter optional, handle "-21%") - Replace logger.info with print(flush=True) for better debugging visibility - Improve OCRPreview.vue to show extraction progress and raw OCR text 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 01:54:52 +02:00
parent 6c3dd89f6d
commit 9f06482681
9 changed files with 952 additions and 116 deletions
--- a/data-entry-app/backend/app/schemas/ocr.py
+++ b/data-entry-app/backend/app/schemas/ocr.py
@@ -37,6 +37,8 @@ class ExtractionData(BaseModel):
    confidence_vendor: float = Field(default=0.0, ge=0, le=1, description="Vendor extraction confidence")
    overall_confidence: float = Field(default=0.0, ge=0, le=1, description="Overall confidence score")
    raw_text: str = Field(default="", description="Raw OCR text")
+    ocr_engine: str = Field(default="", description="OCR engine used: paddleocr or tesseract")
+    processing_time_ms: int = Field(default=0, ge=0, description="Processing time in milliseconds")

    class Config:
        """Pydantic config."""