feat(ocr): Add docTR OCR engine with metrics infrastructure

Add docTR as primary OCR engine with 2-tier sequential processing,
OCR metrics tracking, and simplified engine selection.

Features:
- docTR OCR engine with light+medium preprocessing tiers
- doctr_plus mode with early exit optimization (~65% fast path)
- OCR metrics dashboard with per-engine statistics
- User OCR preference persistence
- Parallel worker pool for OCR processing
- Cross-validation for extraction quality

Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-02 05:37:16 +02:00
parent 74f7aefc26
commit 495790411f
75 changed files with 23349 additions and 1311 deletions

View File

@@ -263,6 +263,12 @@ const formatDate = (dateStr) => {
const getEngineClass = (engine) => {
if (!engine) return ''
// docTR engines
if (engine === 'doctr-light') return 'doctr-fast'
if (engine === 'doctr-medium') return 'doctr'
if (engine === 'doctr-adaptive') return 'doctr-adaptive'
if (engine.includes('doctr')) return 'doctr'
// PaddleOCR engines
if (engine === 'paddle-light') return 'fast'
if (engine === 'paddle-adaptive') return 'adaptive'
if (engine === 'adaptive-full') return 'full'
@@ -273,13 +279,23 @@ const getEngineClass = (engine) => {
const getEngineIcon = (engine) => {
if (!engine) return 'pi pi-cog'
if (engine === 'paddle-light') return 'pi pi-bolt' // Fast/lightning
if (engine === 'adaptive-full') return 'pi pi-cog' // Full pipeline
// docTR - use bolt for fast modes
if (engine === 'doctr-light') return 'pi pi-bolt'
if (engine.includes('doctr')) return 'pi pi-bolt'
// PaddleOCR
if (engine === 'paddle-light') return 'pi pi-bolt'
if (engine === 'adaptive-full') return 'pi pi-cog'
return 'pi pi-cog'
}
const getEngineLabel = (engine) => {
if (!engine) return ''
// docTR engines
if (engine === 'doctr-light') return 'docTR Fast'
if (engine === 'doctr-medium') return 'docTR Medium'
if (engine === 'doctr-adaptive') return 'docTR Adaptive'
if (engine.includes('doctr')) return 'docTR'
// PaddleOCR engines
if (engine === 'paddle-light') return 'Fast Mode (PaddleOCR)'
if (engine === 'paddle-adaptive') return 'Adaptive (Paddle dual)'
if (engine === 'adaptive-full') return 'Full Pipeline'
@@ -615,6 +631,22 @@ const formatProcessingTime = (ms) => {
color: #92400e;
}
/* docTR engine styles */
.ocr-engine-badge.doctr {
background: #ede9fe;
color: #5b21b6;
}
.ocr-engine-badge.doctr-fast {
background: #d1fae5;
color: #047857;
}
.ocr-engine-badge.doctr-adaptive {
background: #e0e7ff;
color: #3730a3;
}
.ocr-message-badge {
display: inline-flex;
align-items: center;

View File

@@ -60,7 +60,14 @@
optionValue="value"
placeholder="Motor OCR"
class="engine-selector dropdown-borderless"
/>
>
<template #option="{ option }">
<div class="engine-option">
<span class="engine-label">{{ option.label }}</span>
<span class="engine-desc">{{ option.desc }}</span>
</div>
</template>
</Dropdown>
<Button
label="Proceseaza OCR"
icon="pi pi-cog"
@@ -77,9 +84,10 @@
</template>
<script setup>
import { ref, computed } from 'vue'
import { ref, computed, onMounted, watch } from 'vue'
import Dropdown from 'primevue/dropdown'
import api from '@data-entry/services/api'
import { useOCRSettingsStore } from '@data-entry/stores/ocrSettingsStore'
const emit = defineEmits(['ocr-result', 'file-selected', 'error'])
@@ -89,20 +97,73 @@ const isDragging = ref(false)
const processing = ref(false)
const error = ref(null)
// OCR Engine selection
// OCR Settings Store - manages user preferences
const ocrStore = useOCRSettingsStore()
// OCR Engine selection (synced with store)
const selectedEngine = ref('auto')
const engineOptions = [
{ label: 'Auto (Recomandat)', value: 'auto' },
{ label: 'PaddleOCR', value: 'paddleocr' },
{ label: 'Tesseract', value: 'tesseract' }
]
// Engine config - labels and descriptions for dropdown
const engineConfig = {
'auto': {
label: 'Auto',
desc: 'docTR→Paddle→Tess · General'
},
'doctr': {
label: 'docTR',
desc: 'Rapid, bună acuratețe'
},
'paddleocr': {
label: 'PaddleOCR',
desc: 'Cea mai bună calitate'
},
'tesseract': {
label: 'Tesseract',
desc: 'Cel mai rapid, calitate redusă'
},
'hybrid': {
label: 'Hybrid',
desc: 'docTR+Tess paralel · Recomandat'
},
'hybrid-quality': {
label: 'Hybrid Calitate',
desc: 'Paddle→docTR→Tess · Acuratețe max'
},
}
// Compute engine options from store's available engines
const engineOptions = computed(() => {
return ocrStore.availableEngines.map(engine => ({
label: engineConfig[engine]?.label || engine,
desc: engineConfig[engine]?.desc || '',
value: engine
}))
})
// Load user's preferred engine on mount
onMounted(async () => {
await ocrStore.loadPreference()
selectedEngine.value = ocrStore.preferredEngine
console.log('[OCRUploadZone] Loaded user preference:', selectedEngine.value)
})
// Save preference when user changes engine
watch(selectedEngine, async (newEngine, oldEngine) => {
if (oldEngine && newEngine !== oldEngine && ocrStore.initialized) {
try {
await ocrStore.setPreference(newEngine)
console.log('[OCRUploadZone] Saved user preference:', newEngine)
} catch (err) {
console.error('[OCRUploadZone] Failed to save preference:', err)
}
}
})
// Job queue state
const jobId = ref(null)
const queuePosition = ref(null)
const estimatedWait = ref(null)
const jobStatus = ref(null)
let pollInterval = null
// Dynamic processing messages
const processingMessage = computed(() => {
@@ -223,26 +284,36 @@ const processOCR = async () => {
}
const pollJobStatus = async (id) => {
const maxAttempts = 120 // 2 minutes max (120 * 1s)
let attempts = 0
const LONG_POLL_TIMEOUT = 30 // seconds
const MAX_TOTAL_TIME = 120 // 2 minutes max
const startTime = Date.now()
const poll = async () => {
try {
const response = await api.get(`/ocr/jobs/${id}`)
const job = response.data
// Check if exceeded max total time
const elapsed = (Date.now() - startTime) / 1000
if (elapsed >= MAX_TOTAL_TIME) {
processing.value = false
error.value = 'Timeout - procesarea a durat prea mult'
emit('error', error.value)
return
}
try {
// Long-poll with 30s server timeout, 35s axios timeout
const response = await api.get(`/ocr/jobs/${id}/wait`, {
params: { timeout: LONG_POLL_TIMEOUT },
timeout: (LONG_POLL_TIMEOUT + 5) * 1000
})
const job = response.data
jobStatus.value = job.status
queuePosition.value = job.queue_position
estimatedWait.value = job.estimated_wait_seconds
console.log('📊 OCR Poll:', { status: job.status, position: job.queue_position })
console.log('📊 OCR Long-Poll:', { status: job.status, position: job.queue_position })
if (job.status === 'completed') {
// Success! Emit result
clearInterval(pollInterval)
pollInterval = null
processing.value = false
if (job.result) {
console.log('✅ OCR Complete:', job.result)
emit('ocr-result', {
@@ -257,47 +328,36 @@ const pollJobStatus = async (id) => {
}
if (job.status === 'failed') {
// Failed
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = job.error || 'OCR processing failed'
emit('error', error.value)
return
}
// Still pending/processing - continue polling
attempts++
if (attempts >= maxAttempts) {
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = 'Timeout - procesarea a durat prea mult'
emit('error', error.value)
// Still pending/processing - long-poll again
if (processing.value) {
await poll()
}
} catch (err) {
console.error('🔴 Poll Error:', err.message)
attempts++
// Don't stop on poll errors - network might be flaky
if (attempts >= maxAttempts) {
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = 'Eroare la verificarea starii job-ului'
emit('error', error.value)
// Handle timeout (normal for long-poll)
if (err.code === 'ECONNABORTED' || err.message?.includes('timeout')) {
console.log('⏱️ Long-poll timeout, retrying...')
if (processing.value) {
await poll()
}
return
}
// Real error
console.error('🔴 Poll Error:', err.message)
processing.value = false
error.value = 'Eroare la verificarea starii job-ului'
emit('error', error.value)
}
}
// Initial poll immediately
await poll()
// Continue polling every 1 second if still processing
if (processing.value) {
pollInterval = setInterval(poll, 1000)
}
}
const formatFileSize = (bytes) => {
@@ -313,10 +373,7 @@ const reset = () => {
queuePosition.value = null
estimatedWait.value = null
jobStatus.value = null
if (pollInterval) {
clearInterval(pollInterval)
pollInterval = null
}
processing.value = false // Stop any ongoing long-poll
if (fileInput.value) {
fileInput.value.value = ''
}
@@ -415,7 +472,7 @@ defineExpose({ reset, processOCR })
/* Engine selector dropdown */
.engine-selector {
min-width: 150px;
min-width: 180px;
}
.engine-selector:deep(.p-dropdown-label) {
@@ -428,6 +485,25 @@ defineExpose({ reset, processOCR })
width: 2rem !important;
}
/* Engine dropdown option with description */
.engine-option {
display: flex;
flex-direction: column;
gap: 2px;
padding: 4px 0;
}
.engine-label {
font-weight: 500;
font-size: 0.875rem;
color: #1e293b;
}
.engine-desc {
font-size: 0.75rem;
color: #64748b;
}
/* Processing state */
.processing-state {
display: flex;