feat(ocr): Implement persistent worker pool with SQLite job queue

Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-31 12:32:12 +02:00
parent 00f826f7ed
commit 74f7aefc26
23 changed files with 3616 additions and 209 deletions

View File

@@ -21,8 +21,8 @@
style="width: 50px; height: 50px"
strokeWidth="4"
/>
<p class="processing-text">Se proceseaza imaginea...</p>
<p class="processing-subtext">Acest proces poate dura cateva secunde</p>
<p class="processing-text">{{ processingMessage }}</p>
<p class="processing-subtext">{{ processingSubtext }}</p>
</div>
<div v-else-if="selectedFile" class="file-selected-state">
@@ -52,6 +52,15 @@
size="small"
@click="triggerFileInput"
/>
<!-- OCR Engine Selector -->
<Dropdown
v-model="selectedEngine"
:options="engineOptions"
optionLabel="label"
optionValue="value"
placeholder="Motor OCR"
class="engine-selector dropdown-borderless"
/>
<Button
label="Proceseaza OCR"
icon="pi pi-cog"
@@ -68,7 +77,8 @@
</template>
<script setup>
import { ref } from 'vue'
import { ref, computed } from 'vue'
import Dropdown from 'primevue/dropdown'
import api from '@data-entry/services/api'
const emit = defineEmits(['ocr-result', 'file-selected', 'error'])
@@ -79,6 +89,42 @@ const isDragging = ref(false)
const processing = ref(false)
const error = ref(null)
// OCR Engine selection
const selectedEngine = ref('auto')
const engineOptions = [
{ label: 'Auto (Recomandat)', value: 'auto' },
{ label: 'PaddleOCR', value: 'paddleocr' },
{ label: 'Tesseract', value: 'tesseract' }
]
// Job queue state
const jobId = ref(null)
const queuePosition = ref(null)
const estimatedWait = ref(null)
const jobStatus = ref(null)
let pollInterval = null
// Dynamic processing messages
const processingMessage = computed(() => {
if (jobStatus.value === 'pending' && queuePosition.value > 0) {
return `In coada... Pozitia ${queuePosition.value}`
}
if (jobStatus.value === 'processing') {
return 'Se proceseaza imaginea...'
}
return 'Se trimite...'
})
const processingSubtext = computed(() => {
if (jobStatus.value === 'pending' && estimatedWait.value > 0) {
return `Timp estimat: ~${estimatedWait.value} secunde`
}
if (jobStatus.value === 'processing') {
return 'Extragere date cu OCR'
}
return 'Asteptati...'
})
const onDragOver = () => {
isDragging.value = true
}
@@ -130,37 +176,130 @@ const processOCR = async () => {
processing.value = true
error.value = null
jobId.value = null
queuePosition.value = null
estimatedWait.value = null
jobStatus.value = 'submitting'
try {
const formData = new FormData()
formData.append('file', selectedFile.value)
// Don't set Content-Type header - let browser set it with boundary for multipart/form-data
// The API interceptor will add Authorization header automatically
const response = await api.post('/ocr/extract', formData, {
timeout: 60000, // 60 second timeout for OCR
console.log('🔍 OCR Submit:', {
fileName: selectedFile.value?.name,
fileSize: selectedFile.value?.size,
fileType: selectedFile.value?.type,
engine: selectedEngine.value
})
if (response.data.success) {
// Include the OCR message in the data for debugging
const resultData = {
...response.data.data,
_ocr_message: response.data.message
}
emit('ocr-result', resultData)
} else {
error.value = response.data.message || 'OCR processing failed'
emit('error', error.value)
}
// Step 1: Submit job to queue (returns immediately)
// Include engine parameter in query string
const submitResponse = await api.post(`/ocr/extract?engine=${selectedEngine.value}`, formData, {
timeout: 30000, // 30s timeout for submission
})
const job = submitResponse.data
jobId.value = job.job_id
queuePosition.value = job.queue_position
estimatedWait.value = job.estimated_wait_seconds
jobStatus.value = job.status
console.log('📋 OCR Job Created:', job)
// Step 2: Start polling for result
await pollJobStatus(job.job_id)
} catch (err) {
console.error('🔴 OCR Error:', {
message: err.message,
code: err.code,
response: err.response?.data
})
const message = err.response?.data?.detail || err.message || 'Eroare la procesarea OCR'
error.value = message
emit('error', message)
} finally {
processing.value = false
}
}
const pollJobStatus = async (id) => {
const maxAttempts = 120 // 2 minutes max (120 * 1s)
let attempts = 0
const poll = async () => {
try {
const response = await api.get(`/ocr/jobs/${id}`)
const job = response.data
jobStatus.value = job.status
queuePosition.value = job.queue_position
estimatedWait.value = job.estimated_wait_seconds
console.log('📊 OCR Poll:', { status: job.status, position: job.queue_position })
if (job.status === 'completed') {
// Success! Emit result
clearInterval(pollInterval)
pollInterval = null
processing.value = false
if (job.result) {
console.log('✅ OCR Complete:', job.result)
emit('ocr-result', {
...job.result,
_processing_time_ms: job.processing_time_ms
})
} else {
error.value = 'OCR completed but no result returned'
emit('error', error.value)
}
return
}
if (job.status === 'failed') {
// Failed
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = job.error || 'OCR processing failed'
emit('error', error.value)
return
}
// Still pending/processing - continue polling
attempts++
if (attempts >= maxAttempts) {
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = 'Timeout - procesarea a durat prea mult'
emit('error', error.value)
}
} catch (err) {
console.error('🔴 Poll Error:', err.message)
attempts++
// Don't stop on poll errors - network might be flaky
if (attempts >= maxAttempts) {
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = 'Eroare la verificarea starii job-ului'
emit('error', error.value)
}
}
}
// Initial poll immediately
await poll()
// Continue polling every 1 second if still processing
if (processing.value) {
pollInterval = setInterval(poll, 1000)
}
}
const formatFileSize = (bytes) => {
if (bytes < 1024) return bytes + ' B'
if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(1) + ' KB'
@@ -170,6 +309,14 @@ const formatFileSize = (bytes) => {
const reset = () => {
selectedFile.value = null
error.value = null
jobId.value = null
queuePosition.value = null
estimatedWait.value = null
jobStatus.value = null
if (pollInterval) {
clearInterval(pollInterval)
pollInterval = null
}
if (fileInput.value) {
fileInput.value.value = ''
}
@@ -262,6 +409,23 @@ defineExpose({ reset, processOCR })
gap: 0.5rem;
margin-top: 0.75rem;
justify-content: center;
align-items: center;
flex-wrap: wrap;
}
/* Engine selector dropdown */
.engine-selector {
min-width: 150px;
}
.engine-selector:deep(.p-dropdown-label) {
padding: 0.5rem 0.75rem !important;
font-size: 0.875rem;
color: #475569;
}
.engine-selector:deep(.p-dropdown-trigger) {
width: 2rem !important;
}
/* Processing state */
@@ -276,6 +440,7 @@ defineExpose({ reset, processOCR })
font-size: 1rem;
color: #475569;
margin: 0.5rem 0 0 0;
font-weight: 500;
}
.processing-subtext {