feat(ocr): Implement persistent worker pool with SQLite job queue

Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-31 12:32:12 +02:00
parent 00f826f7ed
commit 74f7aefc26
23 changed files with 3616 additions and 209 deletions

View File

@@ -21,8 +21,8 @@
style="width: 50px; height: 50px"
strokeWidth="4"
/>
<p class="processing-text">Se proceseaza imaginea...</p>
<p class="processing-subtext">Acest proces poate dura cateva secunde</p>
<p class="processing-text">{{ processingMessage }}</p>
<p class="processing-subtext">{{ processingSubtext }}</p>
</div>
<div v-else-if="selectedFile" class="file-selected-state">
@@ -52,6 +52,15 @@
size="small"
@click="triggerFileInput"
/>
<!-- OCR Engine Selector -->
<Dropdown
v-model="selectedEngine"
:options="engineOptions"
optionLabel="label"
optionValue="value"
placeholder="Motor OCR"
class="engine-selector dropdown-borderless"
/>
<Button
label="Proceseaza OCR"
icon="pi pi-cog"
@@ -68,7 +77,8 @@
</template>
<script setup>
import { ref } from 'vue'
import { ref, computed } from 'vue'
import Dropdown from 'primevue/dropdown'
import api from '@data-entry/services/api'
const emit = defineEmits(['ocr-result', 'file-selected', 'error'])
@@ -79,6 +89,42 @@ const isDragging = ref(false)
const processing = ref(false)
const error = ref(null)
// OCR Engine selection
const selectedEngine = ref('auto')
const engineOptions = [
{ label: 'Auto (Recomandat)', value: 'auto' },
{ label: 'PaddleOCR', value: 'paddleocr' },
{ label: 'Tesseract', value: 'tesseract' }
]
// Job queue state
const jobId = ref(null)
const queuePosition = ref(null)
const estimatedWait = ref(null)
const jobStatus = ref(null)
let pollInterval = null
// Dynamic processing messages
const processingMessage = computed(() => {
if (jobStatus.value === 'pending' && queuePosition.value > 0) {
return `In coada... Pozitia ${queuePosition.value}`
}
if (jobStatus.value === 'processing') {
return 'Se proceseaza imaginea...'
}
return 'Se trimite...'
})
const processingSubtext = computed(() => {
if (jobStatus.value === 'pending' && estimatedWait.value > 0) {
return `Timp estimat: ~${estimatedWait.value} secunde`
}
if (jobStatus.value === 'processing') {
return 'Extragere date cu OCR'
}
return 'Asteptati...'
})
const onDragOver = () => {
isDragging.value = true
}
@@ -130,37 +176,130 @@ const processOCR = async () => {
processing.value = true
error.value = null
jobId.value = null
queuePosition.value = null
estimatedWait.value = null
jobStatus.value = 'submitting'
try {
const formData = new FormData()
formData.append('file', selectedFile.value)
// Don't set Content-Type header - let browser set it with boundary for multipart/form-data
// The API interceptor will add Authorization header automatically
const response = await api.post('/ocr/extract', formData, {
timeout: 60000, // 60 second timeout for OCR
console.log('🔍 OCR Submit:', {
fileName: selectedFile.value?.name,
fileSize: selectedFile.value?.size,
fileType: selectedFile.value?.type,
engine: selectedEngine.value
})
if (response.data.success) {
// Include the OCR message in the data for debugging
const resultData = {
...response.data.data,
_ocr_message: response.data.message
}
emit('ocr-result', resultData)
} else {
error.value = response.data.message || 'OCR processing failed'
emit('error', error.value)
}
// Step 1: Submit job to queue (returns immediately)
// Include engine parameter in query string
const submitResponse = await api.post(`/ocr/extract?engine=${selectedEngine.value}`, formData, {
timeout: 30000, // 30s timeout for submission
})
const job = submitResponse.data
jobId.value = job.job_id
queuePosition.value = job.queue_position
estimatedWait.value = job.estimated_wait_seconds
jobStatus.value = job.status
console.log('📋 OCR Job Created:', job)
// Step 2: Start polling for result
await pollJobStatus(job.job_id)
} catch (err) {
console.error('🔴 OCR Error:', {
message: err.message,
code: err.code,
response: err.response?.data
})
const message = err.response?.data?.detail || err.message || 'Eroare la procesarea OCR'
error.value = message
emit('error', message)
} finally {
processing.value = false
}
}
const pollJobStatus = async (id) => {
const maxAttempts = 120 // 2 minutes max (120 * 1s)
let attempts = 0
const poll = async () => {
try {
const response = await api.get(`/ocr/jobs/${id}`)
const job = response.data
jobStatus.value = job.status
queuePosition.value = job.queue_position
estimatedWait.value = job.estimated_wait_seconds
console.log('📊 OCR Poll:', { status: job.status, position: job.queue_position })
if (job.status === 'completed') {
// Success! Emit result
clearInterval(pollInterval)
pollInterval = null
processing.value = false
if (job.result) {
console.log('✅ OCR Complete:', job.result)
emit('ocr-result', {
...job.result,
_processing_time_ms: job.processing_time_ms
})
} else {
error.value = 'OCR completed but no result returned'
emit('error', error.value)
}
return
}
if (job.status === 'failed') {
// Failed
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = job.error || 'OCR processing failed'
emit('error', error.value)
return
}
// Still pending/processing - continue polling
attempts++
if (attempts >= maxAttempts) {
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = 'Timeout - procesarea a durat prea mult'
emit('error', error.value)
}
} catch (err) {
console.error('🔴 Poll Error:', err.message)
attempts++
// Don't stop on poll errors - network might be flaky
if (attempts >= maxAttempts) {
clearInterval(pollInterval)
pollInterval = null
processing.value = false
error.value = 'Eroare la verificarea starii job-ului'
emit('error', error.value)
}
}
}
// Initial poll immediately
await poll()
// Continue polling every 1 second if still processing
if (processing.value) {
pollInterval = setInterval(poll, 1000)
}
}
const formatFileSize = (bytes) => {
if (bytes < 1024) return bytes + ' B'
if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(1) + ' KB'
@@ -170,6 +309,14 @@ const formatFileSize = (bytes) => {
const reset = () => {
selectedFile.value = null
error.value = null
jobId.value = null
queuePosition.value = null
estimatedWait.value = null
jobStatus.value = null
if (pollInterval) {
clearInterval(pollInterval)
pollInterval = null
}
if (fileInput.value) {
fileInput.value.value = ''
}
@@ -262,6 +409,23 @@ defineExpose({ reset, processOCR })
gap: 0.5rem;
margin-top: 0.75rem;
justify-content: center;
align-items: center;
flex-wrap: wrap;
}
/* Engine selector dropdown */
.engine-selector {
min-width: 150px;
}
.engine-selector:deep(.p-dropdown-label) {
padding: 0.5rem 0.75rem !important;
font-size: 0.875rem;
color: #475569;
}
.engine-selector:deep(.p-dropdown-trigger) {
width: 2rem !important;
}
/* Processing state */
@@ -276,6 +440,7 @@ defineExpose({ reset, processOCR })
font-size: 1rem;
color: #475569;
margin: 0.5rem 0 0 0;
font-weight: 500;
}
.processing-subtext {

View File

@@ -1,7 +1,18 @@
import axios from 'axios'
// Detect if we're accessing from remote (not localhost)
const isRemoteAccess = !['localhost', '127.0.0.1'].includes(window.location.hostname)
// For remote access, use direct backend URL (same host, port 8000)
// For local access, use proxy through Vite
const baseURL = isRemoteAccess
? `http://${window.location.hostname}:8000/api/data-entry`
: import.meta.env.BASE_URL + 'api/data-entry'
console.log('📡 API Config:', { isRemoteAccess, baseURL, hostname: window.location.hostname })
const api = axios.create({
baseURL: import.meta.env.BASE_URL + 'api/data-entry',
baseURL,
headers: { 'Content-Type': 'application/json' }
})
@@ -49,8 +60,17 @@ api.interceptors.request.use((config) => {
// Response interceptor for error handling
api.interceptors.response.use(
(response) => response,
(response) => {
console.log('✅ API Response:', response.config.url, response.status)
return response
},
(error) => {
console.error('❌ API Error:', {
url: error.config?.url,
method: error.config?.method,
code: error.code,
message: error.message
})
if (error.response?.status === 401) {
// Token expired or invalid - redirect to login
localStorage.removeItem('access_token')

View File

@@ -987,33 +987,63 @@ const rescanAttachmentOCR = async (attachment) => {
// Create a File object from the blob
const file = new File([response.data], attachment.filename, { type: attachment.mime_type })
// Send to OCR
// Send to OCR job queue
const formData = new FormData()
formData.append('file', file)
const ocrResponse = await apiService.post('/ocr/extract', formData, {
headers: { 'Content-Type': 'multipart/form-data' },
timeout: 60000,
// Submit job
const submitResponse = await apiService.post('/ocr/extract', formData, {
timeout: 30000,
})
if (ocrResponse.data.success) {
const resultData = {
...ocrResponse.data.data,
_ocr_message: ocrResponse.data.message
const jobId = submitResponse.data.job_id
console.log('📋 OCR Rescan Job:', submitResponse.data)
// Poll for result
const maxAttempts = 120
let attempts = 0
while (attempts < maxAttempts) {
await new Promise(resolve => setTimeout(resolve, 1000)) // Wait 1 second
const pollResponse = await apiService.get(`/ocr/jobs/${jobId}`)
const job = pollResponse.data
if (job.status === 'completed') {
if (job.result) {
ocrData.value = {
...job.result,
_processing_time_ms: job.processing_time_ms
}
ocrCollapsed.value = false
toast.add({
severity: 'success',
summary: 'OCR Procesare',
detail: 'Datele au fost re-extrase din atasament',
life: 3000,
})
}
break
}
ocrData.value = resultData
ocrCollapsed.value = false
toast.add({
severity: 'success',
summary: 'OCR Procesare',
detail: 'Datele au fost re-extrase din atasament',
life: 3000,
})
} else {
if (job.status === 'failed') {
toast.add({
severity: 'error',
summary: 'Eroare OCR',
detail: job.error || 'Procesare OCR esuata',
life: 5000,
})
break
}
attempts++
}
if (attempts >= maxAttempts) {
toast.add({
severity: 'error',
summary: 'Eroare OCR',
detail: ocrResponse.data.message || 'Procesare OCR esuata',
detail: 'Timeout - procesarea a durat prea mult',
life: 5000,
})
}