feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements: - Add persistent SQLite-based job queue for OCR tasks - Implement worker pool with process isolation and auto-restart - Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone - Optimize Tesseract preprocessing based on benchmark results (8x faster) - Add recognize_cif_optimized() with multi-strategy CIF extraction - Add Romanian CIF checksum validation - Increase Telegram long polling timeout from 10s to 30s Squashed commits: - feat(ocr): Implement persistent worker pool with SQLite job queue - feat(ocr): Add OCR engine selector dropdown to upload zone - perf(telegram): Increase long polling timeout from 10s to 30s - perf(ocr): Optimize Tesseract preprocessing based on benchmark results 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -21,8 +21,8 @@
|
||||
style="width: 50px; height: 50px"
|
||||
strokeWidth="4"
|
||||
/>
|
||||
<p class="processing-text">Se proceseaza imaginea...</p>
|
||||
<p class="processing-subtext">Acest proces poate dura cateva secunde</p>
|
||||
<p class="processing-text">{{ processingMessage }}</p>
|
||||
<p class="processing-subtext">{{ processingSubtext }}</p>
|
||||
</div>
|
||||
|
||||
<div v-else-if="selectedFile" class="file-selected-state">
|
||||
@@ -52,6 +52,15 @@
|
||||
size="small"
|
||||
@click="triggerFileInput"
|
||||
/>
|
||||
<!-- OCR Engine Selector -->
|
||||
<Dropdown
|
||||
v-model="selectedEngine"
|
||||
:options="engineOptions"
|
||||
optionLabel="label"
|
||||
optionValue="value"
|
||||
placeholder="Motor OCR"
|
||||
class="engine-selector dropdown-borderless"
|
||||
/>
|
||||
<Button
|
||||
label="Proceseaza OCR"
|
||||
icon="pi pi-cog"
|
||||
@@ -68,7 +77,8 @@
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref } from 'vue'
|
||||
import { ref, computed } from 'vue'
|
||||
import Dropdown from 'primevue/dropdown'
|
||||
import api from '@data-entry/services/api'
|
||||
|
||||
const emit = defineEmits(['ocr-result', 'file-selected', 'error'])
|
||||
@@ -79,6 +89,42 @@ const isDragging = ref(false)
|
||||
const processing = ref(false)
|
||||
const error = ref(null)
|
||||
|
||||
// OCR Engine selection
|
||||
const selectedEngine = ref('auto')
|
||||
const engineOptions = [
|
||||
{ label: 'Auto (Recomandat)', value: 'auto' },
|
||||
{ label: 'PaddleOCR', value: 'paddleocr' },
|
||||
{ label: 'Tesseract', value: 'tesseract' }
|
||||
]
|
||||
|
||||
// Job queue state
|
||||
const jobId = ref(null)
|
||||
const queuePosition = ref(null)
|
||||
const estimatedWait = ref(null)
|
||||
const jobStatus = ref(null)
|
||||
let pollInterval = null
|
||||
|
||||
// Dynamic processing messages
|
||||
const processingMessage = computed(() => {
|
||||
if (jobStatus.value === 'pending' && queuePosition.value > 0) {
|
||||
return `In coada... Pozitia ${queuePosition.value}`
|
||||
}
|
||||
if (jobStatus.value === 'processing') {
|
||||
return 'Se proceseaza imaginea...'
|
||||
}
|
||||
return 'Se trimite...'
|
||||
})
|
||||
|
||||
const processingSubtext = computed(() => {
|
||||
if (jobStatus.value === 'pending' && estimatedWait.value > 0) {
|
||||
return `Timp estimat: ~${estimatedWait.value} secunde`
|
||||
}
|
||||
if (jobStatus.value === 'processing') {
|
||||
return 'Extragere date cu OCR'
|
||||
}
|
||||
return 'Asteptati...'
|
||||
})
|
||||
|
||||
const onDragOver = () => {
|
||||
isDragging.value = true
|
||||
}
|
||||
@@ -130,37 +176,130 @@ const processOCR = async () => {
|
||||
|
||||
processing.value = true
|
||||
error.value = null
|
||||
jobId.value = null
|
||||
queuePosition.value = null
|
||||
estimatedWait.value = null
|
||||
jobStatus.value = 'submitting'
|
||||
|
||||
try {
|
||||
const formData = new FormData()
|
||||
formData.append('file', selectedFile.value)
|
||||
|
||||
// Don't set Content-Type header - let browser set it with boundary for multipart/form-data
|
||||
// The API interceptor will add Authorization header automatically
|
||||
const response = await api.post('/ocr/extract', formData, {
|
||||
timeout: 60000, // 60 second timeout for OCR
|
||||
console.log('🔍 OCR Submit:', {
|
||||
fileName: selectedFile.value?.name,
|
||||
fileSize: selectedFile.value?.size,
|
||||
fileType: selectedFile.value?.type,
|
||||
engine: selectedEngine.value
|
||||
})
|
||||
|
||||
if (response.data.success) {
|
||||
// Include the OCR message in the data for debugging
|
||||
const resultData = {
|
||||
...response.data.data,
|
||||
_ocr_message: response.data.message
|
||||
}
|
||||
emit('ocr-result', resultData)
|
||||
} else {
|
||||
error.value = response.data.message || 'OCR processing failed'
|
||||
emit('error', error.value)
|
||||
}
|
||||
// Step 1: Submit job to queue (returns immediately)
|
||||
// Include engine parameter in query string
|
||||
const submitResponse = await api.post(`/ocr/extract?engine=${selectedEngine.value}`, formData, {
|
||||
timeout: 30000, // 30s timeout for submission
|
||||
})
|
||||
|
||||
const job = submitResponse.data
|
||||
jobId.value = job.job_id
|
||||
queuePosition.value = job.queue_position
|
||||
estimatedWait.value = job.estimated_wait_seconds
|
||||
jobStatus.value = job.status
|
||||
|
||||
console.log('📋 OCR Job Created:', job)
|
||||
|
||||
// Step 2: Start polling for result
|
||||
await pollJobStatus(job.job_id)
|
||||
|
||||
} catch (err) {
|
||||
console.error('🔴 OCR Error:', {
|
||||
message: err.message,
|
||||
code: err.code,
|
||||
response: err.response?.data
|
||||
})
|
||||
const message = err.response?.data?.detail || err.message || 'Eroare la procesarea OCR'
|
||||
error.value = message
|
||||
emit('error', message)
|
||||
} finally {
|
||||
processing.value = false
|
||||
}
|
||||
}
|
||||
|
||||
const pollJobStatus = async (id) => {
|
||||
const maxAttempts = 120 // 2 minutes max (120 * 1s)
|
||||
let attempts = 0
|
||||
|
||||
const poll = async () => {
|
||||
try {
|
||||
const response = await api.get(`/ocr/jobs/${id}`)
|
||||
const job = response.data
|
||||
|
||||
jobStatus.value = job.status
|
||||
queuePosition.value = job.queue_position
|
||||
estimatedWait.value = job.estimated_wait_seconds
|
||||
|
||||
console.log('📊 OCR Poll:', { status: job.status, position: job.queue_position })
|
||||
|
||||
if (job.status === 'completed') {
|
||||
// Success! Emit result
|
||||
clearInterval(pollInterval)
|
||||
pollInterval = null
|
||||
processing.value = false
|
||||
|
||||
if (job.result) {
|
||||
console.log('✅ OCR Complete:', job.result)
|
||||
emit('ocr-result', {
|
||||
...job.result,
|
||||
_processing_time_ms: job.processing_time_ms
|
||||
})
|
||||
} else {
|
||||
error.value = 'OCR completed but no result returned'
|
||||
emit('error', error.value)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if (job.status === 'failed') {
|
||||
// Failed
|
||||
clearInterval(pollInterval)
|
||||
pollInterval = null
|
||||
processing.value = false
|
||||
|
||||
error.value = job.error || 'OCR processing failed'
|
||||
emit('error', error.value)
|
||||
return
|
||||
}
|
||||
|
||||
// Still pending/processing - continue polling
|
||||
attempts++
|
||||
if (attempts >= maxAttempts) {
|
||||
clearInterval(pollInterval)
|
||||
pollInterval = null
|
||||
processing.value = false
|
||||
error.value = 'Timeout - procesarea a durat prea mult'
|
||||
emit('error', error.value)
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
console.error('🔴 Poll Error:', err.message)
|
||||
attempts++
|
||||
// Don't stop on poll errors - network might be flaky
|
||||
if (attempts >= maxAttempts) {
|
||||
clearInterval(pollInterval)
|
||||
pollInterval = null
|
||||
processing.value = false
|
||||
error.value = 'Eroare la verificarea starii job-ului'
|
||||
emit('error', error.value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initial poll immediately
|
||||
await poll()
|
||||
|
||||
// Continue polling every 1 second if still processing
|
||||
if (processing.value) {
|
||||
pollInterval = setInterval(poll, 1000)
|
||||
}
|
||||
}
|
||||
|
||||
const formatFileSize = (bytes) => {
|
||||
if (bytes < 1024) return bytes + ' B'
|
||||
if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(1) + ' KB'
|
||||
@@ -170,6 +309,14 @@ const formatFileSize = (bytes) => {
|
||||
const reset = () => {
|
||||
selectedFile.value = null
|
||||
error.value = null
|
||||
jobId.value = null
|
||||
queuePosition.value = null
|
||||
estimatedWait.value = null
|
||||
jobStatus.value = null
|
||||
if (pollInterval) {
|
||||
clearInterval(pollInterval)
|
||||
pollInterval = null
|
||||
}
|
||||
if (fileInput.value) {
|
||||
fileInput.value.value = ''
|
||||
}
|
||||
@@ -262,6 +409,23 @@ defineExpose({ reset, processOCR })
|
||||
gap: 0.5rem;
|
||||
margin-top: 0.75rem;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
/* Engine selector dropdown */
|
||||
.engine-selector {
|
||||
min-width: 150px;
|
||||
}
|
||||
|
||||
.engine-selector:deep(.p-dropdown-label) {
|
||||
padding: 0.5rem 0.75rem !important;
|
||||
font-size: 0.875rem;
|
||||
color: #475569;
|
||||
}
|
||||
|
||||
.engine-selector:deep(.p-dropdown-trigger) {
|
||||
width: 2rem !important;
|
||||
}
|
||||
|
||||
/* Processing state */
|
||||
@@ -276,6 +440,7 @@ defineExpose({ reset, processOCR })
|
||||
font-size: 1rem;
|
||||
color: #475569;
|
||||
margin: 0.5rem 0 0 0;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.processing-subtext {
|
||||
|
||||
@@ -1,7 +1,18 @@
|
||||
import axios from 'axios'
|
||||
|
||||
// Detect if we're accessing from remote (not localhost)
|
||||
const isRemoteAccess = !['localhost', '127.0.0.1'].includes(window.location.hostname)
|
||||
|
||||
// For remote access, use direct backend URL (same host, port 8000)
|
||||
// For local access, use proxy through Vite
|
||||
const baseURL = isRemoteAccess
|
||||
? `http://${window.location.hostname}:8000/api/data-entry`
|
||||
: import.meta.env.BASE_URL + 'api/data-entry'
|
||||
|
||||
console.log('📡 API Config:', { isRemoteAccess, baseURL, hostname: window.location.hostname })
|
||||
|
||||
const api = axios.create({
|
||||
baseURL: import.meta.env.BASE_URL + 'api/data-entry',
|
||||
baseURL,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
})
|
||||
|
||||
@@ -49,8 +60,17 @@ api.interceptors.request.use((config) => {
|
||||
|
||||
// Response interceptor for error handling
|
||||
api.interceptors.response.use(
|
||||
(response) => response,
|
||||
(response) => {
|
||||
console.log('✅ API Response:', response.config.url, response.status)
|
||||
return response
|
||||
},
|
||||
(error) => {
|
||||
console.error('❌ API Error:', {
|
||||
url: error.config?.url,
|
||||
method: error.config?.method,
|
||||
code: error.code,
|
||||
message: error.message
|
||||
})
|
||||
if (error.response?.status === 401) {
|
||||
// Token expired or invalid - redirect to login
|
||||
localStorage.removeItem('access_token')
|
||||
|
||||
@@ -987,33 +987,63 @@ const rescanAttachmentOCR = async (attachment) => {
|
||||
// Create a File object from the blob
|
||||
const file = new File([response.data], attachment.filename, { type: attachment.mime_type })
|
||||
|
||||
// Send to OCR
|
||||
// Send to OCR job queue
|
||||
const formData = new FormData()
|
||||
formData.append('file', file)
|
||||
|
||||
const ocrResponse = await apiService.post('/ocr/extract', formData, {
|
||||
headers: { 'Content-Type': 'multipart/form-data' },
|
||||
timeout: 60000,
|
||||
// Submit job
|
||||
const submitResponse = await apiService.post('/ocr/extract', formData, {
|
||||
timeout: 30000,
|
||||
})
|
||||
|
||||
if (ocrResponse.data.success) {
|
||||
const resultData = {
|
||||
...ocrResponse.data.data,
|
||||
_ocr_message: ocrResponse.data.message
|
||||
const jobId = submitResponse.data.job_id
|
||||
console.log('📋 OCR Rescan Job:', submitResponse.data)
|
||||
|
||||
// Poll for result
|
||||
const maxAttempts = 120
|
||||
let attempts = 0
|
||||
|
||||
while (attempts < maxAttempts) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)) // Wait 1 second
|
||||
|
||||
const pollResponse = await apiService.get(`/ocr/jobs/${jobId}`)
|
||||
const job = pollResponse.data
|
||||
|
||||
if (job.status === 'completed') {
|
||||
if (job.result) {
|
||||
ocrData.value = {
|
||||
...job.result,
|
||||
_processing_time_ms: job.processing_time_ms
|
||||
}
|
||||
ocrCollapsed.value = false
|
||||
toast.add({
|
||||
severity: 'success',
|
||||
summary: 'OCR Procesare',
|
||||
detail: 'Datele au fost re-extrase din atasament',
|
||||
life: 3000,
|
||||
})
|
||||
}
|
||||
break
|
||||
}
|
||||
ocrData.value = resultData
|
||||
ocrCollapsed.value = false
|
||||
toast.add({
|
||||
severity: 'success',
|
||||
summary: 'OCR Procesare',
|
||||
detail: 'Datele au fost re-extrase din atasament',
|
||||
life: 3000,
|
||||
})
|
||||
} else {
|
||||
|
||||
if (job.status === 'failed') {
|
||||
toast.add({
|
||||
severity: 'error',
|
||||
summary: 'Eroare OCR',
|
||||
detail: job.error || 'Procesare OCR esuata',
|
||||
life: 5000,
|
||||
})
|
||||
break
|
||||
}
|
||||
|
||||
attempts++
|
||||
}
|
||||
|
||||
if (attempts >= maxAttempts) {
|
||||
toast.add({
|
||||
severity: 'error',
|
||||
summary: 'Eroare OCR',
|
||||
detail: ocrResponse.data.message || 'Procesare OCR esuata',
|
||||
detail: 'Timeout - procesarea a durat prea mult',
|
||||
life: 5000,
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user