feat(ocr): Implement persistent worker pool with SQLite job queue
Major OCR infrastructure improvements: - Add persistent SQLite-based job queue for OCR tasks - Implement worker pool with process isolation and auto-restart - Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone - Optimize Tesseract preprocessing based on benchmark results (8x faster) - Add recognize_cif_optimized() with multi-strategy CIF extraction - Add Romanian CIF checksum validation - Increase Telegram long polling timeout from 10s to 30s Squashed commits: - feat(ocr): Implement persistent worker pool with SQLite job queue - feat(ocr): Add OCR engine selector dropdown to upload zone - perf(telegram): Increase long polling timeout from 10s to 30s - perf(ocr): Optimize Tesseract preprocessing based on benchmark results 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -136,3 +136,101 @@ class OCRStatusResponse(BaseModel):
|
||||
available: bool = Field(description="Whether OCR service is available")
|
||||
engines: list[str] = Field(description="Available OCR engines")
|
||||
message: str = Field(description="Status message")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Job Queue Schemas (for async OCR processing)
|
||||
# ============================================================================
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class OCREngineChoice(str, Enum):
|
||||
"""OCR engine selection options."""
|
||||
auto = "auto"
|
||||
paddleocr = "paddleocr"
|
||||
tesseract = "tesseract"
|
||||
|
||||
|
||||
class OCRJobStatus(str, Enum):
|
||||
"""OCR job status."""
|
||||
pending = "pending"
|
||||
processing = "processing"
|
||||
completed = "completed"
|
||||
failed = "failed"
|
||||
|
||||
|
||||
class OCRJobSubmitResponse(BaseModel):
|
||||
"""Response when submitting an OCR job."""
|
||||
|
||||
job_id: str = Field(description="Unique job identifier (UUID)")
|
||||
status: OCRJobStatus = Field(description="Initial job status (pending)")
|
||||
queue_position: int = Field(description="Position in queue (1 = next to process)")
|
||||
estimated_wait_seconds: int = Field(description="Estimated wait time in seconds")
|
||||
created_at: datetime = Field(description="Job creation timestamp")
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"job_id": "abc123-def456-ghi789",
|
||||
"status": "pending",
|
||||
"queue_position": 3,
|
||||
"estimated_wait_seconds": 21,
|
||||
"created_at": "2024-01-15T12:00:00"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class OCRJobResponse(BaseModel):
|
||||
"""Full OCR job status response."""
|
||||
|
||||
job_id: str = Field(description="Unique job identifier")
|
||||
status: OCRJobStatus = Field(description="Current job status")
|
||||
queue_position: Optional[int] = Field(default=None, description="Queue position (None if processing/completed)")
|
||||
estimated_wait_seconds: Optional[int] = Field(default=None, description="Estimated wait time")
|
||||
created_at: datetime = Field(description="Job creation timestamp")
|
||||
started_at: Optional[datetime] = Field(default=None, description="Processing start timestamp")
|
||||
completed_at: Optional[datetime] = Field(default=None, description="Completion timestamp")
|
||||
processing_time_ms: Optional[int] = Field(default=None, description="Actual processing time in ms")
|
||||
result: Optional[ExtractionData] = Field(default=None, description="Extraction result (only if completed)")
|
||||
error: Optional[str] = Field(default=None, description="Error message (only if failed)")
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"job_id": "abc123-def456-ghi789",
|
||||
"status": "completed",
|
||||
"queue_position": None,
|
||||
"estimated_wait_seconds": 0,
|
||||
"created_at": "2024-01-15T12:00:00",
|
||||
"started_at": "2024-01-15T12:00:21",
|
||||
"completed_at": "2024-01-15T12:00:28",
|
||||
"processing_time_ms": 6543,
|
||||
"result": {
|
||||
"receipt_number": "123",
|
||||
"amount": 85.99,
|
||||
"ocr_engine": "paddleocr-light"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class OCRQueueStatusResponse(BaseModel):
|
||||
"""Queue statistics response."""
|
||||
|
||||
pending_jobs: int = Field(description="Number of jobs waiting in queue")
|
||||
processing_jobs: int = Field(description="Number of jobs currently processing")
|
||||
average_time_seconds: float = Field(description="Average processing time in seconds")
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"pending_jobs": 5,
|
||||
"processing_jobs": 1,
|
||||
"average_time_seconds": 7.2
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user