feat(ocr): Implement persistent worker pool with SQLite job queue

Major OCR infrastructure improvements:
- Add persistent SQLite-based job queue for OCR tasks
- Implement worker pool with process isolation and auto-restart
- Add OCR engine selector dropdown (Tesseract/PaddleOCR) in upload zone
- Optimize Tesseract preprocessing based on benchmark results (8x faster)
- Add recognize_cif_optimized() with multi-strategy CIF extraction
- Add Romanian CIF checksum validation
- Increase Telegram long polling timeout from 10s to 30s

Squashed commits:
- feat(ocr): Implement persistent worker pool with SQLite job queue
- feat(ocr): Add OCR engine selector dropdown to upload zone
- perf(telegram): Increase long polling timeout from 10s to 30s
- perf(ocr): Optimize Tesseract preprocessing based on benchmark results

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-31 12:32:12 +02:00
parent 00f826f7ed
commit 74f7aefc26
23 changed files with 3616 additions and 209 deletions

View File

@@ -136,3 +136,101 @@ class OCRStatusResponse(BaseModel):
available: bool = Field(description="Whether OCR service is available")
engines: list[str] = Field(description="Available OCR engines")
message: str = Field(description="Status message")
# ============================================================================
# Job Queue Schemas (for async OCR processing)
# ============================================================================
from datetime import datetime
from enum import Enum
class OCREngineChoice(str, Enum):
"""OCR engine selection options."""
auto = "auto"
paddleocr = "paddleocr"
tesseract = "tesseract"
class OCRJobStatus(str, Enum):
"""OCR job status."""
pending = "pending"
processing = "processing"
completed = "completed"
failed = "failed"
class OCRJobSubmitResponse(BaseModel):
"""Response when submitting an OCR job."""
job_id: str = Field(description="Unique job identifier (UUID)")
status: OCRJobStatus = Field(description="Initial job status (pending)")
queue_position: int = Field(description="Position in queue (1 = next to process)")
estimated_wait_seconds: int = Field(description="Estimated wait time in seconds")
created_at: datetime = Field(description="Job creation timestamp")
class Config:
"""Pydantic config."""
json_schema_extra = {
"example": {
"job_id": "abc123-def456-ghi789",
"status": "pending",
"queue_position": 3,
"estimated_wait_seconds": 21,
"created_at": "2024-01-15T12:00:00"
}
}
class OCRJobResponse(BaseModel):
"""Full OCR job status response."""
job_id: str = Field(description="Unique job identifier")
status: OCRJobStatus = Field(description="Current job status")
queue_position: Optional[int] = Field(default=None, description="Queue position (None if processing/completed)")
estimated_wait_seconds: Optional[int] = Field(default=None, description="Estimated wait time")
created_at: datetime = Field(description="Job creation timestamp")
started_at: Optional[datetime] = Field(default=None, description="Processing start timestamp")
completed_at: Optional[datetime] = Field(default=None, description="Completion timestamp")
processing_time_ms: Optional[int] = Field(default=None, description="Actual processing time in ms")
result: Optional[ExtractionData] = Field(default=None, description="Extraction result (only if completed)")
error: Optional[str] = Field(default=None, description="Error message (only if failed)")
class Config:
"""Pydantic config."""
json_schema_extra = {
"example": {
"job_id": "abc123-def456-ghi789",
"status": "completed",
"queue_position": None,
"estimated_wait_seconds": 0,
"created_at": "2024-01-15T12:00:00",
"started_at": "2024-01-15T12:00:21",
"completed_at": "2024-01-15T12:00:28",
"processing_time_ms": 6543,
"result": {
"receipt_number": "123",
"amount": 85.99,
"ocr_engine": "paddleocr-light"
}
}
}
class OCRQueueStatusResponse(BaseModel):
"""Queue statistics response."""
pending_jobs: int = Field(description="Number of jobs waiting in queue")
processing_jobs: int = Field(description="Number of jobs currently processing")
average_time_seconds: float = Field(description="Average processing time in seconds")
class Config:
"""Pydantic config."""
json_schema_extra = {
"example": {
"pending_jobs": 5,
"processing_jobs": 1,
"average_time_seconds": 7.2
}
}