roa2web-service-auto/backend/modules/data_entry/services/ocr/job_queue.py

"""
SQLite Job Queue Manager for OCR Processing

Provides async job queue for OCR requests:
- Jobs are stored in SQLite for persistence
- Queue position and time estimation
- Automatic expiration after 24 hours
- Statistics for monitoring

Schema:
    ocr_jobs (
        id TEXT PRIMARY KEY,       -- UUID
        status TEXT NOT NULL,      -- pending, processing, completed, failed
        file_path TEXT NOT NULL,   -- Path to uploaded file
        mime_type TEXT NOT NULL,
        engine TEXT DEFAULT 'doctr_plus',
        created_at TIMESTAMP,
        started_at TIMESTAMP,
        completed_at TIMESTAMP,
        result_json TEXT,          -- JSON extraction result
        error_message TEXT,
        processing_time_ms INTEGER,  -- Total job time (started_at to completed_at)
        ocr_time_ms INTEGER,         -- Actual OCR engine processing time
        created_by TEXT,           -- Username
        original_filename TEXT,
        expires_at TIMESTAMP,
        batch_id INTEGER,          -- Foreign key to batch_uploads (for bulk processing)
        file_hash TEXT             -- SHA-256 hash for duplicate detection (US-007)
    )
"""

import asyncio
import json
from decimal import Decimal


class DecimalEncoder(json.JSONEncoder):
    """JSON encoder that handles Decimal types."""
    def default(self, obj):
        if isinstance(obj, Decimal):
            return float(obj)
        return super().default(obj)
import logging
import os
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional

import aiosqlite

logger = logging.getLogger(__name__)

# Default paths
DEFAULT_QUEUE_DIR = Path(__file__).parent.parent.parent.parent.parent / "data" / "ocr_queue"
DEFAULT_DB_PATH = DEFAULT_QUEUE_DIR / "ocr_jobs.db"
DEFAULT_FILES_DIR = DEFAULT_QUEUE_DIR / "files"

# Job expiration
JOB_EXPIRY_HOURS = 24

# SQLite busy timeout (milliseconds) - prevents "database is locked" errors
SQLITE_BUSY_TIMEOUT_MS = 5000


class OCRJobStatus(str, Enum):
    """Job status enum."""
    pending = "pending"
    processing = "processing"
    completed = "completed"
    failed = "failed"
    cancelled = "cancelled"


@dataclass
class OCRJob:
    """OCR Job data class."""
    id: str
    status: OCRJobStatus
    file_path: str
    mime_type: str
    engine: str = "doctr_plus"
    created_at: Optional[datetime] = None
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
    result_json: Optional[str] = None
    error_message: Optional[str] = None
    processing_time_ms: Optional[int] = None  # Total job time (started_at to completed_at)
    ocr_time_ms: Optional[int] = None  # Actual OCR engine processing time
    created_by: Optional[str] = None
    original_filename: Optional[str] = None
    expires_at: Optional[datetime] = None
    batch_id: Optional[int] = None  # Links to batch_uploads table for bulk processing
    file_hash: Optional[str] = None  # SHA-256 hash for duplicate detection (US-007)

    @property
    def queue_wait_ms(self) -> Optional[int]:
        """Calculate queue wait time (created_at to started_at)."""
        if self.created_at and self.started_at:
            delta = self.started_at - self.created_at
            return int(delta.total_seconds() * 1000)
        return None

    @property
    def result(self) -> Optional[Dict]:
        """Parse result_json to dict."""
        if self.result_json:
            try:
                return json.loads(self.result_json)
            except json.JSONDecodeError:
                return None
        return None


class OCRJobQueue:
    """
    SQLite-based job queue for OCR processing.

    Provides async methods for job management with position
    tracking and time estimation.
    """

    def __init__(
        self,
        db_path: Optional[Path] = None,
        files_dir: Optional[Path] = None
    ):
        """
        Initialize job queue.

        Args:
            db_path: Path to SQLite database (default: data/ocr_queue/ocr_jobs.db)
            files_dir: Path to files directory (default: data/ocr_queue/files/)
        """
        self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH
        self.files_dir = Path(files_dir) if files_dir else DEFAULT_FILES_DIR
        self._lock = asyncio.Lock()
        self._initialized = False

    async def initialize(self) -> None:
        """
        Initialize database and directories.

        Creates SQLite database and tables if they don't exist.
        Creates files directory for uploaded files.
        """
        if self._initialized:
            return

        # Create directories
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self.files_dir.mkdir(parents=True, exist_ok=True)

        # Create database and tables
        async with aiosqlite.connect(str(self.db_path)) as db:
            # Enable WAL mode for better concurrency and set busy timeout
            await db.execute("PRAGMA journal_mode=WAL")
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")

            await db.execute('''
                CREATE TABLE IF NOT EXISTS ocr_jobs (
                    id TEXT PRIMARY KEY,
                    status TEXT NOT NULL DEFAULT 'pending',
                    file_path TEXT NOT NULL,
                    mime_type TEXT NOT NULL,
                    engine TEXT DEFAULT 'doctr_plus',
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    started_at TIMESTAMP,
                    completed_at TIMESTAMP,
                    result_json TEXT,
                    error_message TEXT,
                    processing_time_ms INTEGER,
                    ocr_time_ms INTEGER,
                    created_by TEXT,
                    original_filename TEXT,
                    expires_at TIMESTAMP,
                    batch_id INTEGER
                )
            ''')

            # Migration: add ocr_time_ms column if it doesn't exist
            try:
                await db.execute('ALTER TABLE ocr_jobs ADD COLUMN ocr_time_ms INTEGER')
                logger.info("[OCRJobQueue] Added ocr_time_ms column to existing table")
            except Exception:
                pass  # Column already exists

            # Migration: add batch_id column if it doesn't exist
            try:
                await db.execute('ALTER TABLE ocr_jobs ADD COLUMN batch_id INTEGER')
                logger.info("[OCRJobQueue] Added batch_id column to existing table")
            except Exception:
                pass  # Column already exists

            # Migration: add file_hash column if it doesn't exist (US-007)
            try:
                await db.execute('ALTER TABLE ocr_jobs ADD COLUMN file_hash TEXT')
                logger.info("[OCRJobQueue] Added file_hash column to existing table")
            except Exception:
                pass  # Column already exists

            # Index for efficient queue queries
            await db.execute('''
                CREATE INDEX IF NOT EXISTS idx_ocr_jobs_status
                ON ocr_jobs(status, created_at)
            ''')

            # Index for expiration cleanup
            await db.execute('''
                CREATE INDEX IF NOT EXISTS idx_ocr_jobs_expires
                ON ocr_jobs(expires_at)
            ''')

            await db.commit()

        self._initialized = True
        logger.info(f"[OCRJobQueue] Initialized: db={self.db_path}, files={self.files_dir}")

    async def create_job(
        self,
        file_bytes: bytes,
        mime_type: str,
        engine: str = "doctr_plus",
        username: Optional[str] = None,
        original_filename: Optional[str] = None,
        batch_id: Optional[int] = None,
        file_hash: Optional[str] = None
    ) -> OCRJob:
        """
        Create a new OCR job.

        Saves file to disk and creates database record.

        Args:
            file_bytes: Raw file bytes
            mime_type: MIME type of file
            engine: OCR engine ('tesseract', 'doctr', 'doctr_plus', 'paddleocr')
            username: Username of requester
            original_filename: Original filename from upload
            batch_id: Optional batch ID for bulk upload processing
            file_hash: Optional SHA-256 hash for duplicate detection (US-007)

        Returns:
            Created OCRJob instance
        """
        await self.initialize()

        # Generate job ID
        job_id = str(uuid.uuid4())

        # Determine file extension
        ext_map = {
            'image/jpeg': '.jpg',
            'image/png': '.png',
            'application/pdf': '.pdf',
        }
        ext = ext_map.get(mime_type, '.bin')

        # Save file
        file_path = self.files_dir / f"{job_id}{ext}"
        with open(file_path, 'wb') as f:
            f.write(file_bytes)

        # Calculate expiration
        now = datetime.utcnow()
        expires_at = now + timedelta(hours=JOB_EXPIRY_HOURS)

        # Insert job record
        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            await db.execute('''
                INSERT INTO ocr_jobs (
                    id, status, file_path, mime_type, engine,
                    created_at, created_by, original_filename, expires_at, batch_id, file_hash
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                job_id, OCRJobStatus.pending.value, str(file_path), mime_type, engine,
                now.isoformat(), username, original_filename, expires_at.isoformat(), batch_id, file_hash
            ))
            await db.commit()

        logger.info(f"[OCRJobQueue] Created job {job_id}: engine={engine}, file={file_path.name}, batch_id={batch_id}")

        return OCRJob(
            id=job_id,
            status=OCRJobStatus.pending,
            file_path=str(file_path),
            mime_type=mime_type,
            engine=engine,
            created_at=now,
            created_by=username,
            original_filename=original_filename,
            expires_at=expires_at,
            batch_id=batch_id,
            file_hash=file_hash
        )

    async def get_job(self, job_id: str) -> Optional[OCRJob]:
        """
        Get job by ID.

        Args:
            job_id: Job UUID

        Returns:
            OCRJob or None if not found
        """
        await self.initialize()

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            db.row_factory = aiosqlite.Row
            async with db.execute(
                'SELECT * FROM ocr_jobs WHERE id = ?',
                (job_id,)
            ) as cursor:
                row = await cursor.fetchone()
                if row:
                    return self._row_to_job(row)
        return None

    async def get_queue_position(self, job_id: str) -> Optional[int]:
        """
        Get position in queue for a pending job.

        Args:
            job_id: Job UUID

        Returns:
            Queue position (1 = next to process) or None if not pending
        """
        await self.initialize()

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            # Check if job is pending
            async with db.execute(
                'SELECT status, created_at FROM ocr_jobs WHERE id = ?',
                (job_id,)
            ) as cursor:
                row = await cursor.fetchone()
                if not row or row[0] != OCRJobStatus.pending.value:
                    return None
                job_created_at = row[1]

            # Count jobs ahead in queue (created before this job)
            async with db.execute('''
                SELECT COUNT(*) FROM ocr_jobs
                WHERE status = 'pending' AND created_at < ?
            ''', (job_created_at,)) as cursor:
                count = await cursor.fetchone()
                return (count[0] + 1) if count else 1

    async def get_next_pending(self) -> Optional[OCRJob]:
        """
        Get the next pending job (oldest first) and atomically mark it as processing.

        This prevents race conditions in parallel processing - only one worker
        can claim each job.

        Returns:
            Next OCRJob to process or None if queue empty
        """
        await self.initialize()

        now = datetime.utcnow()

        async with self._lock:  # Serialize access to prevent race conditions
            async with aiosqlite.connect(str(self.db_path)) as db:
                await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
                db.row_factory = aiosqlite.Row

                # Get the next pending job
                async with db.execute('''
                    SELECT * FROM ocr_jobs
                    WHERE status = 'pending'
                    ORDER BY created_at ASC
                    LIMIT 1
                ''') as cursor:
                    row = await cursor.fetchone()
                    if not row:
                        return None

                    job_id = row['id']

                # Atomically mark as processing
                await db.execute('''
                    UPDATE ocr_jobs
                    SET status = 'processing', started_at = ?
                    WHERE id = ? AND status = 'pending'
                ''', (now.isoformat(), job_id))
                await db.commit()

                # Fetch the updated job
                async with db.execute(
                    'SELECT * FROM ocr_jobs WHERE id = ?',
                    (job_id,)
                ) as cursor:
                    updated_row = await cursor.fetchone()
                    if updated_row:
                        return self._row_to_job(updated_row)

        return None

    async def update_status(
        self,
        job_id: str,
        status: OCRJobStatus,
        result: Optional[Dict] = None,
        error: Optional[str] = None,
        processing_time_ms: Optional[int] = None,
        ocr_time_ms: Optional[int] = None
    ) -> bool:
        """
        Update job status.

        Args:
            job_id: Job UUID
            status: New status
            result: Extraction result dict (for completed)
            error: Error message (for failed)
            processing_time_ms: Total job processing time (started_at to completed_at)
            ocr_time_ms: Actual OCR engine processing time

        Returns:
            True if update successful
        """
        await self.initialize()

        now = datetime.utcnow()
        result_json = json.dumps(result, cls=DecimalEncoder) if result else None

        # Build update query based on status
        if status == OCRJobStatus.processing:
            query = '''
                UPDATE ocr_jobs
                SET status = ?, started_at = ?
                WHERE id = ?
            '''
            params = (status.value, now.isoformat(), job_id)

        elif status == OCRJobStatus.completed:
            query = '''
                UPDATE ocr_jobs
                SET status = ?, completed_at = ?, result_json = ?, processing_time_ms = ?, ocr_time_ms = ?
                WHERE id = ?
            '''
            params = (status.value, now.isoformat(), result_json, processing_time_ms, ocr_time_ms, job_id)

        elif status == OCRJobStatus.failed:
            query = '''
                UPDATE ocr_jobs
                SET status = ?, completed_at = ?, error_message = ?, processing_time_ms = ?, ocr_time_ms = ?
                WHERE id = ?
            '''
            params = (status.value, now.isoformat(), error, processing_time_ms, ocr_time_ms, job_id)

        else:
            query = 'UPDATE ocr_jobs SET status = ? WHERE id = ?'
            params = (status.value, job_id)

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            cursor = await db.execute(query, params)
            await db.commit()
            return cursor.rowcount > 0

    async def get_average_processing_time(self) -> float:
        """
        Calculate average processing time from recent completed jobs.

        Uses last 50 completed jobs for accuracy.

        Returns:
            Average time in seconds (default 7.0 if no data)
        """
        await self.initialize()

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            async with db.execute('''
                SELECT AVG(processing_time_ms)
                FROM (
                    SELECT processing_time_ms FROM ocr_jobs
                    WHERE status = 'completed' AND processing_time_ms IS NOT NULL
                    ORDER BY completed_at DESC
                    LIMIT 50
                )
            ''') as cursor:
                row = await cursor.fetchone()
                if row and row[0]:
                    return row[0] / 1000.0  # Convert ms to seconds
        return 7.0  # Default estimate

    async def count_pending(self) -> int:
        """Count pending jobs in queue."""
        await self.initialize()

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            async with db.execute(
                'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
                (OCRJobStatus.pending.value,)
            ) as cursor:
                row = await cursor.fetchone()
                return row[0] if row else 0

    async def count_processing(self) -> int:
        """Count currently processing jobs."""
        await self.initialize()

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            async with db.execute(
                'SELECT COUNT(*) FROM ocr_jobs WHERE status = ?',
                (OCRJobStatus.processing.value,)
            ) as cursor:
                row = await cursor.fetchone()
                return row[0] if row else 0

    async def cleanup_expired(self) -> int:
        """
        Delete expired jobs and their files.

        Returns:
            Number of jobs deleted
        """
        await self.initialize()

        now = datetime.utcnow()
        deleted = 0

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            db.row_factory = aiosqlite.Row

            # Get expired jobs
            async with db.execute('''
                SELECT id, file_path FROM ocr_jobs
                WHERE expires_at < ?
            ''', (now.isoformat(),)) as cursor:
                rows = await cursor.fetchall()

            for row in rows:
                # Delete file
                file_path = Path(row['file_path'])
                if file_path.exists():
                    try:
                        file_path.unlink()
                    except Exception as e:
                        logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")

                # Delete job record
                await db.execute('DELETE FROM ocr_jobs WHERE id = ?', (row['id'],))
                deleted += 1

            await db.commit()

        if deleted > 0:
            logger.info(f"[OCRJobQueue] Cleaned up {deleted} expired job(s)")

        return deleted

    async def cleanup_job_file(self, job_id: str) -> bool:
        """
        Delete the file associated with a job.

        Called after processing to free disk space.

        Args:
            job_id: Job UUID

        Returns:
            True if file deleted
        """
        job = await self.get_job(job_id)
        if job:
            file_path = Path(job.file_path)
            if file_path.exists():
                try:
                    file_path.unlink()
                    return True
                except Exception as e:
                    logger.warning(f"[OCRJobQueue] Failed to delete file {file_path}: {e}")
        return False

    async def get_queue_stats(self) -> Dict[str, Any]:
        """
        Get queue statistics.

        Returns:
            Dict with pending, processing, completed, failed counts
        """
        await self.initialize()

        stats = {
            "pending": 0,
            "processing": 0,
            "completed": 0,
            "failed": 0,
            "average_time_seconds": 0.0,
        }

        async with aiosqlite.connect(str(self.db_path)) as db:
            await db.execute(f"PRAGMA busy_timeout={SQLITE_BUSY_TIMEOUT_MS}")
            async with db.execute('''
                SELECT status, COUNT(*) as count
                FROM ocr_jobs
                GROUP BY status
            ''') as cursor:
                rows = await cursor.fetchall()
                for row in rows:
                    if row[0] in stats:
                        stats[row[0]] = row[1]

        stats["average_time_seconds"] = await self.get_average_processing_time()
        return stats

    def _row_to_job(self, row: aiosqlite.Row) -> OCRJob:
        """Convert database row to OCRJob."""
        def parse_datetime(val):
            if val:
                try:
                    return datetime.fromisoformat(val)
                except (ValueError, TypeError):
                    return None
            return None

        return OCRJob(
            id=row['id'],
            status=OCRJobStatus(row['status']),
            file_path=row['file_path'],
            mime_type=row['mime_type'],
            engine=row['engine'] or 'doctr_plus',
            created_at=parse_datetime(row['created_at']),
            started_at=parse_datetime(row['started_at']),
            completed_at=parse_datetime(row['completed_at']),
            result_json=row['result_json'],
            error_message=row['error_message'],
            processing_time_ms=row['processing_time_ms'],
            ocr_time_ms=row['ocr_time_ms'] if 'ocr_time_ms' in row.keys() else None,
            created_by=row['created_by'],
            original_filename=row['original_filename'],
            expires_at=parse_datetime(row['expires_at']),
            batch_id=row['batch_id'] if 'batch_id' in row.keys() else None,
            file_hash=row['file_hash'] if 'file_hash' in row.keys() else None,
        )


# Singleton instance
job_queue = OCRJobQueue()