- OCR client for SQLite queue - WhatsApp flow: PDF -> OCR -> SQLite -> Oracle - PACK_CONTAFIN integration for Oracle save - README with flux documentation
109 lines
3.7 KiB
Python
109 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Client pentru OCR API roa2web - adaugă job direct în SQLite queue.
|
|
Folosește aceeași coadă ca backend-ul, fără HTTP auth.
|
|
"""
|
|
import asyncio
|
|
import json
|
|
import shutil
|
|
import sys
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
# Paths
|
|
QUEUE_DIR = Path("/workspace/roa2web/backend/data/ocr_queue")
|
|
DB_PATH = QUEUE_DIR / "ocr_jobs.db"
|
|
FILES_DIR = QUEUE_DIR / "files"
|
|
|
|
async def submit_ocr_job(file_path: Path, engine: str = "doctr_plus") -> str:
|
|
"""Submit OCR job to queue, return job_id."""
|
|
import aiosqlite
|
|
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Copy file to queue
|
|
FILES_DIR.mkdir(parents=True, exist_ok=True)
|
|
dest_path = FILES_DIR / f"{job_id}_{file_path.name}"
|
|
shutil.copy(file_path, dest_path)
|
|
|
|
# Determine mime type
|
|
mime_type = "application/pdf" if file_path.suffix.lower() == ".pdf" else "image/jpeg"
|
|
|
|
async with aiosqlite.connect(str(DB_PATH), timeout=5.0) as db:
|
|
await db.execute("""
|
|
INSERT INTO ocr_jobs (
|
|
id, status, file_path, mime_type, engine,
|
|
created_at, original_filename, expires_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
job_id, "pending", str(dest_path), mime_type, engine,
|
|
datetime.now().isoformat(), file_path.name,
|
|
(datetime.now() + timedelta(hours=24)).isoformat()
|
|
))
|
|
await db.commit()
|
|
|
|
return job_id
|
|
|
|
async def wait_for_result(job_id: str, timeout: int = 120) -> dict:
|
|
"""Wait for job completion and return result."""
|
|
import aiosqlite
|
|
|
|
start = datetime.now()
|
|
while (datetime.now() - start).seconds < timeout:
|
|
async with aiosqlite.connect(str(DB_PATH), timeout=5.0) as db:
|
|
db.row_factory = aiosqlite.Row
|
|
async with db.execute(
|
|
"SELECT status, result_json, error_message, processing_time_ms FROM ocr_jobs WHERE id = ?",
|
|
(job_id,)
|
|
) as cursor:
|
|
row = await cursor.fetchone()
|
|
if row:
|
|
if row["status"] == "completed":
|
|
return {
|
|
"success": True,
|
|
"result": json.loads(row["result_json"]) if row["result_json"] else None,
|
|
"time_ms": row["processing_time_ms"]
|
|
}
|
|
elif row["status"] == "failed":
|
|
return {
|
|
"success": False,
|
|
"error": row["error_message"]
|
|
}
|
|
await asyncio.sleep(0.5)
|
|
|
|
return {"success": False, "error": "Timeout"}
|
|
|
|
async def process_file(file_path: Path):
|
|
"""Process file through OCR queue."""
|
|
print(f"[OCR Queue] Submitting: {file_path.name}")
|
|
job_id = await submit_ocr_job(file_path)
|
|
print(f"[OCR Queue] Job ID: {job_id}")
|
|
print(f"[OCR Queue] Waiting for result...")
|
|
|
|
result = await wait_for_result(job_id)
|
|
|
|
if result["success"]:
|
|
r = result["result"]
|
|
print(f"\n✅ OCR Complete ({result['time_ms']}ms)")
|
|
print(f" CUI: {r.get('cui')}")
|
|
print(f" Data: {r.get('receipt_date')}")
|
|
print(f" Total: {r.get('amount')}")
|
|
print(f" TVA: {r.get('tva_total')}")
|
|
return r
|
|
else:
|
|
print(f"\n❌ Error: {result['error']}")
|
|
return None
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python roa2web_api_client.py <file_path>")
|
|
sys.exit(1)
|
|
|
|
file_path = Path(sys.argv[1])
|
|
if not file_path.exists():
|
|
print(f"File not found: {file_path}")
|
|
sys.exit(1)
|
|
|
|
asyncio.run(process_file(file_path))
|