feat(telegram): bot bonuri fiscale — OCR → preview → Oracle write
- US-001: mută queue_client.py în data_entry/services/ocr/ - US-002/003/004: oracle_receipt_writer + oracle_server_id în DB - US-005: receipt_handlers.py (PDF/photo/callback flow) - US-006: wire handlers în main.py, per-schema connect, seq_cod.nextval - US-007: .gitignore secrets/*.oracle_pass - US-008/009/010: teste unit + integration + E2E - setup-secrets.sh helper + template - docs/telegram/README.md actualizat cu arhitectura nouă Testat E2E pe DB live (MARIUSM_AUTO). COD din seq_cod.nextval. pypdfium2 fallback pentru PDF decode (fără poppler). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -257,18 +257,31 @@ def _decode_image(image_bytes: bytes) -> Optional[np.ndarray]:
|
||||
# Try as PDF - use 200 DPI for faster processing (sufficient for receipts)
|
||||
try:
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
|
||||
# 200 DPI is sufficient for receipt text recognition
|
||||
# 300 DPI was overkill and slowed down processing
|
||||
images = pdf2image.convert_from_bytes(image_bytes, dpi=200)
|
||||
if images:
|
||||
# Convert first page to numpy array
|
||||
pil_img = images[0]
|
||||
print(f"[Worker {os.getpid()}] PDF decoded: {pil_img.width}x{pil_img.height} @ 200 DPI", flush=True)
|
||||
print(f"[Worker {os.getpid()}] PDF decoded (poppler): {pil_img.width}x{pil_img.height} @ 200 DPI", flush=True)
|
||||
return np.array(pil_img)
|
||||
except Exception as e:
|
||||
print(f"[Worker {os.getpid()}] PDF decode error: {e}", flush=True)
|
||||
# pdf2image needs poppler (pdftoppm/pdfinfo) on PATH; fall back to pypdfium2.
|
||||
print(f"[Worker {os.getpid()}] pdf2image unavailable ({e}); trying pypdfium2 fallback...", flush=True)
|
||||
try:
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
pdf = pdfium.PdfDocument(image_bytes)
|
||||
if len(pdf) > 0:
|
||||
page = pdf[0]
|
||||
pil_img = page.render(scale=200 / 72).to_pil() # scale = DPI / 72
|
||||
arr = np.array(pil_img)
|
||||
if arr.ndim == 3 and arr.shape[2] == 4:
|
||||
arr = arr[:, :, :3] # drop alpha
|
||||
print(f"[Worker {os.getpid()}] PDF decoded (pypdfium2): {pil_img.width}x{pil_img.height} @ 200 DPI", flush=True)
|
||||
return arr
|
||||
except Exception as e2:
|
||||
print(f"[Worker {os.getpid()}] pypdfium2 also failed: {e2}", flush=True)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
104
backend/modules/data_entry/services/ocr/queue_client.py
Normal file
104
backend/modules/data_entry/services/ocr/queue_client.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""
|
||||
Client pentru OCR API roa2web - adaugă job direct în SQLite queue.
|
||||
Folosește aceeași coadă ca backend-ul, fără HTTP auth.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import shutil
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
QUEUE_DIR = Path(__file__).parents[4] / "data" / "ocr_queue"
|
||||
DB_PATH = QUEUE_DIR / "ocr_jobs.db"
|
||||
FILES_DIR = QUEUE_DIR / "files"
|
||||
|
||||
|
||||
async def submit_ocr_job(file_path: Path, engine: str = "doctr_plus") -> str:
|
||||
"""Submit OCR job to queue, return job_id."""
|
||||
import aiosqlite
|
||||
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
FILES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
dest_path = FILES_DIR / f"{job_id}_{file_path.name}"
|
||||
shutil.copy(file_path, dest_path)
|
||||
|
||||
mime_type = "application/pdf" if file_path.suffix.lower() == ".pdf" else "image/jpeg"
|
||||
|
||||
async with aiosqlite.connect(str(DB_PATH), timeout=5.0) as db:
|
||||
await db.execute("""
|
||||
INSERT INTO ocr_jobs (
|
||||
id, status, file_path, mime_type, engine,
|
||||
created_at, original_filename, expires_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
job_id, "pending", str(dest_path), mime_type, engine,
|
||||
datetime.now().isoformat(), file_path.name,
|
||||
(datetime.now() + timedelta(hours=24)).isoformat()
|
||||
))
|
||||
await db.commit()
|
||||
|
||||
return job_id
|
||||
|
||||
|
||||
async def wait_for_result(job_id: str, timeout: int = 120) -> dict:
|
||||
"""Wait for job completion and return result."""
|
||||
import aiosqlite
|
||||
|
||||
start = datetime.now()
|
||||
while (datetime.now() - start).seconds < timeout:
|
||||
async with aiosqlite.connect(str(DB_PATH), timeout=5.0) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(
|
||||
"SELECT status, result_json, error_message, processing_time_ms FROM ocr_jobs WHERE id = ?",
|
||||
(job_id,)
|
||||
) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
if row and row["status"] == "completed":
|
||||
return {
|
||||
"success": True,
|
||||
"result": json.loads(row["result_json"]) if row["result_json"] else None,
|
||||
"time_ms": row["processing_time_ms"]
|
||||
}
|
||||
if row and row["status"] == "failed":
|
||||
return {"success": False, "error": row["error_message"]}
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
return {"success": False, "error": "Timeout"}
|
||||
|
||||
|
||||
async def process_file(file_path: Path):
|
||||
"""Process file through OCR queue."""
|
||||
print(f"[OCR Queue] Submitting: {file_path.name}")
|
||||
job_id = await submit_ocr_job(file_path)
|
||||
print(f"[OCR Queue] Job ID: {job_id}")
|
||||
print(f"[OCR Queue] Waiting for result...")
|
||||
|
||||
result = await wait_for_result(job_id)
|
||||
|
||||
if result["success"]:
|
||||
r = result["result"]
|
||||
print(f"\n✅ OCR Complete ({result['time_ms']}ms)")
|
||||
print(f" CUI: {r.get('cui')}")
|
||||
print(f" Data: {r.get('receipt_date')}")
|
||||
print(f" Total: {r.get('amount')}")
|
||||
print(f" TVA: {r.get('tva_total')}")
|
||||
return r
|
||||
else:
|
||||
print(f"\n❌ Error: {result['error']}")
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python queue_client.py <file_path>")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = Path(sys.argv[1])
|
||||
if not file_path.exists():
|
||||
print(f"File not found: {file_path}")
|
||||
sys.exit(1)
|
||||
|
||||
asyncio.run(process_file(file_path))
|
||||
Reference in New Issue
Block a user