chore: working-tree state — anaf snapshots, cron state, KB notes, tools

Pre-existing uncommitted changes swept in with the STT work: anaf-monitor snapshots/versions, cron job + newsletter state, 9 youtube KB notes, tools/ocr_bon.py, and tools/tts.py. Note: the tts.py change breaks 2 truncation tests in test_voice_normalize.py (sanitize word-count) — flagged for a separate follow-up. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:31 +00:00
parent ce273d14db
commit d175d5ba5a
17 changed files with 840 additions and 41 deletions
--- a/tools/ocr_bon.py
+++ b/tools/ocr_bon.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Extrage date din bon fiscal / factură / extras de cont via Ollama vision LLM.
+
+Usage:
+    python tools/ocr_bon.py <pdf_sau_imagine> [--model minicpm-v] [--host http://10.0.20.161:11434]
+
+Modele recomandate (trage cu: ollama pull <model>):
+    minicpm-v          ~5GB, rapid, excelent pentru documente
+    llava:7b           ~4GB, clasic, bun pe bonuri
+    llama3.2-vision    ~8GB, cel mai precis
+"""
+
+import sys
+import json
+import base64
+import argparse
+import tempfile
+from pathlib import Path
+
+import httpx
+import fitz  # pymupdf
+
+OLLAMA_HOST = "http://10.0.20.161:11434"
+DEFAULT_MODEL = "minicpm-v"
+
+PROMPT = """Ești un sistem OCR specializat pe documente financiare românești.
+Extrage TOATE datele vizibile și returnează EXCLUSIV un JSON valid, fără text în afara JSON-ului.
+
+Schema JSON:
+{
+  "document_type": "bon_fiscal|factura|extras_cont|necunoscut",
+  "vendor": {
+    "name": "...",
+    "cui": "...",
+    "address": "..."
+  },
+  "client": {
+    "name": "...",
+    "cui": "..."
+  },
+  "document": {
+    "numar": "...",
+    "serie": "...",
+    "data": "YYYY-MM-DD",
+    "ora": "HH:MM:SS"
+  },
+  "items": [
+    {
+      "descriere": "...",
+      "cantitate": 0.0,
+      "unitate": "buc|l|kg|...",
+      "pret_unitar": 0.0,
+      "valoare": 0.0
+    }
+  ],
+  "tva": [
+    {
+      "cota_litera": "A|B|C",
+      "procent": 21.0,
+      "baza": 0.0,
+      "valoare_tva": 0.0
+    }
+  ],
+  "total_fara_tva": 0.0,
+  "total_tva": 0.0,
+  "total": 0.0,
+  "plata": {
+    "metoda": "card|numerar|transfer",
+    "tip_card": "...",
+    "pan_masked": "...",
+    "suma": 0.0
+  },
+  "note": "orice info suplimentar relevant"
+}
+
+Omite câmpurile care nu există în document. Returnează DOAR JSON."""
+
+
+def pdf_to_images_b64(pdf_path: Path, dpi: int = 150) -> list[str]:
+    """Convertește fiecare pagină PDF la PNG base64."""
+    doc = fitz.open(str(pdf_path))
+    images = []
+    mat = fitz.Matrix(dpi / 72, dpi / 72)
+    for page in doc:
+        pix = page.get_pixmap(matrix=mat)
+        images.append(base64.b64encode(pix.tobytes("png")).decode())
+    doc.close()
+    return images
+
+
+def image_to_b64(img_path: Path) -> str:
+    """Citește imagine și returnează base64."""
+    return base64.b64encode(img_path.read_bytes()).decode()
+
+
+def ask_ollama(model: str, images_b64: list[str], host: str) -> str:
+    """Trimite imaginile la Ollama și returnează răspunsul."""
+    payload = {
+        "model": model,
+        "prompt": PROMPT,
+        "images": images_b64,
+        "stream": False,
+        "options": {
+            "temperature": 0.1,
+            "num_predict": 2048,
+        }
+    }
+
+    with httpx.Client(timeout=600) as client:
+        r = client.post(f"{host}/api/generate", json=payload)
+        r.raise_for_status()
+        return r.json()["response"]
+
+
+def extract_json(text: str) -> dict:
+    """Extrage JSON din răspuns (poate fi înconjurat de markdown)."""
+    text = text.strip()
+    # Strip markdown code blocks
+    if "```" in text:
+        start = text.find("{", text.find("```"))
+        end = text.rfind("}") + 1
+        text = text[start:end]
+    return json.loads(text)
+
+
+def process(file_path: Path, model: str, host: str) -> dict:
+    suffix = file_path.suffix.lower()
+
+    if suffix == ".pdf":
+        print(f"  Conversie PDF → imagini...", file=sys.stderr)
+        images = pdf_to_images_b64(file_path)
+        print(f"  {len(images)} pagini extrase", file=sys.stderr)
+    elif suffix in (".jpg", ".jpeg", ".png", ".webp"):
+        images = [image_to_b64(file_path)]
+    else:
+        # Încearcă să detecteze tipul din conținut
+        header = file_path.read_bytes()[:8]
+        if header[:4] == b'%PDF':
+            images = pdf_to_images_b64(file_path)
+        else:
+            images = [image_to_b64(file_path)]
+
+    print(f"  Trimit la Ollama ({model})...", file=sys.stderr)
+    raw = ask_ollama(model, images, host)
+
+    try:
+        return extract_json(raw)
+    except json.JSONDecodeError:
+        print(f"  Răspuns brut (nu e JSON valid):\n{raw}", file=sys.stderr)
+        return {"error": "json_parse_failed", "raw": raw}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="OCR bon fiscal via Ollama vision")
+    parser.add_argument("file", help="PDF sau imagine (jpg/png)")
+    parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Model Ollama (default: {DEFAULT_MODEL})")
+    parser.add_argument("--host", default=OLLAMA_HOST, help=f"Ollama host (default: {OLLAMA_HOST})")
+    parser.add_argument("--pretty", action="store_true", help="JSON indentat")
+    args = parser.parse_args()
+
+    file_path = Path(args.file)
+    if not file_path.exists():
+        print(f"Fișierul nu există: {file_path}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Procesez: {file_path.name}", file=sys.stderr)
+    result = process(file_path, args.model, args.host)
+
+    indent = 2 if args.pretty else None
+    print(json.dumps(result, ensure_ascii=False, indent=indent))
+
+
+if __name__ == "__main__":
+    main()