chore: working-tree state — anaf snapshots, cron state, KB notes, tools

Pre-existing uncommitted changes swept in with the STT work: anaf-monitor snapshots/versions, cron job + newsletter state, 9 youtube KB notes, tools/ocr_bon.py, and tools/tts.py. Note: the tts.py change breaks 2 truncation tests in test_voice_normalize.py (sanitize word-count) — flagged for a separate follow-up. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:31 +00:00
parent ce273d14db
commit d175d5ba5a
17 changed files with 840 additions and 41 deletions
--- a/tools/anaf-monitor/hashes.json
+++ b/tools/anaf-monitor/hashes.json
@@ -1,12 +1,12 @@
 {
-  "D100": "27cf97a4d10c8529669d95b2d96ca3c9b41f7e4e50091dce19cf8af117f0ac4a",
+  "D100": "ce27f72bc3fd5e3241480fcda3a3e14572cfbed1b26e43896037bb265d82e4f5",
  "D101": "f72fc1c29657ea11e0238806a28f6abccf5b00e45904e1e0c9385cc64491fcaf",
  "D300": "cb7b55b568ab893024884971eac0367fb6fe487c297e355d64258dae437f6ddd",
  "D394": "c4c4e62bda30032f12c17edf9a5087b6173a350ccb1fd750158978b3bd0acb7d",
  "D406": "ca6103448d663ab16fcaef0f29f8933ef526cbf5aad12c7ff5dbd61b22ca9fc6",
  "SIT_FIN_SEM_2025": "8164843431e6b703a38fbdedc7898ec6ae83559fe10f88663ba0b55f3091d5fe",
  "SIT_FIN_AN_2025": "accceef5b6585a3e901d83d23fc2e60f6562eac4a2ce00f943856232bed929d6",
-  "DESCARCARE_DECLARATII": "8cc082021edb0ae97686d73f8179369be33a68ef03ec791757460bb7fff99e34",
+  "DESCARCARE_DECLARATII": "b2a9534d4f64b828abdb97459b92be27ba26a0d9ba1a0f947ef4a37c968ef293",
  "D205": "d3c20a7ae70f4c18bbb7add42af035e3746d323b2e6df37a4e31ed625ddb86d9",
  "D390": "4726938ed5858ec735caefd947a7d182b6dc64009478332c4feabdb36412a84e",
  "BILANT_2024": "fbb8d66c2e530d8798362992c6983e07e1250188228c758cb6da4cde4f955950",
--- a/tools/anaf-monitor/snapshots/D100.txt
+++ b/tools/anaf-monitor/snapshots/D100.txt
@@ -62,14 +62,14 @@ valabil începand cu
 		01/2024 - publicat în data de 09.02.2024
 soft A
 actualizat în data de
-29.05.2026
+17.06.2026
 soft J*
 actualizat în data de
 25.05.2026
 Anexa
 validări
 actualizat în data de
-20.05.2026
+17.06.2026
 Schema
 XSD
 100
--- a/tools/anaf-monitor/snapshots/DESCARCARE_DECLARATII.txt
+++ b/tools/anaf-monitor/snapshots/DESCARCARE_DECLARATII.txt
@@ -51,7 +51,7 @@ Se transmit prin SPV
 F1129
 -  Ordinul de plată multiplu electronic (OPME) V.2.0.45 dată 
 			actualizare
-25.11.2025
+22.06.2026
 Formularul se depune on-line prin Sistemul naţional de raportare
 FOREXEBUG
 de către instituţiile publice şi, respectiv, prin portalul
@@ -273,6 +273,10 @@ D182
 ,
 408
 ,
+409
+,
+410
+,
 700
 ,
 710,
--- a/tools/anaf-monitor/versions.json
+++ b/tools/anaf-monitor/versions.json
@@ -1,7 +1,7 @@
 {
  "D100": {
-    "soft_a_url": "http://static.anaf.ro/static/10/Anaf/Declaratii_R/AplicatiiDec/D100_710_XML_0126_290526.pdf",
-    "soft_a_date": "29.05.2026",
+    "soft_a_url": "http://static.anaf.ro/static/10/Anaf/Declaratii_R/AplicatiiDec/D100_710_XML_0126_170626.pdf",
+    "soft_a_date": "17.06.2026",
    "soft_j_url": "http://static.anaf.ro/static/10/Anaf/Declaratii_R/AplicatiiDec/D100_22052026.zip",
    "soft_j_date": "22.05.2026"
  },
--- a/tools/ocr_bon.py
+++ b/tools/ocr_bon.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Extrage date din bon fiscal / factură / extras de cont via Ollama vision LLM.
+
+Usage:
+    python tools/ocr_bon.py <pdf_sau_imagine> [--model minicpm-v] [--host http://10.0.20.161:11434]
+
+Modele recomandate (trage cu: ollama pull <model>):
+    minicpm-v          ~5GB, rapid, excelent pentru documente
+    llava:7b           ~4GB, clasic, bun pe bonuri
+    llama3.2-vision    ~8GB, cel mai precis
+"""
+
+import sys
+import json
+import base64
+import argparse
+import tempfile
+from pathlib import Path
+
+import httpx
+import fitz  # pymupdf
+
+OLLAMA_HOST = "http://10.0.20.161:11434"
+DEFAULT_MODEL = "minicpm-v"
+
+PROMPT = """Ești un sistem OCR specializat pe documente financiare românești.
+Extrage TOATE datele vizibile și returnează EXCLUSIV un JSON valid, fără text în afara JSON-ului.
+
+Schema JSON:
+{
+  "document_type": "bon_fiscal|factura|extras_cont|necunoscut",
+  "vendor": {
+    "name": "...",
+    "cui": "...",
+    "address": "..."
+  },
+  "client": {
+    "name": "...",
+    "cui": "..."
+  },
+  "document": {
+    "numar": "...",
+    "serie": "...",
+    "data": "YYYY-MM-DD",
+    "ora": "HH:MM:SS"
+  },
+  "items": [
+    {
+      "descriere": "...",
+      "cantitate": 0.0,
+      "unitate": "buc|l|kg|...",
+      "pret_unitar": 0.0,
+      "valoare": 0.0
+    }
+  ],
+  "tva": [
+    {
+      "cota_litera": "A|B|C",
+      "procent": 21.0,
+      "baza": 0.0,
+      "valoare_tva": 0.0
+    }
+  ],
+  "total_fara_tva": 0.0,
+  "total_tva": 0.0,
+  "total": 0.0,
+  "plata": {
+    "metoda": "card|numerar|transfer",
+    "tip_card": "...",
+    "pan_masked": "...",
+    "suma": 0.0
+  },
+  "note": "orice info suplimentar relevant"
+}
+
+Omite câmpurile care nu există în document. Returnează DOAR JSON."""
+
+
+def pdf_to_images_b64(pdf_path: Path, dpi: int = 150) -> list[str]:
+    """Convertește fiecare pagină PDF la PNG base64."""
+    doc = fitz.open(str(pdf_path))
+    images = []
+    mat = fitz.Matrix(dpi / 72, dpi / 72)
+    for page in doc:
+        pix = page.get_pixmap(matrix=mat)
+        images.append(base64.b64encode(pix.tobytes("png")).decode())
+    doc.close()
+    return images
+
+
+def image_to_b64(img_path: Path) -> str:
+    """Citește imagine și returnează base64."""
+    return base64.b64encode(img_path.read_bytes()).decode()
+
+
+def ask_ollama(model: str, images_b64: list[str], host: str) -> str:
+    """Trimite imaginile la Ollama și returnează răspunsul."""
+    payload = {
+        "model": model,
+        "prompt": PROMPT,
+        "images": images_b64,
+        "stream": False,
+        "options": {
+            "temperature": 0.1,
+            "num_predict": 2048,
+        }
+    }
+
+    with httpx.Client(timeout=600) as client:
+        r = client.post(f"{host}/api/generate", json=payload)
+        r.raise_for_status()
+        return r.json()["response"]
+
+
+def extract_json(text: str) -> dict:
+    """Extrage JSON din răspuns (poate fi înconjurat de markdown)."""
+    text = text.strip()
+    # Strip markdown code blocks
+    if "```" in text:
+        start = text.find("{", text.find("```"))
+        end = text.rfind("}") + 1
+        text = text[start:end]
+    return json.loads(text)
+
+
+def process(file_path: Path, model: str, host: str) -> dict:
+    suffix = file_path.suffix.lower()
+
+    if suffix == ".pdf":
+        print(f"  Conversie PDF → imagini...", file=sys.stderr)
+        images = pdf_to_images_b64(file_path)
+        print(f"  {len(images)} pagini extrase", file=sys.stderr)
+    elif suffix in (".jpg", ".jpeg", ".png", ".webp"):
+        images = [image_to_b64(file_path)]
+    else:
+        # Încearcă să detecteze tipul din conținut
+        header = file_path.read_bytes()[:8]
+        if header[:4] == b'%PDF':
+            images = pdf_to_images_b64(file_path)
+        else:
+            images = [image_to_b64(file_path)]
+
+    print(f"  Trimit la Ollama ({model})...", file=sys.stderr)
+    raw = ask_ollama(model, images, host)
+
+    try:
+        return extract_json(raw)
+    except json.JSONDecodeError:
+        print(f"  Răspuns brut (nu e JSON valid):\n{raw}", file=sys.stderr)
+        return {"error": "json_parse_failed", "raw": raw}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="OCR bon fiscal via Ollama vision")
+    parser.add_argument("file", help="PDF sau imagine (jpg/png)")
+    parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Model Ollama (default: {DEFAULT_MODEL})")
+    parser.add_argument("--host", default=OLLAMA_HOST, help=f"Ollama host (default: {OLLAMA_HOST})")
+    parser.add_argument("--pretty", action="store_true", help="JSON indentat")
+    args = parser.parse_args()
+
+    file_path = Path(args.file)
+    if not file_path.exists():
+        print(f"Fișierul nu există: {file_path}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Procesez: {file_path.name}", file=sys.stderr)
+    result = process(file_path, args.model, args.host)
+
+    indent = 2 if args.pretty else None
+    print(json.dumps(result, ensure_ascii=False, indent=indent))
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/tts.py
+++ b/tools/tts.py
@@ -35,10 +35,26 @@ _TTS_PUNCT_MAP = {
 }


+# Supertonic ONNX model hard limit: inputs longer than this trigger
+# Mul node dimension mismatches in attention layers.
+_MAX_TTS_CHARS = 400
+
+
 def sanitize_for_supertonic(text: str) -> str:
-    """Replace Unicode punctuation Supertonic rejects with ASCII equivalents."""
+    """Replace Unicode punctuation and strip chars that crash Supertonic's ONNX model."""
    for src, dst in _TTS_PUNCT_MAP.items():
        text = text.replace(src, dst)
+    # Strip emoji and high-codepoint chars (keep ASCII printable + Latin/Romanian diacritice)
+    cleaned = []
+    for ch in text:
+        cp = ord(ch)
+        if (32 <= cp <= 126) or (128 <= cp <= 591):
+            cleaned.append(ch)
+        else:
+            cleaned.append(' ')
+    text = ' '.join(''.join(cleaned).split())
+    if len(text) > _MAX_TTS_CHARS:
+        text = text[:_MAX_TTS_CHARS]
    return text