chore: working-tree state — anaf snapshots, cron state, KB notes, tools

Pre-existing uncommitted changes swept in with the STT work:
anaf-monitor snapshots/versions, cron job + newsletter state, 9 youtube KB
notes, tools/ocr_bon.py, and tools/tts.py.

Note: the tts.py change breaks 2 truncation tests in test_voice_normalize.py
(sanitize word-count) — flagged for a separate follow-up.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-27 18:16:31 +00:00
parent ce273d14db
commit d175d5ba5a
17 changed files with 840 additions and 41 deletions

View File

@@ -1,12 +1,12 @@
{
"D100": "27cf97a4d10c8529669d95b2d96ca3c9b41f7e4e50091dce19cf8af117f0ac4a",
"D100": "ce27f72bc3fd5e3241480fcda3a3e14572cfbed1b26e43896037bb265d82e4f5",
"D101": "f72fc1c29657ea11e0238806a28f6abccf5b00e45904e1e0c9385cc64491fcaf",
"D300": "cb7b55b568ab893024884971eac0367fb6fe487c297e355d64258dae437f6ddd",
"D394": "c4c4e62bda30032f12c17edf9a5087b6173a350ccb1fd750158978b3bd0acb7d",
"D406": "ca6103448d663ab16fcaef0f29f8933ef526cbf5aad12c7ff5dbd61b22ca9fc6",
"SIT_FIN_SEM_2025": "8164843431e6b703a38fbdedc7898ec6ae83559fe10f88663ba0b55f3091d5fe",
"SIT_FIN_AN_2025": "accceef5b6585a3e901d83d23fc2e60f6562eac4a2ce00f943856232bed929d6",
"DESCARCARE_DECLARATII": "8cc082021edb0ae97686d73f8179369be33a68ef03ec791757460bb7fff99e34",
"DESCARCARE_DECLARATII": "b2a9534d4f64b828abdb97459b92be27ba26a0d9ba1a0f947ef4a37c968ef293",
"D205": "d3c20a7ae70f4c18bbb7add42af035e3746d323b2e6df37a4e31ed625ddb86d9",
"D390": "4726938ed5858ec735caefd947a7d182b6dc64009478332c4feabdb36412a84e",
"BILANT_2024": "fbb8d66c2e530d8798362992c6983e07e1250188228c758cb6da4cde4f955950",

View File

@@ -62,14 +62,14 @@ valabil începand cu
01/2024 - publicat în data de 09.02.2024
soft A
actualizat în data de
29.05.2026
17.06.2026
soft J*
actualizat în data de
25.05.2026
Anexa
validări
actualizat în data de
20.05.2026
17.06.2026
Schema
XSD
100

View File

@@ -51,7 +51,7 @@ Se transmit prin SPV
F1129
-  Ordinul de plată multiplu electronic (OPME) V.2.0.45 dată
actualizare
25.11.2025
22.06.2026
Formularul se depune on-line prin Sistemul naţional de raportare
FOREXEBUG
de către instituţiile publice şi, respectiv, prin portalul
@@ -273,6 +273,10 @@ D182
,
408
,
409
,
410
,
700
,
710,

View File

@@ -1,7 +1,7 @@
{
"D100": {
"soft_a_url": "http://static.anaf.ro/static/10/Anaf/Declaratii_R/AplicatiiDec/D100_710_XML_0126_290526.pdf",
"soft_a_date": "29.05.2026",
"soft_a_url": "http://static.anaf.ro/static/10/Anaf/Declaratii_R/AplicatiiDec/D100_710_XML_0126_170626.pdf",
"soft_a_date": "17.06.2026",
"soft_j_url": "http://static.anaf.ro/static/10/Anaf/Declaratii_R/AplicatiiDec/D100_22052026.zip",
"soft_j_date": "22.05.2026"
},

175
tools/ocr_bon.py Normal file
View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
"""
Extrage date din bon fiscal / factură / extras de cont via Ollama vision LLM.
Usage:
python tools/ocr_bon.py <pdf_sau_imagine> [--model minicpm-v] [--host http://10.0.20.161:11434]
Modele recomandate (trage cu: ollama pull <model>):
minicpm-v ~5GB, rapid, excelent pentru documente
llava:7b ~4GB, clasic, bun pe bonuri
llama3.2-vision ~8GB, cel mai precis
"""
import sys
import json
import base64
import argparse
import tempfile
from pathlib import Path
import httpx
import fitz # pymupdf
OLLAMA_HOST = "http://10.0.20.161:11434"
DEFAULT_MODEL = "minicpm-v"
PROMPT = """Ești un sistem OCR specializat pe documente financiare românești.
Extrage TOATE datele vizibile și returnează EXCLUSIV un JSON valid, fără text în afara JSON-ului.
Schema JSON:
{
"document_type": "bon_fiscal|factura|extras_cont|necunoscut",
"vendor": {
"name": "...",
"cui": "...",
"address": "..."
},
"client": {
"name": "...",
"cui": "..."
},
"document": {
"numar": "...",
"serie": "...",
"data": "YYYY-MM-DD",
"ora": "HH:MM:SS"
},
"items": [
{
"descriere": "...",
"cantitate": 0.0,
"unitate": "buc|l|kg|...",
"pret_unitar": 0.0,
"valoare": 0.0
}
],
"tva": [
{
"cota_litera": "A|B|C",
"procent": 21.0,
"baza": 0.0,
"valoare_tva": 0.0
}
],
"total_fara_tva": 0.0,
"total_tva": 0.0,
"total": 0.0,
"plata": {
"metoda": "card|numerar|transfer",
"tip_card": "...",
"pan_masked": "...",
"suma": 0.0
},
"note": "orice info suplimentar relevant"
}
Omite câmpurile care nu există în document. Returnează DOAR JSON."""
def pdf_to_images_b64(pdf_path: Path, dpi: int = 150) -> list[str]:
"""Convertește fiecare pagină PDF la PNG base64."""
doc = fitz.open(str(pdf_path))
images = []
mat = fitz.Matrix(dpi / 72, dpi / 72)
for page in doc:
pix = page.get_pixmap(matrix=mat)
images.append(base64.b64encode(pix.tobytes("png")).decode())
doc.close()
return images
def image_to_b64(img_path: Path) -> str:
"""Citește imagine și returnează base64."""
return base64.b64encode(img_path.read_bytes()).decode()
def ask_ollama(model: str, images_b64: list[str], host: str) -> str:
"""Trimite imaginile la Ollama și returnează răspunsul."""
payload = {
"model": model,
"prompt": PROMPT,
"images": images_b64,
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 2048,
}
}
with httpx.Client(timeout=600) as client:
r = client.post(f"{host}/api/generate", json=payload)
r.raise_for_status()
return r.json()["response"]
def extract_json(text: str) -> dict:
"""Extrage JSON din răspuns (poate fi înconjurat de markdown)."""
text = text.strip()
# Strip markdown code blocks
if "```" in text:
start = text.find("{", text.find("```"))
end = text.rfind("}") + 1
text = text[start:end]
return json.loads(text)
def process(file_path: Path, model: str, host: str) -> dict:
suffix = file_path.suffix.lower()
if suffix == ".pdf":
print(f" Conversie PDF → imagini...", file=sys.stderr)
images = pdf_to_images_b64(file_path)
print(f" {len(images)} pagini extrase", file=sys.stderr)
elif suffix in (".jpg", ".jpeg", ".png", ".webp"):
images = [image_to_b64(file_path)]
else:
# Încearcă să detecteze tipul din conținut
header = file_path.read_bytes()[:8]
if header[:4] == b'%PDF':
images = pdf_to_images_b64(file_path)
else:
images = [image_to_b64(file_path)]
print(f" Trimit la Ollama ({model})...", file=sys.stderr)
raw = ask_ollama(model, images, host)
try:
return extract_json(raw)
except json.JSONDecodeError:
print(f" Răspuns brut (nu e JSON valid):\n{raw}", file=sys.stderr)
return {"error": "json_parse_failed", "raw": raw}
def main():
parser = argparse.ArgumentParser(description="OCR bon fiscal via Ollama vision")
parser.add_argument("file", help="PDF sau imagine (jpg/png)")
parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Model Ollama (default: {DEFAULT_MODEL})")
parser.add_argument("--host", default=OLLAMA_HOST, help=f"Ollama host (default: {OLLAMA_HOST})")
parser.add_argument("--pretty", action="store_true", help="JSON indentat")
args = parser.parse_args()
file_path = Path(args.file)
if not file_path.exists():
print(f"Fișierul nu există: {file_path}", file=sys.stderr)
sys.exit(1)
print(f"Procesez: {file_path.name}", file=sys.stderr)
result = process(file_path, args.model, args.host)
indent = 2 if args.pretty else None
print(json.dumps(result, ensure_ascii=False, indent=indent))
if __name__ == "__main__":
main()

View File

@@ -35,10 +35,26 @@ _TTS_PUNCT_MAP = {
}
# Supertonic ONNX model hard limit: inputs longer than this trigger
# Mul node dimension mismatches in attention layers.
_MAX_TTS_CHARS = 400
def sanitize_for_supertonic(text: str) -> str:
"""Replace Unicode punctuation Supertonic rejects with ASCII equivalents."""
"""Replace Unicode punctuation and strip chars that crash Supertonic's ONNX model."""
for src, dst in _TTS_PUNCT_MAP.items():
text = text.replace(src, dst)
# Strip emoji and high-codepoint chars (keep ASCII printable + Latin/Romanian diacritice)
cleaned = []
for ch in text:
cp = ord(ch)
if (32 <= cp <= 126) or (128 <= cp <= 591):
cleaned.append(ch)
else:
cleaned.append(' ')
text = ' '.join(''.join(cleaned).split())
if len(text) > _MAX_TTS_CHARS:
text = text[:_MAX_TTS_CHARS]
return text