Files
echo-core/tools/ocr_bon.py
Marius Mutu d175d5ba5a chore: working-tree state — anaf snapshots, cron state, KB notes, tools
Pre-existing uncommitted changes swept in with the STT work:
anaf-monitor snapshots/versions, cron job + newsletter state, 9 youtube KB
notes, tools/ocr_bon.py, and tools/tts.py.

Note: the tts.py change breaks 2 truncation tests in test_voice_normalize.py
(sanitize word-count) — flagged for a separate follow-up.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:31 +00:00

176 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""
Extrage date din bon fiscal / factură / extras de cont via Ollama vision LLM.
Usage:
python tools/ocr_bon.py <pdf_sau_imagine> [--model minicpm-v] [--host http://10.0.20.161:11434]
Modele recomandate (trage cu: ollama pull <model>):
minicpm-v ~5GB, rapid, excelent pentru documente
llava:7b ~4GB, clasic, bun pe bonuri
llama3.2-vision ~8GB, cel mai precis
"""
import sys
import json
import base64
import argparse
import tempfile
from pathlib import Path
import httpx
import fitz # pymupdf
OLLAMA_HOST = "http://10.0.20.161:11434"
DEFAULT_MODEL = "minicpm-v"
PROMPT = """Ești un sistem OCR specializat pe documente financiare românești.
Extrage TOATE datele vizibile și returnează EXCLUSIV un JSON valid, fără text în afara JSON-ului.
Schema JSON:
{
"document_type": "bon_fiscal|factura|extras_cont|necunoscut",
"vendor": {
"name": "...",
"cui": "...",
"address": "..."
},
"client": {
"name": "...",
"cui": "..."
},
"document": {
"numar": "...",
"serie": "...",
"data": "YYYY-MM-DD",
"ora": "HH:MM:SS"
},
"items": [
{
"descriere": "...",
"cantitate": 0.0,
"unitate": "buc|l|kg|...",
"pret_unitar": 0.0,
"valoare": 0.0
}
],
"tva": [
{
"cota_litera": "A|B|C",
"procent": 21.0,
"baza": 0.0,
"valoare_tva": 0.0
}
],
"total_fara_tva": 0.0,
"total_tva": 0.0,
"total": 0.0,
"plata": {
"metoda": "card|numerar|transfer",
"tip_card": "...",
"pan_masked": "...",
"suma": 0.0
},
"note": "orice info suplimentar relevant"
}
Omite câmpurile care nu există în document. Returnează DOAR JSON."""
def pdf_to_images_b64(pdf_path: Path, dpi: int = 150) -> list[str]:
"""Convertește fiecare pagină PDF la PNG base64."""
doc = fitz.open(str(pdf_path))
images = []
mat = fitz.Matrix(dpi / 72, dpi / 72)
for page in doc:
pix = page.get_pixmap(matrix=mat)
images.append(base64.b64encode(pix.tobytes("png")).decode())
doc.close()
return images
def image_to_b64(img_path: Path) -> str:
"""Citește imagine și returnează base64."""
return base64.b64encode(img_path.read_bytes()).decode()
def ask_ollama(model: str, images_b64: list[str], host: str) -> str:
"""Trimite imaginile la Ollama și returnează răspunsul."""
payload = {
"model": model,
"prompt": PROMPT,
"images": images_b64,
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 2048,
}
}
with httpx.Client(timeout=600) as client:
r = client.post(f"{host}/api/generate", json=payload)
r.raise_for_status()
return r.json()["response"]
def extract_json(text: str) -> dict:
"""Extrage JSON din răspuns (poate fi înconjurat de markdown)."""
text = text.strip()
# Strip markdown code blocks
if "```" in text:
start = text.find("{", text.find("```"))
end = text.rfind("}") + 1
text = text[start:end]
return json.loads(text)
def process(file_path: Path, model: str, host: str) -> dict:
suffix = file_path.suffix.lower()
if suffix == ".pdf":
print(f" Conversie PDF → imagini...", file=sys.stderr)
images = pdf_to_images_b64(file_path)
print(f" {len(images)} pagini extrase", file=sys.stderr)
elif suffix in (".jpg", ".jpeg", ".png", ".webp"):
images = [image_to_b64(file_path)]
else:
# Încearcă să detecteze tipul din conținut
header = file_path.read_bytes()[:8]
if header[:4] == b'%PDF':
images = pdf_to_images_b64(file_path)
else:
images = [image_to_b64(file_path)]
print(f" Trimit la Ollama ({model})...", file=sys.stderr)
raw = ask_ollama(model, images, host)
try:
return extract_json(raw)
except json.JSONDecodeError:
print(f" Răspuns brut (nu e JSON valid):\n{raw}", file=sys.stderr)
return {"error": "json_parse_failed", "raw": raw}
def main():
parser = argparse.ArgumentParser(description="OCR bon fiscal via Ollama vision")
parser.add_argument("file", help="PDF sau imagine (jpg/png)")
parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Model Ollama (default: {DEFAULT_MODEL})")
parser.add_argument("--host", default=OLLAMA_HOST, help=f"Ollama host (default: {OLLAMA_HOST})")
parser.add_argument("--pretty", action="store_true", help="JSON indentat")
args = parser.parse_args()
file_path = Path(args.file)
if not file_path.exists():
print(f"Fișierul nu există: {file_path}", file=sys.stderr)
sys.exit(1)
print(f"Procesez: {file_path.name}", file=sys.stderr)
result = process(file_path, args.model, args.host)
indent = 2 if args.pretty else None
print(json.dumps(result, ensure_ascii=False, indent=indent))
if __name__ == "__main__":
main()