Pre-existing uncommitted changes swept in with the STT work: anaf-monitor snapshots/versions, cron job + newsletter state, 9 youtube KB notes, tools/ocr_bon.py, and tools/tts.py. Note: the tts.py change breaks 2 truncation tests in test_voice_normalize.py (sanitize word-count) — flagged for a separate follow-up. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
176 lines
4.9 KiB
Python
176 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extrage date din bon fiscal / factură / extras de cont via Ollama vision LLM.
|
|
|
|
Usage:
|
|
python tools/ocr_bon.py <pdf_sau_imagine> [--model minicpm-v] [--host http://10.0.20.161:11434]
|
|
|
|
Modele recomandate (trage cu: ollama pull <model>):
|
|
minicpm-v ~5GB, rapid, excelent pentru documente
|
|
llava:7b ~4GB, clasic, bun pe bonuri
|
|
llama3.2-vision ~8GB, cel mai precis
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import base64
|
|
import argparse
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import fitz # pymupdf
|
|
|
|
OLLAMA_HOST = "http://10.0.20.161:11434"
|
|
DEFAULT_MODEL = "minicpm-v"
|
|
|
|
PROMPT = """Ești un sistem OCR specializat pe documente financiare românești.
|
|
Extrage TOATE datele vizibile și returnează EXCLUSIV un JSON valid, fără text în afara JSON-ului.
|
|
|
|
Schema JSON:
|
|
{
|
|
"document_type": "bon_fiscal|factura|extras_cont|necunoscut",
|
|
"vendor": {
|
|
"name": "...",
|
|
"cui": "...",
|
|
"address": "..."
|
|
},
|
|
"client": {
|
|
"name": "...",
|
|
"cui": "..."
|
|
},
|
|
"document": {
|
|
"numar": "...",
|
|
"serie": "...",
|
|
"data": "YYYY-MM-DD",
|
|
"ora": "HH:MM:SS"
|
|
},
|
|
"items": [
|
|
{
|
|
"descriere": "...",
|
|
"cantitate": 0.0,
|
|
"unitate": "buc|l|kg|...",
|
|
"pret_unitar": 0.0,
|
|
"valoare": 0.0
|
|
}
|
|
],
|
|
"tva": [
|
|
{
|
|
"cota_litera": "A|B|C",
|
|
"procent": 21.0,
|
|
"baza": 0.0,
|
|
"valoare_tva": 0.0
|
|
}
|
|
],
|
|
"total_fara_tva": 0.0,
|
|
"total_tva": 0.0,
|
|
"total": 0.0,
|
|
"plata": {
|
|
"metoda": "card|numerar|transfer",
|
|
"tip_card": "...",
|
|
"pan_masked": "...",
|
|
"suma": 0.0
|
|
},
|
|
"note": "orice info suplimentar relevant"
|
|
}
|
|
|
|
Omite câmpurile care nu există în document. Returnează DOAR JSON."""
|
|
|
|
|
|
def pdf_to_images_b64(pdf_path: Path, dpi: int = 150) -> list[str]:
|
|
"""Convertește fiecare pagină PDF la PNG base64."""
|
|
doc = fitz.open(str(pdf_path))
|
|
images = []
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
for page in doc:
|
|
pix = page.get_pixmap(matrix=mat)
|
|
images.append(base64.b64encode(pix.tobytes("png")).decode())
|
|
doc.close()
|
|
return images
|
|
|
|
|
|
def image_to_b64(img_path: Path) -> str:
|
|
"""Citește imagine și returnează base64."""
|
|
return base64.b64encode(img_path.read_bytes()).decode()
|
|
|
|
|
|
def ask_ollama(model: str, images_b64: list[str], host: str) -> str:
|
|
"""Trimite imaginile la Ollama și returnează răspunsul."""
|
|
payload = {
|
|
"model": model,
|
|
"prompt": PROMPT,
|
|
"images": images_b64,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"num_predict": 2048,
|
|
}
|
|
}
|
|
|
|
with httpx.Client(timeout=600) as client:
|
|
r = client.post(f"{host}/api/generate", json=payload)
|
|
r.raise_for_status()
|
|
return r.json()["response"]
|
|
|
|
|
|
def extract_json(text: str) -> dict:
|
|
"""Extrage JSON din răspuns (poate fi înconjurat de markdown)."""
|
|
text = text.strip()
|
|
# Strip markdown code blocks
|
|
if "```" in text:
|
|
start = text.find("{", text.find("```"))
|
|
end = text.rfind("}") + 1
|
|
text = text[start:end]
|
|
return json.loads(text)
|
|
|
|
|
|
def process(file_path: Path, model: str, host: str) -> dict:
|
|
suffix = file_path.suffix.lower()
|
|
|
|
if suffix == ".pdf":
|
|
print(f" Conversie PDF → imagini...", file=sys.stderr)
|
|
images = pdf_to_images_b64(file_path)
|
|
print(f" {len(images)} pagini extrase", file=sys.stderr)
|
|
elif suffix in (".jpg", ".jpeg", ".png", ".webp"):
|
|
images = [image_to_b64(file_path)]
|
|
else:
|
|
# Încearcă să detecteze tipul din conținut
|
|
header = file_path.read_bytes()[:8]
|
|
if header[:4] == b'%PDF':
|
|
images = pdf_to_images_b64(file_path)
|
|
else:
|
|
images = [image_to_b64(file_path)]
|
|
|
|
print(f" Trimit la Ollama ({model})...", file=sys.stderr)
|
|
raw = ask_ollama(model, images, host)
|
|
|
|
try:
|
|
return extract_json(raw)
|
|
except json.JSONDecodeError:
|
|
print(f" Răspuns brut (nu e JSON valid):\n{raw}", file=sys.stderr)
|
|
return {"error": "json_parse_failed", "raw": raw}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="OCR bon fiscal via Ollama vision")
|
|
parser.add_argument("file", help="PDF sau imagine (jpg/png)")
|
|
parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Model Ollama (default: {DEFAULT_MODEL})")
|
|
parser.add_argument("--host", default=OLLAMA_HOST, help=f"Ollama host (default: {OLLAMA_HOST})")
|
|
parser.add_argument("--pretty", action="store_true", help="JSON indentat")
|
|
args = parser.parse_args()
|
|
|
|
file_path = Path(args.file)
|
|
if not file_path.exists():
|
|
print(f"Fișierul nu există: {file_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Procesez: {file_path.name}", file=sys.stderr)
|
|
result = process(file_path, args.model, args.host)
|
|
|
|
indent = 2 if args.pretty else None
|
|
print(json.dumps(result, ensure_ascii=False, indent=indent))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|