feat(mapare-llm): pivot PRD 5.14 + tooling etichetare OpenRouter
PRD 5.14 rescris cu pivotul arhitectural: LLM doar etichetator OFFLINE, runtime = clasificator local fara API (fuzzy + embeddings), baza de cunostinte GOLD partajata cross-account (validarea unui service ajuta toate). Decizia 8 (corpus per-cont) SUPERSEDED. Tooling nou OpenRouter (free, familia NVIDIA Nemotron): or_common.py (client + corpus pe frecventa, cheie din .env) + or_modeltest.py (comparatie modele, acord ensemble vs Groq). Masurat: super-120b + nano-9b fiabile, 3/3 unanim pe 87% volum; ultra-550b aruncat. Corpus real (4 CSV service, coloana NR=frecventa) + etichete Groq bootstrap incluse ca date de masurare. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
137
tools/mapare-llm/or_modeltest.py
Normal file
137
tools/mapare-llm/or_modeltest.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""Compara modele FREE OpenRouter la clasificarea operatii->coduri RAR (RO).
|
||||
|
||||
Ruleaza fiecare model candidat pe top-N denumiri DUPA FRECVENTA (cele care conteaza
|
||||
la volum), si raporteaza per model:
|
||||
- latenta (ms/batch), rata de eroare/parse-fail (cate "?"),
|
||||
- cate NUL detecteaza (gunoi), distributia codurilor,
|
||||
- acord cu etichetele Groq existente (labels-groq-partial.json) ca referinta silver,
|
||||
- acord pereche intre modele + vot majoritar (candidat treapta auto-send).
|
||||
|
||||
Salveaza voturile brute in modeltest-result.json pentru adjudicare de catre om.
|
||||
|
||||
Rulare: python3 tools/mapare-llm/or_modeltest.py [N] [model1 model2 ...]
|
||||
N = cate denumiri (top dupa frecventa). Default 120.
|
||||
modelN = override lista de modele. Default = set curat din modele free live.
|
||||
"""
|
||||
import sys, os, json, time
|
||||
from collections import Counter
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
import or_common as oc
|
||||
|
||||
HERE = os.path.dirname(__file__)
|
||||
GROQ_LABELS = os.path.join(HERE, "labels-groq-partial.json")
|
||||
OUT = os.path.join(HERE, "modeltest-result.json")
|
||||
BATCH = 40 # batch mare = mai putine cereri (cap free tier ~50/zi fara credit)
|
||||
PACE = 4.0 # sec intre batch-uri (free tier OpenRouter ~20 req/min)
|
||||
|
||||
# Set FIABIL pe free tier (probat live 2026-06-28): doar familia NVIDIA Nemotron
|
||||
# routeaza fara 429/404. llama/qwen/gemma/gpt-oss/hermes = rate-limited sau provider
|
||||
# blocat. CAVEAT F7: aceeasi familie -> acordul supraestimeaza increderea; scale
|
||||
# diferite (9B/120B/550B) dau totusi divergenta pe cazuri grele.
|
||||
DEFAULT_MODELS = [
|
||||
"nvidia/nemotron-3-super-120b-a12b:free", # 120B, rapid (~3s)
|
||||
"nvidia/nemotron-nano-9b-v2:free", # 9B, scala mica
|
||||
"nvidia/nemotron-3-ultra-550b-a55b:free", # 550B, lent (~36s) dar capabil
|
||||
]
|
||||
|
||||
|
||||
def run_model(model, sample):
|
||||
codes, total_ms, errs = [], 0, []
|
||||
nb = (len(sample) + BATCH - 1) // BATCH
|
||||
for bi, k in enumerate(range(0, len(sample), BATCH)):
|
||||
batch = sample[k:k + BATCH]
|
||||
c, meta = oc.call(model, batch)
|
||||
codes += c
|
||||
total_ms += meta["ms"]
|
||||
if meta["err"]:
|
||||
errs.append(meta["err"])
|
||||
print(f" {model:<45} batch {bi+1}/{nb} {meta['ms']}ms err={meta['err']}", flush=True)
|
||||
time.sleep(PACE)
|
||||
return codes, total_ms, errs
|
||||
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
n = 120
|
||||
models = DEFAULT_MODELS
|
||||
if args and args[0].isdigit():
|
||||
n = int(args[0]); args = args[1:]
|
||||
if args:
|
||||
models = args
|
||||
|
||||
corpus = oc.corpus_by_freq()
|
||||
sample = [op for op, _ in corpus[:n]]
|
||||
freq = {op: nr for op, nr in corpus[:n]}
|
||||
vol_total = sum(freq.values())
|
||||
print(f"esantion: top {n} dupa frecventa = {vol_total} volum "
|
||||
f"({100*vol_total/sum(nr for _,nr in corpus):.1f}% din total)\n", flush=True)
|
||||
|
||||
groq = {}
|
||||
if os.path.exists(GROQ_LABELS):
|
||||
groq = json.load(open(GROQ_LABELS, encoding="utf-8"))
|
||||
|
||||
results = {}
|
||||
t0 = time.time()
|
||||
for m in models:
|
||||
print(f"=== {m} ===", flush=True)
|
||||
codes, total_ms, errs = run_model(m, sample)
|
||||
results[m] = {"codes": codes, "total_ms": total_ms, "errs": errs}
|
||||
print(f" -> {total_ms/1000:.0f}s total\n", flush=True)
|
||||
|
||||
# vot majoritar + nivel de acord, ponderat pe frecventa
|
||||
rows = []
|
||||
for i, op in enumerate(sample):
|
||||
votes = {m: results[m]["codes"][i] for m in models}
|
||||
valid = [v for v in votes.values() if v not in ("?",)]
|
||||
c = Counter(valid)
|
||||
top, cnt = (c.most_common(1)[0] if c else ("?", 0))
|
||||
rows.append({"op": op, "nr": freq[op], "votes": votes,
|
||||
"maj": top, "agree": cnt, "n_models": len(models),
|
||||
"groq": groq.get(op)})
|
||||
|
||||
json.dump({"models": models, "n": n, "rows": rows}, open(OUT, "w"),
|
||||
ensure_ascii=False, indent=1)
|
||||
|
||||
# --- raport per model ---
|
||||
print("=" * 78)
|
||||
print(f"RAPORT ({n} ops, {time.time()-t0:.0f}s, ponderare pe frecventa NR)")
|
||||
print("=" * 78)
|
||||
print(f"{'model':<46} {'ms/op':>6} {'?fail':>6} {'NUL':>5} {'~Groq':>7}")
|
||||
for m in models:
|
||||
codes = results[m]["codes"]
|
||||
fails = sum(1 for x in codes if x == "?")
|
||||
nul = sum(1 for x in codes if x == "NUL")
|
||||
# acord vs Groq pe overlap (ponderat pe frecventa)
|
||||
ov_w = ov_match = 0
|
||||
for i, op in enumerate(sample):
|
||||
g = groq.get(op)
|
||||
if g and codes[i] != "?":
|
||||
ov_w += freq[op]
|
||||
if codes[i] == g:
|
||||
ov_match += freq[op]
|
||||
agr = f"{100*ov_match/ov_w:.0f}%" if ov_w else "n/a"
|
||||
msop = results[m]["total_ms"] / max(1, len(codes))
|
||||
print(f"{m:<46} {msop:>6.0f} {fails:>6} {nul:>5} {agr:>7}")
|
||||
|
||||
# --- acord ensemble ponderat pe frecventa ---
|
||||
print("\n--- ACORD ENSEMBLE (ponderat pe volum) ---")
|
||||
nm = len(models)
|
||||
for lvl in range(nm, 0, -1):
|
||||
w = sum(r["nr"] for r in rows if r["agree"] == lvl)
|
||||
cnt = sum(1 for r in rows if r["agree"] == lvl)
|
||||
tag = " <- candidat auto-send" if lvl == nm else ""
|
||||
print(f" acord {lvl}/{nm}: {cnt:>4} ops, {100*w/vol_total:.0f}% volum{tag}")
|
||||
unan = [r for r in rows if r["agree"] == nm]
|
||||
nul_un = sum(1 for r in unan if r["maj"] == "NUL")
|
||||
print(f" din unanim {nm}/{nm}: {nul_un} NUL (gunoi), {len(unan)-nul_un} coduri reale")
|
||||
|
||||
print(f"\nbrut salvat: {OUT}")
|
||||
print("--- esantion dezacord (volum mare, de adjudecat de om) ---")
|
||||
disp = sorted([r for r in rows if r["agree"] < nm], key=lambda r: -r["nr"])[:20]
|
||||
for r in disp:
|
||||
vs = "/".join(sorted(set(r["votes"].values())))
|
||||
print(f" {r['nr']:>5} {r['op']:<40} maj={r['maj']:<6} ({vs})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user