"""Compara modele FREE OpenRouter la clasificarea operatii->coduri RAR (RO). Ruleaza fiecare model candidat pe top-N denumiri DUPA FRECVENTA (cele care conteaza la volum), si raporteaza per model: - latenta (ms/batch), rata de eroare/parse-fail (cate "?"), - cate NUL detecteaza (gunoi), distributia codurilor, - acord cu etichetele Groq existente (labels-groq-partial.json) ca referinta silver, - acord pereche intre modele + vot majoritar (candidat treapta auto-send). Salveaza voturile brute in modeltest-result.json pentru adjudicare de catre om. Rulare: python3 tools/mapare-llm/or_modeltest.py [N] [model1 model2 ...] N = cate denumiri (top dupa frecventa). Default 120. modelN = override lista de modele. Default = set curat din modele free live. """ import sys, os, json, time from collections import Counter sys.path.insert(0, os.path.dirname(__file__)) import or_common as oc HERE = os.path.dirname(__file__) GROQ_LABELS = os.path.join(HERE, "labels-groq-partial.json") OUT = os.path.join(HERE, "modeltest-result.json") BATCH = 40 # batch mare = mai putine cereri (cap free tier ~50/zi fara credit) PACE = 4.0 # sec intre batch-uri (free tier OpenRouter ~20 req/min) # Set FIABIL pe free tier (probat live 2026-06-28): doar familia NVIDIA Nemotron # routeaza fara 429/404. llama/qwen/gemma/gpt-oss/hermes = rate-limited sau provider # blocat. CAVEAT F7: aceeasi familie -> acordul supraestimeaza increderea; scale # diferite (9B/120B/550B) dau totusi divergenta pe cazuri grele. DEFAULT_MODELS = [ "nvidia/nemotron-3-super-120b-a12b:free", # 120B, rapid (~3s) "nvidia/nemotron-nano-9b-v2:free", # 9B, scala mica "nvidia/nemotron-3-ultra-550b-a55b:free", # 550B, lent (~36s) dar capabil ] def run_model(model, sample): codes, total_ms, errs = [], 0, [] nb = (len(sample) + BATCH - 1) // BATCH for bi, k in enumerate(range(0, len(sample), BATCH)): batch = sample[k:k + BATCH] c, meta = oc.call(model, batch) codes += c total_ms += meta["ms"] if meta["err"]: errs.append(meta["err"]) print(f" {model:<45} batch {bi+1}/{nb} {meta['ms']}ms err={meta['err']}", flush=True) time.sleep(PACE) return codes, total_ms, errs def main(): args = sys.argv[1:] n = 120 models = DEFAULT_MODELS if args and args[0].isdigit(): n = int(args[0]); args = args[1:] if args: models = args corpus = oc.corpus_by_freq() sample = [op for op, _ in corpus[:n]] freq = {op: nr for op, nr in corpus[:n]} vol_total = sum(freq.values()) print(f"esantion: top {n} dupa frecventa = {vol_total} volum " f"({100*vol_total/sum(nr for _,nr in corpus):.1f}% din total)\n", flush=True) groq = {} if os.path.exists(GROQ_LABELS): groq = json.load(open(GROQ_LABELS, encoding="utf-8")) results = {} t0 = time.time() for m in models: print(f"=== {m} ===", flush=True) codes, total_ms, errs = run_model(m, sample) results[m] = {"codes": codes, "total_ms": total_ms, "errs": errs} print(f" -> {total_ms/1000:.0f}s total\n", flush=True) # vot majoritar + nivel de acord, ponderat pe frecventa rows = [] for i, op in enumerate(sample): votes = {m: results[m]["codes"][i] for m in models} valid = [v for v in votes.values() if v not in ("?",)] c = Counter(valid) top, cnt = (c.most_common(1)[0] if c else ("?", 0)) rows.append({"op": op, "nr": freq[op], "votes": votes, "maj": top, "agree": cnt, "n_models": len(models), "groq": groq.get(op)}) json.dump({"models": models, "n": n, "rows": rows}, open(OUT, "w"), ensure_ascii=False, indent=1) # --- raport per model --- print("=" * 78) print(f"RAPORT ({n} ops, {time.time()-t0:.0f}s, ponderare pe frecventa NR)") print("=" * 78) print(f"{'model':<46} {'ms/op':>6} {'?fail':>6} {'NUL':>5} {'~Groq':>7}") for m in models: codes = results[m]["codes"] fails = sum(1 for x in codes if x == "?") nul = sum(1 for x in codes if x == "NUL") # acord vs Groq pe overlap (ponderat pe frecventa) ov_w = ov_match = 0 for i, op in enumerate(sample): g = groq.get(op) if g and codes[i] != "?": ov_w += freq[op] if codes[i] == g: ov_match += freq[op] agr = f"{100*ov_match/ov_w:.0f}%" if ov_w else "n/a" msop = results[m]["total_ms"] / max(1, len(codes)) print(f"{m:<46} {msop:>6.0f} {fails:>6} {nul:>5} {agr:>7}") # --- acord ensemble ponderat pe frecventa --- print("\n--- ACORD ENSEMBLE (ponderat pe volum) ---") nm = len(models) for lvl in range(nm, 0, -1): w = sum(r["nr"] for r in rows if r["agree"] == lvl) cnt = sum(1 for r in rows if r["agree"] == lvl) tag = " <- candidat auto-send" if lvl == nm else "" print(f" acord {lvl}/{nm}: {cnt:>4} ops, {100*w/vol_total:.0f}% volum{tag}") unan = [r for r in rows if r["agree"] == nm] nul_un = sum(1 for r in unan if r["maj"] == "NUL") print(f" din unanim {nm}/{nm}: {nul_un} NUL (gunoi), {len(unan)-nul_un} coduri reale") print(f"\nbrut salvat: {OUT}") print("--- esantion dezacord (volum mare, de adjudecat de om) ---") disp = sorted([r for r in rows if r["agree"] < nm], key=lambda r: -r["nr"])[:20] for r in disp: vs = "/".join(sorted(set(r["votes"].values()))) print(f" {r['nr']:>5} {r['op']:<40} maj={r['maj']:<6} ({vs})") if __name__ == "__main__": main()