"""Evalueaza un clasificator DETERMINIST (fara AI la runtime) construit din etichetele Groq. Split 90/10: 'antrenam' pe 90% (lookup exact + fuzzy NN + Naive Bayes pe tokeni), testam pe 10% nevazute. Masuram acoperire + acuratete per strat si global, fata de etichetele Groq (referinta).""" import json, re, unicodedata, random, math, time, os from collections import defaultdict, Counter from rapidfuzz import process, fuzz OUT="/tmp/claude-1000/-workspace-autopass/4177677c-7995-4fab-bbd5-16735cb335e3/scratchpad/labels.json" random.seed(7) def norm(s): s=''.join(c for c in unicodedata.normalize('NFD',s.upper()) if unicodedata.category(c)!='Mn') s=re.sub(r'[^A-Z0-9/ ]',' ',s); s=re.sub(r'\s+',' ',s).strip() return s def toks(s): return [t for t in norm(s).split() if len(t)>1] labels=json.load(open(OUT)) items=list(labels.items()) random.shuffle(items) cut=int(len(items)*0.9) train=items[:cut]; test=items[cut:] print(f"{len(items)} etichetate | train {len(train)} | test {len(test)}") # --- strat 1: lookup exact normalizat --- exact={} for op,c in train: exact[norm(op)]=c # ultima castiga (rar conflicte) # --- strat 2: fuzzy NN (rapidfuzz) --- train_norm=[norm(op) for op,_ in train] train_code=[c for _,c in train] norm2code={} for n,c in zip(train_norm,train_code): norm2code.setdefault(n,c) choices=list(norm2code.keys()) FUZZ_THR=88 # --- strat 3: Naive Bayes pe tokeni (invatat din etichete) --- classes=Counter(c for _,c in train) prior={c:math.log(n/len(train)) for c,n in classes.items()} tok_cnt=defaultdict(lambda: defaultdict(int)); tok_tot=defaultdict(int) vocab=set() for op,c in train: for t in toks(op): tok_cnt[c][t]+=1; tok_tot[c]+=1; vocab.add(t) V=len(vocab) def nb(op): best=None; bests=-1e18 for c in classes: s=prior[c] for t in toks(op): s+=math.log((tok_cnt[c][t]+1)/(tok_tot[c]+V)) if s>bests: bests=s; best=c return best MAJ=classes.most_common(1)[0][0] def predict(op): n=norm(op) if n in exact: return exact[n],"exact" m=process.extractOne(n,choices,scorer=fuzz.WRatio) if m and m[1]>=FUZZ_THR: return norm2code[m[0]],"fuzzy" if toks(op): return nb(op),"nb" return MAJ,"default" t0=time.time() layer=Counter(); ok=Counter(); tot=Counter() mis=[] for op,truth in test: pred,lyr=predict(op) layer[lyr]+=1; tot[lyr]+=1 if pred==truth: ok[lyr]+=1 elif len(mis)<25: mis.append((op,pred,truth,lyr)) dt=time.time()-t0 TOTAL=len(test); OKALL=sum(ok.values()) print(f"\nPredictie {TOTAL} cazuri in {dt:.2f}s ({1000*dt/TOTAL:.2f} ms/op) - FARA AI") print(f"\nACURATETE GLOBALA (vs Groq): {OKALL}/{TOTAL} = {100*OKALL/TOTAL:.1f}%") print(f"\n{'strat':<8}{'aparitii':>9}{'corecte':>9}{'acuratete':>11}{'acoperire':>11}") for lyr in ["exact","fuzzy","nb","default"]: if tot[lyr]: print(f"{lyr:<8}{tot[lyr]:>9}{ok[lyr]:>9}{100*ok[lyr]/tot[lyr]:>10.1f}%{100*tot[lyr]/TOTAL:>10.1f}%") print("\nExemple gresite (pred != Groq):") for op,p,t,l in mis[:20]: print(f" [{l}] {op:<42} pred={p:<6} groq={t}")