feat(mapare-llm): pivot PRD 5.14 + tooling etichetare OpenRouter
PRD 5.14 rescris cu pivotul arhitectural: LLM doar etichetator OFFLINE, runtime = clasificator local fara API (fuzzy + embeddings), baza de cunostinte GOLD partajata cross-account (validarea unui service ajuta toate). Decizia 8 (corpus per-cont) SUPERSEDED. Tooling nou OpenRouter (free, familia NVIDIA Nemotron): or_common.py (client + corpus pe frecventa, cheie din .env) + or_modeltest.py (comparatie modele, acord ensemble vs Groq). Masurat: super-120b + nano-9b fiabile, 3/3 unanim pe 87% volum; ultra-550b aruncat. Corpus real (4 CSV service, coloana NR=frecventa) + etichete Groq bootstrap incluse ca date de masurare. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
61
tools/mapare-llm/label_common.py
Normal file
61
tools/mapare-llm/label_common.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import json, urllib.request, urllib.error, time, os, csv, glob, re
|
||||
from collections import Counter
|
||||
|
||||
KEY=os.environ["GROQ_KEY"]; MODEL="llama-3.3-70b-versatile"; BATCH=40
|
||||
OUT="/tmp/claude-1000/-workspace-autopass/4177677c-7995-4fab-bbd5-16735cb335e3/scratchpad/labels.json"
|
||||
|
||||
CODURI=("OE-1=REPARATIE, OE-2=INTRETINERE, OE-3=REVIZIE PERIODICA, OE-4=REGLARE FUNCTIONALA, "
|
||||
"OE-5=MODIFICARE CONSTRUCTIVA, OE-6=RECONSTRUCTIE, OE-7=ACTUALIZARE SOFTWARE, "
|
||||
"OE-8=INLOCUIRE SEZONIERA ANVELOPE, OE-D=AVARIE GRAVA DIRECTIE, OE-F=AVARIE GRAVA FRANARE, "
|
||||
"OE-C=AVARIE GRAVA CAROSERIE, OE-S=AVARIE GRAVA SASIU, OE-R=AVARIE GRAVA RETINERE/AIRBAG, "
|
||||
"OE-A=AVARIE GRAVA ADAS, OE-I=ISTORIC ODOMETRU, AITLV=ATELIER TAHOGRAFE, "
|
||||
"R-ODO=REPARATIE ODOMETRU, I-ODO=INLOCUIRE ODOMETRU, NUL=NU e operatie de service")
|
||||
SYS=("Esti expert RAR AUTOPASS. Clasifici fiecare operatie de service-auto in EXACT unul din coduri:\n"+CODURI+
|
||||
"\nReguli: AVARIILE GRAVE DOAR pentru daune in urma unui accident, NU reparatii curente. "
|
||||
"Vopsire/revopsire/retus = REPARATIE (OE-1). Inlocuire/D-R/reparare piese = REPARATIE (OE-1). "
|
||||
"Schimb ulei motor + filtre = REVIZIE (OE-3). Aerisit/gresat/completat nivele = INTRETINERE (OE-2). "
|
||||
"Text care nu e operatie efectiva (ITP, plata, discount, manopera generica, nr inmatriculare, doar nume piesa) -> NUL. "
|
||||
"Raspunde DOAR JSON {\"rez\":[{\"i\":<numar>,\"cod\":\"...\"}]}.")
|
||||
PLATE=re.compile(r'\b[A-Z]{1,2}\s?\d{2,3}\s?[A-Z]{3}\b'); VIN=re.compile(r'\b[A-HJ-NPR-Z0-9]{17}\b')
|
||||
def scrub(s): return VIN.sub('[VIN]',PLATE.sub('[NR]',s))
|
||||
|
||||
def classify(batch):
|
||||
msgs=[{"role":"system","content":SYS},{"role":"user","content":"\n".join(f"{i+1}. {scrub(o)}" for i,o in enumerate(batch))}]
|
||||
body={"model":MODEL,"messages":msgs,"temperature":0,"response_format":{"type":"json_object"}}
|
||||
data=json.dumps(body).encode()
|
||||
for attempt in range(8):
|
||||
req=urllib.request.Request("https://api.groq.com/openai/v1/chat/completions",data=data,
|
||||
headers={"Authorization":f"Bearer {KEY}","Content-Type":"application/json","User-Agent":"Mozilla/5.0"})
|
||||
try:
|
||||
with urllib.request.urlopen(req,timeout=180) as r: d=json.load(r)
|
||||
m={x["i"]:x["cod"] for x in json.loads(d["choices"][0]["message"]["content"])["rez"]}
|
||||
return [m.get(i+1,"?") for i in range(len(batch))]
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code in (429,500,502,503):
|
||||
time.sleep(float(e.headers.get("retry-after",0)) or min(2**attempt,30)); continue
|
||||
raise
|
||||
except Exception:
|
||||
time.sleep(min(2**attempt,20)); continue
|
||||
return ["?"]*len(batch)
|
||||
|
||||
# presence in CSV-uri
|
||||
files=sorted(glob.glob("/workspace/autopass/docs/operatii-service/*.csv"))
|
||||
presence=Counter()
|
||||
for f in files:
|
||||
seen=set()
|
||||
for r in list(csv.reader(open(f,encoding="utf-8",errors="replace"),delimiter=";"))[1:]:
|
||||
if len(r)>1 and r[1].strip(): seen.add(r[1].strip())
|
||||
for op in seen: presence[op]+=1
|
||||
|
||||
labels=json.load(open(OUT))
|
||||
todo=sorted([op for op,c in presence.items() if c>=2 and op not in labels])
|
||||
print(f"comune (>=2 service) de etichetat: {len(todo)} (peste {len(labels)} deja)",flush=True)
|
||||
|
||||
t0=time.time(); nb=(len(todo)+BATCH-1)//BATCH
|
||||
for bi,k in enumerate(range(0,len(todo),BATCH)):
|
||||
b=todo[k:k+BATCH]
|
||||
for o,c in zip(b,classify(b)): labels[o]=c
|
||||
json.dump(labels,open(OUT,"w"),ensure_ascii=False)
|
||||
print(f" batch {bi+1}/{nb} -> total {len(labels)} ({time.time()-t0:.0f}s)",flush=True)
|
||||
time.sleep(4)
|
||||
print(f"GATA comune: {len(labels)} etichete totale ({time.time()-t0:.0f}s)",flush=True)
|
||||
Reference in New Issue
Block a user