feat(mapare-llm): pivot PRD 5.14 + tooling etichetare OpenRouter
PRD 5.14 rescris cu pivotul arhitectural: LLM doar etichetator OFFLINE, runtime = clasificator local fara API (fuzzy + embeddings), baza de cunostinte GOLD partajata cross-account (validarea unui service ajuta toate). Decizia 8 (corpus per-cont) SUPERSEDED. Tooling nou OpenRouter (free, familia NVIDIA Nemotron): or_common.py (client + corpus pe frecventa, cheie din .env) + or_modeltest.py (comparatie modele, acord ensemble vs Groq). Masurat: super-120b + nano-9b fiabile, 3/3 unanim pe 87% volum; ultra-550b aruncat. Corpus real (4 CSV service, coloana NR=frecventa) + etichete Groq bootstrap incluse ca date de masurare. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
112
tools/mapare-llm/or_common.py
Normal file
112
tools/mapare-llm/or_common.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Comun pentru etichetarea operatii->coduri RAR prin OpenRouter (modele FREE).
|
||||
|
||||
Difera de tooling-ul Groq (label_common.py / f7_ensemble.py) prin:
|
||||
- endpoint OpenRouter (OpenAI-compatibil), cheie din .env (nu env exportat);
|
||||
- corpus ordonat pe FRECVENTA (suma NR per denumire distincta), nu alfabetic.
|
||||
|
||||
Refoloseste IDENTIC: cele 18 coduri RAR, promptul de sistem si scrub-ul PII (F3).
|
||||
"""
|
||||
import json, urllib.request, urllib.error, time, os, csv, glob, re
|
||||
from collections import Counter
|
||||
|
||||
# --- cheie din .env (gitignored). Tool-urile Groq citeau os.environ; aici din fisier,
|
||||
# fiindca shell-ul non-interactiv nu pastreaza export-urile intre apeluri. ---
|
||||
def _load_key():
|
||||
if os.environ.get("OPENROUTER_KEY"):
|
||||
return os.environ["OPENROUTER_KEY"]
|
||||
env = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
|
||||
for line in open(env, encoding="utf-8", errors="replace"):
|
||||
line = line.strip()
|
||||
if line.startswith("OPENROUTER_KEY="):
|
||||
return line.split("=", 1)[1].strip()
|
||||
raise RuntimeError("OPENROUTER_KEY lipseste din .env si din mediu")
|
||||
|
||||
KEY = _load_key()
|
||||
URL = "https://openrouter.ai/api/v1/chat/completions"
|
||||
|
||||
# --- nomenclator + prompt: identic cu label_common.py / f7_ensemble.py (sursa de adevar) ---
|
||||
CODURI = ("OE-1=REPARATIE, OE-2=INTRETINERE, OE-3=REVIZIE PERIODICA, OE-4=REGLARE FUNCTIONALA, "
|
||||
"OE-5=MODIFICARE CONSTRUCTIVA, OE-6=RECONSTRUCTIE, OE-7=ACTUALIZARE SOFTWARE, "
|
||||
"OE-8=INLOCUIRE SEZONIERA ANVELOPE, OE-D=AVARIE GRAVA DIRECTIE, OE-F=AVARIE GRAVA FRANARE, "
|
||||
"OE-C=AVARIE GRAVA CAROSERIE, OE-S=AVARIE GRAVA SASIU, OE-R=AVARIE GRAVA RETINERE/AIRBAG, "
|
||||
"OE-A=AVARIE GRAVA ADAS, OE-I=ISTORIC ODOMETRU, AITLV=ATELIER TAHOGRAFE, "
|
||||
"R-ODO=REPARATIE ODOMETRU, I-ODO=INLOCUIRE ODOMETRU, NUL=NU e operatie de service")
|
||||
SYS = ("Esti expert RAR AUTOPASS. Clasifici fiecare operatie de service-auto in EXACT unul din coduri:\n" + CODURI +
|
||||
"\nReguli: AVARIILE GRAVE (OE-D/F/C/S/R/A) DOAR pentru daune in urma unui accident, NU reparatii curente. "
|
||||
"Vopsire/revopsire/retus = REPARATIE (OE-1). Inlocuire/D-R/reparare piese = REPARATIE (OE-1). "
|
||||
"Schimb ulei motor + filtre = REVIZIE (OE-3). Aerisit/gresat/completat nivele = INTRETINERE (OE-2). "
|
||||
"Text care nu e operatie efectiva (ITP, plata, discount, manopera generica, nr inmatriculare, doar nume piesa) -> NUL. "
|
||||
"Raspunde DOAR JSON {\"rez\":[{\"i\":<numar>,\"cod\":\"...\"}]}.")
|
||||
|
||||
# --- F3: scrub PII inainte de a trimite la LLM ---
|
||||
PLATE = re.compile(r'\b[A-Z]{1,2}\s?\d{2,3}\s?[A-Z]{3}\b')
|
||||
VIN = re.compile(r'\b[A-HJ-NPR-Z0-9]{17}\b')
|
||||
def scrub(s): return VIN.sub('[VIN]', PLATE.sub('[NR]', s))
|
||||
|
||||
VALID = {c.split("=")[0] for c in CODURI.replace(", ", ",").split(",")}
|
||||
|
||||
|
||||
def call(model, batch, timeout=180, max_attempts=6):
|
||||
"""Un apel OpenRouter pe un batch. Intoarce (coduri, meta) unde meta are latenta si erori.
|
||||
|
||||
coduri: lista paralela cu batch; "?" pe pozitiile fara raspuns / parse-fail.
|
||||
"""
|
||||
msgs = [{"role": "system", "content": SYS},
|
||||
{"role": "user", "content": "\n".join(f"{i+1}. {scrub(o)}" for i, o in enumerate(batch))}]
|
||||
body = {"model": model, "messages": msgs, "temperature": 0,
|
||||
"response_format": {"type": "json_object"}}
|
||||
data = json.dumps(body).encode()
|
||||
t0 = time.time()
|
||||
for attempt in range(max_attempts):
|
||||
req = urllib.request.Request(URL, data=data, headers={
|
||||
"Authorization": f"Bearer {KEY}", "Content-Type": "application/json",
|
||||
"User-Agent": "Mozilla/5.0", # WAF: Python-urllib -> 403
|
||||
"HTTP-Referer": "https://gitea.romfast.ro/romfast/autopass",
|
||||
"X-Title": "autopass-mapare-llm"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = json.load(r)
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
out = json.loads(content)["rez"]
|
||||
m = {x["i"]: x["cod"] for x in out}
|
||||
codes = [m.get(i + 1, "?") for i in range(len(batch))]
|
||||
return codes, {"ms": int((time.time() - t0) * 1000), "err": None}
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code in (429, 500, 502, 503):
|
||||
wait = float(e.headers.get("retry-after", 0)) or min(2 ** attempt, 30)
|
||||
time.sleep(wait); continue
|
||||
return ["?"] * len(batch), {"ms": int((time.time() - t0) * 1000), "err": f"HTTP {e.code}"}
|
||||
except Exception as e:
|
||||
if attempt < max_attempts - 1:
|
||||
time.sleep(min(2 ** attempt, 20)); continue
|
||||
return ["?"] * len(batch), {"ms": int((time.time() - t0) * 1000), "err": type(e).__name__}
|
||||
return ["?"] * len(batch), {"ms": int((time.time() - t0) * 1000), "err": "max_attempts"}
|
||||
|
||||
|
||||
def corpus_by_freq():
|
||||
"""Toate denumirile distincte, cu frecventa = suma NR pe toate CSV-urile, desc.
|
||||
|
||||
Intoarce lista de (denop, nr_total). NR = de cate ori apare denumirea in prezentari.
|
||||
"""
|
||||
freq = Counter()
|
||||
for f in sorted(glob.glob(os.path.join(os.path.dirname(__file__), "..", "..",
|
||||
"docs", "operatii-service", "*.csv"))):
|
||||
for r in list(csv.reader(open(f, encoding="utf-8", errors="replace"), delimiter=";"))[1:]:
|
||||
if len(r) > 2 and r[1].strip():
|
||||
try:
|
||||
freq[r[1].strip()] += int(r[2].strip() or 0)
|
||||
except ValueError:
|
||||
freq[r[1].strip()] += 0
|
||||
return freq.most_common()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
c = corpus_by_freq()
|
||||
tot = sum(n for _, n in c)
|
||||
print(f"distincte: {len(c)} volum total (suma NR): {tot}")
|
||||
for cut in (100, 500, 1000):
|
||||
cov = sum(n for _, n in c[:cut])
|
||||
print(f" top {cut}: {100*cov/tot:.1f}% din volum")
|
||||
print("--- top 15 dupa frecventa ---")
|
||||
for op, n in c[:15]:
|
||||
print(f" {n:>6} {op}")
|
||||
Reference in New Issue
Block a user