Files
rar-autopass/tools/mapare-llm/or_label.py
Claude Agent 3fc53534e2 feat(5.15+5.14): CLOSE — fix-uri code-review + embeddings functional
5.15 (propagare design + dashboard editare) si 5.14 (mapare LLM distilata)
inchise dupa /code-review high. 8 buguri reparate TDD:

- HIGH modal nu se deschidea pe randul slim (base.html: trimitere-slim)
- HIGH /repune trunchia prestatii (declaratie incompleta la RAR) -> iterare
  peste existing, codes pozitional
- HIGH embeddings incarca model ~230MB degeaba pe corpus gol -> poarta has_corpus()
- HIGH picker chips gol pe re-render eroare -> conn/account_id pe toate ramurile
- MED obs re-derivat dupa stergere explicita -> _merge_override pastreaza obs=''
- MED mapare salvata fara denumire poluă GOLD -> _record_gold_validation guard
- MED typo nome_prestatie -> nume_prestatie in select /repune
- MED bucketare timp +3h gresita iarna -> SQLite localtime + TZ=Europe/Bucharest

Embeddings WIRE-uit functional (PRD #15, decizie user): ensure_embeddings_corpus
construieste corpus din nomenclator, gated pe AUTOPASS_EMBEDDINGS_ENABLED (default
off). Marime model corectata ~50MB->~230MB (estimare PRD gresita).

Cleanup: hoist load_* din bucla bulk-fix; import re la top.
Regresie: 1256 passed, 1 deselected (live), 0 failed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 20:48:34 +00:00

301 lines
12 KiB
Python

"""Etichetator batch offline OpenRouter (Layer 1) — L14-S1.
Clasifica denumirile de operatii service in cele 18 coduri RAR + NUL.
Cerinte implementate (PRD 5.14 / Decision Audit Trail):
1. Prioritizare pe FRECVENTA (desc): corpus_by_freq() din or_common
2. Grupare pe similaritate (rapidfuzz token_sort_ratio, threshold conservator
Eng-F7): LLM eticheteaza doar reprezentantul, codul se propaga la grup
3. Ensemble NVIDIA (super-120b + nano-9b, PRD #9): acord unanim -> high;
dezacord (orice divergenta) -> needs_mapping. Vot pe coduri, nu pe
self-confidence. ultra-550b EXCLUS (4-5x mai lent, zero castig)
4. Scrub PII (F3): integrat in or_common.call() (regex nr inmatriculare/VIN)
5. Resumabil: scrie *-partial.json incremental, reia de unde a ramas;
retry/backoff pe 429 gestionat de or_common.call()
6. Output: {denumire, cod, sursa, confidence, grup_rep}
NUL = ancore negativa + supresie, NU promovat la cod RAR (#4)
CLI: python3 tools/mapare-llm/or_label.py [N] [--out path] [--partial path]
[--threshold 85] [--batch 20] [--pace 4.0]
"""
import sys
import os
import json
import time
from collections import Counter
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import or_common as oc
from rapidfuzz import fuzz
# Modele NVIDIA (decizie PRD #9: pastram super-120b + nano-9b; aruncam ultra-550b)
MODELS = [
"nvidia/nemotron-3-super-120b-a12b:free",
"nvidia/nemotron-nano-9b-v2:free",
]
DEFAULT_THRESHOLD = 85 # raza conservatoare pt grupare (Eng-F7)
DEFAULT_BATCH = 20 # denumiri per apel LLM (cap free tier ~50 cereri/zi)
DEFAULT_N = 500 # top N dupa frecventa de procesat
DEFAULT_PACE = 4.0 # sec intre batch-uri (free tier OpenRouter ~20 req/min)
HERE = os.path.dirname(os.path.abspath(__file__))
PARTIAL_PATH = os.path.join(HERE, "or-labels-partial.json")
FINAL_PATH = os.path.join(HERE, "or-labels-final.json")
def group_by_similarity(corpus, threshold=DEFAULT_THRESHOLD):
"""Grupeaza denumirile pe similaritate fuzz.token_sort_ratio.
corpus: lista de (denumire, freq) sortata DESCRESCATOR dupa frecventa.
Elementul cu frecventa maxima = reprezentantul grupului.
threshold: scor minim de similaritate (0-100). Valoare conservatoare = 85.
Algoritm greedy: primul item nemapat devine reprezentant; urmatoarele
iteme cu scor >= threshold fata de reprezentant intra in grupul sau.
Conservator: nu grupeaza tranzitiv (doar fata de reprezentant).
Intoarce: lista de dict {rep: str, freq: int, members: [(den, freq), ...]}
"""
assigned = set()
groups = []
for i, (den_i, freq_i) in enumerate(corpus):
if den_i in assigned:
continue
members = []
for j, (den_j, freq_j) in enumerate(corpus):
if j <= i or den_j in assigned:
continue
if fuzz.token_sort_ratio(den_i, den_j) >= threshold:
members.append((den_j, freq_j))
assigned.add(den_j)
assigned.add(den_i)
groups.append({"rep": den_i, "freq": freq_i, "members": members})
return groups
def ensemble_vote(votes):
"""Calculeaza verdictul ensemble din voturile modelelor.
votes: dict {model_id: cod} - "?" inseamna parse-fail (se exclude).
Reguli (2 modele NVIDIA, aceeasi familie):
- Toate N modele cu acelasi cod valid -> (cod, "high", "ensemble-unanim")
- Toate N modele cu "NUL" -> ("NUL", "high", "ensemble-unanim-nul")
- Orice divergenta / parse-fail partial -> ("?", "needs_mapping", "ensemble-dezacord")
Vot pe coduri, NU pe self-confidence (PRD #10, Eng-F7).
NUL tratat SEPARAT: ancore negativa, nu e cod RAR (#4).
Intoarce: (cod_final, confidence, sursa)
cod_final: cod RAR valid | "NUL" | "?" (needs human review)
confidence: "high" | "needs_mapping"
sursa: "ensemble-unanim" | "ensemble-unanim-nul" | "ensemble-dezacord"
"""
n_models = len(votes)
valid_votes = [v for v in votes.values() if v != "?"]
if not valid_votes:
return "?", "needs_mapping", "ensemble-dezacord"
c = Counter(valid_votes)
top_cod, top_cnt = c.most_common(1)[0]
if top_cnt == n_models:
# Unanimitate: toate cele N modele au raspuns cu acelasi cod
if top_cod == "NUL":
return "NUL", "high", "ensemble-unanim-nul"
if top_cod in oc.VALID:
return top_cod, "high", "ensemble-unanim"
# Cod returnat de LLM nu e in nomenclatorul RAR -> dezacord
return "?", "needs_mapping", "ensemble-dezacord"
# Dezacord (inclusiv parse-fail partial: top_cnt < n_models)
return "?", "needs_mapping", "ensemble-dezacord"
def load_partial(path):
"""Incarca rezultate partiale daca fisierul exista.
Intoarce dict {rep -> {cod, confidence, sursa, votes}} sau {} daca
fisierul lipseste sau e corupt.
"""
if os.path.exists(path):
try:
return json.load(open(path, encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return {}
return {}
def save_partial(path, results):
"""Salveaza rezultate partiale incrementabil (suprascrie fisierul).
results: dict {rep -> {cod, confidence, sursa, votes}}
"""
json.dump(results, open(path, "w", encoding="utf-8"), ensure_ascii=False, indent=1)
def label_groups(groups, partial, batch_size=DEFAULT_BATCH, pace=DEFAULT_PACE):
"""Eticheteaza reprezentantii grupurilor cu ensemble NVIDIA.
Sare reprezentantii deja in partial (resumabil).
Colecteaza voturi per model in batch-uri, calculeaza ensemble,
actualizeaza partial la final.
groups: lista de {rep, freq, members} din group_by_similarity()
partial: dict {rep -> label} - stare anterioara (modificat in-place)
batch_size: denumiri per apel LLM
pace: sec intre batch-uri (0 = fara pauza, util in teste)
Intoarce partial actualizat.
"""
todo = [g["rep"] for g in groups if g["rep"] not in partial]
if not todo:
print("toti reprezentantii sunt deja in partial, nimic de facut", flush=True)
return partial
print(f"de etichetat: {len(todo)} reprezentanti "
f"(skip {len(groups) - len(todo)} din partial)", flush=True)
# Colectam voturile per model, pentru toti reprezentantii nerezolvati
votes_per_rep = {rep: {} for rep in todo}
nb = (len(todo) + batch_size - 1) // batch_size
for mi, m in enumerate(MODELS):
print(f" model: {m}", flush=True)
for bi, k in enumerate(range(0, len(todo), batch_size)):
batch = todo[k:k + batch_size]
codes, meta = oc.call(m, batch)
for rep, cod in zip(batch, codes):
votes_per_rep[rep][m] = cod
print(f" batch {bi+1}/{nb} {meta['ms']}ms err={meta['err']}", flush=True)
if bi < nb - 1 and pace > 0:
time.sleep(pace)
if pace > 0 and mi < len(MODELS) - 1:
time.sleep(pace) # pauza intre modele diferite
# Ensemble + scriere in partial
for rep in todo:
cod, confidence, sursa = ensemble_vote(votes_per_rep[rep])
partial[rep] = {
"cod": cod,
"confidence": confidence,
"sursa": sursa,
"votes": votes_per_rep[rep],
}
return partial
def expand_to_all(groups, partial):
"""Propaga etichetele reprezentantilor la membrii grupului.
Reprezentantul primeste sursa din ensemble ("ensemble-*").
Membrii primesc sursa="propagat" si codul/confidence al reprezentantului.
NUL este pastrat ca NUL la propagare, nu e convertit la cod RAR (#4).
Intoarce: lista de dict {denumire, cod, sursa, confidence, grup_rep}
"""
results = []
for g in groups:
rep = g["rep"]
label = partial.get(rep, {})
cod = label.get("cod", "?")
confidence = label.get("confidence", "needs_mapping")
sursa_rep = label.get("sursa", "ensemble-dezacord")
# Reprezentantul
results.append({
"denumire": rep,
"cod": cod,
"sursa": sursa_rep,
"confidence": confidence,
"grup_rep": rep,
})
# Membrii grupului: propaga codul reprezentantului
for (mem, _freq) in g["members"]:
results.append({
"denumire": mem,
"cod": cod,
"sursa": "propagat",
"confidence": confidence,
"grup_rep": rep,
})
return results
def run(n=DEFAULT_N, output_path=FINAL_PATH, partial_path=PARTIAL_PATH,
threshold=DEFAULT_THRESHOLD, batch_size=DEFAULT_BATCH, pace=DEFAULT_PACE):
"""Punctul principal: citeste corpus, grupeaza, eticheteaza, salveaza.
Resumabil: daca partial_path exista, sare reprezentantii deja etichetati.
n: top N denumiri dupa frecventa de procesat
output_path: fisier JSON cu toate etichetele (final)
partial_path: fisier JSON resumabil (stare intermediara per reprezentant)
threshold: raza similaritate pt grupare (0-100, default 85 = conservator)
batch_size: denumiri per apel LLM
pace: sec intre batch-uri
Intoarce: lista de rezultate (identica cu fisierul output_path).
"""
corpus = oc.corpus_by_freq()
top = corpus[:n]
vol_total = sum(nr for _, nr in corpus) or 1
vol_top = sum(nr for _, nr in top)
print(f"corpus: {len(corpus)} denumiri distincte, volum total {vol_total}")
print(f"top {n} dupa frecventa: volum {vol_top} ({100*vol_top/vol_total:.1f}%)")
groups = group_by_similarity(top, threshold)
n_reps = len(groups)
n_mems = sum(len(g["members"]) for g in groups)
print(f"dupa grupare: {n_reps} reprezentanti, {n_mems} membri propagati din {n}")
partial = load_partial(partial_path)
print(f"partial incarcat: {len(partial)} reprezentanti deja etichetati")
partial = label_groups(groups, partial, batch_size, pace)
save_partial(partial_path, partial)
print(f"partial salvat: {partial_path}")
results = expand_to_all(groups, partial)
json.dump(results, open(output_path, "w", encoding="utf-8"),
ensure_ascii=False, indent=1)
# Raport sumar
nul_cnt = sum(1 for r in results if r["cod"] == "NUL")
high_cnt = sum(1 for r in results if r["confidence"] == "high")
needs_cnt = sum(1 for r in results if r["confidence"] == "needs_mapping")
prop_cnt = sum(1 for r in results if r["sursa"] == "propagat")
print(f"\nREZULTAT: {len(results)} denumiri in output")
print(f" NUL (gunoi, ancore negative): {nul_cnt}")
print(f" confidence high (unanim): {high_cnt}")
print(f" needs_mapping (dezacord): {needs_cnt}")
print(f" propagate din grup: {prop_cnt}")
print(f"salvat: {output_path}")
return results
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser(description="Etichetator batch offline OpenRouter (L14-S1)")
p.add_argument("n", nargs="?", type=int, default=DEFAULT_N,
help=f"top N denumiri dupa frecventa (default {DEFAULT_N})")
p.add_argument("--out", default=FINAL_PATH, metavar="PATH",
help="fisier output final JSON (default: or-labels-final.json)")
p.add_argument("--partial", default=PARTIAL_PATH, metavar="PATH",
help="fisier partial resumabil JSON (default: or-labels-partial.json)")
p.add_argument("--threshold", type=int, default=DEFAULT_THRESHOLD,
help=f"raza similaritate grupare 0-100 (default {DEFAULT_THRESHOLD})")
p.add_argument("--batch", type=int, default=DEFAULT_BATCH,
help=f"denumiri per apel LLM (default {DEFAULT_BATCH})")
p.add_argument("--pace", type=float, default=DEFAULT_PACE,
help=f"sec intre batch-uri (default {DEFAULT_PACE})")
a = p.parse_args()
run(a.n, a.out, a.partial, a.threshold, a.batch, a.pace)