5.15 (propagare design + dashboard editare) si 5.14 (mapare LLM distilata) inchise dupa /code-review high. 8 buguri reparate TDD: - HIGH modal nu se deschidea pe randul slim (base.html: trimitere-slim) - HIGH /repune trunchia prestatii (declaratie incompleta la RAR) -> iterare peste existing, codes pozitional - HIGH embeddings incarca model ~230MB degeaba pe corpus gol -> poarta has_corpus() - HIGH picker chips gol pe re-render eroare -> conn/account_id pe toate ramurile - MED obs re-derivat dupa stergere explicita -> _merge_override pastreaza obs='' - MED mapare salvata fara denumire poluă GOLD -> _record_gold_validation guard - MED typo nome_prestatie -> nume_prestatie in select /repune - MED bucketare timp +3h gresita iarna -> SQLite localtime + TZ=Europe/Bucharest Embeddings WIRE-uit functional (PRD #15, decizie user): ensure_embeddings_corpus construieste corpus din nomenclator, gated pe AUTOPASS_EMBEDDINGS_ENABLED (default off). Marime model corectata ~50MB->~230MB (estimare PRD gresita). Cleanup: hoist load_* din bucla bulk-fix; import re la top. Regresie: 1256 passed, 1 deselected (live), 0 failed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
301 lines
12 KiB
Python
301 lines
12 KiB
Python
"""Etichetator batch offline OpenRouter (Layer 1) — L14-S1.
|
|
|
|
Clasifica denumirile de operatii service in cele 18 coduri RAR + NUL.
|
|
|
|
Cerinte implementate (PRD 5.14 / Decision Audit Trail):
|
|
1. Prioritizare pe FRECVENTA (desc): corpus_by_freq() din or_common
|
|
2. Grupare pe similaritate (rapidfuzz token_sort_ratio, threshold conservator
|
|
Eng-F7): LLM eticheteaza doar reprezentantul, codul se propaga la grup
|
|
3. Ensemble NVIDIA (super-120b + nano-9b, PRD #9): acord unanim -> high;
|
|
dezacord (orice divergenta) -> needs_mapping. Vot pe coduri, nu pe
|
|
self-confidence. ultra-550b EXCLUS (4-5x mai lent, zero castig)
|
|
4. Scrub PII (F3): integrat in or_common.call() (regex nr inmatriculare/VIN)
|
|
5. Resumabil: scrie *-partial.json incremental, reia de unde a ramas;
|
|
retry/backoff pe 429 gestionat de or_common.call()
|
|
6. Output: {denumire, cod, sursa, confidence, grup_rep}
|
|
NUL = ancore negativa + supresie, NU promovat la cod RAR (#4)
|
|
|
|
CLI: python3 tools/mapare-llm/or_label.py [N] [--out path] [--partial path]
|
|
[--threshold 85] [--batch 20] [--pace 4.0]
|
|
"""
|
|
import sys
|
|
import os
|
|
import json
|
|
import time
|
|
from collections import Counter
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
import or_common as oc
|
|
from rapidfuzz import fuzz
|
|
|
|
# Modele NVIDIA (decizie PRD #9: pastram super-120b + nano-9b; aruncam ultra-550b)
|
|
MODELS = [
|
|
"nvidia/nemotron-3-super-120b-a12b:free",
|
|
"nvidia/nemotron-nano-9b-v2:free",
|
|
]
|
|
|
|
DEFAULT_THRESHOLD = 85 # raza conservatoare pt grupare (Eng-F7)
|
|
DEFAULT_BATCH = 20 # denumiri per apel LLM (cap free tier ~50 cereri/zi)
|
|
DEFAULT_N = 500 # top N dupa frecventa de procesat
|
|
DEFAULT_PACE = 4.0 # sec intre batch-uri (free tier OpenRouter ~20 req/min)
|
|
|
|
HERE = os.path.dirname(os.path.abspath(__file__))
|
|
PARTIAL_PATH = os.path.join(HERE, "or-labels-partial.json")
|
|
FINAL_PATH = os.path.join(HERE, "or-labels-final.json")
|
|
|
|
|
|
def group_by_similarity(corpus, threshold=DEFAULT_THRESHOLD):
|
|
"""Grupeaza denumirile pe similaritate fuzz.token_sort_ratio.
|
|
|
|
corpus: lista de (denumire, freq) sortata DESCRESCATOR dupa frecventa.
|
|
Elementul cu frecventa maxima = reprezentantul grupului.
|
|
threshold: scor minim de similaritate (0-100). Valoare conservatoare = 85.
|
|
|
|
Algoritm greedy: primul item nemapat devine reprezentant; urmatoarele
|
|
iteme cu scor >= threshold fata de reprezentant intra in grupul sau.
|
|
Conservator: nu grupeaza tranzitiv (doar fata de reprezentant).
|
|
|
|
Intoarce: lista de dict {rep: str, freq: int, members: [(den, freq), ...]}
|
|
"""
|
|
assigned = set()
|
|
groups = []
|
|
|
|
for i, (den_i, freq_i) in enumerate(corpus):
|
|
if den_i in assigned:
|
|
continue
|
|
members = []
|
|
for j, (den_j, freq_j) in enumerate(corpus):
|
|
if j <= i or den_j in assigned:
|
|
continue
|
|
if fuzz.token_sort_ratio(den_i, den_j) >= threshold:
|
|
members.append((den_j, freq_j))
|
|
assigned.add(den_j)
|
|
assigned.add(den_i)
|
|
groups.append({"rep": den_i, "freq": freq_i, "members": members})
|
|
|
|
return groups
|
|
|
|
|
|
def ensemble_vote(votes):
|
|
"""Calculeaza verdictul ensemble din voturile modelelor.
|
|
|
|
votes: dict {model_id: cod} - "?" inseamna parse-fail (se exclude).
|
|
|
|
Reguli (2 modele NVIDIA, aceeasi familie):
|
|
- Toate N modele cu acelasi cod valid -> (cod, "high", "ensemble-unanim")
|
|
- Toate N modele cu "NUL" -> ("NUL", "high", "ensemble-unanim-nul")
|
|
- Orice divergenta / parse-fail partial -> ("?", "needs_mapping", "ensemble-dezacord")
|
|
Vot pe coduri, NU pe self-confidence (PRD #10, Eng-F7).
|
|
NUL tratat SEPARAT: ancore negativa, nu e cod RAR (#4).
|
|
|
|
Intoarce: (cod_final, confidence, sursa)
|
|
cod_final: cod RAR valid | "NUL" | "?" (needs human review)
|
|
confidence: "high" | "needs_mapping"
|
|
sursa: "ensemble-unanim" | "ensemble-unanim-nul" | "ensemble-dezacord"
|
|
"""
|
|
n_models = len(votes)
|
|
valid_votes = [v for v in votes.values() if v != "?"]
|
|
|
|
if not valid_votes:
|
|
return "?", "needs_mapping", "ensemble-dezacord"
|
|
|
|
c = Counter(valid_votes)
|
|
top_cod, top_cnt = c.most_common(1)[0]
|
|
|
|
if top_cnt == n_models:
|
|
# Unanimitate: toate cele N modele au raspuns cu acelasi cod
|
|
if top_cod == "NUL":
|
|
return "NUL", "high", "ensemble-unanim-nul"
|
|
if top_cod in oc.VALID:
|
|
return top_cod, "high", "ensemble-unanim"
|
|
# Cod returnat de LLM nu e in nomenclatorul RAR -> dezacord
|
|
return "?", "needs_mapping", "ensemble-dezacord"
|
|
|
|
# Dezacord (inclusiv parse-fail partial: top_cnt < n_models)
|
|
return "?", "needs_mapping", "ensemble-dezacord"
|
|
|
|
|
|
def load_partial(path):
|
|
"""Incarca rezultate partiale daca fisierul exista.
|
|
|
|
Intoarce dict {rep -> {cod, confidence, sursa, votes}} sau {} daca
|
|
fisierul lipseste sau e corupt.
|
|
"""
|
|
if os.path.exists(path):
|
|
try:
|
|
return json.load(open(path, encoding="utf-8"))
|
|
except (json.JSONDecodeError, OSError):
|
|
return {}
|
|
return {}
|
|
|
|
|
|
def save_partial(path, results):
|
|
"""Salveaza rezultate partiale incrementabil (suprascrie fisierul).
|
|
|
|
results: dict {rep -> {cod, confidence, sursa, votes}}
|
|
"""
|
|
json.dump(results, open(path, "w", encoding="utf-8"), ensure_ascii=False, indent=1)
|
|
|
|
|
|
def label_groups(groups, partial, batch_size=DEFAULT_BATCH, pace=DEFAULT_PACE):
|
|
"""Eticheteaza reprezentantii grupurilor cu ensemble NVIDIA.
|
|
|
|
Sare reprezentantii deja in partial (resumabil).
|
|
Colecteaza voturi per model in batch-uri, calculeaza ensemble,
|
|
actualizeaza partial la final.
|
|
|
|
groups: lista de {rep, freq, members} din group_by_similarity()
|
|
partial: dict {rep -> label} - stare anterioara (modificat in-place)
|
|
batch_size: denumiri per apel LLM
|
|
pace: sec intre batch-uri (0 = fara pauza, util in teste)
|
|
|
|
Intoarce partial actualizat.
|
|
"""
|
|
todo = [g["rep"] for g in groups if g["rep"] not in partial]
|
|
if not todo:
|
|
print("toti reprezentantii sunt deja in partial, nimic de facut", flush=True)
|
|
return partial
|
|
|
|
print(f"de etichetat: {len(todo)} reprezentanti "
|
|
f"(skip {len(groups) - len(todo)} din partial)", flush=True)
|
|
|
|
# Colectam voturile per model, pentru toti reprezentantii nerezolvati
|
|
votes_per_rep = {rep: {} for rep in todo}
|
|
nb = (len(todo) + batch_size - 1) // batch_size
|
|
|
|
for mi, m in enumerate(MODELS):
|
|
print(f" model: {m}", flush=True)
|
|
for bi, k in enumerate(range(0, len(todo), batch_size)):
|
|
batch = todo[k:k + batch_size]
|
|
codes, meta = oc.call(m, batch)
|
|
for rep, cod in zip(batch, codes):
|
|
votes_per_rep[rep][m] = cod
|
|
print(f" batch {bi+1}/{nb} {meta['ms']}ms err={meta['err']}", flush=True)
|
|
if bi < nb - 1 and pace > 0:
|
|
time.sleep(pace)
|
|
if pace > 0 and mi < len(MODELS) - 1:
|
|
time.sleep(pace) # pauza intre modele diferite
|
|
|
|
# Ensemble + scriere in partial
|
|
for rep in todo:
|
|
cod, confidence, sursa = ensemble_vote(votes_per_rep[rep])
|
|
partial[rep] = {
|
|
"cod": cod,
|
|
"confidence": confidence,
|
|
"sursa": sursa,
|
|
"votes": votes_per_rep[rep],
|
|
}
|
|
|
|
return partial
|
|
|
|
|
|
def expand_to_all(groups, partial):
|
|
"""Propaga etichetele reprezentantilor la membrii grupului.
|
|
|
|
Reprezentantul primeste sursa din ensemble ("ensemble-*").
|
|
Membrii primesc sursa="propagat" si codul/confidence al reprezentantului.
|
|
NUL este pastrat ca NUL la propagare, nu e convertit la cod RAR (#4).
|
|
|
|
Intoarce: lista de dict {denumire, cod, sursa, confidence, grup_rep}
|
|
"""
|
|
results = []
|
|
for g in groups:
|
|
rep = g["rep"]
|
|
label = partial.get(rep, {})
|
|
cod = label.get("cod", "?")
|
|
confidence = label.get("confidence", "needs_mapping")
|
|
sursa_rep = label.get("sursa", "ensemble-dezacord")
|
|
|
|
# Reprezentantul
|
|
results.append({
|
|
"denumire": rep,
|
|
"cod": cod,
|
|
"sursa": sursa_rep,
|
|
"confidence": confidence,
|
|
"grup_rep": rep,
|
|
})
|
|
|
|
# Membrii grupului: propaga codul reprezentantului
|
|
for (mem, _freq) in g["members"]:
|
|
results.append({
|
|
"denumire": mem,
|
|
"cod": cod,
|
|
"sursa": "propagat",
|
|
"confidence": confidence,
|
|
"grup_rep": rep,
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
def run(n=DEFAULT_N, output_path=FINAL_PATH, partial_path=PARTIAL_PATH,
|
|
threshold=DEFAULT_THRESHOLD, batch_size=DEFAULT_BATCH, pace=DEFAULT_PACE):
|
|
"""Punctul principal: citeste corpus, grupeaza, eticheteaza, salveaza.
|
|
|
|
Resumabil: daca partial_path exista, sare reprezentantii deja etichetati.
|
|
|
|
n: top N denumiri dupa frecventa de procesat
|
|
output_path: fisier JSON cu toate etichetele (final)
|
|
partial_path: fisier JSON resumabil (stare intermediara per reprezentant)
|
|
threshold: raza similaritate pt grupare (0-100, default 85 = conservator)
|
|
batch_size: denumiri per apel LLM
|
|
pace: sec intre batch-uri
|
|
|
|
Intoarce: lista de rezultate (identica cu fisierul output_path).
|
|
"""
|
|
corpus = oc.corpus_by_freq()
|
|
top = corpus[:n]
|
|
vol_total = sum(nr for _, nr in corpus) or 1
|
|
vol_top = sum(nr for _, nr in top)
|
|
print(f"corpus: {len(corpus)} denumiri distincte, volum total {vol_total}")
|
|
print(f"top {n} dupa frecventa: volum {vol_top} ({100*vol_top/vol_total:.1f}%)")
|
|
|
|
groups = group_by_similarity(top, threshold)
|
|
n_reps = len(groups)
|
|
n_mems = sum(len(g["members"]) for g in groups)
|
|
print(f"dupa grupare: {n_reps} reprezentanti, {n_mems} membri propagati din {n}")
|
|
|
|
partial = load_partial(partial_path)
|
|
print(f"partial incarcat: {len(partial)} reprezentanti deja etichetati")
|
|
|
|
partial = label_groups(groups, partial, batch_size, pace)
|
|
save_partial(partial_path, partial)
|
|
print(f"partial salvat: {partial_path}")
|
|
|
|
results = expand_to_all(groups, partial)
|
|
|
|
json.dump(results, open(output_path, "w", encoding="utf-8"),
|
|
ensure_ascii=False, indent=1)
|
|
|
|
# Raport sumar
|
|
nul_cnt = sum(1 for r in results if r["cod"] == "NUL")
|
|
high_cnt = sum(1 for r in results if r["confidence"] == "high")
|
|
needs_cnt = sum(1 for r in results if r["confidence"] == "needs_mapping")
|
|
prop_cnt = sum(1 for r in results if r["sursa"] == "propagat")
|
|
print(f"\nREZULTAT: {len(results)} denumiri in output")
|
|
print(f" NUL (gunoi, ancore negative): {nul_cnt}")
|
|
print(f" confidence high (unanim): {high_cnt}")
|
|
print(f" needs_mapping (dezacord): {needs_cnt}")
|
|
print(f" propagate din grup: {prop_cnt}")
|
|
print(f"salvat: {output_path}")
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
p = argparse.ArgumentParser(description="Etichetator batch offline OpenRouter (L14-S1)")
|
|
p.add_argument("n", nargs="?", type=int, default=DEFAULT_N,
|
|
help=f"top N denumiri dupa frecventa (default {DEFAULT_N})")
|
|
p.add_argument("--out", default=FINAL_PATH, metavar="PATH",
|
|
help="fisier output final JSON (default: or-labels-final.json)")
|
|
p.add_argument("--partial", default=PARTIAL_PATH, metavar="PATH",
|
|
help="fisier partial resumabil JSON (default: or-labels-partial.json)")
|
|
p.add_argument("--threshold", type=int, default=DEFAULT_THRESHOLD,
|
|
help=f"raza similaritate grupare 0-100 (default {DEFAULT_THRESHOLD})")
|
|
p.add_argument("--batch", type=int, default=DEFAULT_BATCH,
|
|
help=f"denumiri per apel LLM (default {DEFAULT_BATCH})")
|
|
p.add_argument("--pace", type=float, default=DEFAULT_PACE,
|
|
help=f"sec intre batch-uri (default {DEFAULT_PACE})")
|
|
a = p.parse_args()
|
|
run(a.n, a.out, a.partial, a.threshold, a.batch, a.pace)
|