"""Etichetator batch offline OpenRouter (Layer 1) — L14-S1. Clasifica denumirile de operatii service in cele 18 coduri RAR + NUL. Cerinte implementate (PRD 5.14 / Decision Audit Trail): 1. Prioritizare pe FRECVENTA (desc): corpus_by_freq() din or_common 2. Grupare pe similaritate (rapidfuzz token_sort_ratio, threshold conservator Eng-F7): LLM eticheteaza doar reprezentantul, codul se propaga la grup 3. Ensemble NVIDIA (super-120b + nano-9b, PRD #9): acord unanim -> high; dezacord (orice divergenta) -> needs_mapping. Vot pe coduri, nu pe self-confidence. ultra-550b EXCLUS (4-5x mai lent, zero castig) 4. Scrub PII (F3): integrat in or_common.call() (regex nr inmatriculare/VIN) 5. Resumabil: scrie *-partial.json incremental, reia de unde a ramas; retry/backoff pe 429 gestionat de or_common.call() 6. Output: {denumire, cod, sursa, confidence, grup_rep} NUL = ancore negativa + supresie, NU promovat la cod RAR (#4) CLI: python3 tools/mapare-llm/or_label.py [N] [--out path] [--partial path] [--threshold 85] [--batch 20] [--pace 4.0] """ import sys import os import json import time from collections import Counter sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import or_common as oc from rapidfuzz import fuzz # Modele NVIDIA (decizie PRD #9: pastram super-120b + nano-9b; aruncam ultra-550b) MODELS = [ "nvidia/nemotron-3-super-120b-a12b:free", "nvidia/nemotron-nano-9b-v2:free", ] DEFAULT_THRESHOLD = 85 # raza conservatoare pt grupare (Eng-F7) DEFAULT_BATCH = 20 # denumiri per apel LLM (cap free tier ~50 cereri/zi) DEFAULT_N = 500 # top N dupa frecventa de procesat DEFAULT_PACE = 4.0 # sec intre batch-uri (free tier OpenRouter ~20 req/min) HERE = os.path.dirname(os.path.abspath(__file__)) PARTIAL_PATH = os.path.join(HERE, "or-labels-partial.json") FINAL_PATH = os.path.join(HERE, "or-labels-final.json") def group_by_similarity(corpus, threshold=DEFAULT_THRESHOLD): """Grupeaza denumirile pe similaritate fuzz.token_sort_ratio. corpus: lista de (denumire, freq) sortata DESCRESCATOR dupa frecventa. Elementul cu frecventa maxima = reprezentantul grupului. threshold: scor minim de similaritate (0-100). Valoare conservatoare = 85. Algoritm greedy: primul item nemapat devine reprezentant; urmatoarele iteme cu scor >= threshold fata de reprezentant intra in grupul sau. Conservator: nu grupeaza tranzitiv (doar fata de reprezentant). Intoarce: lista de dict {rep: str, freq: int, members: [(den, freq), ...]} """ assigned = set() groups = [] for i, (den_i, freq_i) in enumerate(corpus): if den_i in assigned: continue members = [] for j, (den_j, freq_j) in enumerate(corpus): if j <= i or den_j in assigned: continue if fuzz.token_sort_ratio(den_i, den_j) >= threshold: members.append((den_j, freq_j)) assigned.add(den_j) assigned.add(den_i) groups.append({"rep": den_i, "freq": freq_i, "members": members}) return groups def ensemble_vote(votes): """Calculeaza verdictul ensemble din voturile modelelor. votes: dict {model_id: cod} - "?" inseamna parse-fail (se exclude). Reguli (2 modele NVIDIA, aceeasi familie): - Toate N modele cu acelasi cod valid -> (cod, "high", "ensemble-unanim") - Toate N modele cu "NUL" -> ("NUL", "high", "ensemble-unanim-nul") - Orice divergenta / parse-fail partial -> ("?", "needs_mapping", "ensemble-dezacord") Vot pe coduri, NU pe self-confidence (PRD #10, Eng-F7). NUL tratat SEPARAT: ancore negativa, nu e cod RAR (#4). Intoarce: (cod_final, confidence, sursa) cod_final: cod RAR valid | "NUL" | "?" (needs human review) confidence: "high" | "needs_mapping" sursa: "ensemble-unanim" | "ensemble-unanim-nul" | "ensemble-dezacord" """ n_models = len(votes) valid_votes = [v for v in votes.values() if v != "?"] if not valid_votes: return "?", "needs_mapping", "ensemble-dezacord" c = Counter(valid_votes) top_cod, top_cnt = c.most_common(1)[0] if top_cnt == n_models: # Unanimitate: toate cele N modele au raspuns cu acelasi cod if top_cod == "NUL": return "NUL", "high", "ensemble-unanim-nul" if top_cod in oc.VALID: return top_cod, "high", "ensemble-unanim" # Cod returnat de LLM nu e in nomenclatorul RAR -> dezacord return "?", "needs_mapping", "ensemble-dezacord" # Dezacord (inclusiv parse-fail partial: top_cnt < n_models) return "?", "needs_mapping", "ensemble-dezacord" def load_partial(path): """Incarca rezultate partiale daca fisierul exista. Intoarce dict {rep -> {cod, confidence, sursa, votes}} sau {} daca fisierul lipseste sau e corupt. """ if os.path.exists(path): try: return json.load(open(path, encoding="utf-8")) except (json.JSONDecodeError, OSError): return {} return {} def save_partial(path, results): """Salveaza rezultate partiale incrementabil (suprascrie fisierul). results: dict {rep -> {cod, confidence, sursa, votes}} """ json.dump(results, open(path, "w", encoding="utf-8"), ensure_ascii=False, indent=1) def label_groups(groups, partial, batch_size=DEFAULT_BATCH, pace=DEFAULT_PACE): """Eticheteaza reprezentantii grupurilor cu ensemble NVIDIA. Sare reprezentantii deja in partial (resumabil). Colecteaza voturi per model in batch-uri, calculeaza ensemble, actualizeaza partial la final. groups: lista de {rep, freq, members} din group_by_similarity() partial: dict {rep -> label} - stare anterioara (modificat in-place) batch_size: denumiri per apel LLM pace: sec intre batch-uri (0 = fara pauza, util in teste) Intoarce partial actualizat. """ todo = [g["rep"] for g in groups if g["rep"] not in partial] if not todo: print("toti reprezentantii sunt deja in partial, nimic de facut", flush=True) return partial print(f"de etichetat: {len(todo)} reprezentanti " f"(skip {len(groups) - len(todo)} din partial)", flush=True) # Colectam voturile per model, pentru toti reprezentantii nerezolvati votes_per_rep = {rep: {} for rep in todo} nb = (len(todo) + batch_size - 1) // batch_size for mi, m in enumerate(MODELS): print(f" model: {m}", flush=True) for bi, k in enumerate(range(0, len(todo), batch_size)): batch = todo[k:k + batch_size] codes, meta = oc.call(m, batch) for rep, cod in zip(batch, codes): votes_per_rep[rep][m] = cod print(f" batch {bi+1}/{nb} {meta['ms']}ms err={meta['err']}", flush=True) if bi < nb - 1 and pace > 0: time.sleep(pace) if pace > 0 and mi < len(MODELS) - 1: time.sleep(pace) # pauza intre modele diferite # Ensemble + scriere in partial for rep in todo: cod, confidence, sursa = ensemble_vote(votes_per_rep[rep]) partial[rep] = { "cod": cod, "confidence": confidence, "sursa": sursa, "votes": votes_per_rep[rep], } return partial def expand_to_all(groups, partial): """Propaga etichetele reprezentantilor la membrii grupului. Reprezentantul primeste sursa din ensemble ("ensemble-*"). Membrii primesc sursa="propagat" si codul/confidence al reprezentantului. NUL este pastrat ca NUL la propagare, nu e convertit la cod RAR (#4). Intoarce: lista de dict {denumire, cod, sursa, confidence, grup_rep} """ results = [] for g in groups: rep = g["rep"] label = partial.get(rep, {}) cod = label.get("cod", "?") confidence = label.get("confidence", "needs_mapping") sursa_rep = label.get("sursa", "ensemble-dezacord") # Reprezentantul results.append({ "denumire": rep, "cod": cod, "sursa": sursa_rep, "confidence": confidence, "grup_rep": rep, }) # Membrii grupului: propaga codul reprezentantului for (mem, _freq) in g["members"]: results.append({ "denumire": mem, "cod": cod, "sursa": "propagat", "confidence": confidence, "grup_rep": rep, }) return results def run(n=DEFAULT_N, output_path=FINAL_PATH, partial_path=PARTIAL_PATH, threshold=DEFAULT_THRESHOLD, batch_size=DEFAULT_BATCH, pace=DEFAULT_PACE): """Punctul principal: citeste corpus, grupeaza, eticheteaza, salveaza. Resumabil: daca partial_path exista, sare reprezentantii deja etichetati. n: top N denumiri dupa frecventa de procesat output_path: fisier JSON cu toate etichetele (final) partial_path: fisier JSON resumabil (stare intermediara per reprezentant) threshold: raza similaritate pt grupare (0-100, default 85 = conservator) batch_size: denumiri per apel LLM pace: sec intre batch-uri Intoarce: lista de rezultate (identica cu fisierul output_path). """ corpus = oc.corpus_by_freq() top = corpus[:n] vol_total = sum(nr for _, nr in corpus) or 1 vol_top = sum(nr for _, nr in top) print(f"corpus: {len(corpus)} denumiri distincte, volum total {vol_total}") print(f"top {n} dupa frecventa: volum {vol_top} ({100*vol_top/vol_total:.1f}%)") groups = group_by_similarity(top, threshold) n_reps = len(groups) n_mems = sum(len(g["members"]) for g in groups) print(f"dupa grupare: {n_reps} reprezentanti, {n_mems} membri propagati din {n}") partial = load_partial(partial_path) print(f"partial incarcat: {len(partial)} reprezentanti deja etichetati") partial = label_groups(groups, partial, batch_size, pace) save_partial(partial_path, partial) print(f"partial salvat: {partial_path}") results = expand_to_all(groups, partial) json.dump(results, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=1) # Raport sumar nul_cnt = sum(1 for r in results if r["cod"] == "NUL") high_cnt = sum(1 for r in results if r["confidence"] == "high") needs_cnt = sum(1 for r in results if r["confidence"] == "needs_mapping") prop_cnt = sum(1 for r in results if r["sursa"] == "propagat") print(f"\nREZULTAT: {len(results)} denumiri in output") print(f" NUL (gunoi, ancore negative): {nul_cnt}") print(f" confidence high (unanim): {high_cnt}") print(f" needs_mapping (dezacord): {needs_cnt}") print(f" propagate din grup: {prop_cnt}") print(f"salvat: {output_path}") return results if __name__ == "__main__": import argparse p = argparse.ArgumentParser(description="Etichetator batch offline OpenRouter (L14-S1)") p.add_argument("n", nargs="?", type=int, default=DEFAULT_N, help=f"top N denumiri dupa frecventa (default {DEFAULT_N})") p.add_argument("--out", default=FINAL_PATH, metavar="PATH", help="fisier output final JSON (default: or-labels-final.json)") p.add_argument("--partial", default=PARTIAL_PATH, metavar="PATH", help="fisier partial resumabil JSON (default: or-labels-partial.json)") p.add_argument("--threshold", type=int, default=DEFAULT_THRESHOLD, help=f"raza similaritate grupare 0-100 (default {DEFAULT_THRESHOLD})") p.add_argument("--batch", type=int, default=DEFAULT_BATCH, help=f"denumiri per apel LLM (default {DEFAULT_BATCH})") p.add_argument("--pace", type=float, default=DEFAULT_PACE, help=f"sec intre batch-uri (default {DEFAULT_PACE})") a = p.parse_args() run(a.n, a.out, a.partial, a.threshold, a.batch, a.pace)