"""Harness de evaluare held-out pentru sistemul de mapare operatii->coduri RAR. Scop (L14-S5, Decision #19 PRD 5.14): Masurarea ACURATETEI REALE a clasificatorului inainte de a permite orice tier auto-send peste GOLD propriu. Rationale: Masuratorile existente (100% acord vs Groq, 87% unanim NVIDIA) sunt masuri de ACORD (cross-model), nu de ACURATETE vs ground-truth. Same-family NVIDIA = eroare corelata: daca ambele modele gresesc la fel, acordul e 100% dar acuratete = 0. Un set etichetat de OM (esantion aleator stratificat) e singurul mod de a masura acuratete reala. Continut: 1. sample_stratified() — esantionare stratificata aleatorie (cap/mijloc/coada Zipf), determinista cu seed. FARA apel LLM. 2. export_for_labeling() — export CSV gol pt etichetare umana (ground-truth). Coloana cod_gold RAMANE GOALA: etichetarea umana e exclusiv responsabilitatea operatorului. 3. eval_predictions() — date (predictii, gold) -> precizie globala + per-cod + matrice confuzie + rata cod-gresit. 4. kill_criterion() — evalueaza daca sistemul indeplineste pragul de acceptanta (F-E, PRD 5.14). Ce NU face: NU eticheteaza ground-truth-ul. Etichetarea de cod ar fi exact "antrenare pe test" si ar invalida precizia raportata (Decision #19). Fisierul exportat se completeaza MANUAL de operatorul uman. CLI: python3 tools/mapare-llm/heldout_eval.py --n 250 --out esantion-heldout.csv Genereaza esantionul de 250 denumiri pt etichetare umana. python3 tools/mapare-llm/heldout_eval.py --eval predictii.csv gold.csv Evalueaza predictii vs ground-truth (ambele CSV cu camp 'denumire'). """ from __future__ import annotations import csv import os import random import sys _HERE = os.path.dirname(os.path.abspath(__file__)) _ROOT = os.path.abspath(os.path.join(_HERE, '..', '..')) if _ROOT not in sys.path: sys.path.insert(0, _ROOT) # --------------------------------------------------------------------------- # Constante # --------------------------------------------------------------------------- # Coduri RAR valide (din or_common.py / nomenclator, 18 coduri + NUL) # NUL = supresie (non-operatie); NU este cod RAR valid transmis la RAR. VALID_RAR: frozenset[str] = frozenset([ "OE-1", "OE-2", "OE-3", "OE-4", "OE-5", "OE-6", "OE-7", "OE-8", "OE-D", "OE-F", "OE-C", "OE-S", "OE-R", "OE-A", "OE-I", "AITLV", "R-ODO", "I-ODO", ]) NUL = "NUL" # eticheta speciala: supresie (nu e cod RAR) ALL_LABELS = VALID_RAR | {NUL} # toate etichetele valide ale clasificatorului UNRESOLVED = "?" # clasificatorul nu a dat raspuns -> needs_mapping # Seed implicit pentru reproductibilitate esantionare DEFAULT_SEED = 42 # Strate Zipf (proportii din numarul total de denumiri DISTINCTE): # cap = top 20% dupa frecventa (cateva denumiri, volum ridicat) # mijloc = urmatoarele 30% # coada = restul 50% (multe denumiri, volum scazut individual) _STRAT_HEAD_END_PCT = 0.20 _STRAT_MID_END_PCT = 0.50 # head+mid = 50%, deci mid = 30% # Kill-criterion (F-E, PRD 5.14): # # DEFAULT_WRONG_CODE_THRESHOLD = 0.005 (0.5%) # Justificare: un cod gresit = FINALIZATA ireversibila la RAR (Premisa 3). # La 200 operatii/zi auto-rezolvate cu 0.5% rata gresita = 1 FINALIZATA # gresita/zi, ceea ce depaseste toleranta operationala acceptabila. # Pragul poate fi RELAXAT empiric pe baza de date reale; NU inasprit post-hoc. # Recomandat: strangeti cel putin 200 esantioane inainte de a calibra. # # DEFAULT_COVERAGE_THRESHOLD = 0.50 (50%) # Justificare: sub 50% acoperire, sistemul nu aduce economie reala vs # needs_mapping uman (ar trebui sa lasi totul pe operatorul uman). DEFAULT_WRONG_CODE_THRESHOLD = 0.005 DEFAULT_COVERAGE_THRESHOLD = 0.50 # --------------------------------------------------------------------------- # Esantionare stratificata (FARA LLM) # --------------------------------------------------------------------------- def sample_stratified( rows: list[tuple[str, int]], n_sample: int = 250, seed: int = DEFAULT_SEED, ) -> list[dict]: """Esantionare stratificata aleatorie pe trei strate Zipf: cap/mijloc/coada. Determinista cu seed; NU apeleaza LLM (PRD L14-S5). rows: lista de (denumire, nr) — frecventele absolute. Nu trebuie sortata in prealabil. n_sample: marimea totala a esantionului (aproximativa, +/-3 datorita rotunjirii). Default 250 = practic pt etichetare umana in 2-3 ore. seed: seed pentru random.Random — acelasi seed produce acelasi esantion. Returneaza: list de dict: {denumire: str, nr: int, strat: str} strat in {"cap", "mijloc", "coada"} Stratificare (pe count, nu pe volum): cap = top 20% din denumirile distincte (cele cu frecventa mare) mijloc = urmatoarele 30% coada = restul 50% Alocare per strat: proportionala cu marimea stratului (egal per denumire), cu minim 1 per strat non-gol. """ if not rows: return [] # Sorteaza descrescator dupa frecventa (ca sa definim stratele corect) sorted_rows = sorted(rows, key=lambda x: -x[1]) n = len(sorted_rows) # Limite strate (pe indici) head_end = max(1, round(n * _STRAT_HEAD_END_PCT)) mid_end = max(head_end + 1, round(n * _STRAT_MID_END_PCT)) mid_end = min(mid_end, n) strata: dict[str, list[tuple[str, int]]] = { "cap": sorted_rows[:head_end], "mijloc": sorted_rows[head_end:mid_end], "coada": sorted_rows[mid_end:], } # Alocare proportionala cu marimea stratului names = ["cap", "mijloc", "coada"] sizes = {name: len(strata[name]) for name in names} total_size = sum(sizes.values()) # == n rng = random.Random(seed) # Calculeaza alocarea cu regula: max(1, round(n_sample * frac)) per strat ne-gol alloc: dict[str, int] = {} for name in names[:-1]: if sizes[name] == 0: alloc[name] = 0 else: a = max(1, round(n_sample * sizes[name] / total_size)) a = min(a, sizes[name]) # nu mai mult decat avem alloc[name] = a # Ultima strata primeste restul (pentru a ne apropia de n_sample) used = sum(alloc.get(name, 0) for name in names[:-1]) remaining = max(0, n_sample - used) alloc["coada"] = min(remaining, sizes["coada"]) if alloc["coada"] == 0 and sizes["coada"] > 0: alloc["coada"] = 1 # garantam minim 1 din coada daca exista # Esantionare per strat result: list[dict] = [] for name in names: items = strata[name] k = alloc.get(name, 0) if k > 0 and items: sampled = rng.sample(items, k) for (den, nr) in sampled: result.append({"denumire": den, "nr": nr, "strat": name}) return result # --------------------------------------------------------------------------- # Export CSV pentru etichetare umana # --------------------------------------------------------------------------- def export_for_labeling(sample: list[dict], path: str) -> None: """Exporta esantionul ca CSV pentru etichetare UMANA (ground-truth). Coloana `cod_gold` ramane GOALA in fisierul exportat. NU o completa cu etichete LLM sau automate: ar fi "antrenare pe test" si ar invalida precizia raportata (Decision #19, PRD 5.14). sample: lista de {denumire, nr, strat} returnata de sample_stratified() path: fisierul CSV de scris (suprascrie daca exista) Format CSV: UTF-8-BOM, separator ';', coloane: denumire;nr;strat;cod_gold """ with open(path, 'w', newline='', encoding='utf-8-sig') as f: writer = csv.writer(f, delimiter=';') writer.writerow(["denumire", "nr", "strat", "cod_gold"]) for item in sample: writer.writerow([ item["denumire"], item["nr"], item["strat"], "", # cod_gold GOLA — de completat de operator uman ]) # --------------------------------------------------------------------------- # Evaluare predictii vs ground-truth # --------------------------------------------------------------------------- def eval_predictions( predictions: list[dict], ground_truth: list[dict], ) -> dict: """Evalueaza predictiile clasificatorului fata de ground-truth uman. Matching pe 'denumire'. Denumirile din ground_truth fara predictie corespunzatoare sunt tratate ca UNRESOLVED (pred='?'). predictions: list de {denumire: str, cod_pred: str} cod_pred: cod RAR ("OE-1"…) | "NUL" | "?" (nerezolvat) ground_truth: list de {denumire: str, cod_gold: str} cod_gold: cod RAR | "NUL" (completat de operator uman) Returneaza dict cu: total — numarul total de intrari din ground_truth correct — predictii corecte (pred == gold) global_precision — correct / total wrong_code_count — cazuri cod-gresit (critic: FINALIZATA ireversibila) def: pred in VALID_RAR AND gold in VALID_RAR AND pred != gold wrong_code_rate — wrong_code_count / total coverage_count — predictii cu cod_pred != '?' (clasificatorul a raspuns) coverage_rate — coverage_count / total per_cod — dict {cod -> {tp, fp, fn, precision, recall}} confusion_matrix — dict {"gold->pred" -> count} Nota 'cod gresit' vs 'NUL gresit': pred=NUL si gold=OE-X -> item merge la needs_mapping, nu la FINALIZATA. Rau (operatie pierduta), dar REPARABIL. pred=OE-X si gold=NUL -> trimitem non-operatia la RAR cu un cod. Rau (inselatoare), dar RAR nu o accepta ca operatie. pred=OE-X si gold=OE-Y (X!=Y) -> FINALIZATA cu cod GRESIT. IREVERSIBIL. Doar ultimul caz e 'wrong_code' (blocant pentru auto-send dincolo de GOLD). """ if not ground_truth: return { "total": 0, "correct": 0, "global_precision": 0.0, "wrong_code_count": 0, "wrong_code_rate": 0.0, "coverage_count": 0, "coverage_rate": 0.0, "per_cod": {}, "confusion_matrix": {}, } gt_map: dict[str, str] = {item["denumire"]: item["cod_gold"] for item in ground_truth} pred_map: dict[str, str] = {item["denumire"]: item["cod_pred"] for item in predictions} total = len(gt_map) correct = 0 wrong_code_count = 0 coverage_count = 0 per_cod_tp: dict[str, int] = {} per_cod_fp: dict[str, int] = {} per_cod_fn: dict[str, int] = {} confusion: dict[str, int] = {} for den, gold in gt_map.items(): pred = pred_map.get(den, UNRESOLVED) # Matrice confuzie key = f"{gold}->{pred}" confusion[key] = confusion.get(key, 0) + 1 # Coverage: classificatorul a dat un raspuns (nu '?') if pred != UNRESOLVED: coverage_count += 1 if pred == gold: # Predictie corecta correct += 1 per_cod_tp[gold] = per_cod_tp.get(gold, 0) + 1 else: # Eroare: FN pentru gold, FP pentru pred (daca nu '?') per_cod_fn[gold] = per_cod_fn.get(gold, 0) + 1 if pred != UNRESOLVED: per_cod_fp[pred] = per_cod_fp.get(pred, 0) + 1 # COD GRESIT: ambii (pred si gold) sunt coduri RAR valide (diferite) # -> ar produce FINALIZATA cu cod eronat (ireversibil) if pred in VALID_RAR and gold in VALID_RAR: wrong_code_count += 1 # Calculeaza per_cod (union a tuturor codurilor vazute) all_codes = set(per_cod_tp) | set(per_cod_fp) | set(per_cod_fn) per_cod: dict[str, dict] = {} for code in sorted(all_codes): tp = per_cod_tp.get(code, 0) fp = per_cod_fp.get(code, 0) fn = per_cod_fn.get(code, 0) precision = tp / (tp + fp) if (tp + fp) > 0 else None recall = tp / (tp + fn) if (tp + fn) > 0 else None per_cod[code] = { "tp": tp, "fp": fp, "fn": fn, "precision": precision, "recall": recall, } return { "total": total, "correct": correct, "global_precision": correct / total, "wrong_code_count": wrong_code_count, "wrong_code_rate": wrong_code_count / total, "coverage_count": coverage_count, "coverage_rate": coverage_count / total, "per_cod": per_cod, "confusion_matrix": confusion, } # --------------------------------------------------------------------------- # Kill-criterion (F-E, PRD 5.14) # --------------------------------------------------------------------------- def kill_criterion( metrics: dict, wrong_code_threshold: float = DEFAULT_WRONG_CODE_THRESHOLD, coverage_threshold: float = DEFAULT_COVERAGE_THRESHOLD, ) -> dict: """Evalueaza daca sistemul de clasificare indeplineste pragul de acceptanta (F-E). Sistemul TRECE daca: wrong_code_rate < wrong_code_threshold (implicit 0.5%) SI coverage_rate > coverage_threshold (implicit 50%) Un sistem care nu trece kill-criterion NU trebuie folosit pentru auto-send dincolo de GOLD propriu (Decision #19, #17, PRD 5.14). metrics: dict returnat de eval_predictions() sau compatibil (must have keys: wrong_code_rate, coverage_rate). wrong_code_threshold: pragul maxim admis pentru rata cod-gresit. coverage_threshold: pragul minim admis pentru acoperire. Returneaza dict cu: passes — True daca ambele conditii sunt indeplinite reason — explicatie in limba romana wrong_code_rate — valoarea actuala coverage_rate — valoarea actuala thresholds — {"wrong_code": ..., "coverage": ...} """ wcr = metrics.get("wrong_code_rate", 1.0) cvr = metrics.get("coverage_rate", 0.0) cond_wrong_code = wcr < wrong_code_threshold cond_coverage = cvr > coverage_threshold passes = cond_wrong_code and cond_coverage if passes: reason = ( f"TRECE: rata cod-gresit {wcr:.2%} < {wrong_code_threshold:.2%} " f"si acoperire {cvr:.1%} > {coverage_threshold:.1%}." ) elif not cond_wrong_code and not cond_coverage: reason = ( f"ESUEAZA: rata cod-gresit {wcr:.2%} >= {wrong_code_threshold:.2%} " f"(FINALIZATA ireversibila) SI acoperire {cvr:.1%} <= {coverage_threshold:.1%} " f"(sistem neutilizabil). Auto-send dincolo de GOLD dezactivat." ) elif not cond_wrong_code: reason = ( f"ESUEAZA: rata cod-gresit {wcr:.2%} >= {wrong_code_threshold:.2%}. " f"Un cod gresit = FINALIZATA ireversibila la RAR (Premisa 3, PRD 5.14). " f"Auto-send dincolo de GOLD dezactivat pana la recalibrat." ) else: reason = ( f"ESUEAZA: acoperire {cvr:.1%} <= {coverage_threshold:.1%}. " f"Sub pragul minim de utilitate practica. " f"Sistemul ar lasa prea multe intrari in needs_mapping vs efort uman direct." ) return { "passes": passes, "reason": reason, "wrong_code_rate": wcr, "coverage_rate": cvr, "thresholds": { "wrong_code": wrong_code_threshold, "coverage": coverage_threshold, }, } # --------------------------------------------------------------------------- # I/O corpus real (refoloseste holdout.load_csv) # --------------------------------------------------------------------------- def _load_corpus_from_csvs(data_dir: str) -> list[tuple[str, int]]: """Incarca corpus din CSV-urile docs/operatii-service/*.csv. Refoloseste logica din holdout.load_csv + agregare cross-client. """ import glob from app.mapping import normalize_for_match agg: dict[str, list] = {} for path in sorted(glob.glob(os.path.join(data_dir, "*.csv"))): try: with open(path, encoding='utf-8-sig') as f: reader = csv.DictReader(f, delimiter=';') for row in reader: denop = (row.get('DENOP') or '').strip().strip('"') nr_raw = (row.get('NR') or '').strip().strip('"') if not denop or not nr_raw: continue try: nr = int(nr_raw) except ValueError: continue if nr <= 0: continue key = normalize_for_match(denop) if key not in agg: agg[key] = [denop, 0] agg[key][1] += nr except OSError: continue return [(v[0], v[1]) for v in agg.values()] # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _print_report(metrics: dict) -> None: sep = "=" * 70 print(sep) print("RAPORT EVALUARE HELD-OUT (L14-S5, PRD 5.14)") print(sep) print(f" Total intrari evaluate: {metrics['total']}") print(f" Corecte: {metrics['correct']}") print(f" Precizie globala: {metrics['global_precision']:.2%}") print(f" Acoperire (pred != '?'): {metrics['coverage_rate']:.2%}") print(f" Rata cod-gresit: {metrics['wrong_code_rate']:.2%} " f"({metrics['wrong_code_count']} cazuri)") print() print("KILL-CRITERION (F-E):") kc = kill_criterion(metrics) print(f" {kc['reason']}") print() if metrics['per_cod']: print("PRECIZIE PER COD (TP/FP/FN/prec/recall):") for cod, s in sorted(metrics['per_cod'].items()): prec = f"{s['precision']:.0%}" if s['precision'] is not None else "N/A" rec = f"{s['recall']:.0%}" if s['recall'] is not None else "N/A" print(f" {cod:<10} TP={s['tp']:3d} FP={s['fp']:3d} FN={s['fn']:3d}" f" prec={prec:>5} recall={rec:>5}") print() if metrics['confusion_matrix']: print("MATRICE CONFUZIE (gold->pred, >0):") for key, cnt in sorted(metrics['confusion_matrix'].items()): if cnt > 0 and not key.endswith(f"->{key.split('->')[0]}"): # Afiseaza doar erorile (gold != pred) gold, pred_lbl = key.split("->", 1) if gold != pred_lbl: print(f" {key:<25} {cnt}") print(sep) def main() -> None: import argparse p = argparse.ArgumentParser( description="Harness eval held-out L14-S5 (PRD 5.14).", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Moduri de utilizare: Generare esantion pt etichetare umana (FARA LLM): python3 tools/mapare-llm/heldout_eval.py --n 250 --out esantion.csv Evaluare predictii vs ground-truth (dupa etichetare umana): python3 tools/mapare-llm/heldout_eval.py \\ --eval predictii.csv gold.csv Format CSV predictii: denumire;cod_pred (separator ';') Format CSV gold: denumire;cod_gold (separator ';') """, ) p.add_argument("--n", type=int, default=250, help="Marimea esantionului de etichetat (default 250)") p.add_argument("--seed", type=int, default=DEFAULT_SEED, help=f"Seed reproductibilitate (default {DEFAULT_SEED})") p.add_argument("--out", default=None, help="Fisier output CSV pt esantion (mod generare)") p.add_argument("--eval", nargs=2, metavar=("PRED_CSV", "GOLD_CSV"), help="Fisiere predictii si ground-truth (mod evaluare)") p.add_argument("--data", default=None, help="Director CSV date (default: docs/operatii-service/)") args = p.parse_args() data_dir = args.data or os.path.join(_ROOT, "docs", "operatii-service") if args.eval: # Mod evaluare pred_path, gold_path = args.eval def read_csv_map(path, cod_col): result = [] with open(path, encoding='utf-8-sig') as f: reader = csv.DictReader(f, delimiter=';') for row in reader: den = (row.get('denumire') or '').strip() cod = (row.get(cod_col) or '').strip() if den: result.append({"denumire": den, cod_col: cod}) return result preds = read_csv_map(pred_path, "cod_pred") gold = read_csv_map(gold_path, "cod_gold") metrics = eval_predictions(preds, gold) _print_report(metrics) return # Mod generare esantion print(f"Incarcare corpus din {data_dir} ...") rows = _load_corpus_from_csvs(data_dir) print(f"Corpus: {len(rows)} denumiri distincte, " f"volum total {sum(nr for _, nr in rows):,}") sample = sample_stratified(rows, n_sample=args.n, seed=args.seed) # Statistici strate from collections import Counter strat_cnt = Counter(item["strat"] for item in sample) print(f"Esantion ({len(sample)} iteme, seed={args.seed}):") for strat in ("cap", "mijloc", "coada"): print(f" {strat:<8}: {strat_cnt.get(strat, 0):4d} iteme") out_path = args.out or os.path.join(_HERE, "heldout-esantion.csv") export_for_labeling(sample, out_path) print(f"Esantion exportat: {out_path}") print() print("INSTRUCTIUNI ETICHETARE:") print(" Deschide fisierul exportat si completeaza coloana 'cod_gold'") print(" cu codul RAR corect pentru fiecare denumire.") print(" Coduri RAR valide:", ", ".join(sorted(VALID_RAR)), ", NUL") print(" NUL = denumire care NU este operatie de service (discount, ITP, etc.)") print(" '?' = incert (clasificatorul nu poate decide)") print() print(" ATENTIE: NU folosi etichete LLM drept cod_gold!") print(" Asta ar fi 'antrenare pe test' (Decision #19, PRD 5.14) si ar") print(" invalida orice masurare de acuratete.") if __name__ == "__main__": main()