"""Teste TDD pentru tools/mapare-llm/heldout_eval.py (L14-S5). Fixture sintetic cu predictii+ground_truth cunoscute. Verifica: - precizie globala - precizie per-cod (TP/FP/FN per eticheta) - rata cod-gresit (critic: cod gresit = FINALIZATA ireversibil) - esantionare stratificata determinista (acelasi seed = aceleasi rezultate) - kill-criterion (pass/fail pe praguri definite) Rulare: python3 -m pytest tests/test_heldout_eval.py -v """ from __future__ import annotations import os import sys import csv # Adaugam tools/mapare-llm/ la sys.path (pattern din test_holdout.py) HERE = os.path.dirname(os.path.abspath(__file__)) TOOLS_DIR = os.path.abspath(os.path.join(HERE, "..", "tools", "mapare-llm")) if TOOLS_DIR not in sys.path: sys.path.insert(0, TOOLS_DIR) import pytest import heldout_eval as he # --------------------------------------------------------------------------- # Fixture sintetic pentru eval_predictions # --------------------------------------------------------------------------- # 6 intrari; 3 corecte, 1 cod-gresit (critic), 1 NUL fals-negativ, 1 nerezolvat PREDS = [ {"denumire": "REVIZIE PERIODICA", "cod_pred": "OE-3"}, # corect {"denumire": "SCHIMB ULEI MOTOR", "cod_pred": "OE-1"}, # GRESIT: gold=OE-3 (cod gresit!) {"denumire": "DISCOUNT 10%", "cod_pred": "NUL"}, # corect {"denumire": "VOPSIRE BARA FATA", "cod_pred": "OE-1"}, # corect {"denumire": "DIAGNOSTICARE OBD", "cod_pred": "?"}, # nerezolvat {"denumire": "D/R BARA FATA", "cod_pred": "OE-2"}, # GRESIT: gold=OE-1 (cod gresit!) ] GOLD = [ {"denumire": "REVIZIE PERIODICA", "cod_gold": "OE-3"}, {"denumire": "SCHIMB ULEI MOTOR", "cod_gold": "OE-3"}, # pred=OE-1, gold=OE-3 -> COD GRESIT {"denumire": "DISCOUNT 10%", "cod_gold": "NUL"}, {"denumire": "VOPSIRE BARA FATA", "cod_gold": "OE-1"}, {"denumire": "DIAGNOSTICARE OBD", "cod_gold": "OE-4"}, {"denumire": "D/R BARA FATA", "cod_gold": "OE-1"}, # pred=OE-2, gold=OE-1 -> COD GRESIT ] # total=6, correct=3 (REVIZIE, DISCOUNT, VOPSIRE) # wrong_code=2 (SCHIMB ULEI: OE-1 vs OE-3; D/R BARA: OE-2 vs OE-1) # coverage_count=5 (pred!="?"), coverage_rate=5/6 # global_precision=3/6=0.50 # wrong_code_rate=2/6 # --------------------------------------------------------------------------- # Sectiunea 1: eval_predictions — precizie globala # --------------------------------------------------------------------------- class TestEvalPrecizie: """Verifica metricile globale returnate de eval_predictions.""" def test_total_items(self): """total = numarul de intrari din ground_truth.""" m = he.eval_predictions(PREDS, GOLD) assert m["total"] == 6 def test_correct_count(self): """3 predictii corecte din 6.""" m = he.eval_predictions(PREDS, GOLD) assert m["correct"] == 3 def test_global_precision(self): """global_precision = correct / total = 3/6 = 0.50.""" m = he.eval_predictions(PREDS, GOLD) assert abs(m["global_precision"] - 0.50) < 1e-9 def test_campuri_obligatorii(self): """Rezultatul contine toate campurile definite.""" m = he.eval_predictions(PREDS, GOLD) obligatorii = [ "total", "correct", "global_precision", "wrong_code_count", "wrong_code_rate", "coverage_count", "coverage_rate", "per_cod", "confusion_matrix", ] for camp in obligatorii: assert camp in m, f"Camp lipsa: {camp}" def test_empty_inputs(self): """Input gol -> metrics cu valori zero, fara exceptie.""" m = he.eval_predictions([], []) assert m["total"] == 0 assert m["global_precision"] == 0.0 assert m["wrong_code_rate"] == 0.0 def test_all_correct(self): """Toate corecte -> precision 1.0, wrong_code_rate 0.0.""" preds = [ {"denumire": "REVIZIE", "cod_pred": "OE-3"}, {"denumire": "ITP", "cod_pred": "NUL"}, ] gold = [ {"denumire": "REVIZIE", "cod_gold": "OE-3"}, {"denumire": "ITP", "cod_gold": "NUL"}, ] m = he.eval_predictions(preds, gold) assert m["global_precision"] == 1.0 assert m["wrong_code_rate"] == 0.0 def test_predictie_lipsa_tratata_ca_nerezolvata(self): """Daca o denumire din gold nu e in predictions -> pred='?' (nerezolvat).""" preds = [ {"denumire": "REVIZIE", "cod_pred": "OE-3"}, # SCHIMB ULEI lipseste din predictii ] gold = [ {"denumire": "REVIZIE", "cod_gold": "OE-3"}, {"denumire": "SCHIMB ULEI", "cod_gold": "OE-3"}, ] m = he.eval_predictions(preds, gold) assert m["total"] == 2 assert m["correct"] == 1 # doar REVIZIE assert m["coverage_count"] == 1 # SCHIMB ULEI e "?" # --------------------------------------------------------------------------- # Sectiunea 2: eval_predictions — rata cod-gresit (CRITIC) # --------------------------------------------------------------------------- class TestWrongCodeRate: """ 'Cod gresit' = pred in VALID_RAR, gold in VALID_RAR, pred != gold. Aceasta situatie ar produce FINALIZATA ireversibil cu cod eronat. """ def test_wrong_code_count(self): """2 cod-gresit din 6 intrari.""" m = he.eval_predictions(PREDS, GOLD) assert m["wrong_code_count"] == 2 def test_wrong_code_rate(self): """wrong_code_rate = 2/6.""" m = he.eval_predictions(PREDS, GOLD) assert abs(m["wrong_code_rate"] - 2 / 6) < 1e-9 def test_nul_gresit_nu_e_cod_gresit(self): """pred=NUL si gold=OE-3 NU e 'cod gresit' (item merge la needs_mapping, nu la FINALIZATA).""" preds = [{"denumire": "REVIZIE", "cod_pred": "NUL"}] gold = [{"denumire": "REVIZIE", "cod_gold": "OE-3"}] m = he.eval_predictions(preds, gold) # pred=NUL nu genereaza FINALIZATA -> wrong_code_count=0 assert m["wrong_code_count"] == 0 def test_zero_wrong_code_pe_fixture_corect(self): """Pe fixture 'all correct', wrong_code_count = 0.""" preds = [{"denumire": "X", "cod_pred": "OE-1"}] gold = [{"denumire": "X", "cod_gold": "OE-1"}] m = he.eval_predictions(preds, gold) assert m["wrong_code_count"] == 0 assert m["wrong_code_rate"] == 0.0 # --------------------------------------------------------------------------- # Sectiunea 3: eval_predictions — acoperire (coverage) # --------------------------------------------------------------------------- class TestCoverage: """coverage = fractia de intrari cu pred != '?' (are un raspuns, fie cod fie NUL).""" def test_coverage_count(self): """5 din 6 au pred != '?'.""" m = he.eval_predictions(PREDS, GOLD) assert m["coverage_count"] == 5 def test_coverage_rate(self): """coverage_rate = 5/6.""" m = he.eval_predictions(PREDS, GOLD) assert abs(m["coverage_rate"] - 5 / 6) < 1e-9 def test_coverage_zero_pe_toate_nerezolvate(self): """Daca toate pred='?' -> coverage=0.""" preds = [{"denumire": "X", "cod_pred": "?"}] gold = [{"denumire": "X", "cod_gold": "OE-3"}] m = he.eval_predictions(preds, gold) assert m["coverage_count"] == 0 assert m["coverage_rate"] == 0.0 # --------------------------------------------------------------------------- # Sectiunea 4: eval_predictions — per_cod (TP/FP/FN + precision/recall) # --------------------------------------------------------------------------- class TestPerCod: """Verifica metricile per eticheta (precizie + recall per cod).""" def test_per_cod_returnat(self): """per_cod e un dict cu chei = etichete prezente.""" m = he.eval_predictions(PREDS, GOLD) assert isinstance(m["per_cod"], dict) assert len(m["per_cod"]) > 0 def test_per_cod_campuri(self): """Fiecare cod are tp, fp, fn, precision, recall.""" m = he.eval_predictions(PREDS, GOLD) for cod, stats in m["per_cod"].items(): assert "tp" in stats, f"tp lipsa pentru {cod}" assert "fp" in stats, f"fp lipsa pentru {cod}" assert "fn" in stats, f"fn lipsa pentru {cod}" assert "precision" in stats, f"precision lipsa pentru {cod}" assert "recall" in stats, f"recall lipsa pentru {cod}" def test_per_cod_oe1_precision(self): """OE-1: pred pt [VOPSIRE(corect), D/R BARA(gresit, gold=OE-1 dar pred=OE-2)]. Wait - pred=OE-1 pt VOPSIRE(gold=OE-1 corect) si SCHIMB ULEI(gold=OE-3 gresit). TP=1(VOPSIRE), FP=1(SCHIMB ULEI pred=OE-1 dar gold=OE-3), FN=1(D/R BARA pred=OE-2 nu OE-1). precision_OE1 = 1/(1+1) = 0.50 recall_OE1 = 1/(1+1) = 0.50 """ m = he.eval_predictions(PREDS, GOLD) oe1 = m["per_cod"].get("OE-1", {}) # TP: VOPSIRE BARA FATA (pred=OE-1, gold=OE-1) # FP: SCHIMB ULEI MOTOR (pred=OE-1, gold=OE-3) # FN: D/R BARA FATA (gold=OE-1, pred=OE-2) assert oe1.get("tp") == 1 assert oe1.get("fp") == 1 assert oe1.get("fn") == 1 assert abs(oe1.get("precision") - 0.50) < 1e-9 assert abs(oe1.get("recall") - 0.50) < 1e-9 def test_per_cod_oe3_precision(self): """OE-3: pred pt [REVIZIE(corect)]. gold=OE-3 pt [REVIZIE, SCHIMB ULEI]. TP=1(REVIZIE), FP=0, FN=1(SCHIMB ULEI pred=OE-1). precision=1.0, recall=0.50 """ m = he.eval_predictions(PREDS, GOLD) oe3 = m["per_cod"].get("OE-3", {}) assert oe3.get("tp") == 1 assert oe3.get("fp") == 0 assert oe3.get("fn") == 1 assert abs(oe3.get("precision") - 1.0) < 1e-9 assert abs(oe3.get("recall") - 0.50) < 1e-9 def test_per_cod_precision_none_pe_necunoscut(self): """Daca un cod e doar in gold (niciodata prezis) -> precision=None sau 0.""" # OE-4 e gold pt DIAGNOSTICARE, dar pred='?' -> FN=1, TP=0, FP=0 m = he.eval_predictions(PREDS, GOLD) oe4 = m["per_cod"].get("OE-4", {}) # Precision nedefinita (0/0): None sau 0.0 ambele OK assert oe4.get("tp") == 0 assert oe4.get("fp") == 0 assert oe4.get("fn") == 1 assert oe4.get("precision") is None or oe4.get("precision") == 0.0 # --------------------------------------------------------------------------- # Sectiunea 5: eval_predictions — matrice confuzie # --------------------------------------------------------------------------- class TestConfusionMatrix: """Matricea confuzie indexata ca 'gold->pred'.""" def test_confusion_matrix_returnat(self): """confusion_matrix e un dict.""" m = he.eval_predictions(PREDS, GOLD) assert isinstance(m["confusion_matrix"], dict) def test_confusion_matrix_cod_gresit_prezent(self): """Cazul 'gold=OE-3, pred=OE-1' (SCHIMB ULEI) -> cheie 'OE-3->OE-1' cu count 1.""" m = he.eval_predictions(PREDS, GOLD) assert m["confusion_matrix"].get("OE-3->OE-1") == 1 def test_confusion_matrix_corect(self): """Cazul corect 'gold=OE-3, pred=OE-3' (REVIZIE) -> cheie 'OE-3->OE-3' cu count 1.""" m = he.eval_predictions(PREDS, GOLD) assert m["confusion_matrix"].get("OE-3->OE-3") == 1 # --------------------------------------------------------------------------- # Sectiunea 6: sample_stratified — esantionare stratificata determinista # --------------------------------------------------------------------------- # Fixture: 20 iteme cu frecvente Zipf-like (suficient pt 3 strate) SAMPLE_ROWS = [(f"op_{i:02d}", max(1, 2000 - i * 100)) for i in range(20)] # Sortat descrescator: op_00=2000, op_01=1900, ..., op_19=100 # n=20, head_end = max(1, round(20*0.20)) = 4 # mid_end = max(5, round(20*0.50)) = 10 # cap = [op_00..op_03] (4 items) # mijloc = [op_04..op_09] (6 items) # coada = [op_10..op_19] (10 items) class TestSampleStratified: """Verifica proprietatile esantionarii stratificate.""" def test_determinist_acelasi_seed(self): """Acelasi seed -> acelasi rezultat (determinist).""" r1 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42) r2 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42) assert r1 == r2 def test_seed_diferit_rezultat_diferit(self): """Seed diferit -> (de obicei) rezultat diferit.""" r1 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42) r2 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=999) # Nu garanteaza 100% diferenta, dar pe 20 items e practic garantat assert r1 != r2 def test_items_din_input(self): """Toate itemele returnate exista in inputul original.""" result = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42) input_set = {(d, n) for d, n in SAMPLE_ROWS} for item in result: assert (item["denumire"], item["nr"]) in input_set def test_campuri_obligatorii(self): """Fiecare item are: denumire, nr, strat.""" result = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42) for item in result: assert "denumire" in item assert "nr" in item assert "strat" in item def test_strat_valid(self): """Valorile strat sunt exclusiv din {'cap', 'mijloc', 'coada'}.""" result = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42) for item in result: assert item["strat"] in ("cap", "mijloc", "coada") def test_toate_stratele_reprezentate(self): """Cand n_sample e suficient de mare, toate 3 stratele apar in rezultat.""" # n_sample=15 dintr-un total de 20 -> toate stratele au cel putin 1 item result = he.sample_stratified(SAMPLE_ROWS, n_sample=15, seed=42) strate_prezente = {item["strat"] for item in result} assert "cap" in strate_prezente assert "mijloc" in strate_prezente assert "coada" in strate_prezente def test_dimensiune_aproape_de_n_sample(self): """Dimensiunea rezultatului e aproape de n_sample (+/- 3 datorita rotunjirii).""" n_sample = 9 result = he.sample_stratified(SAMPLE_ROWS, n_sample=n_sample, seed=42) assert abs(len(result) - n_sample) <= 3 def test_fara_duplicate(self): """Niciun item nu apare de doua ori in esantion.""" result = he.sample_stratified(SAMPLE_ROWS, n_sample=15, seed=42) denumiri = [item["denumire"] for item in result] assert len(denumiri) == len(set(denumiri)) def test_input_gol(self): """Input gol -> returneaza lista goala fara exceptie.""" result = he.sample_stratified([], n_sample=10, seed=42) assert result == [] def test_n_sample_mai_mare_decat_corpus(self): """Cand n_sample > len(rows), returneaza cel mult len(rows) items.""" result = he.sample_stratified(SAMPLE_ROWS, n_sample=1000, seed=42) assert len(result) <= len(SAMPLE_ROWS) # --------------------------------------------------------------------------- # Sectiunea 7: export_for_labeling — fisier CSV pt etichetare umana # --------------------------------------------------------------------------- class TestExportForLabeling: """Exportul CSV contine denumire, nr, strat si coloana cod_gold GOALA.""" def test_fisier_creat(self, tmp_path): """Fisierul este creat la calea specificata.""" sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42) path = str(tmp_path / "esantion.csv") he.export_for_labeling(sample, path) assert os.path.exists(path) def test_header_csv(self, tmp_path): """CSV-ul are header-ul corect: denumire;nr;strat;cod_gold.""" sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42) path = str(tmp_path / "esantion.csv") he.export_for_labeling(sample, path) with open(path, encoding="utf-8-sig") as f: reader = csv.DictReader(f, delimiter=";") coloane = reader.fieldnames assert "denumire" in coloane assert "nr" in coloane assert "strat" in coloane assert "cod_gold" in coloane def test_cod_gold_gol(self, tmp_path): """Coloana cod_gold e goala (de completat de operator uman).""" sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42) path = str(tmp_path / "esantion.csv") he.export_for_labeling(sample, path) with open(path, encoding="utf-8-sig") as f: reader = csv.DictReader(f, delimiter=";") for row in reader: # Coloana cod_gold trebuie sa fie vida (nu etichetata de cod!) assert row["cod_gold"] == "", ( "cod_gold nu trebuie pre-completat: ar fi 'antrenare pe test' " "(Decision #19 PRD 5.14)" ) def test_n_randuri_egal_cu_sample(self, tmp_path): """CSV-ul are exact atatea randuri cat esantionul.""" sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42) path = str(tmp_path / "esantion.csv") he.export_for_labeling(sample, path) with open(path, encoding="utf-8-sig") as f: rows = list(csv.DictReader(f, delimiter=";")) assert len(rows) == len(sample) # --------------------------------------------------------------------------- # Sectiunea 8: kill_criterion — pragul de acceptanta (F-E, PRD 5.14) # --------------------------------------------------------------------------- class TestKillCriterion: """ Kill-criterion (F-E): sistemul TRECE daca wrong_code_rate < wrong_code_threshold (default 0.5%) SI coverage_rate > coverage_threshold (default 50%). Justificare threshold 0.5% (0.005): Un service cu 200 operatii/zi auto-rezolvate = 1 FINALIZATA gresita/zi. FINALIZATA e ireversibila (cf. PRD 5.14 Premisa 3 / invariant CLAUDE.md). Pragul poate fi RELAXAT empiric; nu INASPRIT post-hoc (sesizare-in-timp). """ def test_trece_cand_sub_prag(self): """Trece cand wrong_code_rate < threshold si coverage_rate > min_coverage.""" metrics = { "wrong_code_rate": 0.003, # 0.3% < 0.5% "coverage_rate": 0.70, # 70% > 50% } r = he.kill_criterion(metrics) assert r["passes"] is True def test_esueaza_cand_wrong_code_prea_mare(self): """Esueaza cand wrong_code_rate >= threshold.""" metrics = { "wrong_code_rate": 0.02, # 2% > 0.5% -> FAIL "coverage_rate": 0.70, } r = he.kill_criterion(metrics) assert r["passes"] is False assert "wrong_code" in r["reason"].lower() or "cod gresit" in r["reason"].lower() def test_esueaza_cand_coverage_prea_mica(self): """Esueaza cand coverage_rate < min_coverage_threshold.""" metrics = { "wrong_code_rate": 0.001, "coverage_rate": 0.30, # 30% < 50% -> FAIL } r = he.kill_criterion(metrics) assert r["passes"] is False assert "acoperire" in r["reason"].lower() or "coverage" in r["reason"].lower() def test_esueaza_pe_ambele_conditii(self): """Esueaza cand ambele conditii sunt incalcate.""" metrics = { "wrong_code_rate": 0.05, "coverage_rate": 0.10, } r = he.kill_criterion(metrics) assert r["passes"] is False def test_campuri_obligatorii_in_rezultat(self): """Rezultatul are: passes, reason, wrong_code_rate, coverage_rate, thresholds.""" metrics = {"wrong_code_rate": 0.001, "coverage_rate": 0.80} r = he.kill_criterion(metrics) for camp in ("passes", "reason", "wrong_code_rate", "coverage_rate", "thresholds"): assert camp in r, f"Camp lipsa: {camp}" def test_threshold_customizabil(self): """Pragurile pot fi suprascrise.""" metrics = {"wrong_code_rate": 0.05, "coverage_rate": 0.80} # Cu threshold mai lax, trece r = he.kill_criterion(metrics, wrong_code_threshold=0.10) assert r["passes"] is True def test_exact_pe_prag_nu_trece(self): """Pe prag exact (egalitate), nu trece (< e strict).""" threshold = he.DEFAULT_WRONG_CODE_THRESHOLD metrics = {"wrong_code_rate": threshold, "coverage_rate": 0.80} r = he.kill_criterion(metrics) # wrong_code_rate = threshold -> NU < threshold -> FAIL assert r["passes"] is False def test_reason_descrie_starea(self): """reason e un string non-gol care descrie de ce trece/esueaza.""" metrics = {"wrong_code_rate": 0.001, "coverage_rate": 0.80} r = he.kill_criterion(metrics) assert isinstance(r["reason"], str) assert len(r["reason"]) > 0 # --------------------------------------------------------------------------- # Sectiunea 9: constante si metadate modul # --------------------------------------------------------------------------- class TestModulMetadata: """Verifica existenta constantelor documentate.""" def test_valid_rar_definit(self): """VALID_RAR e un set non-gol de coduri RAR.""" assert hasattr(he, "VALID_RAR") assert isinstance(he.VALID_RAR, frozenset) assert len(he.VALID_RAR) >= 18 def test_nul_in_all_labels_nu_in_valid_rar(self): """NUL e eticheta speciala (supresie), NU e cod RAR valid.""" assert "NUL" not in he.VALID_RAR # NUL trebuie sa fie accesibil totusi assert hasattr(he, "NUL") assert he.NUL == "NUL" def test_default_seed(self): """DEFAULT_SEED exista si e intreg.""" assert hasattr(he, "DEFAULT_SEED") assert isinstance(he.DEFAULT_SEED, int) def test_default_thresholds_in_range(self): """Pragurile default sunt in (0, 1).""" assert 0 < he.DEFAULT_WRONG_CODE_THRESHOLD < 1 assert 0 < he.DEFAULT_COVERAGE_THRESHOLD < 1