rar-autopass/tests/test_heldout_eval.py

"""Teste TDD pentru tools/mapare-llm/heldout_eval.py (L14-S5).

Fixture sintetic cu predictii+ground_truth cunoscute. Verifica:
  - precizie globala
  - precizie per-cod (TP/FP/FN per eticheta)
  - rata cod-gresit (critic: cod gresit = FINALIZATA ireversibil)
  - esantionare stratificata determinista (acelasi seed = aceleasi rezultate)
  - kill-criterion (pass/fail pe praguri definite)

Rulare: python3 -m pytest tests/test_heldout_eval.py -v
"""
from __future__ import annotations

import os
import sys
import csv

# Adaugam tools/mapare-llm/ la sys.path (pattern din test_holdout.py)
HERE = os.path.dirname(os.path.abspath(__file__))
TOOLS_DIR = os.path.abspath(os.path.join(HERE, "..", "tools", "mapare-llm"))
if TOOLS_DIR not in sys.path:
    sys.path.insert(0, TOOLS_DIR)

import pytest
import heldout_eval as he


# ---------------------------------------------------------------------------
# Fixture sintetic pentru eval_predictions
# ---------------------------------------------------------------------------

# 6 intrari; 3 corecte, 1 cod-gresit (critic), 1 NUL fals-negativ, 1 nerezolvat
PREDS = [
    {"denumire": "REVIZIE PERIODICA",  "cod_pred": "OE-3"},   # corect
    {"denumire": "SCHIMB ULEI MOTOR",  "cod_pred": "OE-1"},   # GRESIT: gold=OE-3 (cod gresit!)
    {"denumire": "DISCOUNT 10%",       "cod_pred": "NUL"},    # corect
    {"denumire": "VOPSIRE BARA FATA",  "cod_pred": "OE-1"},   # corect
    {"denumire": "DIAGNOSTICARE OBD",  "cod_pred": "?"},      # nerezolvat
    {"denumire": "D/R BARA FATA",      "cod_pred": "OE-2"},   # GRESIT: gold=OE-1 (cod gresit!)
]

GOLD = [
    {"denumire": "REVIZIE PERIODICA",  "cod_gold": "OE-3"},
    {"denumire": "SCHIMB ULEI MOTOR",  "cod_gold": "OE-3"},   # pred=OE-1, gold=OE-3 -> COD GRESIT
    {"denumire": "DISCOUNT 10%",       "cod_gold": "NUL"},
    {"denumire": "VOPSIRE BARA FATA",  "cod_gold": "OE-1"},
    {"denumire": "DIAGNOSTICARE OBD",  "cod_gold": "OE-4"},
    {"denumire": "D/R BARA FATA",      "cod_gold": "OE-1"},   # pred=OE-2, gold=OE-1 -> COD GRESIT
]
# total=6, correct=3 (REVIZIE, DISCOUNT, VOPSIRE)
# wrong_code=2 (SCHIMB ULEI: OE-1 vs OE-3; D/R BARA: OE-2 vs OE-1)
# coverage_count=5 (pred!="?"), coverage_rate=5/6
# global_precision=3/6=0.50
# wrong_code_rate=2/6


# ---------------------------------------------------------------------------
# Sectiunea 1: eval_predictions — precizie globala
# ---------------------------------------------------------------------------

class TestEvalPrecizie:
    """Verifica metricile globale returnate de eval_predictions."""

    def test_total_items(self):
        """total = numarul de intrari din ground_truth."""
        m = he.eval_predictions(PREDS, GOLD)
        assert m["total"] == 6

    def test_correct_count(self):
        """3 predictii corecte din 6."""
        m = he.eval_predictions(PREDS, GOLD)
        assert m["correct"] == 3

    def test_global_precision(self):
        """global_precision = correct / total = 3/6 = 0.50."""
        m = he.eval_predictions(PREDS, GOLD)
        assert abs(m["global_precision"] - 0.50) < 1e-9

    def test_campuri_obligatorii(self):
        """Rezultatul contine toate campurile definite."""
        m = he.eval_predictions(PREDS, GOLD)
        obligatorii = [
            "total", "correct", "global_precision",
            "wrong_code_count", "wrong_code_rate",
            "coverage_count", "coverage_rate",
            "per_cod", "confusion_matrix",
        ]
        for camp in obligatorii:
            assert camp in m, f"Camp lipsa: {camp}"

    def test_empty_inputs(self):
        """Input gol -> metrics cu valori zero, fara exceptie."""
        m = he.eval_predictions([], [])
        assert m["total"] == 0
        assert m["global_precision"] == 0.0
        assert m["wrong_code_rate"] == 0.0

    def test_all_correct(self):
        """Toate corecte -> precision 1.0, wrong_code_rate 0.0."""
        preds = [
            {"denumire": "REVIZIE", "cod_pred": "OE-3"},
            {"denumire": "ITP",     "cod_pred": "NUL"},
        ]
        gold = [
            {"denumire": "REVIZIE", "cod_gold": "OE-3"},
            {"denumire": "ITP",     "cod_gold": "NUL"},
        ]
        m = he.eval_predictions(preds, gold)
        assert m["global_precision"] == 1.0
        assert m["wrong_code_rate"] == 0.0

    def test_predictie_lipsa_tratata_ca_nerezolvata(self):
        """Daca o denumire din gold nu e in predictions -> pred='?' (nerezolvat)."""
        preds = [
            {"denumire": "REVIZIE", "cod_pred": "OE-3"},
            # SCHIMB ULEI lipseste din predictii
        ]
        gold = [
            {"denumire": "REVIZIE",     "cod_gold": "OE-3"},
            {"denumire": "SCHIMB ULEI", "cod_gold": "OE-3"},
        ]
        m = he.eval_predictions(preds, gold)
        assert m["total"] == 2
        assert m["correct"] == 1  # doar REVIZIE
        assert m["coverage_count"] == 1  # SCHIMB ULEI e "?"


# ---------------------------------------------------------------------------
# Sectiunea 2: eval_predictions — rata cod-gresit (CRITIC)
# ---------------------------------------------------------------------------

class TestWrongCodeRate:
    """
    'Cod gresit' = pred in VALID_RAR, gold in VALID_RAR, pred != gold.
    Aceasta situatie ar produce FINALIZATA ireversibil cu cod eronat.
    """

    def test_wrong_code_count(self):
        """2 cod-gresit din 6 intrari."""
        m = he.eval_predictions(PREDS, GOLD)
        assert m["wrong_code_count"] == 2

    def test_wrong_code_rate(self):
        """wrong_code_rate = 2/6."""
        m = he.eval_predictions(PREDS, GOLD)
        assert abs(m["wrong_code_rate"] - 2 / 6) < 1e-9

    def test_nul_gresit_nu_e_cod_gresit(self):
        """pred=NUL si gold=OE-3 NU e 'cod gresit' (item merge la needs_mapping, nu la FINALIZATA)."""
        preds = [{"denumire": "REVIZIE", "cod_pred": "NUL"}]
        gold  = [{"denumire": "REVIZIE", "cod_gold": "OE-3"}]
        m = he.eval_predictions(preds, gold)
        # pred=NUL nu genereaza FINALIZATA -> wrong_code_count=0
        assert m["wrong_code_count"] == 0

    def test_zero_wrong_code_pe_fixture_corect(self):
        """Pe fixture 'all correct', wrong_code_count = 0."""
        preds = [{"denumire": "X", "cod_pred": "OE-1"}]
        gold  = [{"denumire": "X", "cod_gold": "OE-1"}]
        m = he.eval_predictions(preds, gold)
        assert m["wrong_code_count"] == 0
        assert m["wrong_code_rate"] == 0.0


# ---------------------------------------------------------------------------
# Sectiunea 3: eval_predictions — acoperire (coverage)
# ---------------------------------------------------------------------------

class TestCoverage:
    """coverage = fractia de intrari cu pred != '?' (are un raspuns, fie cod fie NUL)."""

    def test_coverage_count(self):
        """5 din 6 au pred != '?'."""
        m = he.eval_predictions(PREDS, GOLD)
        assert m["coverage_count"] == 5

    def test_coverage_rate(self):
        """coverage_rate = 5/6."""
        m = he.eval_predictions(PREDS, GOLD)
        assert abs(m["coverage_rate"] - 5 / 6) < 1e-9

    def test_coverage_zero_pe_toate_nerezolvate(self):
        """Daca toate pred='?' -> coverage=0."""
        preds = [{"denumire": "X", "cod_pred": "?"}]
        gold  = [{"denumire": "X", "cod_gold": "OE-3"}]
        m = he.eval_predictions(preds, gold)
        assert m["coverage_count"] == 0
        assert m["coverage_rate"] == 0.0


# ---------------------------------------------------------------------------
# Sectiunea 4: eval_predictions — per_cod (TP/FP/FN + precision/recall)
# ---------------------------------------------------------------------------

class TestPerCod:
    """Verifica metricile per eticheta (precizie + recall per cod)."""

    def test_per_cod_returnat(self):
        """per_cod e un dict cu chei = etichete prezente."""
        m = he.eval_predictions(PREDS, GOLD)
        assert isinstance(m["per_cod"], dict)
        assert len(m["per_cod"]) > 0

    def test_per_cod_campuri(self):
        """Fiecare cod are tp, fp, fn, precision, recall."""
        m = he.eval_predictions(PREDS, GOLD)
        for cod, stats in m["per_cod"].items():
            assert "tp" in stats, f"tp lipsa pentru {cod}"
            assert "fp" in stats, f"fp lipsa pentru {cod}"
            assert "fn" in stats, f"fn lipsa pentru {cod}"
            assert "precision" in stats, f"precision lipsa pentru {cod}"
            assert "recall" in stats, f"recall lipsa pentru {cod}"

    def test_per_cod_oe1_precision(self):
        """OE-1: pred pt [VOPSIRE(corect), D/R BARA(gresit, gold=OE-1 dar pred=OE-2)].

        Wait - pred=OE-1 pt VOPSIRE(gold=OE-1 corect) si SCHIMB ULEI(gold=OE-3 gresit).
        TP=1(VOPSIRE), FP=1(SCHIMB ULEI pred=OE-1 dar gold=OE-3), FN=1(D/R BARA pred=OE-2 nu OE-1).
        precision_OE1 = 1/(1+1) = 0.50
        recall_OE1 = 1/(1+1) = 0.50
        """
        m = he.eval_predictions(PREDS, GOLD)
        oe1 = m["per_cod"].get("OE-1", {})
        # TP: VOPSIRE BARA FATA (pred=OE-1, gold=OE-1)
        # FP: SCHIMB ULEI MOTOR (pred=OE-1, gold=OE-3)
        # FN: D/R BARA FATA (gold=OE-1, pred=OE-2)
        assert oe1.get("tp") == 1
        assert oe1.get("fp") == 1
        assert oe1.get("fn") == 1
        assert abs(oe1.get("precision") - 0.50) < 1e-9
        assert abs(oe1.get("recall") - 0.50) < 1e-9

    def test_per_cod_oe3_precision(self):
        """OE-3: pred pt [REVIZIE(corect)]. gold=OE-3 pt [REVIZIE, SCHIMB ULEI].
        TP=1(REVIZIE), FP=0, FN=1(SCHIMB ULEI pred=OE-1).
        precision=1.0, recall=0.50
        """
        m = he.eval_predictions(PREDS, GOLD)
        oe3 = m["per_cod"].get("OE-3", {})
        assert oe3.get("tp") == 1
        assert oe3.get("fp") == 0
        assert oe3.get("fn") == 1
        assert abs(oe3.get("precision") - 1.0) < 1e-9
        assert abs(oe3.get("recall") - 0.50) < 1e-9

    def test_per_cod_precision_none_pe_necunoscut(self):
        """Daca un cod e doar in gold (niciodata prezis) -> precision=None sau 0."""
        # OE-4 e gold pt DIAGNOSTICARE, dar pred='?' -> FN=1, TP=0, FP=0
        m = he.eval_predictions(PREDS, GOLD)
        oe4 = m["per_cod"].get("OE-4", {})
        # Precision nedefinita (0/0): None sau 0.0 ambele OK
        assert oe4.get("tp") == 0
        assert oe4.get("fp") == 0
        assert oe4.get("fn") == 1
        assert oe4.get("precision") is None or oe4.get("precision") == 0.0


# ---------------------------------------------------------------------------
# Sectiunea 5: eval_predictions — matrice confuzie
# ---------------------------------------------------------------------------

class TestConfusionMatrix:
    """Matricea confuzie indexata ca 'gold->pred'."""

    def test_confusion_matrix_returnat(self):
        """confusion_matrix e un dict."""
        m = he.eval_predictions(PREDS, GOLD)
        assert isinstance(m["confusion_matrix"], dict)

    def test_confusion_matrix_cod_gresit_prezent(self):
        """Cazul 'gold=OE-3, pred=OE-1' (SCHIMB ULEI) -> cheie 'OE-3->OE-1' cu count 1."""
        m = he.eval_predictions(PREDS, GOLD)
        assert m["confusion_matrix"].get("OE-3->OE-1") == 1

    def test_confusion_matrix_corect(self):
        """Cazul corect 'gold=OE-3, pred=OE-3' (REVIZIE) -> cheie 'OE-3->OE-3' cu count 1."""
        m = he.eval_predictions(PREDS, GOLD)
        assert m["confusion_matrix"].get("OE-3->OE-3") == 1


# ---------------------------------------------------------------------------
# Sectiunea 6: sample_stratified — esantionare stratificata determinista
# ---------------------------------------------------------------------------

# Fixture: 20 iteme cu frecvente Zipf-like (suficient pt 3 strate)
SAMPLE_ROWS = [(f"op_{i:02d}", max(1, 2000 - i * 100)) for i in range(20)]
# Sortat descrescator: op_00=2000, op_01=1900, ..., op_19=100
# n=20, head_end = max(1, round(20*0.20)) = 4
# mid_end  = max(5, round(20*0.50)) = 10
# cap  = [op_00..op_03] (4 items)
# mijloc = [op_04..op_09] (6 items)
# coada = [op_10..op_19] (10 items)


class TestSampleStratified:
    """Verifica proprietatile esantionarii stratificate."""

    def test_determinist_acelasi_seed(self):
        """Acelasi seed -> acelasi rezultat (determinist)."""
        r1 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42)
        r2 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42)
        assert r1 == r2

    def test_seed_diferit_rezultat_diferit(self):
        """Seed diferit -> (de obicei) rezultat diferit."""
        r1 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42)
        r2 = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=999)
        # Nu garanteaza 100% diferenta, dar pe 20 items e practic garantat
        assert r1 != r2

    def test_items_din_input(self):
        """Toate itemele returnate exista in inputul original."""
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42)
        input_set = {(d, n) for d, n in SAMPLE_ROWS}
        for item in result:
            assert (item["denumire"], item["nr"]) in input_set

    def test_campuri_obligatorii(self):
        """Fiecare item are: denumire, nr, strat."""
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42)
        for item in result:
            assert "denumire" in item
            assert "nr" in item
            assert "strat" in item

    def test_strat_valid(self):
        """Valorile strat sunt exclusiv din {'cap', 'mijloc', 'coada'}."""
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=9, seed=42)
        for item in result:
            assert item["strat"] in ("cap", "mijloc", "coada")

    def test_toate_stratele_reprezentate(self):
        """Cand n_sample e suficient de mare, toate 3 stratele apar in rezultat."""
        # n_sample=15 dintr-un total de 20 -> toate stratele au cel putin 1 item
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=15, seed=42)
        strate_prezente = {item["strat"] for item in result}
        assert "cap" in strate_prezente
        assert "mijloc" in strate_prezente
        assert "coada" in strate_prezente

    def test_dimensiune_aproape_de_n_sample(self):
        """Dimensiunea rezultatului e aproape de n_sample (+/- 3 datorita rotunjirii)."""
        n_sample = 9
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=n_sample, seed=42)
        assert abs(len(result) - n_sample) <= 3

    def test_fara_duplicate(self):
        """Niciun item nu apare de doua ori in esantion."""
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=15, seed=42)
        denumiri = [item["denumire"] for item in result]
        assert len(denumiri) == len(set(denumiri))

    def test_input_gol(self):
        """Input gol -> returneaza lista goala fara exceptie."""
        result = he.sample_stratified([], n_sample=10, seed=42)
        assert result == []

    def test_n_sample_mai_mare_decat_corpus(self):
        """Cand n_sample > len(rows), returneaza cel mult len(rows) items."""
        result = he.sample_stratified(SAMPLE_ROWS, n_sample=1000, seed=42)
        assert len(result) <= len(SAMPLE_ROWS)


# ---------------------------------------------------------------------------
# Sectiunea 7: export_for_labeling — fisier CSV pt etichetare umana
# ---------------------------------------------------------------------------

class TestExportForLabeling:
    """Exportul CSV contine denumire, nr, strat si coloana cod_gold GOALA."""

    def test_fisier_creat(self, tmp_path):
        """Fisierul este creat la calea specificata."""
        sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42)
        path = str(tmp_path / "esantion.csv")
        he.export_for_labeling(sample, path)
        assert os.path.exists(path)

    def test_header_csv(self, tmp_path):
        """CSV-ul are header-ul corect: denumire;nr;strat;cod_gold."""
        sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42)
        path = str(tmp_path / "esantion.csv")
        he.export_for_labeling(sample, path)
        with open(path, encoding="utf-8-sig") as f:
            reader = csv.DictReader(f, delimiter=";")
            coloane = reader.fieldnames
        assert "denumire" in coloane
        assert "nr" in coloane
        assert "strat" in coloane
        assert "cod_gold" in coloane

    def test_cod_gold_gol(self, tmp_path):
        """Coloana cod_gold e goala (de completat de operator uman)."""
        sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42)
        path = str(tmp_path / "esantion.csv")
        he.export_for_labeling(sample, path)
        with open(path, encoding="utf-8-sig") as f:
            reader = csv.DictReader(f, delimiter=";")
            for row in reader:
                # Coloana cod_gold trebuie sa fie vida (nu etichetata de cod!)
                assert row["cod_gold"] == "", (
                    "cod_gold nu trebuie pre-completat: ar fi 'antrenare pe test' "
                    "(Decision #19 PRD 5.14)"
                )

    def test_n_randuri_egal_cu_sample(self, tmp_path):
        """CSV-ul are exact atatea randuri cat esantionul."""
        sample = he.sample_stratified(SAMPLE_ROWS, n_sample=5, seed=42)
        path = str(tmp_path / "esantion.csv")
        he.export_for_labeling(sample, path)
        with open(path, encoding="utf-8-sig") as f:
            rows = list(csv.DictReader(f, delimiter=";"))
        assert len(rows) == len(sample)


# ---------------------------------------------------------------------------
# Sectiunea 8: kill_criterion — pragul de acceptanta (F-E, PRD 5.14)
# ---------------------------------------------------------------------------

class TestKillCriterion:
    """
    Kill-criterion (F-E): sistemul TRECE daca
      wrong_code_rate < wrong_code_threshold (default 0.5%)
      SI coverage_rate > coverage_threshold (default 50%).

    Justificare threshold 0.5% (0.005):
      Un service cu 200 operatii/zi auto-rezolvate = 1 FINALIZATA gresita/zi.
      FINALIZATA e ireversibila (cf. PRD 5.14 Premisa 3 / invariant CLAUDE.md).
      Pragul poate fi RELAXAT empiric; nu INASPRIT post-hoc (sesizare-in-timp).
    """

    def test_trece_cand_sub_prag(self):
        """Trece cand wrong_code_rate < threshold si coverage_rate > min_coverage."""
        metrics = {
            "wrong_code_rate": 0.003,   # 0.3% < 0.5%
            "coverage_rate": 0.70,       # 70% > 50%
        }
        r = he.kill_criterion(metrics)
        assert r["passes"] is True

    def test_esueaza_cand_wrong_code_prea_mare(self):
        """Esueaza cand wrong_code_rate >= threshold."""
        metrics = {
            "wrong_code_rate": 0.02,    # 2% > 0.5% -> FAIL
            "coverage_rate": 0.70,
        }
        r = he.kill_criterion(metrics)
        assert r["passes"] is False
        assert "wrong_code" in r["reason"].lower() or "cod gresit" in r["reason"].lower()

    def test_esueaza_cand_coverage_prea_mica(self):
        """Esueaza cand coverage_rate < min_coverage_threshold."""
        metrics = {
            "wrong_code_rate": 0.001,
            "coverage_rate": 0.30,       # 30% < 50% -> FAIL
        }
        r = he.kill_criterion(metrics)
        assert r["passes"] is False
        assert "acoperire" in r["reason"].lower() or "coverage" in r["reason"].lower()

    def test_esueaza_pe_ambele_conditii(self):
        """Esueaza cand ambele conditii sunt incalcate."""
        metrics = {
            "wrong_code_rate": 0.05,
            "coverage_rate": 0.10,
        }
        r = he.kill_criterion(metrics)
        assert r["passes"] is False

    def test_campuri_obligatorii_in_rezultat(self):
        """Rezultatul are: passes, reason, wrong_code_rate, coverage_rate, thresholds."""
        metrics = {"wrong_code_rate": 0.001, "coverage_rate": 0.80}
        r = he.kill_criterion(metrics)
        for camp in ("passes", "reason", "wrong_code_rate", "coverage_rate", "thresholds"):
            assert camp in r, f"Camp lipsa: {camp}"

    def test_threshold_customizabil(self):
        """Pragurile pot fi suprascrise."""
        metrics = {"wrong_code_rate": 0.05, "coverage_rate": 0.80}
        # Cu threshold mai lax, trece
        r = he.kill_criterion(metrics, wrong_code_threshold=0.10)
        assert r["passes"] is True

    def test_exact_pe_prag_nu_trece(self):
        """Pe prag exact (egalitate), nu trece (< e strict)."""
        threshold = he.DEFAULT_WRONG_CODE_THRESHOLD
        metrics = {"wrong_code_rate": threshold, "coverage_rate": 0.80}
        r = he.kill_criterion(metrics)
        # wrong_code_rate = threshold -> NU < threshold -> FAIL
        assert r["passes"] is False

    def test_reason_descrie_starea(self):
        """reason e un string non-gol care descrie de ce trece/esueaza."""
        metrics = {"wrong_code_rate": 0.001, "coverage_rate": 0.80}
        r = he.kill_criterion(metrics)
        assert isinstance(r["reason"], str)
        assert len(r["reason"]) > 0


# ---------------------------------------------------------------------------
# Sectiunea 9: constante si metadate modul
# ---------------------------------------------------------------------------

class TestModulMetadata:
    """Verifica existenta constantelor documentate."""

    def test_valid_rar_definit(self):
        """VALID_RAR e un set non-gol de coduri RAR."""
        assert hasattr(he, "VALID_RAR")
        assert isinstance(he.VALID_RAR, frozenset)
        assert len(he.VALID_RAR) >= 18

    def test_nul_in_all_labels_nu_in_valid_rar(self):
        """NUL e eticheta speciala (supresie), NU e cod RAR valid."""
        assert "NUL" not in he.VALID_RAR
        # NUL trebuie sa fie accesibil totusi
        assert hasattr(he, "NUL")
        assert he.NUL == "NUL"

    def test_default_seed(self):
        """DEFAULT_SEED exista si e intreg."""
        assert hasattr(he, "DEFAULT_SEED")
        assert isinstance(he.DEFAULT_SEED, int)

    def test_default_thresholds_in_range(self):
        """Pragurile default sunt in (0, 1)."""
        assert 0 < he.DEFAULT_WRONG_CODE_THRESHOLD < 1
        assert 0 < he.DEFAULT_COVERAGE_THRESHOLD < 1