"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa.

Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5):
  brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR)
  -> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat.

Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM.
Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste).
"""

from __future__ import annotations

import importlib.util
import json
import os
import sys


def _load(name: str):
    path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py")
    spec = importlib.util.spec_from_file_location(name, path)
    mod = importlib.util.module_from_spec(spec)
    sys.modules[name] = mod
    spec.loader.exec_module(mod)
    return mod


gs = _load("genereaza_seed")


def _scrie_csv(path, randuri):
    """randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header)."""
    linii = ['"   ";"DENOP";"NR"']
    for i, (den, nr) in enumerate(randuri, 1):
        linii.append(f'"{i}";"{den}";"{nr}"')
    path.write_text("\n".join(linii) + "\n", encoding="utf-8")


def _mock_recorder():
    """Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul."""
    vazute = []

    def clasifica(batch):
        vazute.append(list(batch))
        return ["OE-1"] * len(batch)

    return clasifica, vazute


# --------------------------------------------------------------------------- #

def test_dedup_normalizat(tmp_path):
    f1 = tmp_path / "a.csv"
    f2 = tmp_path / "b.csv"
    _scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)])
    _scrie_csv(f2, [("  revizie  ", 5)])  # acelasi logic, case+spatii
    corpus = gs.agrega_corpus([str(f1), str(f2)])
    assert "REVIZIE" in corpus
    assert corpus["REVIZIE"]["freq"] == 15           # 10 + 5, dedup pe cheie
    assert len([k for k in corpus]) == 2             # REVIZIE + D/R BARA FATA


def test_skip_cheie_normalizata_vida(tmp_path):
    f = tmp_path / "a.csv"
    _scrie_csv(f, [("   ", 99), ("REVIZIE", 5)])     # cheie vida (doar spatii)
    corpus = gs.agrega_corpus([str(f)])
    assert "" not in corpus
    assert list(corpus) == ["REVIZIE"]


def test_ordine_pe_frecventa(tmp_path):
    f = tmp_path / "a.csv"
    _scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)])
    seed = tmp_path / "seed.json"
    clasifica, vazute = _mock_recorder()
    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed),
                 etichetare_all=True, clasifica=clasifica, batch=32)
    # Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa.
    primul_batch = vazute[0]
    assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"]


def test_reuse_in_spatiu_normalizat(tmp_path):
    f = tmp_path / "a.csv"
    _scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)])
    labels = tmp_path / "labels.json"
    labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8")  # cheiat brut, dar normalizeaza la fel
    seed = tmp_path / "seed.json"
    clasifica, vazute = _mock_recorder()
    gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed),
                 etichetare_all=True, clasifica=clasifica)
    trimise = {d for b in vazute for d in b}
    assert "Revizie" not in trimise and "REVIZIE" not in trimise  # deja etichetat -> nu se trimite
    seed_data = json.loads(seed.read_text(encoding="utf-8"))
    rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0]
    assert rev["cod"] == "OE-3"


def test_reuse_conflict_determinist(tmp_path):
    f = tmp_path / "a.csv"
    # Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide.
    _scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)])
    labels = tmp_path / "labels.json"
    labels.write_text(json.dumps({
        "CURATAT CATALIZATOR": "OE-1",   # freq 100
        "curatat catalizator": "OE-2",   # freq 5
    }), encoding="utf-8")
    seed = tmp_path / "seed.json"
    clasifica, _ = _mock_recorder()
    gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
    seed_data = json.loads(seed.read_text(encoding="utf-8"))
    cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0]
    assert cat["cod"] == "OE-1"   # freq-max castiga (100 > 5)


def test_zero_duplicate_trimis_la_llm(tmp_path):
    f1 = tmp_path / "a.csv"
    f2 = tmp_path / "b.csv"
    _scrie_csv(f1, [("REVIZIE", 10), ("  revizie ", 4), ("OP NOUA", 7), ("   ", 3)])
    _scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)])  # cross-file duplicate
    labels = tmp_path / "labels.json"
    labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8")  # REVIZIE deja etichetat
    seed = tmp_path / "seed.json"
    clasifica, vazute = _mock_recorder()
    from app.mapping import normalize_for_match
    gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed),
                 etichetare_all=True, clasifica=clasifica)
    trimise = [d for b in vazute for d in b]
    chei = [normalize_for_match(d) for d in trimise]
    assert len(chei) == len(set(chei))          # nicio cheie normalizata trimisa de doua ori
    assert "" not in chei                        # nicio cheie vida
    assert "REVIZIE" not in chei                 # nicio cheie deja etichetata
    assert "OP NOUA" in chei                     # doar ce lipseste


def test_rerun_zero_apeluri_llm(tmp_path):
    """Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic."""
    f = tmp_path / "a.csv"
    _scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)])
    seed = tmp_path / "seed.json"

    clasifica1, vazute1 = _mock_recorder()
    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1)
    assert sum(len(b) for b in vazute1) == 2     # prima rulare eticheteaza ambele
    bytes1 = seed.read_bytes()

    clasifica2, vazute2 = _mock_recorder()
    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2)
    assert vazute2 == []                          # a doua rulare: 0 apeluri LLM (seed = cache)
    bytes2 = seed.read_bytes()
    assert bytes1 == bytes2                        # seed identic byte-cu-byte


def test_format_seed_valid(tmp_path):
    f = tmp_path / "a.csv"
    _scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)])
    seed = tmp_path / "seed.json"

    def clasifica(batch):
        # marcheaza ITP ca NUL, restul OE-1
        return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch]

    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
    data = json.loads(seed.read_text(encoding="utf-8"))
    chei = [e["denumire_normalizata"] for e in data]
    assert len(chei) == len(set(chei))            # unice
    assert all(e["denumire_normalizata"] for e in data)  # non-vide
    for e in data:
        assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"}
        if e["is_nul"]:
            assert e["cod"] is None                # NUL -> cod NULL (oglindeste CHECK-ul DB)
        else:
            assert e["cod"]
    nul = [e for e in data if e["is_nul"]][0]
    assert "ITP" in nul["denumire_normalizata"]