"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa. Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5): brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR) -> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat. Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM. Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste). """ from __future__ import annotations import importlib.util import json import os import sys def _load(name: str): path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py") spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) sys.modules[name] = mod spec.loader.exec_module(mod) return mod gs = _load("genereaza_seed") def _scrie_csv(path, randuri): """randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header).""" linii = ['" ";"DENOP";"NR"'] for i, (den, nr) in enumerate(randuri, 1): linii.append(f'"{i}";"{den}";"{nr}"') path.write_text("\n".join(linii) + "\n", encoding="utf-8") def _mock_recorder(): """Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul.""" vazute = [] def clasifica(batch): vazute.append(list(batch)) return ["OE-1"] * len(batch) return clasifica, vazute # --------------------------------------------------------------------------- # def test_dedup_normalizat(tmp_path): f1 = tmp_path / "a.csv" f2 = tmp_path / "b.csv" _scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)]) _scrie_csv(f2, [(" revizie ", 5)]) # acelasi logic, case+spatii corpus = gs.agrega_corpus([str(f1), str(f2)]) assert "REVIZIE" in corpus assert corpus["REVIZIE"]["freq"] == 15 # 10 + 5, dedup pe cheie assert len([k for k in corpus]) == 2 # REVIZIE + D/R BARA FATA def test_skip_cheie_normalizata_vida(tmp_path): f = tmp_path / "a.csv" _scrie_csv(f, [(" ", 99), ("REVIZIE", 5)]) # cheie vida (doar spatii) corpus = gs.agrega_corpus([str(f)]) assert "" not in corpus assert list(corpus) == ["REVIZIE"] def test_ordine_pe_frecventa(tmp_path): f = tmp_path / "a.csv" _scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)]) seed = tmp_path / "seed.json" clasifica, vazute = _mock_recorder() gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica, batch=32) # Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa. primul_batch = vazute[0] assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"] def test_reuse_in_spatiu_normalizat(tmp_path): f = tmp_path / "a.csv" _scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)]) labels = tmp_path / "labels.json" labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # cheiat brut, dar normalizeaza la fel seed = tmp_path / "seed.json" clasifica, vazute = _mock_recorder() gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica) trimise = {d for b in vazute for d in b} assert "Revizie" not in trimise and "REVIZIE" not in trimise # deja etichetat -> nu se trimite seed_data = json.loads(seed.read_text(encoding="utf-8")) rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0] assert rev["cod"] == "OE-3" def test_reuse_conflict_determinist(tmp_path): f = tmp_path / "a.csv" # Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide. _scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)]) labels = tmp_path / "labels.json" labels.write_text(json.dumps({ "CURATAT CATALIZATOR": "OE-1", # freq 100 "curatat catalizator": "OE-2", # freq 5 }), encoding="utf-8") seed = tmp_path / "seed.json" clasifica, _ = _mock_recorder() gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica) seed_data = json.loads(seed.read_text(encoding="utf-8")) cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0] assert cat["cod"] == "OE-1" # freq-max castiga (100 > 5) def test_zero_duplicate_trimis_la_llm(tmp_path): f1 = tmp_path / "a.csv" f2 = tmp_path / "b.csv" _scrie_csv(f1, [("REVIZIE", 10), (" revizie ", 4), ("OP NOUA", 7), (" ", 3)]) _scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)]) # cross-file duplicate labels = tmp_path / "labels.json" labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # REVIZIE deja etichetat seed = tmp_path / "seed.json" clasifica, vazute = _mock_recorder() from app.mapping import normalize_for_match gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica) trimise = [d for b in vazute for d in b] chei = [normalize_for_match(d) for d in trimise] assert len(chei) == len(set(chei)) # nicio cheie normalizata trimisa de doua ori assert "" not in chei # nicio cheie vida assert "REVIZIE" not in chei # nicio cheie deja etichetata assert "OP NOUA" in chei # doar ce lipseste def test_rerun_zero_apeluri_llm(tmp_path): """Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic.""" f = tmp_path / "a.csv" _scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)]) seed = tmp_path / "seed.json" clasifica1, vazute1 = _mock_recorder() gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1) assert sum(len(b) for b in vazute1) == 2 # prima rulare eticheteaza ambele bytes1 = seed.read_bytes() clasifica2, vazute2 = _mock_recorder() gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2) assert vazute2 == [] # a doua rulare: 0 apeluri LLM (seed = cache) bytes2 = seed.read_bytes() assert bytes1 == bytes2 # seed identic byte-cu-byte def test_format_seed_valid(tmp_path): f = tmp_path / "a.csv" _scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)]) seed = tmp_path / "seed.json" def clasifica(batch): # marcheaza ITP ca NUL, restul OE-1 return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch] gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica) data = json.loads(seed.read_text(encoding="utf-8")) chei = [e["denumire_normalizata"] for e in data] assert len(chei) == len(set(chei)) # unice assert all(e["denumire_normalizata"] for e in data) # non-vide for e in data: assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"} if e["is_nul"]: assert e["cod"] is None # NUL -> cod NULL (oglindeste CHECK-ul DB) else: assert e["cod"] nul = [e for e in data if e["is_nul"]][0] assert "ITP" in nul["denumire_normalizata"]