feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)

Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq (3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect ~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7 (nu OE-5), placute frana uzura OE-1 (nu OE-F avarie). US-001..006: prefiltru NUL determinist, etichetator offline, generator seed, seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL 12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO). config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env). Suita: 1387 passed, 1 deselected (live). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 06:59:15 +00:00
parent c05fa00007
commit 756f77730f
17 changed files with 139308 additions and 44 deletions
--- a/tests/test_genereaza_seed.py
+++ b/tests/test_genereaza_seed.py
@@ -0,0 +1,175 @@
+"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa.
+
+Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5):
+  brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR)
+  -> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat.
+
+Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM.
+Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste).
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import sys
+
+
+def _load(name: str):
+    path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py")
+    spec = importlib.util.spec_from_file_location(name, path)
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+gs = _load("genereaza_seed")
+
+
+def _scrie_csv(path, randuri):
+    """randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header)."""
+    linii = ['"   ";"DENOP";"NR"']
+    for i, (den, nr) in enumerate(randuri, 1):
+        linii.append(f'"{i}";"{den}";"{nr}"')
+    path.write_text("\n".join(linii) + "\n", encoding="utf-8")
+
+
+def _mock_recorder():
+    """Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul."""
+    vazute = []
+
+    def clasifica(batch):
+        vazute.append(list(batch))
+        return ["OE-1"] * len(batch)
+
+    return clasifica, vazute
+
+
+# --------------------------------------------------------------------------- #
+
+def test_dedup_normalizat(tmp_path):
+    f1 = tmp_path / "a.csv"
+    f2 = tmp_path / "b.csv"
+    _scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)])
+    _scrie_csv(f2, [("  revizie  ", 5)])  # acelasi logic, case+spatii
+    corpus = gs.agrega_corpus([str(f1), str(f2)])
+    assert "REVIZIE" in corpus
+    assert corpus["REVIZIE"]["freq"] == 15           # 10 + 5, dedup pe cheie
+    assert len([k for k in corpus]) == 2             # REVIZIE + D/R BARA FATA
+
+
+def test_skip_cheie_normalizata_vida(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("   ", 99), ("REVIZIE", 5)])     # cheie vida (doar spatii)
+    corpus = gs.agrega_corpus([str(f)])
+    assert "" not in corpus
+    assert list(corpus) == ["REVIZIE"]
+
+
+def test_ordine_pe_frecventa(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)])
+    seed = tmp_path / "seed.json"
+    clasifica, vazute = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed),
+                 etichetare_all=True, clasifica=clasifica, batch=32)
+    # Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa.
+    primul_batch = vazute[0]
+    assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"]
+
+
+def test_reuse_in_spatiu_normalizat(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)])
+    labels = tmp_path / "labels.json"
+    labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8")  # cheiat brut, dar normalizeaza la fel
+    seed = tmp_path / "seed.json"
+    clasifica, vazute = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed),
+                 etichetare_all=True, clasifica=clasifica)
+    trimise = {d for b in vazute for d in b}
+    assert "Revizie" not in trimise and "REVIZIE" not in trimise  # deja etichetat -> nu se trimite
+    seed_data = json.loads(seed.read_text(encoding="utf-8"))
+    rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0]
+    assert rev["cod"] == "OE-3"
+
+
+def test_reuse_conflict_determinist(tmp_path):
+    f = tmp_path / "a.csv"
+    # Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide.
+    _scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)])
+    labels = tmp_path / "labels.json"
+    labels.write_text(json.dumps({
+        "CURATAT CATALIZATOR": "OE-1",   # freq 100
+        "curatat catalizator": "OE-2",   # freq 5
+    }), encoding="utf-8")
+    seed = tmp_path / "seed.json"
+    clasifica, _ = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
+    seed_data = json.loads(seed.read_text(encoding="utf-8"))
+    cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0]
+    assert cat["cod"] == "OE-1"   # freq-max castiga (100 > 5)
+
+
+def test_zero_duplicate_trimis_la_llm(tmp_path):
+    f1 = tmp_path / "a.csv"
+    f2 = tmp_path / "b.csv"
+    _scrie_csv(f1, [("REVIZIE", 10), ("  revizie ", 4), ("OP NOUA", 7), ("   ", 3)])
+    _scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)])  # cross-file duplicate
+    labels = tmp_path / "labels.json"
+    labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8")  # REVIZIE deja etichetat
+    seed = tmp_path / "seed.json"
+    clasifica, vazute = _mock_recorder()
+    from app.mapping import normalize_for_match
+    gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed),
+                 etichetare_all=True, clasifica=clasifica)
+    trimise = [d for b in vazute for d in b]
+    chei = [normalize_for_match(d) for d in trimise]
+    assert len(chei) == len(set(chei))          # nicio cheie normalizata trimisa de doua ori
+    assert "" not in chei                        # nicio cheie vida
+    assert "REVIZIE" not in chei                 # nicio cheie deja etichetata
+    assert "OP NOUA" in chei                     # doar ce lipseste
+
+
+def test_rerun_zero_apeluri_llm(tmp_path):
+    """Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic."""
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)])
+    seed = tmp_path / "seed.json"
+
+    clasifica1, vazute1 = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1)
+    assert sum(len(b) for b in vazute1) == 2     # prima rulare eticheteaza ambele
+    bytes1 = seed.read_bytes()
+
+    clasifica2, vazute2 = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2)
+    assert vazute2 == []                          # a doua rulare: 0 apeluri LLM (seed = cache)
+    bytes2 = seed.read_bytes()
+    assert bytes1 == bytes2                        # seed identic byte-cu-byte
+
+
+def test_format_seed_valid(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)])
+    seed = tmp_path / "seed.json"
+
+    def clasifica(batch):
+        # marcheaza ITP ca NUL, restul OE-1
+        return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch]
+
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
+    data = json.loads(seed.read_text(encoding="utf-8"))
+    chei = [e["denumire_normalizata"] for e in data]
+    assert len(chei) == len(set(chei))            # unice
+    assert all(e["denumire_normalizata"] for e in data)  # non-vide
+    for e in data:
+        assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"}
+        if e["is_nul"]:
+            assert e["cod"] is None                # NUL -> cod NULL (oglindeste CHECK-ul DB)
+        else:
+            assert e["cod"]
+    nul = [e for e in data if e["is_nul"]][0]
+    assert "ITP" in nul["denumire_normalizata"]