feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)

Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE
cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq
(3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect
~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume
piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7
(nu OE-5), placute frana uzura OE-1 (nu OE-F avarie).

US-001..006: prefiltru NUL determinist, etichetator offline, generator seed,
seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings
indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL
12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO).

config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER
populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env).

Suita: 1387 passed, 1 deselected (live).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-06-29 06:59:15 +00:00
parent c05fa00007
commit 756f77730f
17 changed files with 139308 additions and 44 deletions

View File

@@ -0,0 +1,175 @@
"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa.
Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5):
brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR)
-> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat.
Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM.
Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste).
"""
from __future__ import annotations
import importlib.util
import json
import os
import sys
def _load(name: str):
path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py")
spec = importlib.util.spec_from_file_location(name, path)
mod = importlib.util.module_from_spec(spec)
sys.modules[name] = mod
spec.loader.exec_module(mod)
return mod
gs = _load("genereaza_seed")
def _scrie_csv(path, randuri):
"""randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header)."""
linii = ['" ";"DENOP";"NR"']
for i, (den, nr) in enumerate(randuri, 1):
linii.append(f'"{i}";"{den}";"{nr}"')
path.write_text("\n".join(linii) + "\n", encoding="utf-8")
def _mock_recorder():
"""Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul."""
vazute = []
def clasifica(batch):
vazute.append(list(batch))
return ["OE-1"] * len(batch)
return clasifica, vazute
# --------------------------------------------------------------------------- #
def test_dedup_normalizat(tmp_path):
f1 = tmp_path / "a.csv"
f2 = tmp_path / "b.csv"
_scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)])
_scrie_csv(f2, [(" revizie ", 5)]) # acelasi logic, case+spatii
corpus = gs.agrega_corpus([str(f1), str(f2)])
assert "REVIZIE" in corpus
assert corpus["REVIZIE"]["freq"] == 15 # 10 + 5, dedup pe cheie
assert len([k for k in corpus]) == 2 # REVIZIE + D/R BARA FATA
def test_skip_cheie_normalizata_vida(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [(" ", 99), ("REVIZIE", 5)]) # cheie vida (doar spatii)
corpus = gs.agrega_corpus([str(f)])
assert "" not in corpus
assert list(corpus) == ["REVIZIE"]
def test_ordine_pe_frecventa(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)])
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed),
etichetare_all=True, clasifica=clasifica, batch=32)
# Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa.
primul_batch = vazute[0]
assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"]
def test_reuse_in_spatiu_normalizat(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)])
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # cheiat brut, dar normalizeaza la fel
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed),
etichetare_all=True, clasifica=clasifica)
trimise = {d for b in vazute for d in b}
assert "Revizie" not in trimise and "REVIZIE" not in trimise # deja etichetat -> nu se trimite
seed_data = json.loads(seed.read_text(encoding="utf-8"))
rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0]
assert rev["cod"] == "OE-3"
def test_reuse_conflict_determinist(tmp_path):
f = tmp_path / "a.csv"
# Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide.
_scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)])
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({
"CURATAT CATALIZATOR": "OE-1", # freq 100
"curatat catalizator": "OE-2", # freq 5
}), encoding="utf-8")
seed = tmp_path / "seed.json"
clasifica, _ = _mock_recorder()
gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
seed_data = json.loads(seed.read_text(encoding="utf-8"))
cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0]
assert cat["cod"] == "OE-1" # freq-max castiga (100 > 5)
def test_zero_duplicate_trimis_la_llm(tmp_path):
f1 = tmp_path / "a.csv"
f2 = tmp_path / "b.csv"
_scrie_csv(f1, [("REVIZIE", 10), (" revizie ", 4), ("OP NOUA", 7), (" ", 3)])
_scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)]) # cross-file duplicate
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # REVIZIE deja etichetat
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
from app.mapping import normalize_for_match
gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed),
etichetare_all=True, clasifica=clasifica)
trimise = [d for b in vazute for d in b]
chei = [normalize_for_match(d) for d in trimise]
assert len(chei) == len(set(chei)) # nicio cheie normalizata trimisa de doua ori
assert "" not in chei # nicio cheie vida
assert "REVIZIE" not in chei # nicio cheie deja etichetata
assert "OP NOUA" in chei # doar ce lipseste
def test_rerun_zero_apeluri_llm(tmp_path):
"""Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic."""
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)])
seed = tmp_path / "seed.json"
clasifica1, vazute1 = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1)
assert sum(len(b) for b in vazute1) == 2 # prima rulare eticheteaza ambele
bytes1 = seed.read_bytes()
clasifica2, vazute2 = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2)
assert vazute2 == [] # a doua rulare: 0 apeluri LLM (seed = cache)
bytes2 = seed.read_bytes()
assert bytes1 == bytes2 # seed identic byte-cu-byte
def test_format_seed_valid(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)])
seed = tmp_path / "seed.json"
def clasifica(batch):
# marcheaza ITP ca NUL, restul OE-1
return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch]
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
data = json.loads(seed.read_text(encoding="utf-8"))
chei = [e["denumire_normalizata"] for e in data]
assert len(chei) == len(set(chei)) # unice
assert all(e["denumire_normalizata"] for e in data) # non-vide
for e in data:
assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"}
if e["is_nul"]:
assert e["cod"] is None # NUL -> cod NULL (oglindeste CHECK-ul DB)
else:
assert e["cod"]
nul = [e for e in data if e["is_nul"]][0]
assert "ITP" in nul["denumire_normalizata"]