feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)

Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq (3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect ~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7 (nu OE-5), placute frana uzura OE-1 (nu OE-F avarie). US-001..006: prefiltru NUL determinist, etichetator offline, generator seed, seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL 12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO). config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env). Suita: 1387 passed, 1 deselected (live). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 06:59:15 +00:00
parent c05fa00007
commit 756f77730f
17 changed files with 139308 additions and 44 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,14 @@ import pytest

 os.environ.setdefault("AUTOPASS_REQUIRE_API_KEY", "false")
 os.environ.setdefault("AUTOPASS_WORKER_USE_TEST_CREDS", "false")
+# Embeddings e ON implicit in app (config.py), dar in teste il lasam OFF ca sa nu
+# lazy-load-eze modelul de ~230MB la fiecare test care atinge editorul de mapari
+# (suita rapida, fara download in CI). Testele de embeddings il pornesc punctual.
+os.environ.setdefault("AUTOPASS_EMBEDDINGS_ENABLED", "false")
+# Seed-ul de operatii etichetate (SILVER, PRD 5.18) e ON in app, dar OFF in teste:
+# multe teste presupun mapping_suggestions GOL la init_db. Testele US-004/005/006 il
+# pornesc punctual (object.__setattr__ pe settings sau apel direct la seeder).
+os.environ.setdefault("AUTOPASS_SEED_OPERATII_ENABLED", "false")


@pytest.fixture(autouse=True)
--- a/tests/test_embeddings_corpus_etichetat.py
+++ b/tests/test_embeddings_corpus_etichetat.py
@@ -0,0 +1,150 @@
+"""US-005 (PRD 5.18) — embeddings indexeaza corpusul etichetat (NU nomenclatorul).
+
+k-NN peste exemple reale etichetate (denumire_normalizata -> cod, is_nul) e net mai
+precis decat peste cele 18 categorii generice. Acopera si simetria corpus/query (F1):
+corpusul e text NORMALIZAT, deci query-ul trebuie normalizat la fel inainte de embedding.
+"""
+
+from __future__ import annotations
+
+import math
+import os
+import tempfile
+
+import pytest
+
+
+# Backend mock determinist: vector = histograma de caractere (similaritate stabila).
+class MockBackend:
+    def embed(self, texts):
+        out = []
+        for t in texts:
+            v = [0.0] * 27
+            for ch in t.upper():
+                if "A" <= ch <= "Z":
+                    v[ord(ch) - 65] += 1.0
+                else:
+                    v[26] += 1.0
+            out.append(v)
+        return out
+
+
+@pytest.fixture()
+def env(monkeypatch):
+    tmp = tempfile.mkdtemp()
+    monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us005.db"))
+    monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
+    monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true")   # US-005 are nevoie de embeddings ON
+    from app.config import get_settings
+    get_settings.cache_clear()
+    from app.db import init_db
+    init_db()
+    yield monkeypatch
+    get_settings.cache_clear()
+
+
+@pytest.fixture()
+def conn(env):
+    from app.db import get_connection
+    c = get_connection()
+    yield c
+    c.close()
+
+
+def _inject_mock_engine():
+    import app.embeddings as emb
+    from app.embeddings import EmbeddingEngine
+    emb._engine = EmbeddingEngine(backend=MockBackend())
+    return emb
+
+
+def _seed_silver(conn, rows):
+    """rows = [(denumire_normalizata, cod, is_nul)]."""
+    conn.executemany(
+        "INSERT OR IGNORE INTO mapping_suggestions "
+        "(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
+        rows,
+    )
+    conn.commit()
+
+
+def test_corpus_din_mapping_suggestions(conn):
+    emb = _inject_mock_engine()
+    _seed_silver(conn, [
+        ("SCHIMB ULEI MOTOR", "OE-3", 0),
+        ("INLOCUIT PLACUTE FRANA", "OE-1", 0),
+        ("13 X ITP", None, 1),
+    ])
+    from app.mapping import ensure_embeddings_corpus
+    ensure_embeddings_corpus(conn)
+    assert emb.has_corpus()
+    # Corpusul indexat = denumirile din mapping_suggestions, NU din nomenclator_rar.
+    texte = {it["denumire"] for it in emb._engine._corpus_items}
+    assert texte == {"SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA", "13 X ITP"}
+
+
+def test_suggest_nearest_intoarce_is_nul(conn):
+    emb = _inject_mock_engine()
+    _seed_silver(conn, [
+        ("SCHIMB ULEI MOTOR", "OE-3", 0),
+        ("13 X ITP", None, 1),
+    ])
+    from app.mapping import ensure_embeddings_corpus
+    ensure_embeddings_corpus(conn)
+    res = emb.suggest_nearest("13 X ITP", top_k=1)
+    assert res and res[0]["is_nul"] is True   # vecin NUL -> semnal de supresie
+    res2 = emb.suggest_nearest("SCHIMB ULEI MOTOR", top_k=1)
+    assert res2 and res2[0]["is_nul"] is False
+    assert res2[0]["cod"] == "OE-3"
+
+
+def test_semnatura_corpus_pe_seed(conn):
+    emb = _inject_mock_engine()
+    _seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
+    from app.mapping import ensure_embeddings_corpus
+    ensure_embeddings_corpus(conn)
+    sig1 = emb.corpus_signature()
+    assert sig1 is not None
+    # Re-apel fara schimbare -> aceeasi semnatura (nu re-indexeaza).
+    ensure_embeddings_corpus(conn)
+    assert emb.corpus_signature() == sig1
+    # Adaugare rand -> semnatura se schimba.
+    _seed_silver(conn, [("INLOCUIT BATERIE", "OE-1", 0)])
+    ensure_embeddings_corpus(conn)
+    assert emb.corpus_signature() != sig1
+
+
+def test_query_normalizat_ca_si_corpusul(conn, monkeypatch):
+    """F1 (HIGH): enrich_suggestions interogheaza suggest_nearest cu textul NORMALIZAT."""
+    import app.embeddings as emb
+    captura = {}
+    monkeypatch.setattr(emb, "has_corpus", lambda: True)
+
+    def fake_suggest(text, top_k=1):
+        captura["text"] = text
+        return [{"cod": "OE-3", "is_nul": False, "similaritate": 0.99}]
+
+    monkeypatch.setattr(emb, "suggest_nearest", fake_suggest)
+    from app.mapping import enrich_suggestions
+    enrich_suggestions(conn, "Schimb  Uleiul   Motor")
+    # Corpusul e denumire_normalizata -> query-ul trebuie normalizat la fel.
+    from app.mapping import normalize_for_match
+    assert captura["text"] == normalize_for_match("Schimb  Uleiul   Motor")
+    assert captura["text"] == "SCHIMB ULEIUL MOTOR"
+
+
+def test_degradare_gratioasa_pastrata(conn):
+    """Backend care arunca -> ensure + enrich NU arunca exceptie."""
+    import app.embeddings as emb
+    from app.embeddings import EmbeddingEngine
+
+    class BrokenBackend:
+        def embed(self, texts):
+            raise RuntimeError("model indisponibil")
+
+    emb._engine = EmbeddingEngine(backend=BrokenBackend())
+    _seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
+    from app.mapping import ensure_embeddings_corpus, enrich_suggestions
+    ensure_embeddings_corpus(conn)            # nu arunca
+    out = enrich_suggestions(conn, "SCHIMB ULEI")  # nu arunca
+    assert "sugestie_principala" in out
--- a/tests/test_enrich_corpus_etichetat.py
+++ b/tests/test_enrich_corpus_etichetat.py
@@ -0,0 +1,133 @@
+"""US-006 (PRD 5.18) — enrich_suggestions = pre-filtru NUL + k-NN pe corpus etichetat.
+
+Ordinea de precedenta: pre-filtru NUL -> (daca NUL: fara cod) altfel GOLD partajat >
+exact (SILVER) > k-NN embeddings. k-NN sub prag -> abtinere. Vecin k-NN NUL -> supresie.
+Invariant #13: nimic din asta nu intra in resolve_prestatii/load_mapping.
+"""
+
+from __future__ import annotations
+
+import os
+import tempfile
+
+import pytest
+
+
+@pytest.fixture()
+def env(monkeypatch):
+    tmp = tempfile.mkdtemp()
+    monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us006.db"))
+    monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
+    monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true")
+    from app.config import get_settings
+    get_settings.cache_clear()
+    from app.db import init_db
+    init_db()
+    yield monkeypatch
+    get_settings.cache_clear()
+
+
+@pytest.fixture()
+def conn(env):
+    from app.db import get_connection
+    c = get_connection()
+    yield c
+    c.close()
+
+
+def _silver(conn, denumire_norm, cod, is_nul=0):
+    conn.execute(
+        "INSERT OR IGNORE INTO mapping_suggestions "
+        "(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
+        (denumire_norm, cod, is_nul),
+    )
+    conn.commit()
+
+
+def _mock_embedding(monkeypatch, cod, sim, is_nul=False):
+    import app.embeddings as emb
+    monkeypatch.setattr(emb, "has_corpus", lambda: True)
+    monkeypatch.setattr(emb, "suggest_nearest",
+                        lambda text, top_k=1: [{"cod": cod, "is_nul": is_nul, "similaritate": sim}])
+
+
+def test_prefiltru_nul_supreseaza_inainte_de_knn(conn, monkeypatch):
+    # Embedding-ul AR sugera un cod, dar pre-filtrul NUL trebuie sa scurtcircuiteze.
+    chemat = {"da": False}
+    import app.embeddings as emb
+    monkeypatch.setattr(emb, "has_corpus", lambda: True)
+
+    def spion(text, top_k=1):
+        chemat["da"] = True
+        return [{"cod": "OE-1", "is_nul": False, "similaritate": 0.99}]
+
+    monkeypatch.setattr(emb, "suggest_nearest", spion)
+    from app.mapping import enrich_suggestions
+    out = enrich_suggestions(conn, "13 X ITP")
+    assert out["sugestie_principala"] is None      # non-operatie -> fara cod
+    assert out["surse"]["nul"] is True
+    assert chemat["da"] is False                    # k-NN nici macar interogat
+
+
+def test_precedenta_gold_exact_embedding(conn, monkeypatch):
+    from app.shared_store import record_human_validation
+    from app.mapping import enrich_suggestions, normalize_for_match
+    den = "OPERATIE DE TEST UNICA"
+    norm = normalize_for_match(den)
+
+    # Toate trei sursele dau coduri diferite.
+    record_human_validation(conn, den, "OE-1")     # GOLD partajat
+    _silver(conn, norm, "OE-2")                     # SILVER exact
+    _mock_embedding(monkeypatch, "OE-3", 0.99)      # embedding
+    conn.commit()
+
+    out = enrich_suggestions(conn, den)
+    assert out["sugestie_principala"] == {"cod_prestatie": "OE-1", "sursa": "gold_partajat"}
+
+    # Fara GOLD -> castiga SILVER.
+    conn.execute("DELETE FROM shared_mappings")
+    conn.commit()
+    out = enrich_suggestions(conn, den)
+    assert out["sugestie_principala"]["sursa"] == "silver"
+    assert out["sugestie_principala"]["cod_prestatie"] == "OE-2"
+
+    # Fara GOLD si fara SILVER -> castiga embedding.
+    conn.execute("DELETE FROM mapping_suggestions")
+    conn.commit()
+    out = enrich_suggestions(conn, den)
+    assert out["sugestie_principala"] == {"cod_prestatie": "OE-3", "sursa": "embedding"}
+
+
+def test_prag_similaritate(conn, monkeypatch):
+    from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
+    _mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE + 0.01)
+    out = enrich_suggestions(conn, "CEVA NEVAZUT")
+    assert out["surse"]["embedding"] == "OE-3"
+
+
+def test_abtinere_sub_prag(conn, monkeypatch):
+    from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
+    _mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE - 0.01)
+    out = enrich_suggestions(conn, "CEVA NEVAZUT")
+    assert out["surse"]["embedding"] is None        # sub prag -> abtinere
+    assert out["sugestie_principala"] is None
+
+
+def test_vecin_knn_nul_supreseaza(conn, monkeypatch):
+    from app.mapping import enrich_suggestions
+    _mock_embedding(monkeypatch, None, 0.99, is_nul=True)   # vecin NUL peste prag
+    out = enrich_suggestions(conn, "CEVA CARE SEAMANA CU GUNOI")
+    assert out["surse"]["embedding"] is None        # NUL -> nu produce cod
+    assert out["surse"]["nul"] is True
+    assert out["sugestie_principala"] is None
+
+
+def test_invariant_13_resolve_neatins(conn):
+    """Regresie #13: SILVER populat NU produce auto-rezolvare in resolve_prestatii."""
+    from app.mapping import resolve_prestatii, normalize_for_match
+    _silver(conn, normalize_for_match("OPERATIE X"), "OE-1")
+    resolved, unmapped = resolve_prestatii(
+        [{"cod_op_service": "OPERATIE X", "denumire": "OPERATIE X"}], mapping={}, valid_codes={"OE-1"}
+    )
+    assert resolved[0]["cod_prestatie"] is None     # ramane nemapat, NU ia codul din SILVER
+    assert unmapped and unmapped[0]["cod_op_service"] == "OPERATIE X"
--- a/tests/test_eticheteaza_tool.py
+++ b/tests/test_eticheteaza_tool.py
@@ -0,0 +1,103 @@
+"""US-002 (PRD 5.18) — etichetator offline multi-backend cu prompt procedural.
+
+Toate testele ruleaza FARA retea reala (transport injectabil / inspectie body).
+Acopera: prompt 3 pasi, envelope json_schema strict + enum, backend selectabil
+prin env, scrub PII inainte de orice request, garda de truncare.
+"""
+
+from __future__ import annotations
+
+# Numele pachetului `tools/mapare-llm` contine cratima -> nu e importabil ca modul.
+# Incarcam fisierul direct prin importlib pe cale.
+import importlib.util
+import os
+import sys
+
+_PATH = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", "eticheteaza.py")
+_spec = importlib.util.spec_from_file_location("eticheteaza", _PATH)
+eticheteaza = importlib.util.module_from_spec(_spec)
+sys.modules["eticheteaza"] = eticheteaza  # necesar pt. @dataclass introspection
+_spec.loader.exec_module(eticheteaza)
+
+
+def test_construieste_prompt_3pasi():
+    msgs = eticheteaza.construieste_mesaje(["INLOCUIT PLACUTE FRANA"])
+    assert isinstance(msgs, list) and msgs[0]["role"] == "system"
+    sys = msgs[0]["content"].upper()
+    # Procedura in 3 pasi explicita.
+    assert "PAS 1" in sys and "PAS 2" in sys and "PAS 3" in sys
+    # Regula NUL + avarie grava doar la accident.
+    assert "NUL" in sys
+    assert "ACCIDENT" in sys
+    # Dezactivare thinking Qwen3 (token /no_think undeva in mesaje).
+    joined = " ".join(m["content"] for m in msgs)
+    assert "/no_think" in joined
+    # User message enumera operatiile.
+    assert "1." in msgs[1]["content"] and "INLOCUIT PLACUTE FRANA" in msgs[1]["content"]
+
+
+def test_envelope_json_schema_strict_si_enum():
+    backend = eticheteaza.get_backend("lmstudio")
+    body = eticheteaza.construieste_body(["REVIZIE"], backend)
+    rf = body["response_format"]
+    # Envelope COMPLET, NU json_object.
+    assert rf["type"] == "json_schema"
+    js = rf["json_schema"]
+    assert js["strict"] is True
+    assert "name" in js
+    schema = js["schema"]
+    cod_schema = schema["properties"]["rez"]["items"]["properties"]["cod"]
+    # cod = enum peste cele 19 ALL_LABELS (18 coduri + NUL).
+    assert set(cod_schema["enum"]) == set(eticheteaza.ALL_LABELS)
+    assert len(eticheteaza.ALL_LABELS) == 19
+    assert "NUL" in eticheteaza.ALL_LABELS
+    # temperatura 0 (determinist) si strict items.
+    assert body["temperature"] == 0
+    assert schema["properties"]["rez"]["items"]["additionalProperties"] is False
+
+
+def test_parseaza_raspuns_si_garda_truncare():
+    batch = ["A", "B", "C"]
+    # Raspuns complet, ordine amestecata, un cod invalid.
+    content = {"rez": [{"i": 2, "cod": "OE-1"}, {"i": 1, "cod": "NUL"}, {"i": 3, "cod": "INEXISTENT"}]}
+    codes = eticheteaza.parseaza_raspuns(content, len(batch))
+    assert codes == ["NUL", "OE-1", "?"]  # cod invalid -> '?', NU ascuns
+    # Raspuns trunchiat: lipseste pozitia 3 -> '?' pe lipsa, nu eroare.
+    content_trunc = {"rez": [{"i": 1, "cod": "OE-1"}, {"i": 2, "cod": "OE-2"}]}
+    codes2 = eticheteaza.parseaza_raspuns(content_trunc, len(batch))
+    assert codes2 == ["OE-1", "OE-2", "?"]
+    assert len(codes2) == len(batch)
+
+
+def test_backend_selectabil_env(monkeypatch):
+    # Default = lmstudio (backend aprobat v1, D4).
+    monkeypatch.delenv("ETICHETARE_BACKEND", raising=False)
+    assert eticheteaza.get_backend().name == "lmstudio"
+    # Selectie prin env.
+    monkeypatch.setenv("ETICHETARE_BACKEND", "groq")
+    assert eticheteaza.get_backend().name == "groq"
+    # Endpoint + model configurabile prin env.
+    monkeypatch.setenv("ETICHETARE_BACKEND", "lmstudio")
+    monkeypatch.setenv("ETICHETARE_ENDPOINT", "http://exemplu:1234/v1/chat/completions")
+    monkeypatch.setenv("ETICHETARE_MODEL", "qwen/qwen3-custom")
+    b = eticheteaza.get_backend()
+    assert b.url == "http://exemplu:1234/v1/chat/completions"
+    assert b.model == "qwen/qwen3-custom"
+
+
+def test_scrub_pii_inainte_de_request(monkeypatch):
+    """Nicio placuta/VIN nu ajunge la transport — scrub inainte de orice apel."""
+    capturat = {}
+
+    def fake_transport(url, headers, payload, timeout):
+        capturat["payload"] = payload
+        return {"choices": [{"message": {"content": '{"rez":[{"i":1,"cod":"OE-1"}]}'}}]}
+
+    backend = eticheteaza.get_backend("lmstudio")
+    codes, meta = eticheteaza.call(["VOPSIT USA B 123 ABC"], backend, transport=fake_transport)
+    assert codes == ["OE-1"]
+    body = capturat["payload"]
+    user_content = body["messages"][1]["content"]
+    assert "B 123 ABC" not in user_content
+    assert "[NR]" in user_content
+    assert meta["err"] is None
--- a/tests/test_genereaza_seed.py
+++ b/tests/test_genereaza_seed.py
@@ -0,0 +1,175 @@
+"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa.
+
+Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5):
+  brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR)
+  -> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat.
+
+Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM.
+Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste).
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import sys
+
+
+def _load(name: str):
+    path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py")
+    spec = importlib.util.spec_from_file_location(name, path)
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+gs = _load("genereaza_seed")
+
+
+def _scrie_csv(path, randuri):
+    """randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header)."""
+    linii = ['"   ";"DENOP";"NR"']
+    for i, (den, nr) in enumerate(randuri, 1):
+        linii.append(f'"{i}";"{den}";"{nr}"')
+    path.write_text("\n".join(linii) + "\n", encoding="utf-8")
+
+
+def _mock_recorder():
+    """Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul."""
+    vazute = []
+
+    def clasifica(batch):
+        vazute.append(list(batch))
+        return ["OE-1"] * len(batch)
+
+    return clasifica, vazute
+
+
+# --------------------------------------------------------------------------- #
+
+def test_dedup_normalizat(tmp_path):
+    f1 = tmp_path / "a.csv"
+    f2 = tmp_path / "b.csv"
+    _scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)])
+    _scrie_csv(f2, [("  revizie  ", 5)])  # acelasi logic, case+spatii
+    corpus = gs.agrega_corpus([str(f1), str(f2)])
+    assert "REVIZIE" in corpus
+    assert corpus["REVIZIE"]["freq"] == 15           # 10 + 5, dedup pe cheie
+    assert len([k for k in corpus]) == 2             # REVIZIE + D/R BARA FATA
+
+
+def test_skip_cheie_normalizata_vida(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("   ", 99), ("REVIZIE", 5)])     # cheie vida (doar spatii)
+    corpus = gs.agrega_corpus([str(f)])
+    assert "" not in corpus
+    assert list(corpus) == ["REVIZIE"]
+
+
+def test_ordine_pe_frecventa(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)])
+    seed = tmp_path / "seed.json"
+    clasifica, vazute = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed),
+                 etichetare_all=True, clasifica=clasifica, batch=32)
+    # Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa.
+    primul_batch = vazute[0]
+    assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"]
+
+
+def test_reuse_in_spatiu_normalizat(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)])
+    labels = tmp_path / "labels.json"
+    labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8")  # cheiat brut, dar normalizeaza la fel
+    seed = tmp_path / "seed.json"
+    clasifica, vazute = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed),
+                 etichetare_all=True, clasifica=clasifica)
+    trimise = {d for b in vazute for d in b}
+    assert "Revizie" not in trimise and "REVIZIE" not in trimise  # deja etichetat -> nu se trimite
+    seed_data = json.loads(seed.read_text(encoding="utf-8"))
+    rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0]
+    assert rev["cod"] == "OE-3"
+
+
+def test_reuse_conflict_determinist(tmp_path):
+    f = tmp_path / "a.csv"
+    # Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide.
+    _scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)])
+    labels = tmp_path / "labels.json"
+    labels.write_text(json.dumps({
+        "CURATAT CATALIZATOR": "OE-1",   # freq 100
+        "curatat catalizator": "OE-2",   # freq 5
+    }), encoding="utf-8")
+    seed = tmp_path / "seed.json"
+    clasifica, _ = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
+    seed_data = json.loads(seed.read_text(encoding="utf-8"))
+    cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0]
+    assert cat["cod"] == "OE-1"   # freq-max castiga (100 > 5)
+
+
+def test_zero_duplicate_trimis_la_llm(tmp_path):
+    f1 = tmp_path / "a.csv"
+    f2 = tmp_path / "b.csv"
+    _scrie_csv(f1, [("REVIZIE", 10), ("  revizie ", 4), ("OP NOUA", 7), ("   ", 3)])
+    _scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)])  # cross-file duplicate
+    labels = tmp_path / "labels.json"
+    labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8")  # REVIZIE deja etichetat
+    seed = tmp_path / "seed.json"
+    clasifica, vazute = _mock_recorder()
+    from app.mapping import normalize_for_match
+    gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed),
+                 etichetare_all=True, clasifica=clasifica)
+    trimise = [d for b in vazute for d in b]
+    chei = [normalize_for_match(d) for d in trimise]
+    assert len(chei) == len(set(chei))          # nicio cheie normalizata trimisa de doua ori
+    assert "" not in chei                        # nicio cheie vida
+    assert "REVIZIE" not in chei                 # nicio cheie deja etichetata
+    assert "OP NOUA" in chei                     # doar ce lipseste
+
+
+def test_rerun_zero_apeluri_llm(tmp_path):
+    """Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic."""
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)])
+    seed = tmp_path / "seed.json"
+
+    clasifica1, vazute1 = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1)
+    assert sum(len(b) for b in vazute1) == 2     # prima rulare eticheteaza ambele
+    bytes1 = seed.read_bytes()
+
+    clasifica2, vazute2 = _mock_recorder()
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2)
+    assert vazute2 == []                          # a doua rulare: 0 apeluri LLM (seed = cache)
+    bytes2 = seed.read_bytes()
+    assert bytes1 == bytes2                        # seed identic byte-cu-byte
+
+
+def test_format_seed_valid(tmp_path):
+    f = tmp_path / "a.csv"
+    _scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)])
+    seed = tmp_path / "seed.json"
+
+    def clasifica(batch):
+        # marcheaza ITP ca NUL, restul OE-1
+        return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch]
+
+    gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
+    data = json.loads(seed.read_text(encoding="utf-8"))
+    chei = [e["denumire_normalizata"] for e in data]
+    assert len(chei) == len(set(chei))            # unice
+    assert all(e["denumire_normalizata"] for e in data)  # non-vide
+    for e in data:
+        assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"}
+        if e["is_nul"]:
+            assert e["cod"] is None                # NUL -> cod NULL (oglindeste CHECK-ul DB)
+        else:
+            assert e["cod"]
+    nul = [e for e in data if e["is_nul"]][0]
+    assert "ITP" in nul["denumire_normalizata"]
--- a/tests/test_mapare_integrare_l14.py
+++ b/tests/test_mapare_integrare_l14.py
@@ -272,14 +272,18 @@ def test_embeddings_functional_cand_flag_activ(conn, monkeypatch):
    get_settings.cache_clear()
    monkeypatch.setattr(emb_mod, "_engine", EmbeddingEngine(backend=_FakeEmbedBackend()))

-    # Nomenclatorul (din fixtura conn) are OE-1..OE-4; adaug coduri cu denumiri keyword.
+    # Corpusul sursa = mapping_suggestions (SILVER) -- PRD 5.18 US-005.
+    # (Inainte era nomenclator_rar; migrat la mapping_suggestions ca k-NN sa
+    # opereze pe exemple reale etichetate, nu pe categorii generice RAR.)
    conn.execute(
-        "INSERT OR REPLACE INTO nomenclator_rar (cod_prestatie, nume_prestatie) VALUES (?, ?)",
-        ("UL-1", "Schimb ulei"),
+        "INSERT OR REPLACE INTO mapping_suggestions "
+        "(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, ?, ?)",
+        ("Schimb ulei", "UL-1", 0, "llm", 0.95),
    )
    conn.execute(
-        "INSERT OR REPLACE INTO nomenclator_rar (cod_prestatie, nume_prestatie) VALUES (?, ?)",
-        ("FR-1", "Placute frana"),
+        "INSERT OR REPLACE INTO mapping_suggestions "
+        "(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, ?, ?)",
+        ("Placute frana", "FR-1", 0, "llm", 0.95),
    )
    conn.commit()

--- a/tests/test_operatii_seed.py
+++ b/tests/test_operatii_seed.py
@@ -0,0 +1,113 @@
+"""US-004 (PRD 5.18) — seeder corpus etichetat in mapping_suggestions (SILVER).
+
+INSERT OR IGNORE din artefactul comis -> SILVER nu mai e gol in productie.
+NB (F10): confirmarile UMANE stau in shared_mappings, NU aici; deci INSERT OR IGNORE
+pastreaza codul LLM existent la re-seed (v1 = ignore, nu upsert).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import tempfile
+
+import pytest
+
+
+@pytest.fixture()
+def env(monkeypatch):
+    tmp = tempfile.mkdtemp()
+    monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us004.db"))
+    monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
+    from app.config import get_settings
+    get_settings.cache_clear()
+    from app.db import init_db
+    init_db()
+    yield tmp
+    get_settings.cache_clear()
+
+
+@pytest.fixture()
+def conn(env):
+    from app.db import get_connection
+    c = get_connection()
+    yield c
+    c.close()
+
+
+def _scrie_seed(tmp, items) -> str:
+    p = os.path.join(tmp, "operatii-etichetate.json")
+    with open(p, "w", encoding="utf-8") as fh:
+        json.dump(items, fh, ensure_ascii=False)
+    return p
+
+
+SEED_OE = {"denumire": "SCHIMB ULEI MOTOR", "denumire_normalizata": "SCHIMB ULEI MOTOR",
+           "cod": "OE-3", "is_nul": False, "source": "llm_seed", "confidence": 0.7}
+SEED_NUL = {"denumire": "13 X ITP", "denumire_normalizata": "13 X ITP",
+            "cod": None, "is_nul": True, "source": "llm_seed", "confidence": 0.7}
+
+
+def test_seed_populeaza_mapping_suggestions(env, conn):
+    from app.operatii_seed import seed_operatii_etichetate
+    path = _scrie_seed(env, [SEED_OE])
+    n = seed_operatii_etichetate(conn, path)
+    conn.commit()
+    assert n == 1
+    row = conn.execute(
+        "SELECT cod_prestatie, source, confidence FROM mapping_suggestions "
+        "WHERE denumire_normalizata = 'SCHIMB ULEI MOTOR'"
+    ).fetchone()
+    assert row["cod_prestatie"] == "OE-3"
+    assert row["source"] == "llm_seed"
+    assert abs(row["confidence"] - 0.7) < 1e-9
+
+
+def test_is_nul_din_seed(env, conn):
+    from app.operatii_seed import seed_operatii_etichetate
+    path = _scrie_seed(env, [SEED_NUL])
+    seed_operatii_etichetate(conn, path)
+    conn.commit()
+    row = conn.execute(
+        "SELECT cod_prestatie, is_nul FROM mapping_suggestions WHERE denumire_normalizata = '13 X ITP'"
+    ).fetchone()
+    assert row["is_nul"] == 1
+    assert row["cod_prestatie"] is None   # respecta CHECK-ul (NUL -> cod NULL)
+
+
+def test_insert_or_ignore_nu_clobber(env, conn):
+    from app.operatii_seed import seed_operatii_etichetate
+    # Un rand pre-existent (ex. embedding) pe aceeasi cheie, cu alt cod.
+    conn.execute(
+        "INSERT INTO mapping_suggestions (denumire_normalizata, cod_prestatie, is_nul, source, confidence) "
+        "VALUES ('SCHIMB ULEI MOTOR', 'OE-1', 0, 'embedding', 0.5)"
+    )
+    conn.commit()
+    path = _scrie_seed(env, [SEED_OE])
+    n = seed_operatii_etichetate(conn, path)
+    conn.commit()
+    assert n == 0  # INSERT OR IGNORE -> nu suprascrie
+    row = conn.execute(
+        "SELECT cod_prestatie, source FROM mapping_suggestions WHERE denumire_normalizata = 'SCHIMB ULEI MOTOR'"
+    ).fetchone()
+    assert row["cod_prestatie"] == "OE-1"  # randul existent ramane neatins
+    assert row["source"] == "embedding"
+
+
+def test_idempotent_la_reinit(env, conn):
+    from app.operatii_seed import seed_operatii_etichetate
+    path = _scrie_seed(env, [SEED_OE, SEED_NUL])
+    n1 = seed_operatii_etichetate(conn, path)
+    conn.commit()
+    n2 = seed_operatii_etichetate(conn, path)
+    conn.commit()
+    assert n1 == 2
+    assert n2 == 0  # a doua rulare nu dubleaza
+    total = conn.execute("SELECT COUNT(*) AS n FROM mapping_suggestions").fetchone()["n"]
+    assert total == 2
+
+
+def test_seed_inexistent_e_noop(env, conn):
+    from app.operatii_seed import seed_operatii_etichetate
+    n = seed_operatii_etichetate(conn, os.path.join(env, "nu-exista.json"))
+    assert n == 0
--- a/tests/test_prefiltru_nul.py
+++ b/tests/test_prefiltru_nul.py
@@ -0,0 +1,72 @@
+"""US-001 (PRD 5.18) — pre-filtru determinist non-operatii (NUL).
+
+Masuratoarea k-NN (memorie test-precizie-knn-embeddings) arata recall NUL doar 64%:
+gunoiul evident (ITP, plata, discount, nr. inmatriculare, tractare) scapa ca OE-1.
+Un pre-filtru determinist il marcheaza NUL INAINTE de k-NN.
+
+Garantie non-negociabila (AC): ZERO fals-pozitiv pe operatii reale. Regulile
+text/regex au fost calibrate pe `docs/operatii-service/*.csv` (vezi sesiunea de
+implementare): triggerele ambigue (TRACTARE, NR INMATRICULARE/placuta) sunt
+ECRANATE de un context de piesa/operatie (D/R, CARLIG, CAPAC, INLOCUIT...).
+"""
+
+from __future__ import annotations
+
+from app.mapping import prefiltru_nul
+
+
+def test_itp_e_nul():
+    assert prefiltru_nul("13 X ITP") is True
+    assert prefiltru_nul("11XITP") is True          # glue fara spatii
+    assert prefiltru_nul("ITP") is True
+    assert prefiltru_nul("2 X ITP") is True
+
+
+def test_plata_discount_nul():
+    assert prefiltru_nul("DISCOUNT FIDELITATE 10%") is True
+    assert prefiltru_nul("REDUCERE COMERCIALA") is True
+    assert prefiltru_nul("ACHITAT DE CONF.URBAN") is True
+    assert prefiltru_nul("PLATA AVANS") is True
+    assert prefiltru_nul("TAXA DE MEDIU") is True
+
+
+def test_nr_inmatriculare_nul():
+    assert prefiltru_nul("NR INMATRICULARE") is True
+    assert prefiltru_nul("NUMAR INMATRICULARE") is True
+    assert prefiltru_nul("B 123 ABC") is True       # pattern placuta standalone
+    assert prefiltru_nul("CT 44 MKY") is True
+
+
+def test_tractare_serviciu_nul():
+    # Serviciul de tractare (rmorca) = non-operatie de service.
+    assert prefiltru_nul("TRACTARE CTA-SLOBOZIA") is True
+    assert prefiltru_nul("TRACTARE 100 KM") is True
+
+
+def test_operatie_reala_nu_e_nul():
+    # Punctul critic: trigger ambiguu intr-un context de piesa reala -> NU e NUL.
+    assert prefiltru_nul("INLOCUIT PLACUTE FRANA") is False
+    assert prefiltru_nul("D/R CARLIG TRACTARE") is False        # carlig = piesa, nu serviciu
+    assert prefiltru_nul("D/R CAPAC TRACTARE BARA SPATE") is False
+    assert prefiltru_nul("D/R NR INMATRICULARE") is False       # suport placuta = piesa
+    assert prefiltru_nul("D/R ELECTROMOTOR CT 44 MKY") is False  # placuta lipita la o operatie reala
+
+
+def test_zero_fals_pozitiv_pe_set_operatii_reale():
+    """AC: zero fals-pozitiv pe un set de 20 operatii reale (din docs/operatii-service)."""
+    reale = [
+        "REVIZIE", "SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA FATA",
+        "D/R BARA FATA", "VOPSIT USA DR FATA", "INLOCUIT FILTRU AER",
+        "AERISIT INSTALATIE FRANA", "INLOCUIT AMORTIZOR SPATE", "ABSORBANT SOC BARA SPATE",
+        "INLOCUIT CUREA DISTRIBUTIE", "REGLAT FARURI", "INLOCUIT BUJII",
+        "REPARAT ARIPA FATA DR", "INLOCUIT DISCURI FRANA", "GRESAT PLANETARA",
+        "INLOCUIT RULMENT ROATA", "MONTAT ANVELOPE", "INLOCUIT BATERIE",
+        "DIAGNOZA COMPUTERIZATA", "INLOCUIT CONTACT PORNIRE",
+    ]
+    for op in reale:
+        assert prefiltru_nul(op) is False, f"fals-pozitiv pe operatie reala: {op!r}"
+
+
+def test_input_gol_nu_e_nul():
+    assert prefiltru_nul("") is False
+    assert prefiltru_nul(None) is False  # type: ignore[arg-type]