feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)

Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE
cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq
(3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect
~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume
piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7
(nu OE-5), placute frana uzura OE-1 (nu OE-F avarie).

US-001..006: prefiltru NUL determinist, etichetator offline, generator seed,
seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings
indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL
12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO).

config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER
populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env).

Suita: 1387 passed, 1 deselected (live).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-06-29 06:59:15 +00:00
parent c05fa00007
commit 756f77730f
17 changed files with 139308 additions and 44 deletions

View File

@@ -18,6 +18,14 @@ import pytest
os.environ.setdefault("AUTOPASS_REQUIRE_API_KEY", "false")
os.environ.setdefault("AUTOPASS_WORKER_USE_TEST_CREDS", "false")
# Embeddings e ON implicit in app (config.py), dar in teste il lasam OFF ca sa nu
# lazy-load-eze modelul de ~230MB la fiecare test care atinge editorul de mapari
# (suita rapida, fara download in CI). Testele de embeddings il pornesc punctual.
os.environ.setdefault("AUTOPASS_EMBEDDINGS_ENABLED", "false")
# Seed-ul de operatii etichetate (SILVER, PRD 5.18) e ON in app, dar OFF in teste:
# multe teste presupun mapping_suggestions GOL la init_db. Testele US-004/005/006 il
# pornesc punctual (object.__setattr__ pe settings sau apel direct la seeder).
os.environ.setdefault("AUTOPASS_SEED_OPERATII_ENABLED", "false")
@pytest.fixture(autouse=True)

View File

@@ -0,0 +1,150 @@
"""US-005 (PRD 5.18) — embeddings indexeaza corpusul etichetat (NU nomenclatorul).
k-NN peste exemple reale etichetate (denumire_normalizata -> cod, is_nul) e net mai
precis decat peste cele 18 categorii generice. Acopera si simetria corpus/query (F1):
corpusul e text NORMALIZAT, deci query-ul trebuie normalizat la fel inainte de embedding.
"""
from __future__ import annotations
import math
import os
import tempfile
import pytest
# Backend mock determinist: vector = histograma de caractere (similaritate stabila).
class MockBackend:
def embed(self, texts):
out = []
for t in texts:
v = [0.0] * 27
for ch in t.upper():
if "A" <= ch <= "Z":
v[ord(ch) - 65] += 1.0
else:
v[26] += 1.0
out.append(v)
return out
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us005.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true") # US-005 are nevoie de embeddings ON
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield monkeypatch
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _inject_mock_engine():
import app.embeddings as emb
from app.embeddings import EmbeddingEngine
emb._engine = EmbeddingEngine(backend=MockBackend())
return emb
def _seed_silver(conn, rows):
"""rows = [(denumire_normalizata, cod, is_nul)]."""
conn.executemany(
"INSERT OR IGNORE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
rows,
)
conn.commit()
def test_corpus_din_mapping_suggestions(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [
("SCHIMB ULEI MOTOR", "OE-3", 0),
("INLOCUIT PLACUTE FRANA", "OE-1", 0),
("13 X ITP", None, 1),
])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
assert emb.has_corpus()
# Corpusul indexat = denumirile din mapping_suggestions, NU din nomenclator_rar.
texte = {it["denumire"] for it in emb._engine._corpus_items}
assert texte == {"SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA", "13 X ITP"}
def test_suggest_nearest_intoarce_is_nul(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [
("SCHIMB ULEI MOTOR", "OE-3", 0),
("13 X ITP", None, 1),
])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
res = emb.suggest_nearest("13 X ITP", top_k=1)
assert res and res[0]["is_nul"] is True # vecin NUL -> semnal de supresie
res2 = emb.suggest_nearest("SCHIMB ULEI MOTOR", top_k=1)
assert res2 and res2[0]["is_nul"] is False
assert res2[0]["cod"] == "OE-3"
def test_semnatura_corpus_pe_seed(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
sig1 = emb.corpus_signature()
assert sig1 is not None
# Re-apel fara schimbare -> aceeasi semnatura (nu re-indexeaza).
ensure_embeddings_corpus(conn)
assert emb.corpus_signature() == sig1
# Adaugare rand -> semnatura se schimba.
_seed_silver(conn, [("INLOCUIT BATERIE", "OE-1", 0)])
ensure_embeddings_corpus(conn)
assert emb.corpus_signature() != sig1
def test_query_normalizat_ca_si_corpusul(conn, monkeypatch):
"""F1 (HIGH): enrich_suggestions interogheaza suggest_nearest cu textul NORMALIZAT."""
import app.embeddings as emb
captura = {}
monkeypatch.setattr(emb, "has_corpus", lambda: True)
def fake_suggest(text, top_k=1):
captura["text"] = text
return [{"cod": "OE-3", "is_nul": False, "similaritate": 0.99}]
monkeypatch.setattr(emb, "suggest_nearest", fake_suggest)
from app.mapping import enrich_suggestions
enrich_suggestions(conn, "Schimb Uleiul Motor")
# Corpusul e denumire_normalizata -> query-ul trebuie normalizat la fel.
from app.mapping import normalize_for_match
assert captura["text"] == normalize_for_match("Schimb Uleiul Motor")
assert captura["text"] == "SCHIMB ULEIUL MOTOR"
def test_degradare_gratioasa_pastrata(conn):
"""Backend care arunca -> ensure + enrich NU arunca exceptie."""
import app.embeddings as emb
from app.embeddings import EmbeddingEngine
class BrokenBackend:
def embed(self, texts):
raise RuntimeError("model indisponibil")
emb._engine = EmbeddingEngine(backend=BrokenBackend())
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
from app.mapping import ensure_embeddings_corpus, enrich_suggestions
ensure_embeddings_corpus(conn) # nu arunca
out = enrich_suggestions(conn, "SCHIMB ULEI") # nu arunca
assert "sugestie_principala" in out

View File

@@ -0,0 +1,133 @@
"""US-006 (PRD 5.18) — enrich_suggestions = pre-filtru NUL + k-NN pe corpus etichetat.
Ordinea de precedenta: pre-filtru NUL -> (daca NUL: fara cod) altfel GOLD partajat >
exact (SILVER) > k-NN embeddings. k-NN sub prag -> abtinere. Vecin k-NN NUL -> supresie.
Invariant #13: nimic din asta nu intra in resolve_prestatii/load_mapping.
"""
from __future__ import annotations
import os
import tempfile
import pytest
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us006.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true")
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield monkeypatch
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _silver(conn, denumire_norm, cod, is_nul=0):
conn.execute(
"INSERT OR IGNORE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
(denumire_norm, cod, is_nul),
)
conn.commit()
def _mock_embedding(monkeypatch, cod, sim, is_nul=False):
import app.embeddings as emb
monkeypatch.setattr(emb, "has_corpus", lambda: True)
monkeypatch.setattr(emb, "suggest_nearest",
lambda text, top_k=1: [{"cod": cod, "is_nul": is_nul, "similaritate": sim}])
def test_prefiltru_nul_supreseaza_inainte_de_knn(conn, monkeypatch):
# Embedding-ul AR sugera un cod, dar pre-filtrul NUL trebuie sa scurtcircuiteze.
chemat = {"da": False}
import app.embeddings as emb
monkeypatch.setattr(emb, "has_corpus", lambda: True)
def spion(text, top_k=1):
chemat["da"] = True
return [{"cod": "OE-1", "is_nul": False, "similaritate": 0.99}]
monkeypatch.setattr(emb, "suggest_nearest", spion)
from app.mapping import enrich_suggestions
out = enrich_suggestions(conn, "13 X ITP")
assert out["sugestie_principala"] is None # non-operatie -> fara cod
assert out["surse"]["nul"] is True
assert chemat["da"] is False # k-NN nici macar interogat
def test_precedenta_gold_exact_embedding(conn, monkeypatch):
from app.shared_store import record_human_validation
from app.mapping import enrich_suggestions, normalize_for_match
den = "OPERATIE DE TEST UNICA"
norm = normalize_for_match(den)
# Toate trei sursele dau coduri diferite.
record_human_validation(conn, den, "OE-1") # GOLD partajat
_silver(conn, norm, "OE-2") # SILVER exact
_mock_embedding(monkeypatch, "OE-3", 0.99) # embedding
conn.commit()
out = enrich_suggestions(conn, den)
assert out["sugestie_principala"] == {"cod_prestatie": "OE-1", "sursa": "gold_partajat"}
# Fara GOLD -> castiga SILVER.
conn.execute("DELETE FROM shared_mappings")
conn.commit()
out = enrich_suggestions(conn, den)
assert out["sugestie_principala"]["sursa"] == "silver"
assert out["sugestie_principala"]["cod_prestatie"] == "OE-2"
# Fara GOLD si fara SILVER -> castiga embedding.
conn.execute("DELETE FROM mapping_suggestions")
conn.commit()
out = enrich_suggestions(conn, den)
assert out["sugestie_principala"] == {"cod_prestatie": "OE-3", "sursa": "embedding"}
def test_prag_similaritate(conn, monkeypatch):
from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
_mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE + 0.01)
out = enrich_suggestions(conn, "CEVA NEVAZUT")
assert out["surse"]["embedding"] == "OE-3"
def test_abtinere_sub_prag(conn, monkeypatch):
from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
_mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE - 0.01)
out = enrich_suggestions(conn, "CEVA NEVAZUT")
assert out["surse"]["embedding"] is None # sub prag -> abtinere
assert out["sugestie_principala"] is None
def test_vecin_knn_nul_supreseaza(conn, monkeypatch):
from app.mapping import enrich_suggestions
_mock_embedding(monkeypatch, None, 0.99, is_nul=True) # vecin NUL peste prag
out = enrich_suggestions(conn, "CEVA CARE SEAMANA CU GUNOI")
assert out["surse"]["embedding"] is None # NUL -> nu produce cod
assert out["surse"]["nul"] is True
assert out["sugestie_principala"] is None
def test_invariant_13_resolve_neatins(conn):
"""Regresie #13: SILVER populat NU produce auto-rezolvare in resolve_prestatii."""
from app.mapping import resolve_prestatii, normalize_for_match
_silver(conn, normalize_for_match("OPERATIE X"), "OE-1")
resolved, unmapped = resolve_prestatii(
[{"cod_op_service": "OPERATIE X", "denumire": "OPERATIE X"}], mapping={}, valid_codes={"OE-1"}
)
assert resolved[0]["cod_prestatie"] is None # ramane nemapat, NU ia codul din SILVER
assert unmapped and unmapped[0]["cod_op_service"] == "OPERATIE X"

View File

@@ -0,0 +1,103 @@
"""US-002 (PRD 5.18) — etichetator offline multi-backend cu prompt procedural.
Toate testele ruleaza FARA retea reala (transport injectabil / inspectie body).
Acopera: prompt 3 pasi, envelope json_schema strict + enum, backend selectabil
prin env, scrub PII inainte de orice request, garda de truncare.
"""
from __future__ import annotations
# Numele pachetului `tools/mapare-llm` contine cratima -> nu e importabil ca modul.
# Incarcam fisierul direct prin importlib pe cale.
import importlib.util
import os
import sys
_PATH = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", "eticheteaza.py")
_spec = importlib.util.spec_from_file_location("eticheteaza", _PATH)
eticheteaza = importlib.util.module_from_spec(_spec)
sys.modules["eticheteaza"] = eticheteaza # necesar pt. @dataclass introspection
_spec.loader.exec_module(eticheteaza)
def test_construieste_prompt_3pasi():
msgs = eticheteaza.construieste_mesaje(["INLOCUIT PLACUTE FRANA"])
assert isinstance(msgs, list) and msgs[0]["role"] == "system"
sys = msgs[0]["content"].upper()
# Procedura in 3 pasi explicita.
assert "PAS 1" in sys and "PAS 2" in sys and "PAS 3" in sys
# Regula NUL + avarie grava doar la accident.
assert "NUL" in sys
assert "ACCIDENT" in sys
# Dezactivare thinking Qwen3 (token /no_think undeva in mesaje).
joined = " ".join(m["content"] for m in msgs)
assert "/no_think" in joined
# User message enumera operatiile.
assert "1." in msgs[1]["content"] and "INLOCUIT PLACUTE FRANA" in msgs[1]["content"]
def test_envelope_json_schema_strict_si_enum():
backend = eticheteaza.get_backend("lmstudio")
body = eticheteaza.construieste_body(["REVIZIE"], backend)
rf = body["response_format"]
# Envelope COMPLET, NU json_object.
assert rf["type"] == "json_schema"
js = rf["json_schema"]
assert js["strict"] is True
assert "name" in js
schema = js["schema"]
cod_schema = schema["properties"]["rez"]["items"]["properties"]["cod"]
# cod = enum peste cele 19 ALL_LABELS (18 coduri + NUL).
assert set(cod_schema["enum"]) == set(eticheteaza.ALL_LABELS)
assert len(eticheteaza.ALL_LABELS) == 19
assert "NUL" in eticheteaza.ALL_LABELS
# temperatura 0 (determinist) si strict items.
assert body["temperature"] == 0
assert schema["properties"]["rez"]["items"]["additionalProperties"] is False
def test_parseaza_raspuns_si_garda_truncare():
batch = ["A", "B", "C"]
# Raspuns complet, ordine amestecata, un cod invalid.
content = {"rez": [{"i": 2, "cod": "OE-1"}, {"i": 1, "cod": "NUL"}, {"i": 3, "cod": "INEXISTENT"}]}
codes = eticheteaza.parseaza_raspuns(content, len(batch))
assert codes == ["NUL", "OE-1", "?"] # cod invalid -> '?', NU ascuns
# Raspuns trunchiat: lipseste pozitia 3 -> '?' pe lipsa, nu eroare.
content_trunc = {"rez": [{"i": 1, "cod": "OE-1"}, {"i": 2, "cod": "OE-2"}]}
codes2 = eticheteaza.parseaza_raspuns(content_trunc, len(batch))
assert codes2 == ["OE-1", "OE-2", "?"]
assert len(codes2) == len(batch)
def test_backend_selectabil_env(monkeypatch):
# Default = lmstudio (backend aprobat v1, D4).
monkeypatch.delenv("ETICHETARE_BACKEND", raising=False)
assert eticheteaza.get_backend().name == "lmstudio"
# Selectie prin env.
monkeypatch.setenv("ETICHETARE_BACKEND", "groq")
assert eticheteaza.get_backend().name == "groq"
# Endpoint + model configurabile prin env.
monkeypatch.setenv("ETICHETARE_BACKEND", "lmstudio")
monkeypatch.setenv("ETICHETARE_ENDPOINT", "http://exemplu:1234/v1/chat/completions")
monkeypatch.setenv("ETICHETARE_MODEL", "qwen/qwen3-custom")
b = eticheteaza.get_backend()
assert b.url == "http://exemplu:1234/v1/chat/completions"
assert b.model == "qwen/qwen3-custom"
def test_scrub_pii_inainte_de_request(monkeypatch):
"""Nicio placuta/VIN nu ajunge la transport — scrub inainte de orice apel."""
capturat = {}
def fake_transport(url, headers, payload, timeout):
capturat["payload"] = payload
return {"choices": [{"message": {"content": '{"rez":[{"i":1,"cod":"OE-1"}]}'}}]}
backend = eticheteaza.get_backend("lmstudio")
codes, meta = eticheteaza.call(["VOPSIT USA B 123 ABC"], backend, transport=fake_transport)
assert codes == ["OE-1"]
body = capturat["payload"]
user_content = body["messages"][1]["content"]
assert "B 123 ABC" not in user_content
assert "[NR]" in user_content
assert meta["err"] is None

View File

@@ -0,0 +1,175 @@
"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa.
Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5):
brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR)
-> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat.
Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM.
Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste).
"""
from __future__ import annotations
import importlib.util
import json
import os
import sys
def _load(name: str):
path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py")
spec = importlib.util.spec_from_file_location(name, path)
mod = importlib.util.module_from_spec(spec)
sys.modules[name] = mod
spec.loader.exec_module(mod)
return mod
gs = _load("genereaza_seed")
def _scrie_csv(path, randuri):
"""randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header)."""
linii = ['" ";"DENOP";"NR"']
for i, (den, nr) in enumerate(randuri, 1):
linii.append(f'"{i}";"{den}";"{nr}"')
path.write_text("\n".join(linii) + "\n", encoding="utf-8")
def _mock_recorder():
"""Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul."""
vazute = []
def clasifica(batch):
vazute.append(list(batch))
return ["OE-1"] * len(batch)
return clasifica, vazute
# --------------------------------------------------------------------------- #
def test_dedup_normalizat(tmp_path):
f1 = tmp_path / "a.csv"
f2 = tmp_path / "b.csv"
_scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)])
_scrie_csv(f2, [(" revizie ", 5)]) # acelasi logic, case+spatii
corpus = gs.agrega_corpus([str(f1), str(f2)])
assert "REVIZIE" in corpus
assert corpus["REVIZIE"]["freq"] == 15 # 10 + 5, dedup pe cheie
assert len([k for k in corpus]) == 2 # REVIZIE + D/R BARA FATA
def test_skip_cheie_normalizata_vida(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [(" ", 99), ("REVIZIE", 5)]) # cheie vida (doar spatii)
corpus = gs.agrega_corpus([str(f)])
assert "" not in corpus
assert list(corpus) == ["REVIZIE"]
def test_ordine_pe_frecventa(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)])
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed),
etichetare_all=True, clasifica=clasifica, batch=32)
# Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa.
primul_batch = vazute[0]
assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"]
def test_reuse_in_spatiu_normalizat(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)])
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # cheiat brut, dar normalizeaza la fel
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed),
etichetare_all=True, clasifica=clasifica)
trimise = {d for b in vazute for d in b}
assert "Revizie" not in trimise and "REVIZIE" not in trimise # deja etichetat -> nu se trimite
seed_data = json.loads(seed.read_text(encoding="utf-8"))
rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0]
assert rev["cod"] == "OE-3"
def test_reuse_conflict_determinist(tmp_path):
f = tmp_path / "a.csv"
# Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide.
_scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)])
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({
"CURATAT CATALIZATOR": "OE-1", # freq 100
"curatat catalizator": "OE-2", # freq 5
}), encoding="utf-8")
seed = tmp_path / "seed.json"
clasifica, _ = _mock_recorder()
gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
seed_data = json.loads(seed.read_text(encoding="utf-8"))
cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0]
assert cat["cod"] == "OE-1" # freq-max castiga (100 > 5)
def test_zero_duplicate_trimis_la_llm(tmp_path):
f1 = tmp_path / "a.csv"
f2 = tmp_path / "b.csv"
_scrie_csv(f1, [("REVIZIE", 10), (" revizie ", 4), ("OP NOUA", 7), (" ", 3)])
_scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)]) # cross-file duplicate
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # REVIZIE deja etichetat
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
from app.mapping import normalize_for_match
gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed),
etichetare_all=True, clasifica=clasifica)
trimise = [d for b in vazute for d in b]
chei = [normalize_for_match(d) for d in trimise]
assert len(chei) == len(set(chei)) # nicio cheie normalizata trimisa de doua ori
assert "" not in chei # nicio cheie vida
assert "REVIZIE" not in chei # nicio cheie deja etichetata
assert "OP NOUA" in chei # doar ce lipseste
def test_rerun_zero_apeluri_llm(tmp_path):
"""Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic."""
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)])
seed = tmp_path / "seed.json"
clasifica1, vazute1 = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1)
assert sum(len(b) for b in vazute1) == 2 # prima rulare eticheteaza ambele
bytes1 = seed.read_bytes()
clasifica2, vazute2 = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2)
assert vazute2 == [] # a doua rulare: 0 apeluri LLM (seed = cache)
bytes2 = seed.read_bytes()
assert bytes1 == bytes2 # seed identic byte-cu-byte
def test_format_seed_valid(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)])
seed = tmp_path / "seed.json"
def clasifica(batch):
# marcheaza ITP ca NUL, restul OE-1
return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch]
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
data = json.loads(seed.read_text(encoding="utf-8"))
chei = [e["denumire_normalizata"] for e in data]
assert len(chei) == len(set(chei)) # unice
assert all(e["denumire_normalizata"] for e in data) # non-vide
for e in data:
assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"}
if e["is_nul"]:
assert e["cod"] is None # NUL -> cod NULL (oglindeste CHECK-ul DB)
else:
assert e["cod"]
nul = [e for e in data if e["is_nul"]][0]
assert "ITP" in nul["denumire_normalizata"]

View File

@@ -272,14 +272,18 @@ def test_embeddings_functional_cand_flag_activ(conn, monkeypatch):
get_settings.cache_clear()
monkeypatch.setattr(emb_mod, "_engine", EmbeddingEngine(backend=_FakeEmbedBackend()))
# Nomenclatorul (din fixtura conn) are OE-1..OE-4; adaug coduri cu denumiri keyword.
# Corpusul sursa = mapping_suggestions (SILVER) -- PRD 5.18 US-005.
# (Inainte era nomenclator_rar; migrat la mapping_suggestions ca k-NN sa
# opereze pe exemple reale etichetate, nu pe categorii generice RAR.)
conn.execute(
"INSERT OR REPLACE INTO nomenclator_rar (cod_prestatie, nume_prestatie) VALUES (?, ?)",
("UL-1", "Schimb ulei"),
"INSERT OR REPLACE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, ?, ?)",
("Schimb ulei", "UL-1", 0, "llm", 0.95),
)
conn.execute(
"INSERT OR REPLACE INTO nomenclator_rar (cod_prestatie, nume_prestatie) VALUES (?, ?)",
("FR-1", "Placute frana"),
"INSERT OR REPLACE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, ?, ?)",
("Placute frana", "FR-1", 0, "llm", 0.95),
)
conn.commit()

113
tests/test_operatii_seed.py Normal file
View File

@@ -0,0 +1,113 @@
"""US-004 (PRD 5.18) — seeder corpus etichetat in mapping_suggestions (SILVER).
INSERT OR IGNORE din artefactul comis -> SILVER nu mai e gol in productie.
NB (F10): confirmarile UMANE stau in shared_mappings, NU aici; deci INSERT OR IGNORE
pastreaza codul LLM existent la re-seed (v1 = ignore, nu upsert).
"""
from __future__ import annotations
import json
import os
import tempfile
import pytest
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us004.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield tmp
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _scrie_seed(tmp, items) -> str:
p = os.path.join(tmp, "operatii-etichetate.json")
with open(p, "w", encoding="utf-8") as fh:
json.dump(items, fh, ensure_ascii=False)
return p
SEED_OE = {"denumire": "SCHIMB ULEI MOTOR", "denumire_normalizata": "SCHIMB ULEI MOTOR",
"cod": "OE-3", "is_nul": False, "source": "llm_seed", "confidence": 0.7}
SEED_NUL = {"denumire": "13 X ITP", "denumire_normalizata": "13 X ITP",
"cod": None, "is_nul": True, "source": "llm_seed", "confidence": 0.7}
def test_seed_populeaza_mapping_suggestions(env, conn):
from app.operatii_seed import seed_operatii_etichetate
path = _scrie_seed(env, [SEED_OE])
n = seed_operatii_etichetate(conn, path)
conn.commit()
assert n == 1
row = conn.execute(
"SELECT cod_prestatie, source, confidence FROM mapping_suggestions "
"WHERE denumire_normalizata = 'SCHIMB ULEI MOTOR'"
).fetchone()
assert row["cod_prestatie"] == "OE-3"
assert row["source"] == "llm_seed"
assert abs(row["confidence"] - 0.7) < 1e-9
def test_is_nul_din_seed(env, conn):
from app.operatii_seed import seed_operatii_etichetate
path = _scrie_seed(env, [SEED_NUL])
seed_operatii_etichetate(conn, path)
conn.commit()
row = conn.execute(
"SELECT cod_prestatie, is_nul FROM mapping_suggestions WHERE denumire_normalizata = '13 X ITP'"
).fetchone()
assert row["is_nul"] == 1
assert row["cod_prestatie"] is None # respecta CHECK-ul (NUL -> cod NULL)
def test_insert_or_ignore_nu_clobber(env, conn):
from app.operatii_seed import seed_operatii_etichetate
# Un rand pre-existent (ex. embedding) pe aceeasi cheie, cu alt cod.
conn.execute(
"INSERT INTO mapping_suggestions (denumire_normalizata, cod_prestatie, is_nul, source, confidence) "
"VALUES ('SCHIMB ULEI MOTOR', 'OE-1', 0, 'embedding', 0.5)"
)
conn.commit()
path = _scrie_seed(env, [SEED_OE])
n = seed_operatii_etichetate(conn, path)
conn.commit()
assert n == 0 # INSERT OR IGNORE -> nu suprascrie
row = conn.execute(
"SELECT cod_prestatie, source FROM mapping_suggestions WHERE denumire_normalizata = 'SCHIMB ULEI MOTOR'"
).fetchone()
assert row["cod_prestatie"] == "OE-1" # randul existent ramane neatins
assert row["source"] == "embedding"
def test_idempotent_la_reinit(env, conn):
from app.operatii_seed import seed_operatii_etichetate
path = _scrie_seed(env, [SEED_OE, SEED_NUL])
n1 = seed_operatii_etichetate(conn, path)
conn.commit()
n2 = seed_operatii_etichetate(conn, path)
conn.commit()
assert n1 == 2
assert n2 == 0 # a doua rulare nu dubleaza
total = conn.execute("SELECT COUNT(*) AS n FROM mapping_suggestions").fetchone()["n"]
assert total == 2
def test_seed_inexistent_e_noop(env, conn):
from app.operatii_seed import seed_operatii_etichetate
n = seed_operatii_etichetate(conn, os.path.join(env, "nu-exista.json"))
assert n == 0

View File

@@ -0,0 +1,72 @@
"""US-001 (PRD 5.18) — pre-filtru determinist non-operatii (NUL).
Masuratoarea k-NN (memorie test-precizie-knn-embeddings) arata recall NUL doar 64%:
gunoiul evident (ITP, plata, discount, nr. inmatriculare, tractare) scapa ca OE-1.
Un pre-filtru determinist il marcheaza NUL INAINTE de k-NN.
Garantie non-negociabila (AC): ZERO fals-pozitiv pe operatii reale. Regulile
text/regex au fost calibrate pe `docs/operatii-service/*.csv` (vezi sesiunea de
implementare): triggerele ambigue (TRACTARE, NR INMATRICULARE/placuta) sunt
ECRANATE de un context de piesa/operatie (D/R, CARLIG, CAPAC, INLOCUIT...).
"""
from __future__ import annotations
from app.mapping import prefiltru_nul
def test_itp_e_nul():
assert prefiltru_nul("13 X ITP") is True
assert prefiltru_nul("11XITP") is True # glue fara spatii
assert prefiltru_nul("ITP") is True
assert prefiltru_nul("2 X ITP") is True
def test_plata_discount_nul():
assert prefiltru_nul("DISCOUNT FIDELITATE 10%") is True
assert prefiltru_nul("REDUCERE COMERCIALA") is True
assert prefiltru_nul("ACHITAT DE CONF.URBAN") is True
assert prefiltru_nul("PLATA AVANS") is True
assert prefiltru_nul("TAXA DE MEDIU") is True
def test_nr_inmatriculare_nul():
assert prefiltru_nul("NR INMATRICULARE") is True
assert prefiltru_nul("NUMAR INMATRICULARE") is True
assert prefiltru_nul("B 123 ABC") is True # pattern placuta standalone
assert prefiltru_nul("CT 44 MKY") is True
def test_tractare_serviciu_nul():
# Serviciul de tractare (rmorca) = non-operatie de service.
assert prefiltru_nul("TRACTARE CTA-SLOBOZIA") is True
assert prefiltru_nul("TRACTARE 100 KM") is True
def test_operatie_reala_nu_e_nul():
# Punctul critic: trigger ambiguu intr-un context de piesa reala -> NU e NUL.
assert prefiltru_nul("INLOCUIT PLACUTE FRANA") is False
assert prefiltru_nul("D/R CARLIG TRACTARE") is False # carlig = piesa, nu serviciu
assert prefiltru_nul("D/R CAPAC TRACTARE BARA SPATE") is False
assert prefiltru_nul("D/R NR INMATRICULARE") is False # suport placuta = piesa
assert prefiltru_nul("D/R ELECTROMOTOR CT 44 MKY") is False # placuta lipita la o operatie reala
def test_zero_fals_pozitiv_pe_set_operatii_reale():
"""AC: zero fals-pozitiv pe un set de 20 operatii reale (din docs/operatii-service)."""
reale = [
"REVIZIE", "SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA FATA",
"D/R BARA FATA", "VOPSIT USA DR FATA", "INLOCUIT FILTRU AER",
"AERISIT INSTALATIE FRANA", "INLOCUIT AMORTIZOR SPATE", "ABSORBANT SOC BARA SPATE",
"INLOCUIT CUREA DISTRIBUTIE", "REGLAT FARURI", "INLOCUIT BUJII",
"REPARAT ARIPA FATA DR", "INLOCUIT DISCURI FRANA", "GRESAT PLANETARA",
"INLOCUIT RULMENT ROATA", "MONTAT ANVELOPE", "INLOCUIT BATERIE",
"DIAGNOZA COMPUTERIZATA", "INLOCUIT CONTACT PORNIRE",
]
for op in reale:
assert prefiltru_nul(op) is False, f"fals-pozitiv pe operatie reala: {op!r}"
def test_input_gol_nu_e_nul():
assert prefiltru_nul("") is False
assert prefiltru_nul(None) is False # type: ignore[arg-type]