feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)
Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq (3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect ~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7 (nu OE-5), placute frana uzura OE-1 (nu OE-F avarie). US-001..006: prefiltru NUL determinist, etichetator offline, generator seed, seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL 12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO). config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env). Suita: 1387 passed, 1 deselected (live). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
133
tests/test_enrich_corpus_etichetat.py
Normal file
133
tests/test_enrich_corpus_etichetat.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""US-006 (PRD 5.18) — enrich_suggestions = pre-filtru NUL + k-NN pe corpus etichetat.
|
||||
|
||||
Ordinea de precedenta: pre-filtru NUL -> (daca NUL: fara cod) altfel GOLD partajat >
|
||||
exact (SILVER) > k-NN embeddings. k-NN sub prag -> abtinere. Vecin k-NN NUL -> supresie.
|
||||
Invariant #13: nimic din asta nu intra in resolve_prestatii/load_mapping.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def env(monkeypatch):
|
||||
tmp = tempfile.mkdtemp()
|
||||
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us006.db"))
|
||||
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
|
||||
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true")
|
||||
from app.config import get_settings
|
||||
get_settings.cache_clear()
|
||||
from app.db import init_db
|
||||
init_db()
|
||||
yield monkeypatch
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def conn(env):
|
||||
from app.db import get_connection
|
||||
c = get_connection()
|
||||
yield c
|
||||
c.close()
|
||||
|
||||
|
||||
def _silver(conn, denumire_norm, cod, is_nul=0):
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO mapping_suggestions "
|
||||
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
|
||||
(denumire_norm, cod, is_nul),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _mock_embedding(monkeypatch, cod, sim, is_nul=False):
|
||||
import app.embeddings as emb
|
||||
monkeypatch.setattr(emb, "has_corpus", lambda: True)
|
||||
monkeypatch.setattr(emb, "suggest_nearest",
|
||||
lambda text, top_k=1: [{"cod": cod, "is_nul": is_nul, "similaritate": sim}])
|
||||
|
||||
|
||||
def test_prefiltru_nul_supreseaza_inainte_de_knn(conn, monkeypatch):
|
||||
# Embedding-ul AR sugera un cod, dar pre-filtrul NUL trebuie sa scurtcircuiteze.
|
||||
chemat = {"da": False}
|
||||
import app.embeddings as emb
|
||||
monkeypatch.setattr(emb, "has_corpus", lambda: True)
|
||||
|
||||
def spion(text, top_k=1):
|
||||
chemat["da"] = True
|
||||
return [{"cod": "OE-1", "is_nul": False, "similaritate": 0.99}]
|
||||
|
||||
monkeypatch.setattr(emb, "suggest_nearest", spion)
|
||||
from app.mapping import enrich_suggestions
|
||||
out = enrich_suggestions(conn, "13 X ITP")
|
||||
assert out["sugestie_principala"] is None # non-operatie -> fara cod
|
||||
assert out["surse"]["nul"] is True
|
||||
assert chemat["da"] is False # k-NN nici macar interogat
|
||||
|
||||
|
||||
def test_precedenta_gold_exact_embedding(conn, monkeypatch):
|
||||
from app.shared_store import record_human_validation
|
||||
from app.mapping import enrich_suggestions, normalize_for_match
|
||||
den = "OPERATIE DE TEST UNICA"
|
||||
norm = normalize_for_match(den)
|
||||
|
||||
# Toate trei sursele dau coduri diferite.
|
||||
record_human_validation(conn, den, "OE-1") # GOLD partajat
|
||||
_silver(conn, norm, "OE-2") # SILVER exact
|
||||
_mock_embedding(monkeypatch, "OE-3", 0.99) # embedding
|
||||
conn.commit()
|
||||
|
||||
out = enrich_suggestions(conn, den)
|
||||
assert out["sugestie_principala"] == {"cod_prestatie": "OE-1", "sursa": "gold_partajat"}
|
||||
|
||||
# Fara GOLD -> castiga SILVER.
|
||||
conn.execute("DELETE FROM shared_mappings")
|
||||
conn.commit()
|
||||
out = enrich_suggestions(conn, den)
|
||||
assert out["sugestie_principala"]["sursa"] == "silver"
|
||||
assert out["sugestie_principala"]["cod_prestatie"] == "OE-2"
|
||||
|
||||
# Fara GOLD si fara SILVER -> castiga embedding.
|
||||
conn.execute("DELETE FROM mapping_suggestions")
|
||||
conn.commit()
|
||||
out = enrich_suggestions(conn, den)
|
||||
assert out["sugestie_principala"] == {"cod_prestatie": "OE-3", "sursa": "embedding"}
|
||||
|
||||
|
||||
def test_prag_similaritate(conn, monkeypatch):
|
||||
from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
|
||||
_mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE + 0.01)
|
||||
out = enrich_suggestions(conn, "CEVA NEVAZUT")
|
||||
assert out["surse"]["embedding"] == "OE-3"
|
||||
|
||||
|
||||
def test_abtinere_sub_prag(conn, monkeypatch):
|
||||
from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
|
||||
_mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE - 0.01)
|
||||
out = enrich_suggestions(conn, "CEVA NEVAZUT")
|
||||
assert out["surse"]["embedding"] is None # sub prag -> abtinere
|
||||
assert out["sugestie_principala"] is None
|
||||
|
||||
|
||||
def test_vecin_knn_nul_supreseaza(conn, monkeypatch):
|
||||
from app.mapping import enrich_suggestions
|
||||
_mock_embedding(monkeypatch, None, 0.99, is_nul=True) # vecin NUL peste prag
|
||||
out = enrich_suggestions(conn, "CEVA CARE SEAMANA CU GUNOI")
|
||||
assert out["surse"]["embedding"] is None # NUL -> nu produce cod
|
||||
assert out["surse"]["nul"] is True
|
||||
assert out["sugestie_principala"] is None
|
||||
|
||||
|
||||
def test_invariant_13_resolve_neatins(conn):
|
||||
"""Regresie #13: SILVER populat NU produce auto-rezolvare in resolve_prestatii."""
|
||||
from app.mapping import resolve_prestatii, normalize_for_match
|
||||
_silver(conn, normalize_for_match("OPERATIE X"), "OE-1")
|
||||
resolved, unmapped = resolve_prestatii(
|
||||
[{"cod_op_service": "OPERATIE X", "denumire": "OPERATIE X"}], mapping={}, valid_codes={"OE-1"}
|
||||
)
|
||||
assert resolved[0]["cod_prestatie"] is None # ramane nemapat, NU ia codul din SILVER
|
||||
assert unmapped and unmapped[0]["cod_op_service"] == "OPERATIE X"
|
||||
Reference in New Issue
Block a user