feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)
Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq (3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect ~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7 (nu OE-5), placute frana uzura OE-1 (nu OE-F avarie). US-001..006: prefiltru NUL determinist, etichetator offline, generator seed, seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL 12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO). config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env). Suita: 1387 passed, 1 deselected (live). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
150
tests/test_embeddings_corpus_etichetat.py
Normal file
150
tests/test_embeddings_corpus_etichetat.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""US-005 (PRD 5.18) — embeddings indexeaza corpusul etichetat (NU nomenclatorul).
|
||||
|
||||
k-NN peste exemple reale etichetate (denumire_normalizata -> cod, is_nul) e net mai
|
||||
precis decat peste cele 18 categorii generice. Acopera si simetria corpus/query (F1):
|
||||
corpusul e text NORMALIZAT, deci query-ul trebuie normalizat la fel inainte de embedding.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Backend mock determinist: vector = histograma de caractere (similaritate stabila).
|
||||
class MockBackend:
|
||||
def embed(self, texts):
|
||||
out = []
|
||||
for t in texts:
|
||||
v = [0.0] * 27
|
||||
for ch in t.upper():
|
||||
if "A" <= ch <= "Z":
|
||||
v[ord(ch) - 65] += 1.0
|
||||
else:
|
||||
v[26] += 1.0
|
||||
out.append(v)
|
||||
return out
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def env(monkeypatch):
|
||||
tmp = tempfile.mkdtemp()
|
||||
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us005.db"))
|
||||
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
|
||||
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true") # US-005 are nevoie de embeddings ON
|
||||
from app.config import get_settings
|
||||
get_settings.cache_clear()
|
||||
from app.db import init_db
|
||||
init_db()
|
||||
yield monkeypatch
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def conn(env):
|
||||
from app.db import get_connection
|
||||
c = get_connection()
|
||||
yield c
|
||||
c.close()
|
||||
|
||||
|
||||
def _inject_mock_engine():
|
||||
import app.embeddings as emb
|
||||
from app.embeddings import EmbeddingEngine
|
||||
emb._engine = EmbeddingEngine(backend=MockBackend())
|
||||
return emb
|
||||
|
||||
|
||||
def _seed_silver(conn, rows):
|
||||
"""rows = [(denumire_normalizata, cod, is_nul)]."""
|
||||
conn.executemany(
|
||||
"INSERT OR IGNORE INTO mapping_suggestions "
|
||||
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
|
||||
rows,
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def test_corpus_din_mapping_suggestions(conn):
|
||||
emb = _inject_mock_engine()
|
||||
_seed_silver(conn, [
|
||||
("SCHIMB ULEI MOTOR", "OE-3", 0),
|
||||
("INLOCUIT PLACUTE FRANA", "OE-1", 0),
|
||||
("13 X ITP", None, 1),
|
||||
])
|
||||
from app.mapping import ensure_embeddings_corpus
|
||||
ensure_embeddings_corpus(conn)
|
||||
assert emb.has_corpus()
|
||||
# Corpusul indexat = denumirile din mapping_suggestions, NU din nomenclator_rar.
|
||||
texte = {it["denumire"] for it in emb._engine._corpus_items}
|
||||
assert texte == {"SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA", "13 X ITP"}
|
||||
|
||||
|
||||
def test_suggest_nearest_intoarce_is_nul(conn):
|
||||
emb = _inject_mock_engine()
|
||||
_seed_silver(conn, [
|
||||
("SCHIMB ULEI MOTOR", "OE-3", 0),
|
||||
("13 X ITP", None, 1),
|
||||
])
|
||||
from app.mapping import ensure_embeddings_corpus
|
||||
ensure_embeddings_corpus(conn)
|
||||
res = emb.suggest_nearest("13 X ITP", top_k=1)
|
||||
assert res and res[0]["is_nul"] is True # vecin NUL -> semnal de supresie
|
||||
res2 = emb.suggest_nearest("SCHIMB ULEI MOTOR", top_k=1)
|
||||
assert res2 and res2[0]["is_nul"] is False
|
||||
assert res2[0]["cod"] == "OE-3"
|
||||
|
||||
|
||||
def test_semnatura_corpus_pe_seed(conn):
|
||||
emb = _inject_mock_engine()
|
||||
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
|
||||
from app.mapping import ensure_embeddings_corpus
|
||||
ensure_embeddings_corpus(conn)
|
||||
sig1 = emb.corpus_signature()
|
||||
assert sig1 is not None
|
||||
# Re-apel fara schimbare -> aceeasi semnatura (nu re-indexeaza).
|
||||
ensure_embeddings_corpus(conn)
|
||||
assert emb.corpus_signature() == sig1
|
||||
# Adaugare rand -> semnatura se schimba.
|
||||
_seed_silver(conn, [("INLOCUIT BATERIE", "OE-1", 0)])
|
||||
ensure_embeddings_corpus(conn)
|
||||
assert emb.corpus_signature() != sig1
|
||||
|
||||
|
||||
def test_query_normalizat_ca_si_corpusul(conn, monkeypatch):
|
||||
"""F1 (HIGH): enrich_suggestions interogheaza suggest_nearest cu textul NORMALIZAT."""
|
||||
import app.embeddings as emb
|
||||
captura = {}
|
||||
monkeypatch.setattr(emb, "has_corpus", lambda: True)
|
||||
|
||||
def fake_suggest(text, top_k=1):
|
||||
captura["text"] = text
|
||||
return [{"cod": "OE-3", "is_nul": False, "similaritate": 0.99}]
|
||||
|
||||
monkeypatch.setattr(emb, "suggest_nearest", fake_suggest)
|
||||
from app.mapping import enrich_suggestions
|
||||
enrich_suggestions(conn, "Schimb Uleiul Motor")
|
||||
# Corpusul e denumire_normalizata -> query-ul trebuie normalizat la fel.
|
||||
from app.mapping import normalize_for_match
|
||||
assert captura["text"] == normalize_for_match("Schimb Uleiul Motor")
|
||||
assert captura["text"] == "SCHIMB ULEIUL MOTOR"
|
||||
|
||||
|
||||
def test_degradare_gratioasa_pastrata(conn):
|
||||
"""Backend care arunca -> ensure + enrich NU arunca exceptie."""
|
||||
import app.embeddings as emb
|
||||
from app.embeddings import EmbeddingEngine
|
||||
|
||||
class BrokenBackend:
|
||||
def embed(self, texts):
|
||||
raise RuntimeError("model indisponibil")
|
||||
|
||||
emb._engine = EmbeddingEngine(backend=BrokenBackend())
|
||||
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
|
||||
from app.mapping import ensure_embeddings_corpus, enrich_suggestions
|
||||
ensure_embeddings_corpus(conn) # nu arunca
|
||||
out = enrich_suggestions(conn, "SCHIMB ULEI") # nu arunca
|
||||
assert "sugestie_principala" in out
|
||||
Reference in New Issue
Block a user