Files
rar-autopass/tests/test_embeddings_corpus_etichetat.py
Claude Agent 756f77730f feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)
Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE
cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq
(3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect
~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume
piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7
(nu OE-5), placute frana uzura OE-1 (nu OE-F avarie).

US-001..006: prefiltru NUL determinist, etichetator offline, generator seed,
seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings
indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL
12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO).

config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER
populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env).

Suita: 1387 passed, 1 deselected (live).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 06:59:15 +00:00

151 lines
5.0 KiB
Python

"""US-005 (PRD 5.18) — embeddings indexeaza corpusul etichetat (NU nomenclatorul).
k-NN peste exemple reale etichetate (denumire_normalizata -> cod, is_nul) e net mai
precis decat peste cele 18 categorii generice. Acopera si simetria corpus/query (F1):
corpusul e text NORMALIZAT, deci query-ul trebuie normalizat la fel inainte de embedding.
"""
from __future__ import annotations
import math
import os
import tempfile
import pytest
# Backend mock determinist: vector = histograma de caractere (similaritate stabila).
class MockBackend:
def embed(self, texts):
out = []
for t in texts:
v = [0.0] * 27
for ch in t.upper():
if "A" <= ch <= "Z":
v[ord(ch) - 65] += 1.0
else:
v[26] += 1.0
out.append(v)
return out
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us005.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true") # US-005 are nevoie de embeddings ON
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield monkeypatch
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _inject_mock_engine():
import app.embeddings as emb
from app.embeddings import EmbeddingEngine
emb._engine = EmbeddingEngine(backend=MockBackend())
return emb
def _seed_silver(conn, rows):
"""rows = [(denumire_normalizata, cod, is_nul)]."""
conn.executemany(
"INSERT OR IGNORE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
rows,
)
conn.commit()
def test_corpus_din_mapping_suggestions(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [
("SCHIMB ULEI MOTOR", "OE-3", 0),
("INLOCUIT PLACUTE FRANA", "OE-1", 0),
("13 X ITP", None, 1),
])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
assert emb.has_corpus()
# Corpusul indexat = denumirile din mapping_suggestions, NU din nomenclator_rar.
texte = {it["denumire"] for it in emb._engine._corpus_items}
assert texte == {"SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA", "13 X ITP"}
def test_suggest_nearest_intoarce_is_nul(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [
("SCHIMB ULEI MOTOR", "OE-3", 0),
("13 X ITP", None, 1),
])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
res = emb.suggest_nearest("13 X ITP", top_k=1)
assert res and res[0]["is_nul"] is True # vecin NUL -> semnal de supresie
res2 = emb.suggest_nearest("SCHIMB ULEI MOTOR", top_k=1)
assert res2 and res2[0]["is_nul"] is False
assert res2[0]["cod"] == "OE-3"
def test_semnatura_corpus_pe_seed(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
sig1 = emb.corpus_signature()
assert sig1 is not None
# Re-apel fara schimbare -> aceeasi semnatura (nu re-indexeaza).
ensure_embeddings_corpus(conn)
assert emb.corpus_signature() == sig1
# Adaugare rand -> semnatura se schimba.
_seed_silver(conn, [("INLOCUIT BATERIE", "OE-1", 0)])
ensure_embeddings_corpus(conn)
assert emb.corpus_signature() != sig1
def test_query_normalizat_ca_si_corpusul(conn, monkeypatch):
"""F1 (HIGH): enrich_suggestions interogheaza suggest_nearest cu textul NORMALIZAT."""
import app.embeddings as emb
captura = {}
monkeypatch.setattr(emb, "has_corpus", lambda: True)
def fake_suggest(text, top_k=1):
captura["text"] = text
return [{"cod": "OE-3", "is_nul": False, "similaritate": 0.99}]
monkeypatch.setattr(emb, "suggest_nearest", fake_suggest)
from app.mapping import enrich_suggestions
enrich_suggestions(conn, "Schimb Uleiul Motor")
# Corpusul e denumire_normalizata -> query-ul trebuie normalizat la fel.
from app.mapping import normalize_for_match
assert captura["text"] == normalize_for_match("Schimb Uleiul Motor")
assert captura["text"] == "SCHIMB ULEIUL MOTOR"
def test_degradare_gratioasa_pastrata(conn):
"""Backend care arunca -> ensure + enrich NU arunca exceptie."""
import app.embeddings as emb
from app.embeddings import EmbeddingEngine
class BrokenBackend:
def embed(self, texts):
raise RuntimeError("model indisponibil")
emb._engine = EmbeddingEngine(backend=BrokenBackend())
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
from app.mapping import ensure_embeddings_corpus, enrich_suggestions
ensure_embeddings_corpus(conn) # nu arunca
out = enrich_suggestions(conn, "SCHIMB ULEI") # nu arunca
assert "sugestie_principala" in out