feat(5.18): corpus k-NN exemple etichetate + seed real Haiku (17181 op)

Seed app/data/operatii-etichetate.json regenerat cu subagenti Haiku pe TOATE
cele 17181 operatii distincte (ordine frecventa, 100%), inlocuind seed-ul Groq
(3758). Validare Haiku vs Groq pe 157 op etichetate: la dezacorduri Haiku corect
~22/30, Groq ~0. Haiku prinde gunoiul ratat de Groq (ITP, chirie anvelope, nume
piese fara actiune): NUL 2200 (12.8%) vs ~7.6% Groq; adaptare electronica OE-7
(nu OE-5), placute frana uzura OE-1 (nu OE-F avarie).

US-001..006: prefiltru NUL determinist, etichetator offline, generator seed,
seeder mapping_suggestions (in init_db, gated seed_operatii_enabled), embeddings
indexeaza corpus etichetat, enrich NUL+kNN. Distributie seed: OE-1 80.1%, NUL
12.8%, OE-2 3.5%, restul rar (OE-4/3/7/8/R/I/5, AITLV, R-ODO).

config: seed_operatii_enabled=True + embeddings_enabled=True implicit (SILVER
populat + sugestii semantice; ambele suggestion-only, dezactivabile prin env).

Suita: 1387 passed, 1 deselected (live).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-06-29 06:59:15 +00:00
parent c05fa00007
commit 756f77730f
17 changed files with 139308 additions and 44 deletions

View File

@@ -117,11 +117,21 @@ class Settings(BaseSettings):
enforce_plans: bool = True
# --- Embeddings (sugestie mapare, Stratul 2 PRD 5.14) ---
# DEZACTIVAT implicit: prima folosire lazy-load-eaza modelul fastembed/ONNX
# (~230MB pe disc) sincron in thread-ul de cerere -> hang la prima cerere /mapari.
# Activeaza explicit in productie (start.sh/Docker/.env) cand vrei sugestii semantice.
# OFF pastreaza suita de teste rapida si /mapari instant (cade pe GOLD/SILVER+fuzzy).
embeddings_enabled: bool = False
# ACTIVAT implicit: editorul de mapari ofera sugestii semantice (model fastembed/ONNX).
# Cost: prima folosire lazy-load-eaza modelul (~230MB pe disc) sincron in thread-ul de
# cerere -> prima cerere /mapari poate dura 30-120s pana modelul intra in memorie; cererile
# urmatoare sunt instant. SUGGESTION-ONLY: nu intra in resolve_prestatii (nu auto-trimite).
# Pune-l pe False (start.sh/Docker/.env: AUTOPASS_EMBEDDINGS_ENABLED=false) cand vrei
# /mapari instant la prima cerere sau suita de teste rapida (cade pe GOLD/SILVER+fuzzy).
embeddings_enabled: bool = True
# --- Seed corpus operatii etichetate (SILVER, PRD 5.18 US-004) ---
# ACTIVAT implicit: la init_db, populeaza mapping_suggestions din artefactul comis
# `app/data/operatii-etichetate.json` (INSERT OR IGNORE). Asa SILVER nu mai e gol in
# productie -> sugestii exact-match + corpus k-NN reale. SUGGESTION-ONLY.
# Pune-l pe False (AUTOPASS_SEED_OPERATII_ENABLED=false) cand vrei SILVER gol —
# conftest il dezactiveaza global, testele care-l vor il pornesc punctual.
seed_operatii_enabled: bool = True
@property
def rar_base_url(self) -> str:

File diff suppressed because it is too large Load Diff

View File

@@ -37,6 +37,13 @@ def init_db() -> None:
from .mapping import seed_nomenclator_if_empty
seed_nomenclator_if_empty(conn)
# Seed corpus operatii etichetate -> mapping_suggestions (SILVER, PRD 5.18 US-004).
# Gated: OFF in teste (conftest), ON in productie. INSERT OR IGNORE -> idempotent.
if get_settings().seed_operatii_enabled:
from .operatii_seed import seed_operatii_etichetate
seed_operatii_etichetate(conn)
conn.commit()
finally:
conn.close()

View File

@@ -135,10 +135,12 @@ class EmbeddingEngine:
denumire: str,
top_k: int = 3,
) -> list[dict]:
"""Returneaza top_k vecini cosine [{cod, similaritate}].
"""Returneaza top_k vecini cosine [{cod, is_nul, similaritate}].
Returneaza [] daca backend-ul lipseste, corpus-ul e gol sau apare
orice exceptie (degradare gratioasa -- nu blocheaza ingestia).
`is_nul` (PRD 5.18 US-005): cand corpusul include exemple NUL (non-operatii),
un vecin NUL = semnal de SUPRESIE, nu cod. Default False pe corpusuri vechi
fara `is_nul` in itemi. Returneaza [] daca backend-ul lipseste, corpus-ul e gol
sau apare orice exceptie (degradare gratioasa -- nu blocheaza ingestia).
"""
if not self.is_available() or not self._corpus_items:
return []
@@ -149,6 +151,7 @@ class EmbeddingEngine:
scored = [
{
"cod": item["cod"],
"is_nul": bool(item.get("is_nul", False)),
"similaritate": _cosine_similarity(query_vec, vec),
}
for item, vec in zip(self._corpus_items, self._corpus_vecs)

View File

@@ -16,6 +16,7 @@ from __future__ import annotations
import hashlib
import json
import re
import unicodedata
from typing import Any
@@ -49,6 +50,60 @@ def normalize_for_match(value: object) -> str:
return " ".join(s.upper().split())
# --------------------------------------------------------------------------- #
# Pre-filtru determinist non-operatii (NUL) — US-001 PRD 5.18 #
# --------------------------------------------------------------------------- #
#
# Masuratoarea k-NN (memorie test-precizie-knn-embeddings) arata recall NUL doar
# 64%: gunoiul evident (ITP, plata, discount, nr. inmatriculare, tractare) scapa
# semantic ca OE-1. Un pre-filtru text/regex il marcheaza NUL INAINTE de k-NN.
#
# Garantie: ZERO fals-pozitiv pe operatii reale. Regulile au fost calibrate pe
# `docs/operatii-service/*.csv` (toate aparitiile distincte). Triggerele NEambigue
# (ITP, ACHITAT/PLATA, DISCOUNT/REDUCERE, TAXA) sunt neconditionate (0 FP masurat).
# Triggerele AMBIGUE (TRACTARE, NR INMATRICULARE + pattern placuta) apar si in
# operatii reale ("D/R CARLIG TRACTARE", "D/R ELECTROMOTOR CT 44 MKY") -> sunt
# ECRANATE de un context de piesa/operatie (`_NUL_CTX_PIESA`).
# Trigger-uri neambigue (substring/regex pe text normalizat).
_NUL_ITP = re.compile(r"(?:\bITP\b|\d\s*X\s*ITP|X\s*ITP\b|\bITP[.,])")
_NUL_PLATA = re.compile(r"\b(ACHITAT|ACHITARE|PLATA|PLATIT|PLATIRE)\b")
_NUL_DISCOUNT = re.compile(r"\b(DISCOUNT|REDUCERE)\b")
_NUL_TAXA = re.compile(r"\bTAXA\b")
# Trigger-uri ambigue — valide ca NUL DOAR in absenta unui context de piesa.
_NUL_TRACTARE = re.compile(r"\b(TRACTARE|TRACTARI)\b")
_NUL_NR_PLACUTA = re.compile(
r"(\bNR\s+INMATRICULARE\b|\bNUMAR\s+INMATRICULARE\b|\b[A-Z]{1,2}\s?\d{2,3}\s?[A-Z]{3}\b)"
)
# Daca apare oricare cuvant de aici, TRACTARE/placuta e nume de piesa sau operatie
# reala (carlig/capac de tractare, suport placuta, placuta lipita la o reparatie).
_NUL_CTX_PIESA = re.compile(
r"\b(D/R|D-R|CARLIG|CAPAC|BARA|PROTECTIE|MONTAT|MONTAJ|DEMONTAT|INLOCUIT|"
r"INLOCUIRE|REPARAT|REPARATIE|VOPSIT|SCHIMBAT|SUPORT)\b"
)
def prefiltru_nul(denumire: object) -> bool:
"""True daca operatia e gunoi evident (non-operatie de service) -> NUL determinist.
Ruleaza INAINTE de k-NN/embeddings in `enrich_suggestions` (US-006). Pur, fara DB.
Zero fals-pozitiv pe operatii reale (vezi comentariul de mai sus + tests).
"""
text = normalize_for_match(denumire)
if not text:
return False
# Neambigue: 0 FP masurat, fara ecranare.
if _NUL_ITP.search(text) or _NUL_PLATA.search(text) or _NUL_DISCOUNT.search(text) or _NUL_TAXA.search(text):
return True
# Ambigue: doar daca NU e context de piesa.
if _NUL_CTX_PIESA.search(text):
return False
if _NUL_TRACTARE.search(text) or _NUL_NR_PLACUTA.search(text):
return True
return False
def suggest_codes(
denumire: object,
nomenclator: list[dict],
@@ -576,51 +631,58 @@ def delete_text_rule(conn, account_id: int | None, pattern: str) -> None:
EMB_MIN_SIMILARITATE = 0.5
def _corpus_signature(nomenclator: list[dict]) -> str:
"""Semnatura stabila a nomenclatorului pentru cache-ul corpusului embeddings.
def _corpus_signature_silver(rows: list) -> str:
"""Semnatura stabila a corpusului SILVER (mapping_suggestions) pentru cache.
Hash pe perechile (cod, denumire) sortate dupa cod -> se schimba la orice
add/remove/redenumire de cod, ramane stabila altfel (evita re-embed inutil).
Hash pe (denumire_normalizata, cod, is_nul) sortat -> se schimba la orice
add/remove/redenumire/relabel, ramane stabila altfel (evita re-embed inutil).
"""
pairs = sorted(
(str(n.get("cod_prestatie") or ""), str(n.get("nume_prestatie") or ""))
for n in nomenclator
triples = sorted(
(str(r["denumire_normalizata"] or ""), str(r["cod_prestatie"] or ""), int(r["is_nul"] or 0))
for r in rows
)
blob = "".join(f"{c}{d}" for c, d in pairs)
blob = "".join(f"{d}|{c}|{n}" for d, c, n in triples)
return hashlib.sha256(blob.encode("utf-8")).hexdigest()
def ensure_embeddings_corpus(conn, nomenclator: list[dict] | None = None) -> None:
"""Construieste/actualizeaza corpusul embeddings din nomenclator (Stratul 2 PRD 5.14).
"""Construieste/actualizeaza corpusul embeddings din corpusul ETICHETAT (PRD 5.18 US-005).
Gated pe `AUTOPASS_EMBEDDINGS_ENABLED` (default OFF): cand e dezactivat, e un
no-op total (nu atinge modelul, nu interogheaza nomenclatorul) -> /mapari instant
+ suita de teste rapida; sugestiile cad pe GOLD/SILVER + fuzzy.
Sursa corpusului = `mapping_suggestions` (SILVER): exemple reale etichetate
{denumire_normalizata -> cod, is_nul}, NU cele 18 categorii generice din
`nomenclator_rar`. k-NN peste exemple reale e net mai precis (94.3% acord LLM).
Parametrul `nomenclator` e pastrat pentru compatibilitatea apelantilor, dar nu mai
e folosit ca sursa.
Cand e activat: indexeaza corpusul {denumire=nume_prestatie, cod=cod_prestatie}
o singura data (lazy-load modelul ~230MB la prima chemare), re-indexeaza doar
cand semnatura nomenclatorului s-a schimbat. Degradare gratioasa: orice eroare
(model absent, embed esuat) lasa corpusul gol -> enrich_suggestions cade pe restul.
Gated pe `AUTOPASS_EMBEDDINGS_ENABLED` (default ON; OFF in teste): cand e
dezactivat, e un no-op total -> /mapari instant + suita de teste rapida.
Apelat de apelantii care imbogatesc sugestii (pending_unmapped,
_nemapate_pentru_submission) INAINTE de bucla de enrich_suggestions, NU din
enrich_suggestions (care ramane o interogare ieftina cu garda has_corpus()).
Cand e activat: indexeaza corpusul o singura data (lazy-load modelul ~230MB la
prima chemare), re-indexeaza doar cand semnatura corpusului SILVER s-a schimbat.
Itemii NUL (is_nul=1, cod NULL) raman in corpus: un vecin NUL e semnal de supresie
(US-006). Degradare gratioasa: orice eroare lasa corpusul gol -> enrich cade pe restul.
"""
from .config import get_settings
if not get_settings().embeddings_enabled:
return
try:
from . import embeddings as _emb
nomen = nomenclator if nomenclator is not None else load_nomenclator(conn)
if not nomen:
rows = conn.execute(
"SELECT denumire_normalizata, cod_prestatie, is_nul FROM mapping_suggestions"
).fetchall()
if not rows:
return
sig = _corpus_signature(nomen)
sig = _corpus_signature_silver(rows)
if _emb.corpus_signature() == sig and _emb.has_corpus():
return # deja indexat pe acelasi nomenclator -> nimic de facut
return # deja indexat pe acelasi corpus SILVER -> nimic de facut
items = [
{"denumire": str(n["nume_prestatie"]), "cod": str(n["cod_prestatie"])}
for n in nomen
if n.get("nume_prestatie") and n.get("cod_prestatie")
{
"denumire": str(r["denumire_normalizata"]),
"cod": (str(r["cod_prestatie"]) if r["cod_prestatie"] is not None else None),
"is_nul": bool(r["is_nul"]),
}
for r in rows
if r["denumire_normalizata"]
]
_emb.index_corpus(items, signature=sig)
except Exception:
@@ -641,26 +703,38 @@ def enrich_suggestions(
(Account GOLD = operations_mapping propriu = deja rezolvat inainte de needs_mapping;
nu apare in needs_mapping, deci nu e in precedenta de sugestie.)
Ordine completa (PRD 5.18 US-006):
pre-filtru NUL determinist -> (daca NUL: fara cod, `surse['nul']=True`)
altfel GOLD partajat > exact (SILVER) > k-NN embeddings.
Returneaza:
{
'sugestie_principala': {'cod_prestatie': str, 'sursa': str} | None,
'surse': {'gold_partajat': str|None, 'silver': str|None, 'embedding': str|None}
'surse': {'gold_partajat': str|None, 'silver': str|None, 'embedding': str|None, 'nul': bool}
}
INVARIANTE:
- Toate sursele = SUGGESTION-ONLY. NU intra in resolve_prestatii/load_mapping (#13).
- SILVER cu is_nul=1 (non-operatie/gunoi) NU produce sugestie (#4).
- Pre-filtru NUL (US-001) ruleaza PRIMUL: gunoiul evident (ITP/plata/discount...) e
marcat non-operatie INAINTE de k-NN, fara sugestie de cod.
- SILVER cu is_nul=1 (non-operatie/gunoi) NU produce sugestie (#4); vecin k-NN NUL idem.
- Degradare gratioasa pe embeddings (#16b): daca motorul nu e disponibil sau arunca,
returneaza sugestia disponibila din celelalte surse, fara exceptie.
- Import local shared_store/embeddings: evita ciclu la import-time (shared_store
importa normalize_for_match din mapping).
"""
sugestie_principala: dict | None = None
surse: dict = {"gold_partajat": None, "silver": None, "embedding": None}
surse: dict = {"gold_partajat": None, "silver": None, "embedding": None, "nul": False}
if not denumire:
return {"sugestie_principala": sugestie_principala, "surse": surse}
# 0. Pre-filtru NUL determinist (US-001) INAINTE de orice k-NN/lookup: non-operatie
# evidenta -> fara cod, scurtcircuit (nu interogheaza embeddings/SILVER pe gunoi).
if prefiltru_nul(denumire):
surse["nul"] = True
return {"sugestie_principala": None, "surse": surse}
# Colecteaza TOATE sursele (fara short-circuit) in `surse`: editorul le poate afisa
# toate, independent de care castiga ca sugestie principala.
# Precedenta Eng-F2 se aplica DOAR la alegerea sugestiei_principale.
@@ -693,10 +767,17 @@ def enrich_suggestions(
# ensure_embeddings_corpus (gated pe AUTOPASS_EMBEDDINGS_ENABLED); cand
# flagul e off, has_corpus() ramane False si calea e un no-op real.
if _emb.has_corpus():
nn = _emb.suggest_nearest(str(denumire), top_k=1)
# F1 (US-005): corpusul k-NN e text NORMALIZAT (denumire_normalizata),
# deci query-ul TREBUIE normalizat la fel — altfel cosine degradeaza si
# nu mai e configul sub care s-a masurat 94.3%.
nn = _emb.suggest_nearest(normalize_for_match(denumire), top_k=1)
# Prag minim: similaritate prea mica = sugestie inutila.
# Evita recomandari irelevante cand corpus-ul e mic/partial.
if nn and nn[0].get("similaritate", 0) >= EMB_MIN_SIMILARITATE:
if nn[0].get("is_nul"):
# Vecin NUL (non-operatie) = semnal de SUPRESIE, nu cod (US-006).
surse["nul"] = True
elif nn[0].get("cod"):
surse["embedding"] = str(nn[0]["cod"])
except Exception:
pass # degradare gratioasa (#16b): motorul absent nu blocheaza

59
app/operatii_seed.py Normal file
View File

@@ -0,0 +1,59 @@
"""Seeder corpus operatii etichetate -> mapping_suggestions (SILVER, PRD 5.18 US-004).
Artefactul `app/data/operatii-etichetate.json` e produs offline de
`tools/mapare-llm/genereaza_seed.py` (etichetare LM Studio, o singura data) si comis
in repo. La `init_db` il incarcam in `mapping_suggestions` cu INSERT OR IGNORE, ca
SILVER sa nu mai fie gol in productie (sugestii exact-match + corpus k-NN reale).
Format seed: [{denumire, denumire_normalizata, cod, is_nul, source, confidence}].
Reutilizeaza `shared_store.seed_suggestions` (normalizeaza cheia + impune NUL->cod NULL,
INSERT OR IGNORE). NB (F10): confirmarile UMANE stau in `shared_mappings`, NU aici —
deci INSERT OR IGNORE pastreaza codul LLM existent la re-seed (v1 = ignore, nu upsert).
SUGGESTION-ONLY (invariant #13): nimic din SILVER nu intra in resolve_prestatii/load_mapping.
"""
from __future__ import annotations
import json
import os
import sqlite3
from .shared_store import seed_suggestions
SEED_PATH = os.path.join(os.path.dirname(__file__), "data", "operatii-etichetate.json")
def load_seed_file(path: str = SEED_PATH) -> list[dict]:
"""Citeste artefactul seed. Lipsa / invalid -> [] (degradare gratioasa)."""
if not path or not os.path.exists(path):
return []
try:
with open(path, encoding="utf-8") as fh:
data = json.load(fh)
except (ValueError, OSError):
return []
return data if isinstance(data, list) else []
def seed_operatii_etichetate(conn: sqlite3.Connection, path: str = SEED_PATH) -> int:
"""Incarca seedul in mapping_suggestions (INSERT OR IGNORE). Intoarce nr. randuri inserate.
Mapeaza cheia seedului `cod` -> `cod_prestatie` (forma asteptata de seed_suggestions);
`is_nul=True` forteaza cod NULL acolo. Idempotent: re-rularea nu dubleaza randuri.
"""
raw = load_seed_file(path)
if not raw:
return 0
items = [
{
"denumire": e.get("denumire") or e.get("denumire_normalizata") or "",
"cod_prestatie": e.get("cod"),
"is_nul": bool(e.get("is_nul")),
"source": e.get("source") or "llm_seed",
"confidence": e.get("confidence") or 0.0,
}
for e in raw
if isinstance(e, dict)
]
return seed_suggestions(conn, items)

View File

@@ -0,0 +1,294 @@
# PRD 5.18 — Corpus k-NN din exemple reale etichetate (mapare operatii service)
**Stare**: aprobat + revizuit /autoplan (2026-06-28; intrebari deschise rezolvate de user — vezi §5 Decizii;
cerinte user D4/D5 + 10 constatari Eng incorporate — vezi GSTACK REVIEW REPORT la final)
> Proces: `docs/ROADMAP.md` §5. Contract RAR: `docs/api-rar-contract.md`. Construieste peste
> infrastructura 5.14 (straturi GOLD/SILVER/embeddings, `app/embeddings.py`, `app/shared_store.py`,
> `mapping_suggestions`). NU re-deschide deciziile 5.14 (#11-#19); le foloseste.
## 0. Context si motivatie (de ce acest PRD)
5.14 a livrat embeddings in-proces, dar corpusul indexat = **cele 18 denumiri RAR generice**
din nomenclator (`nume_prestatie` -> `cod_prestatie`). O operatie reala ("inlocuit lubrifiant
la propulsor") se potriveste semantic slab cu etichete generice scurte ("INTRETINERE",
"REPARATIE"). In plus, stratul **SILVER (`mapping_suggestions`) e populat DOAR in teste**
in productie e gol, deci nu produce nicio sugestie (LLM-ul nu e chemat la runtime).
Acest PRD muta corpusul de la cele 18 categorii la **operatiile reale etichetate** (k-NN peste
exemple): o operatie noua se potriveste semantic cu o operatie deja vazuta si MOSTENESTE codul ei.
**Masuratori care justifica directia** (vezi memorie `test-precizie-knn-embeddings`, rulat 2026-06-28):
- k-NN peste exemple etichetate: **94.3% acord cu LLM pe operatii distincte** (baseline "mereu OE-1" = 86.2%).
- Acoperire IEFTINA: pe volumul real total (155.195 aparitii, 17.181 operatii distincte):
148 operatii = 50% volum, **1.380 = 80%**, 4.368 = 90%, 9.422 = 95%.
- Punct slab masurat: **NUL recall 64%** (ITP/discount/plata scapa ca OE-1) -> de aici pre-filtrul (US-001).
- Etichetarea offline cu **Qwen3-4B local (LM Studio, GPU RX 6600M)** + prompt procedural in 3 pasi:
**91% pe batch greu, 20/20 pe batch de validare**, ambele NUL prinse. Debit ~1.5-2h pentru ~13.5k operatii.
## 1. Obiectiv
Inlocuieste corpusul embeddings (18 categorii generice) cu **corpusul de operatii reale etichetate**
(exemplu -> cod RAR), populat dintr-un seed comis in repo, plus un **pre-filtru determinist** pentru
non-operatii (NUL). Rezultat: sugestii de mapare semnificativ mai precise in editor, fara LLM la runtime.
**Pasul 1 (bootstrap offline, fundatia intregului PRD) = etichetare cu LLM via LM Studio local.**
Tot restul (seeder, corpus embeddings, enrich) consuma artefactul produs aici. Pasul are doua garantii
non-negociabile:
1. **LM Studio = backend implicit aprobat pentru rularea v1** (Qwen3-4B local, GPU RX 6600M, `json_schema`
strict — `json_object` e respins de LM Studio). Groq/OpenRouter raman fallback-uri interschimbabile, dar
NU sunt calea aprobata pentru bootstrap-ul v1 (vezi D4).
2. **Dedup INAINTE de orice apel LLM.** Cele 4 fisiere (`docs/operatii-service/*.csv`) contin **19.456 randuri
brute -> 17.181 operatii distincte dupa `normalize_for_match`** (gain de doar 254 fata de dedup exact-string,
pentru ca datele sunt deja majuscule, fara diacritice — `normalize_for_match` colapseaza spatii + scoate diacritice,
**NU** scoate punctuatie). Din cele 17.181, **3.662 sunt deja etichetate** (in spatiu normalizat) in
`labels-groq-partial.json`. Trimitem la LLM EXACT cele **13.519** operatii distincte ne-etichetate, niciodata un
duplicat normalizat, o cheie normalizata vida sau o operatie deja etichetata (vezi D5). Economie: **31% mai putine
apeluri** vs randuri brute. (Castigul real al pipeline-ului nu e atat normalizarea — 254 chei — cat **reuse-ul
etichetelor existente** + agregarea frecventei; motivul principal pentru spatiul normalizat e **consistenta
end-to-end cu cheia DB/k-NN**, vezi F1/F3 din review.)
## 2. Non-Goals (anti scope-creep)
- **NU auto-send peste GOLD propriu.** Toate sursele (k-NN, exact, NUL pre-filtru) raman SUGGESTION-ONLY,
niciodata in `resolve_prestatii`/`load_mapping` (invariant #13, #11 din 5.14). Singura cale spre `queued`
ramane `operations_mapping` (GOLD propriu confirmat de om).
- **NU LLM la runtime.** Etichetarea LLM se face O SINGURA DATA, offline; runtime = doar embeddings + exact + reguli.
- **NU validare temporala / re-etichetare automata.** Seedul e static; reimprospatarea e un re-run manual al tool-ului.
- **NU schimbare UI majora.** Editorul (`_mapari.html`) consuma deja `sugestie_principala`; doar sursa se schimba.
(Un badge optional de sursa = US-007, jos.)
- **NU eshantion etichetat de om in acest PRD** (doar mentionat la Riscuri ca recomandare — Decision #19).
## 3. Stories atomice
> Fiecare story = cea mai mica unitate care lasa sistemul functional. Refoloseste `mapping_suggestions`
> (SILVER) ca tabela-corpus (are deja: `denumire_normalizata`, `cod_prestatie`, `is_nul`, `source`,
> `confidence`) — populata acum si in productie, nu doar in teste.
### US-001: Pre-filtru determinist non-operatii (NUL)
**Ca** operator **vreau** ca gunoiul evident (ITP, plata, discount, nr. inmatriculare, tractare) sa fie
marcat NUL inainte de k-NN **pentru ca** masuratoarea arata recall NUL doar 64% (scapa ca OE-1).
- **Depinde de**: —
- **Fisiere**: `app/mapping.py` (functie noua `prefiltru_nul(denumire) -> bool`), `tests/test_prefiltru_nul.py` (~2 fisiere)
- **Test intai (RED)**: `tests/test_prefiltru_nul.py``test_itp_e_nul`, `test_plata_discount_nul`, `test_nr_inmatriculare_nul`, `test_operatie_reala_nu_e_nul`
- **Acceptance criteria**:
- [ ] Reguli text/regex deterministe (ITP, ACHITAT/PLATA, DISCOUNT/REDUCERE, NR INMATRICULARE + pattern placuta, TRACTARE, TAXA)
- [ ] `prefiltru_nul("13 X ITP")` / `("DISCOUNT FIDELITATE 10%")` -> True; `("INLOCUIT PLACUTE FRANA")` -> False
- [ ] Zero fals-pozitiv pe un set de 20 operatii reale (din `docs/operatii-service`)
- [ ] `python3 -m pytest tests/test_prefiltru_nul.py -q` verde
- **Verificare E2E**: — (pur backend, acoperit de teste)
### US-002: Etichetator offline multi-backend cu prompt procedural
**Ca** dezvoltator **vreau** un tool care eticheteaza operatii->coduri RAR via LM Studio local / Groq /
OpenRouter, cu prompt procedural in 3 pasi si `json_schema` strict **pentru ca** LM Studio respinge
`json_object` si promptul nou ridica precizia (91% vs 80%).
- **Depinde de**: —
- **Fisiere**: `tools/mapare-llm/eticheteaza.py` (NOU, backend-uri interschimbabile), `tests/test_eticheteaza_tool.py` (mock HTTP) (~2 fisiere)
- **Test intai (RED)**: `tests/test_eticheteaza_tool.py``test_construieste_prompt_3pasi`, `test_parseaza_json_schema`, `test_backend_selectabil_env`, `test_scrub_pii_inainte_de_request`
- **Acceptance criteria**:
- [ ] Backend selectabil prin env (`ETICHETARE_BACKEND=lmstudio|groq|openrouter`, endpoint+model configurabile);
**default = `lmstudio`** (backend-ul aprobat pentru bootstrap v1, D4). Groq/OpenRouter = fallback.
- [ ] `response_format` = `json_schema` strict cu **envelope complet** `{"type":"json_schema","json_schema":{"name":...,"strict":true,"schema":{...}}}`
(NU `{"type":"json_object"}` ca `or_common.py:57`/`label_common.py:24`); `cod` = **enum** peste cele 19 `ALL_LABELS` (18 + NUL),
cod invalid/lipsa -> `?` (F8 din review). Etichetatorul nou NU reutilizeaza request-ul vechi, doar promptul/codurile/scrub-ul.
- [ ] **Dezactiveaza explicit "thinking"-ul Qwen3** (`/no_think` sau reasoning off) — altfel modelul emite `<think>` si
umfla tokeni/latenta sub structured output strict (F8).
- [ ] **Garda de truncare**: daca raspunsul are mai putine iteme decat batch-ul sau JSON invalid -> log + marcheaza `?`
pe pozitiile lipsa, NU le ascunde tacit (la batch 40 + prompt 3 pasi, `n_ctx=4096` e stramt — F8).
- [ ] Promptul = procedura 3 pasi + ancore (mapare parte caroserie->OE-C etc.), versionat in fisier
- [ ] Scrub PII (nr. inmatriculare, VIN) inainte de orice request (refoloseste `or_common.scrub`, #3)
- [ ] Setari conservatoare documentate in tool (batch 32-40, `n_parallel=1`, `n_ctx=4096`) — vezi Riscuri
- [ ] `python3 -m pytest tests/test_eticheteaza_tool.py -q` verde (fara retea reala)
- **Verificare E2E**: rulare manuala 1 batch pe LM Studio local (`http://<tailscale>:1234`), confirmare JSON valid
### US-003: Generare seed etichetat in faze pe frecventa
**Ca** dezvoltator **vreau** sa generez un fisier seed `operatii-etichetate.json` (operatie->cod) pornind de la
operatiile existente + cele deja etichetate, in ordinea frecventei **pentru ca** 1.380 operatii prind 80% din volum.
- **Depinde de**: US-002
- **Fisiere**: `tools/mapare-llm/genereaza_seed.py` (NOU), `app/data/operatii-etichetate.json` (artefact comis), `tests/test_genereaza_seed.py` (~3 fisiere)
- **Test intai (RED)**: `tests/test_genereaza_seed.py``test_dedup_normalizat`, `test_zero_duplicate_trimis_la_llm`, `test_rerun_zero_apeluri_llm`, `test_reuse_conflict_determinist`, `test_skip_cheie_normalizata_vida`, `test_reuse_in_spatiu_normalizat`, `test_ordine_pe_frecventa`, `test_format_seed_valid`
- **Pipeline dedup (ordinea e obligatorie, INAINTE de orice apel LLM):**
1. Agrega cele 4 CSV-uri -> pentru fiecare rand `(denumire, NR)`. Parseaza NR tolerant (skip rand pe NR ne-numeric, nu zero-weight — F9).
2. `cheie = normalize_for_match(denumire)` — ACEEASI functie ca DB/k-NN (`app/mapping.py:40`), NU `.strip()` exact.
**Arunca randurile cu `cheie == ""`** (gunoi gen `"..."`, `" "`) inainte de dedup — altfel se bat pe slotul UNIQUE gol (F6).
3. Dedup pe cheie: un singur reprezentant per cheie, `freq = suma NR` pe toate aparitiile/fisierele.
4. Construieste **harta** `cheie_normalizata -> cod` (NU doar un set) din TOATE sursele de etichete deja existente:
`labels-groq-partial.json` (cheiat pe text BRUT) **PLUS seedul comis anterior** `operatii-etichetate.json` (cheiat normalizat).
Reuse + scaderea se fac in spatiu normalizat. **Rezolvare conflict determinista** cand acelasi `cheie` are coduri diferite
pe variante raw (masurat: 1 azi — `CURATAT CATALIZATOR` OE-2 vs OE-1): castiga varianta cu `freq` (suma NR) maxima, tie-break pe `cod` sortat (F3).
5. `de_etichetat = {cheie in corpus} - {cheie in harta etichete}`. Lista (distincta, ne-etichetata, sortata desc pe freq) = SINGURUL input catre LLM.
- **Acceptance criteria**:
- [ ] `test_zero_duplicate_trimis_la_llm` (within-run): backend LLM mock care inregistreaza fiecare denumire primita;
input cu duplicate intentionate (spatii/case + cross-file) -> mock-ul nu vede NICIODATA doua chei normalizate egale,
nicio cheie deja etichetata, nicio cheie vida.
- [ ] `test_rerun_zero_apeluri_llm` (cross-run, **criteriul real de idempotenta**, F2/F7): ruleaza tool-ul de doua ori cu acelasi
input; a doua rulare consuma seedul comis ca cache -> **0 apeluri LLM**, seed identic byte-cu-byte.
- [ ] `test_reuse_conflict_determinist` (F3/F7): doua variante raw ale aceleiasi chei cu coduri diferite -> codul ales e determinist (freq-max, tie-break cod).
- [ ] Dedup pe `normalize_for_match` (colapseaza spatii + diacritice, **NU** punctuatie; gain real ~254 chei vs exact-string —
valoarea principala e consistenta cu cheia DB/k-NN, nu volumul); NU reutiliza `or_common.corpus_by_freq()` ca atare (dedup exact-string).
- [ ] Eticheteaza DOAR ce lipseste, in ordine descrescatoare de frecventa, cu `--target-volum 0.9` (oprire la prag) sau `--all`
- [ ] Seed format `[{denumire, denumire_normalizata, cod, is_nul, source, confidence}]`, UTF-8, comis in repo;
`denumire_normalizata` unica + ne-vida in seed (oglindeste UNIQUE din `mapping_suggestions`; `test_format_seed_valid` asserta non-empty)
- [ ] `python3 -m pytest tests/test_genereaza_seed.py -q` verde
- **Verificare E2E**: rulare `--target-volum 0.5` pe date reale -> ~150 etichete noi, fisier valid; log-ul tool-ului
raporteaza explicit "{brute} randuri -> {distincte} dupa normalizare -> {de_etichetat} trimise la LLM"
### US-004: Seeder corpus etichetat in DB (mapping_suggestions)
**Ca** sistem **vreau** sa incarc seedul etichetat in `mapping_suggestions` la init (INSERT OR IGNORE)
**pentru ca** SILVER e gol in productie si trebuie populat ca sa dea sugestii exact-match + corpus k-NN.
- **Depinde de**: US-003
- **Fisiere**: `app/operatii_seed.py` (NOU, dupa modelul `nomenclator_seed.py`), `app/db.py` (apel la init), `tests/test_operatii_seed.py` (~3 fisiere)
- **Test intai (RED)**: `tests/test_operatii_seed.py``test_seed_populeaza_mapping_suggestions`, `test_insert_or_ignore_nu_clobber_uman`, `test_is_nul_din_seed`, `test_idempotent_la_reinit`
- **Acceptance criteria**:
- [ ] La `init_db`, daca seedul exista si tabela permite, INSERT OR IGNORE randurile (idempotenta re-seed: nu dubla / nu
clobber un rand seedat sau de embedding deja prezent). NB (F10): confirmarile UMANE stau in `shared_mappings`
(`record_human_validation`), NU in `mapping_suggestions` — deci INSERT OR IGNORE pastreaza TACIT codul LLM vechi la re-seed;
daca vrei refresh pe coduri LLM invechite, e decizie explicita upsert-vs-ignore (v1 = ignore)
- [ ] `is_nul=1` -> `cod_prestatie=NULL` (respecta CHECK-ul existent); `source='llm_seed'`, `confidence` din seed
- [ ] Idempotent: a doua initializare nu dubleaza si nu modifica randuri existente
- [ ] `python3 -m pytest tests/test_operatii_seed.py -q` verde
- **Verificare E2E**: pornire app pe DB gol -> `SELECT count(*) FROM mapping_suggestions` > 0
### US-005: Embeddings indexeaza corpusul etichetat (nu nomenclatorul)
**Ca** sistem **vreau** ca `ensure_embeddings_corpus` sa indexeze operatiile etichetate (denumire->cod, cu is_nul)
**pentru ca** k-NN peste exemple reale e net mai precis decat peste 18 categorii generice.
- **Depinde de**: US-004
- **Fisiere**: `app/mapping.py` (`ensure_embeddings_corpus` schimba sursa), `app/embeddings.py` (`suggest_nearest` intoarce si `is_nul`), `tests/test_embeddings_corpus_etichetat.py` (~3 fisiere)
- **Test intai (RED)**: `tests/test_embeddings_corpus_etichetat.py``test_corpus_din_mapping_suggestions`, `test_suggest_nearest_intoarce_is_nul`, `test_semnatura_corpus_pe_seed`, `test_degradare_gratioasa_pastrata`
- **Acceptance criteria**:
- [ ] Corpusul = `mapping_suggestions` (denumire_normalizata -> cod, is_nul), NU `nomenclator_rar`
- [ ] **Simetrie corpus/query (F1, HIGH)**: corpusul e text `denumire_normalizata`; deci `enrich_suggestions` trebuie
sa interogheze `suggest_nearest(normalize_for_match(denumire), ...)`, NU `denumire` brut. Altfel corpus normalizat vs
query brut degradeaza cosine si NU e configul sub care s-a masurat 94.3%. `test_query_normalizat_ca_si_corpusul` o asserta.
- [ ] `suggest_nearest` intoarce `[{cod, is_nul, similaritate}]`; un vecin NUL -> semnal de supresie, nu cod
- [ ] Re-index doar la schimbarea semnaturii corpusului (cache pastrat, #16b degradare gratioasa neschimbata)
- [ ] Gated pe `AUTOPASS_EMBEDDINGS_ENABLED` (acum default True — vezi 5.14 CLOSE); off in teste (conftest)
- [ ] `python3 -m pytest tests/test_embeddings_corpus_etichetat.py -q` verde
- **Verificare E2E**: cu flag on + seed incarcat, `suggest_nearest("schimbat uleiul motor")` -> cod revizie/intretinere real
### US-006: enrich_suggestions = pre-filtru NUL + k-NN pe corpus etichetat
**Ca** operator **vreau** ca editorul sa imbine pre-filtrul NUL, exact-match si k-NN semantic in ordinea de
precedenta corecta **pentru ca** vreau sugestia cea mai buna fara junk.
- **Depinde de**: US-001, US-005
- **Fisiere**: `app/mapping.py` (`enrich_suggestions`), `tests/test_enrich_corpus_etichetat.py` (~2 fisiere)
- **Test intai (RED)**: `tests/test_enrich_corpus_etichetat.py``test_prefiltru_nul_supreseaza_inainte_de_knn`, `test_precedenta_gold_exact_embedding`, `test_prag_similaritate`, `test_abtinere_sub_prag`
- **Acceptance criteria**:
- [ ] Ordine: pre-filtru NUL -> daca NUL, fara sugestie de cod (marcat non-operatie); altfel GOLD partajat > exact (SILVER) > k-NN embeddings
- [ ] k-NN sub `EMB_MIN_SIMILARITATE` -> abtinere (`embedding=None`), nu sugestie incerta
- [ ] Vecin k-NN cu `is_nul=1` -> tratat ca supresie, nu cod (consecventa cu pre-filtrul)
- [ ] Invariant #13 pastrat: nimic din asta nu intra in `resolve_prestatii`/`load_mapping` (test de regresie)
- [ ] `python3 -m pytest tests/test_enrich_corpus_etichetat.py -q` verde + suita 5.14 (`test_mapare_integrare_l14.py`) ramane verde
- **Verificare E2E**: browser HTMX pe `/_fragments/mapari` — operatie parafraza primeste cod corect pre-selectat din k-NN
### US-007 (optional): Badge sursa sugestie in editor
**Ca** operator **vreau** sa vad de unde vine sugestia (confirmat de om / exemplu similar / non-operatie)
**pentru ca** acum nu pot distinge sursa si nu stiu cata incredere sa am.
- **Depinde de**: US-006
- **Fisiere**: `app/web/templates/_mapari.html`, `tests/test_web_badge_sursa.py` (~2 fisiere)
- **Test intai (RED)**: `tests/test_web_badge_sursa.py``test_badge_gold`, `test_badge_embedding`, `test_badge_nul`
- **Acceptance criteria**:
- [ ] Chip mic langa sugestie: "confirmat" (gold), "similar" (embedding/silver), "non-operatie" (NUL)
- [ ] Fara sursa -> fara chip; nu rupe layoutul 5.15/5.16
- [ ] `python3 -m pytest tests/test_web_badge_sursa.py -q` verde
- **Verificare E2E**: browser — chip vizibil si corect colorat pe randul de mapare
## 4. Riscuri
- **Calitate etichetare model local (Qwen3-4B Q4) < model mare (Groq 70b).** Masurat: bun pe cap (frecvent,
clar), mai slab pe coada rara/ambigua (ADAS calibrare, chei, "doar nume piesa"). Mitigare: pre-filtru NUL
(US-001) + optiunea unui al doilea pas de verificare cloud DOAR pe esantionul cu cod rar/incert.
- **Hardware GPU-box instabil sub sarcina (shutdown observat 2026-06-29).** La config-ul rulant erau ~4GB VRAM
liberi -> cauza probabil termica/alimentare, NU memorie. Mitigare OBLIGATORIE pentru pasul de etichetare:
`n_parallel=1`, `n_ctx=4096`, batch 32-40, monitorizare temperatura GPU. NU mari batch/context fara headroom termic.
- **Ground-truth = eticheta LLM, nu om.** 94.3% e ACORD cu LLM, nu acuratete reala; LLM impinge 86% in OE-1
(posibil prea agresiv). **Recomandare (Decision #19):** inainte de a creste increderea/orice auto-send, ruleaza
`heldout_eval.py` cu un esantion etichetat de OM. Ramane in afara scope-ului acestui PRD, dar e poarta pentru orice 5.x viitor de auto-send.
- **`mapping_suggestions` populat schimba comportamentul testelor** care presupuneau SILVER gol. Mitigare: seederul
ruleaza doar daca seedul exista; conftest poate dezactiva seedul in testele care nu-l vor (ca la embeddings).
- **Coada lunga ramane needs_mapping.** Chiar la 90% volum acoperit, 76% din operatiile DISTINCTE raman neetichetate
(frecventa 1). Asteptare corecta: bootstrap-ul reduce mult volumul, dar editorul uman ramane necesar pe coada.
- **(F1, review) Simetrie corpus/query la embeddings.** Corpusul k-NN devine text NORMALIZAT (`denumire_normalizata`),
deci query-ul TREBUIE normalizat la fel inainte de embedding (US-005 AC). Daca raman asimetrice (corpus normalizat,
query brut), similaritatea scade si nu mai e configul masurat (94.3%). Risc de regresie tacuta — acoperit de test in US-005.
- **(F2, review) Idempotenta cross-run a etichetarii.** Etichetele noi produse de o rulare trebuie sa devina cache pentru
urmatoarea (seedul comis = sursa de etichete, nu doar `labels-groq-partial.json`), altfel re-run-ul re-trimite tot la LLM.
Acoperit de `test_rerun_zero_apeluri_llm` (US-003).
## 5. Decizii (intrebari deschise rezolvate la aprobare, 2026-06-28)
> Erau intrebari deschise; rezolvate de user la poarta de aprobare PRD. Devin constrangeri de executie.
- **D1 — Tinta de acoperire la etichetare: 90% din volum** (`--target-volum 0.9`, ~4.368 operatii distincte).
Restul (coada lunga, 76% din operatiile distincte dar doar ~10% din volum) ramane pe editorul uman.
US-003 implementeaza exact acest default; `--all` ramane disponibil dar NU e calea aprobata pentru v1.
- **D2 — Verificare cloud pe esantionul incert: NU in acest PRD.** Toate sursele sunt suggestion-only (blast
radius mic: o sugestie gresita = omul alege altceva in editor). Pre-filtrul NUL (US-001) acopera punctul slab
masurat. Codurile rare/avarii grave sunt volum mic; un pas de verificare cloud adauga un backend in plus pentru
castig marginal. Se reia DOAR daca esantionul uman (Decision #19, vezi Riscuri) arata ca erorile pe coduri rare
sunt o problema reala. `source`/`confidence` din seed raman in DB pentru o eventuala flag-uire ulterioara.
- **D3 — Pastram exact-match (SILVER) separat de k-NN.** Exact-match (`lookup_suggestion` pe text normalizat) =
instant, 100% pe text identic; k-NN = generalizare semantica pentru texte nevazute. Precedenta confirmata:
**GOLD partajat > exact (SILVER) > k-NN embedding** (US-006). k-NN NU inlocuieste exact-match.
- **D4 — Bootstrap-ul v1 ruleaza pe LM Studio local** (Qwen3-4B, `json_schema` strict), nu pe Groq/OpenRouter.
Motiv: zero cost per-token, date pe hardware propriu (PII service local), masurat 91% pe batch greu + 20/20 validare.
Groq/OpenRouter raman in tool ca fallback interschimbabil (US-002), dar nu sunt calea aprobata pentru v1. Cerinta user, 2026-06-28.
- **D5 — Dedup pe `normalize_for_match` INAINTE de orice apel LLM, cu reuse in spatiu normalizat.** Nu se trimite la LLM
niciun duplicat normalizat si nicio operatie deja etichetata. Garantat de `test_zero_duplicate_trimis_la_llm` (within-run) +
`test_rerun_zero_apeluri_llm` (cross-run, idempotenta) — US-003.
Motiv: ~31% randuri redundante (19.456 brute -> 13.519 de etichetat: cross-file + variatii spatii + reuse labels existente);
fara dedup-ul corect platim apeluri LLM inutile si riscam etichete inconsistente pe acelasi text logic. Cerinta user, 2026-06-28.
## 6. Valuri de executie (graful de dependente)
```
PASUL 1 — BOOTSTRAP ETICHETE OFFLINE (LM Studio LLM) — fundatia, ruleaza prima:
Val 1: [US-002] [US-001] ← US-002 (etichetator LM Studio) = pasul 1; US-001 (pre-filtru NUL) paralel, fisiere disjuncte
Val 2: [US-003] ← deblocat de US-002: dedup normalizat -> trimite la LLM -> seed comis
PASUL 2 — CONSUM SEED (fara LLM):
Val 3: [US-004] ← deblocat de US-003 (owns schema/seed loader)
Val 4: [US-005] ← deblocat de US-004
Val 5: [US-006] ← deblocat de US-001 + US-005
Val 6: [US-007] (optional) ← deblocat de US-006
```
---
## Raport VERIFY
> Completat de subagentul verificator (context curat) in faza VERIFY — vezi ROADMAP §5.6.
> PASS/FAIL per criteriu, cu dovezi (output pytest citat, E2E pe RAR test). Lipseste pana la VERIFY.
---
<!-- AUTONOMOUS DECISION LOG -->
## GSTACK REVIEW REPORT (/autoplan — Eng focus, 2026-06-28)
Scope review: Eng (CEO premise gate + Eng dual-voice). Design/DX sarite (UI = doar badge optional US-007, tool intern mono-dezvoltator). Voce Eng: **subagent-only** — Codex a lovit limita de utilizare (degradare conform matricei).
**Premise confirmate** (poarta umana): (1) k-NN peste exemple reale > 18 categorii generice (94.3% vs 86.2% masurat); (2) etichetare LLM o singura data, offline, zero LLM la runtime; (3) SILVER populat in productie din seed comis; (4) pre-filtru NUL necesar (recall 64%); (5) LM Studio Qwen3-4B = calitate acceptabila pt bootstrap (91% batch greu / 20/20 validare).
**Cerinte user incorporate**: D4 (LM Studio = backend default v1), D5 (dedup pe `normalize_for_match` + reuse normalizat, INAINTE de LLM).
### Decision Audit Trail
| # | Faza | Decizie | Clasif. | Principiu | Rationament |
|---|------|---------|---------|-----------|-------------|
| 1 | CEO | Restructurare valuri: Pasul 1 = bootstrap LM Studio (US-002->US-003) | Mecanic | P1 | Cerinta user explicita; reflecta dependenta reala |
| 2 | Eng | F1: query embedding normalizat ca si corpusul (US-005 AC + test) | Mecanic | P5 | Corectitudine; altfel 94.3% nereproductibil. Blast radius (US-005) |
| 3 | Eng | F2: seed comis = cache de etichete cross-run (US-003 pipeline + `test_rerun_zero_apeluri_llm`) | Mecanic | P1 | Criteriul "0 apel LLM la re-run" altfel nesatisfiabil |
| 4 | Eng | F3: harta normalizat->cod cu tie-break determinist (freq-max) | Mecanic | P5 | 1 conflict real azi (CURATAT CATALIZATOR); altfel cod nedeterminist |
| 5 | Eng | F4/F5: corectie cifre (17.181 distinct, 13.519 de etichetat, 31%) + claim "fara punctuatie" | Mecanic | P5 | Cifre verificate cu `normalize_for_match` real |
| 6 | Eng | F6: arunca cheie normalizata vida inainte de dedup | Mecanic | P1 | Coliziune pe slot UNIQUE gol |
| 7 | Eng | F7: teste two-run + conflict adaugate | Mecanic | P1 | Testul single-run nu acopera idempotenta/determinismul |
| 8 | Eng | F8: envelope json_schema strict + enum cod + dezactivare thinking Qwen3 + garda truncare | Mecanic | P1 | Realism integrare LM Studio (cerinta user #1) |
| 9 | Eng | F9: parsare NR toleranta (skip, nu zero-weight) | Mecanic | P3 | Date curate azi; ieftina robustete |
| 10 | Eng | F10: re-justificare INSERT OR IGNORE (confirmari umane = shared_mappings) | Mecanic | P5 | Evita inducerea in eroare a unui mentainer |
Zero decizii de gust (taste) si zero user-challenge: toate constatarile au intarit directia user, nu au contrazis-o.

View File

@@ -18,6 +18,14 @@ import pytest
os.environ.setdefault("AUTOPASS_REQUIRE_API_KEY", "false")
os.environ.setdefault("AUTOPASS_WORKER_USE_TEST_CREDS", "false")
# Embeddings e ON implicit in app (config.py), dar in teste il lasam OFF ca sa nu
# lazy-load-eze modelul de ~230MB la fiecare test care atinge editorul de mapari
# (suita rapida, fara download in CI). Testele de embeddings il pornesc punctual.
os.environ.setdefault("AUTOPASS_EMBEDDINGS_ENABLED", "false")
# Seed-ul de operatii etichetate (SILVER, PRD 5.18) e ON in app, dar OFF in teste:
# multe teste presupun mapping_suggestions GOL la init_db. Testele US-004/005/006 il
# pornesc punctual (object.__setattr__ pe settings sau apel direct la seeder).
os.environ.setdefault("AUTOPASS_SEED_OPERATII_ENABLED", "false")
@pytest.fixture(autouse=True)

View File

@@ -0,0 +1,150 @@
"""US-005 (PRD 5.18) — embeddings indexeaza corpusul etichetat (NU nomenclatorul).
k-NN peste exemple reale etichetate (denumire_normalizata -> cod, is_nul) e net mai
precis decat peste cele 18 categorii generice. Acopera si simetria corpus/query (F1):
corpusul e text NORMALIZAT, deci query-ul trebuie normalizat la fel inainte de embedding.
"""
from __future__ import annotations
import math
import os
import tempfile
import pytest
# Backend mock determinist: vector = histograma de caractere (similaritate stabila).
class MockBackend:
def embed(self, texts):
out = []
for t in texts:
v = [0.0] * 27
for ch in t.upper():
if "A" <= ch <= "Z":
v[ord(ch) - 65] += 1.0
else:
v[26] += 1.0
out.append(v)
return out
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us005.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true") # US-005 are nevoie de embeddings ON
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield monkeypatch
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _inject_mock_engine():
import app.embeddings as emb
from app.embeddings import EmbeddingEngine
emb._engine = EmbeddingEngine(backend=MockBackend())
return emb
def _seed_silver(conn, rows):
"""rows = [(denumire_normalizata, cod, is_nul)]."""
conn.executemany(
"INSERT OR IGNORE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
rows,
)
conn.commit()
def test_corpus_din_mapping_suggestions(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [
("SCHIMB ULEI MOTOR", "OE-3", 0),
("INLOCUIT PLACUTE FRANA", "OE-1", 0),
("13 X ITP", None, 1),
])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
assert emb.has_corpus()
# Corpusul indexat = denumirile din mapping_suggestions, NU din nomenclator_rar.
texte = {it["denumire"] for it in emb._engine._corpus_items}
assert texte == {"SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA", "13 X ITP"}
def test_suggest_nearest_intoarce_is_nul(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [
("SCHIMB ULEI MOTOR", "OE-3", 0),
("13 X ITP", None, 1),
])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
res = emb.suggest_nearest("13 X ITP", top_k=1)
assert res and res[0]["is_nul"] is True # vecin NUL -> semnal de supresie
res2 = emb.suggest_nearest("SCHIMB ULEI MOTOR", top_k=1)
assert res2 and res2[0]["is_nul"] is False
assert res2[0]["cod"] == "OE-3"
def test_semnatura_corpus_pe_seed(conn):
emb = _inject_mock_engine()
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
from app.mapping import ensure_embeddings_corpus
ensure_embeddings_corpus(conn)
sig1 = emb.corpus_signature()
assert sig1 is not None
# Re-apel fara schimbare -> aceeasi semnatura (nu re-indexeaza).
ensure_embeddings_corpus(conn)
assert emb.corpus_signature() == sig1
# Adaugare rand -> semnatura se schimba.
_seed_silver(conn, [("INLOCUIT BATERIE", "OE-1", 0)])
ensure_embeddings_corpus(conn)
assert emb.corpus_signature() != sig1
def test_query_normalizat_ca_si_corpusul(conn, monkeypatch):
"""F1 (HIGH): enrich_suggestions interogheaza suggest_nearest cu textul NORMALIZAT."""
import app.embeddings as emb
captura = {}
monkeypatch.setattr(emb, "has_corpus", lambda: True)
def fake_suggest(text, top_k=1):
captura["text"] = text
return [{"cod": "OE-3", "is_nul": False, "similaritate": 0.99}]
monkeypatch.setattr(emb, "suggest_nearest", fake_suggest)
from app.mapping import enrich_suggestions
enrich_suggestions(conn, "Schimb Uleiul Motor")
# Corpusul e denumire_normalizata -> query-ul trebuie normalizat la fel.
from app.mapping import normalize_for_match
assert captura["text"] == normalize_for_match("Schimb Uleiul Motor")
assert captura["text"] == "SCHIMB ULEIUL MOTOR"
def test_degradare_gratioasa_pastrata(conn):
"""Backend care arunca -> ensure + enrich NU arunca exceptie."""
import app.embeddings as emb
from app.embeddings import EmbeddingEngine
class BrokenBackend:
def embed(self, texts):
raise RuntimeError("model indisponibil")
emb._engine = EmbeddingEngine(backend=BrokenBackend())
_seed_silver(conn, [("SCHIMB ULEI MOTOR", "OE-3", 0)])
from app.mapping import ensure_embeddings_corpus, enrich_suggestions
ensure_embeddings_corpus(conn) # nu arunca
out = enrich_suggestions(conn, "SCHIMB ULEI") # nu arunca
assert "sugestie_principala" in out

View File

@@ -0,0 +1,133 @@
"""US-006 (PRD 5.18) — enrich_suggestions = pre-filtru NUL + k-NN pe corpus etichetat.
Ordinea de precedenta: pre-filtru NUL -> (daca NUL: fara cod) altfel GOLD partajat >
exact (SILVER) > k-NN embeddings. k-NN sub prag -> abtinere. Vecin k-NN NUL -> supresie.
Invariant #13: nimic din asta nu intra in resolve_prestatii/load_mapping.
"""
from __future__ import annotations
import os
import tempfile
import pytest
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us006.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
monkeypatch.setenv("AUTOPASS_EMBEDDINGS_ENABLED", "true")
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield monkeypatch
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _silver(conn, denumire_norm, cod, is_nul=0):
conn.execute(
"INSERT OR IGNORE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, 'llm_seed', 0.7)",
(denumire_norm, cod, is_nul),
)
conn.commit()
def _mock_embedding(monkeypatch, cod, sim, is_nul=False):
import app.embeddings as emb
monkeypatch.setattr(emb, "has_corpus", lambda: True)
monkeypatch.setattr(emb, "suggest_nearest",
lambda text, top_k=1: [{"cod": cod, "is_nul": is_nul, "similaritate": sim}])
def test_prefiltru_nul_supreseaza_inainte_de_knn(conn, monkeypatch):
# Embedding-ul AR sugera un cod, dar pre-filtrul NUL trebuie sa scurtcircuiteze.
chemat = {"da": False}
import app.embeddings as emb
monkeypatch.setattr(emb, "has_corpus", lambda: True)
def spion(text, top_k=1):
chemat["da"] = True
return [{"cod": "OE-1", "is_nul": False, "similaritate": 0.99}]
monkeypatch.setattr(emb, "suggest_nearest", spion)
from app.mapping import enrich_suggestions
out = enrich_suggestions(conn, "13 X ITP")
assert out["sugestie_principala"] is None # non-operatie -> fara cod
assert out["surse"]["nul"] is True
assert chemat["da"] is False # k-NN nici macar interogat
def test_precedenta_gold_exact_embedding(conn, monkeypatch):
from app.shared_store import record_human_validation
from app.mapping import enrich_suggestions, normalize_for_match
den = "OPERATIE DE TEST UNICA"
norm = normalize_for_match(den)
# Toate trei sursele dau coduri diferite.
record_human_validation(conn, den, "OE-1") # GOLD partajat
_silver(conn, norm, "OE-2") # SILVER exact
_mock_embedding(monkeypatch, "OE-3", 0.99) # embedding
conn.commit()
out = enrich_suggestions(conn, den)
assert out["sugestie_principala"] == {"cod_prestatie": "OE-1", "sursa": "gold_partajat"}
# Fara GOLD -> castiga SILVER.
conn.execute("DELETE FROM shared_mappings")
conn.commit()
out = enrich_suggestions(conn, den)
assert out["sugestie_principala"]["sursa"] == "silver"
assert out["sugestie_principala"]["cod_prestatie"] == "OE-2"
# Fara GOLD si fara SILVER -> castiga embedding.
conn.execute("DELETE FROM mapping_suggestions")
conn.commit()
out = enrich_suggestions(conn, den)
assert out["sugestie_principala"] == {"cod_prestatie": "OE-3", "sursa": "embedding"}
def test_prag_similaritate(conn, monkeypatch):
from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
_mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE + 0.01)
out = enrich_suggestions(conn, "CEVA NEVAZUT")
assert out["surse"]["embedding"] == "OE-3"
def test_abtinere_sub_prag(conn, monkeypatch):
from app.mapping import enrich_suggestions, EMB_MIN_SIMILARITATE
_mock_embedding(monkeypatch, "OE-3", EMB_MIN_SIMILARITATE - 0.01)
out = enrich_suggestions(conn, "CEVA NEVAZUT")
assert out["surse"]["embedding"] is None # sub prag -> abtinere
assert out["sugestie_principala"] is None
def test_vecin_knn_nul_supreseaza(conn, monkeypatch):
from app.mapping import enrich_suggestions
_mock_embedding(monkeypatch, None, 0.99, is_nul=True) # vecin NUL peste prag
out = enrich_suggestions(conn, "CEVA CARE SEAMANA CU GUNOI")
assert out["surse"]["embedding"] is None # NUL -> nu produce cod
assert out["surse"]["nul"] is True
assert out["sugestie_principala"] is None
def test_invariant_13_resolve_neatins(conn):
"""Regresie #13: SILVER populat NU produce auto-rezolvare in resolve_prestatii."""
from app.mapping import resolve_prestatii, normalize_for_match
_silver(conn, normalize_for_match("OPERATIE X"), "OE-1")
resolved, unmapped = resolve_prestatii(
[{"cod_op_service": "OPERATIE X", "denumire": "OPERATIE X"}], mapping={}, valid_codes={"OE-1"}
)
assert resolved[0]["cod_prestatie"] is None # ramane nemapat, NU ia codul din SILVER
assert unmapped and unmapped[0]["cod_op_service"] == "OPERATIE X"

View File

@@ -0,0 +1,103 @@
"""US-002 (PRD 5.18) — etichetator offline multi-backend cu prompt procedural.
Toate testele ruleaza FARA retea reala (transport injectabil / inspectie body).
Acopera: prompt 3 pasi, envelope json_schema strict + enum, backend selectabil
prin env, scrub PII inainte de orice request, garda de truncare.
"""
from __future__ import annotations
# Numele pachetului `tools/mapare-llm` contine cratima -> nu e importabil ca modul.
# Incarcam fisierul direct prin importlib pe cale.
import importlib.util
import os
import sys
_PATH = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", "eticheteaza.py")
_spec = importlib.util.spec_from_file_location("eticheteaza", _PATH)
eticheteaza = importlib.util.module_from_spec(_spec)
sys.modules["eticheteaza"] = eticheteaza # necesar pt. @dataclass introspection
_spec.loader.exec_module(eticheteaza)
def test_construieste_prompt_3pasi():
msgs = eticheteaza.construieste_mesaje(["INLOCUIT PLACUTE FRANA"])
assert isinstance(msgs, list) and msgs[0]["role"] == "system"
sys = msgs[0]["content"].upper()
# Procedura in 3 pasi explicita.
assert "PAS 1" in sys and "PAS 2" in sys and "PAS 3" in sys
# Regula NUL + avarie grava doar la accident.
assert "NUL" in sys
assert "ACCIDENT" in sys
# Dezactivare thinking Qwen3 (token /no_think undeva in mesaje).
joined = " ".join(m["content"] for m in msgs)
assert "/no_think" in joined
# User message enumera operatiile.
assert "1." in msgs[1]["content"] and "INLOCUIT PLACUTE FRANA" in msgs[1]["content"]
def test_envelope_json_schema_strict_si_enum():
backend = eticheteaza.get_backend("lmstudio")
body = eticheteaza.construieste_body(["REVIZIE"], backend)
rf = body["response_format"]
# Envelope COMPLET, NU json_object.
assert rf["type"] == "json_schema"
js = rf["json_schema"]
assert js["strict"] is True
assert "name" in js
schema = js["schema"]
cod_schema = schema["properties"]["rez"]["items"]["properties"]["cod"]
# cod = enum peste cele 19 ALL_LABELS (18 coduri + NUL).
assert set(cod_schema["enum"]) == set(eticheteaza.ALL_LABELS)
assert len(eticheteaza.ALL_LABELS) == 19
assert "NUL" in eticheteaza.ALL_LABELS
# temperatura 0 (determinist) si strict items.
assert body["temperature"] == 0
assert schema["properties"]["rez"]["items"]["additionalProperties"] is False
def test_parseaza_raspuns_si_garda_truncare():
batch = ["A", "B", "C"]
# Raspuns complet, ordine amestecata, un cod invalid.
content = {"rez": [{"i": 2, "cod": "OE-1"}, {"i": 1, "cod": "NUL"}, {"i": 3, "cod": "INEXISTENT"}]}
codes = eticheteaza.parseaza_raspuns(content, len(batch))
assert codes == ["NUL", "OE-1", "?"] # cod invalid -> '?', NU ascuns
# Raspuns trunchiat: lipseste pozitia 3 -> '?' pe lipsa, nu eroare.
content_trunc = {"rez": [{"i": 1, "cod": "OE-1"}, {"i": 2, "cod": "OE-2"}]}
codes2 = eticheteaza.parseaza_raspuns(content_trunc, len(batch))
assert codes2 == ["OE-1", "OE-2", "?"]
assert len(codes2) == len(batch)
def test_backend_selectabil_env(monkeypatch):
# Default = lmstudio (backend aprobat v1, D4).
monkeypatch.delenv("ETICHETARE_BACKEND", raising=False)
assert eticheteaza.get_backend().name == "lmstudio"
# Selectie prin env.
monkeypatch.setenv("ETICHETARE_BACKEND", "groq")
assert eticheteaza.get_backend().name == "groq"
# Endpoint + model configurabile prin env.
monkeypatch.setenv("ETICHETARE_BACKEND", "lmstudio")
monkeypatch.setenv("ETICHETARE_ENDPOINT", "http://exemplu:1234/v1/chat/completions")
monkeypatch.setenv("ETICHETARE_MODEL", "qwen/qwen3-custom")
b = eticheteaza.get_backend()
assert b.url == "http://exemplu:1234/v1/chat/completions"
assert b.model == "qwen/qwen3-custom"
def test_scrub_pii_inainte_de_request(monkeypatch):
"""Nicio placuta/VIN nu ajunge la transport — scrub inainte de orice apel."""
capturat = {}
def fake_transport(url, headers, payload, timeout):
capturat["payload"] = payload
return {"choices": [{"message": {"content": '{"rez":[{"i":1,"cod":"OE-1"}]}'}}]}
backend = eticheteaza.get_backend("lmstudio")
codes, meta = eticheteaza.call(["VOPSIT USA B 123 ABC"], backend, transport=fake_transport)
assert codes == ["OE-1"]
body = capturat["payload"]
user_content = body["messages"][1]["content"]
assert "B 123 ABC" not in user_content
assert "[NR]" in user_content
assert meta["err"] is None

View File

@@ -0,0 +1,175 @@
"""US-003 (PRD 5.18) — generare seed etichetat in faze pe frecventa.
Pipeline dedup OBLIGATORIU inainte de orice apel LLM (D5):
brut -> normalize_for_match -> arunca chei vide -> dedup pe cheie (freq=suma NR)
-> reuse etichete existente (labels-groq + seed comis, conflict freq-max) -> de_etichetat.
Idempotenta cross-run (F2/F7): a doua rulare consuma seedul comis ca cache -> 0 apeluri LLM.
Toate testele FARA retea: `clasifica` e injectat (mock care inregistreaza ce primeste).
"""
from __future__ import annotations
import importlib.util
import json
import os
import sys
def _load(name: str):
path = os.path.join(os.path.dirname(__file__), "..", "tools", "mapare-llm", f"{name}.py")
spec = importlib.util.spec_from_file_location(name, path)
mod = importlib.util.module_from_spec(spec)
sys.modules[name] = mod
spec.loader.exec_module(mod)
return mod
gs = _load("genereaza_seed")
def _scrie_csv(path, randuri):
"""randuri = [(denumire, nr)]. Format CSV ca docs/operatii-service (`;`, header)."""
linii = ['" ";"DENOP";"NR"']
for i, (den, nr) in enumerate(randuri, 1):
linii.append(f'"{i}";"{den}";"{nr}"')
path.write_text("\n".join(linii) + "\n", encoding="utf-8")
def _mock_recorder():
"""Returneaza (clasifica, vazute) — clasifica raspunde OE-1 pe tot, inregistreaza inputul."""
vazute = []
def clasifica(batch):
vazute.append(list(batch))
return ["OE-1"] * len(batch)
return clasifica, vazute
# --------------------------------------------------------------------------- #
def test_dedup_normalizat(tmp_path):
f1 = tmp_path / "a.csv"
f2 = tmp_path / "b.csv"
_scrie_csv(f1, [("REVIZIE", 10), ("D/R BARA FATA", 3)])
_scrie_csv(f2, [(" revizie ", 5)]) # acelasi logic, case+spatii
corpus = gs.agrega_corpus([str(f1), str(f2)])
assert "REVIZIE" in corpus
assert corpus["REVIZIE"]["freq"] == 15 # 10 + 5, dedup pe cheie
assert len([k for k in corpus]) == 2 # REVIZIE + D/R BARA FATA
def test_skip_cheie_normalizata_vida(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [(" ", 99), ("REVIZIE", 5)]) # cheie vida (doar spatii)
corpus = gs.agrega_corpus([str(f)])
assert "" not in corpus
assert list(corpus) == ["REVIZIE"]
def test_ordine_pe_frecventa(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP MICA", 5), ("OP MARE", 50), ("OP MEDIE", 20)])
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed),
etichetare_all=True, clasifica=clasifica, batch=32)
# Ordinea in care LLM-ul a vazut operatiile = desc pe frecventa.
primul_batch = vazute[0]
assert primul_batch[:3] == ["OP MARE", "OP MEDIE", "OP MICA"]
def test_reuse_in_spatiu_normalizat(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("Revizie", 10), ("SCHIMB ULEI", 5)])
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # cheiat brut, dar normalizeaza la fel
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed),
etichetare_all=True, clasifica=clasifica)
trimise = {d for b in vazute for d in b}
assert "Revizie" not in trimise and "REVIZIE" not in trimise # deja etichetat -> nu se trimite
seed_data = json.loads(seed.read_text(encoding="utf-8"))
rev = [e for e in seed_data if e["denumire_normalizata"] == "REVIZIE"][0]
assert rev["cod"] == "OE-3"
def test_reuse_conflict_determinist(tmp_path):
f = tmp_path / "a.csv"
# Doua variante raw ale aceleiasi chei, etichetate diferit; freq decide.
_scrie_csv(f, [("CURATAT CATALIZATOR", 100), ("curatat catalizator", 5)])
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({
"CURATAT CATALIZATOR": "OE-1", # freq 100
"curatat catalizator": "OE-2", # freq 5
}), encoding="utf-8")
seed = tmp_path / "seed.json"
clasifica, _ = _mock_recorder()
gs.genereaza([str(f)], labels_path=str(labels), seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
seed_data = json.loads(seed.read_text(encoding="utf-8"))
cat = [e for e in seed_data if e["denumire_normalizata"] == "CURATAT CATALIZATOR"][0]
assert cat["cod"] == "OE-1" # freq-max castiga (100 > 5)
def test_zero_duplicate_trimis_la_llm(tmp_path):
f1 = tmp_path / "a.csv"
f2 = tmp_path / "b.csv"
_scrie_csv(f1, [("REVIZIE", 10), (" revizie ", 4), ("OP NOUA", 7), (" ", 3)])
_scrie_csv(f2, [("REVIZIE", 2), ("OP NOUA", 1)]) # cross-file duplicate
labels = tmp_path / "labels.json"
labels.write_text(json.dumps({"REVIZIE": "OE-3"}), encoding="utf-8") # REVIZIE deja etichetat
seed = tmp_path / "seed.json"
clasifica, vazute = _mock_recorder()
from app.mapping import normalize_for_match
gs.genereaza([str(f1), str(f2)], labels_path=str(labels), seed_path=str(seed),
etichetare_all=True, clasifica=clasifica)
trimise = [d for b in vazute for d in b]
chei = [normalize_for_match(d) for d in trimise]
assert len(chei) == len(set(chei)) # nicio cheie normalizata trimisa de doua ori
assert "" not in chei # nicio cheie vida
assert "REVIZIE" not in chei # nicio cheie deja etichetata
assert "OP NOUA" in chei # doar ce lipseste
def test_rerun_zero_apeluri_llm(tmp_path):
"""Criteriul real de idempotenta (F2/F7): a doua rulare = 0 apeluri LLM, seed identic."""
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP UNU", 10), ("OP DOI", 5)])
seed = tmp_path / "seed.json"
clasifica1, vazute1 = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica1)
assert sum(len(b) for b in vazute1) == 2 # prima rulare eticheteaza ambele
bytes1 = seed.read_bytes()
clasifica2, vazute2 = _mock_recorder()
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica2)
assert vazute2 == [] # a doua rulare: 0 apeluri LLM (seed = cache)
bytes2 = seed.read_bytes()
assert bytes1 == bytes2 # seed identic byte-cu-byte
def test_format_seed_valid(tmp_path):
f = tmp_path / "a.csv"
_scrie_csv(f, [("OP REALA", 10), ("13 X ITP", 5)])
seed = tmp_path / "seed.json"
def clasifica(batch):
# marcheaza ITP ca NUL, restul OE-1
return ["NUL" if "ITP" in d.upper() else "OE-1" for d in batch]
gs.genereaza([str(f)], labels_path=None, seed_path=str(seed), etichetare_all=True, clasifica=clasifica)
data = json.loads(seed.read_text(encoding="utf-8"))
chei = [e["denumire_normalizata"] for e in data]
assert len(chei) == len(set(chei)) # unice
assert all(e["denumire_normalizata"] for e in data) # non-vide
for e in data:
assert set(e) >= {"denumire", "denumire_normalizata", "cod", "is_nul", "source", "confidence"}
if e["is_nul"]:
assert e["cod"] is None # NUL -> cod NULL (oglindeste CHECK-ul DB)
else:
assert e["cod"]
nul = [e for e in data if e["is_nul"]][0]
assert "ITP" in nul["denumire_normalizata"]

View File

@@ -272,14 +272,18 @@ def test_embeddings_functional_cand_flag_activ(conn, monkeypatch):
get_settings.cache_clear()
monkeypatch.setattr(emb_mod, "_engine", EmbeddingEngine(backend=_FakeEmbedBackend()))
# Nomenclatorul (din fixtura conn) are OE-1..OE-4; adaug coduri cu denumiri keyword.
# Corpusul sursa = mapping_suggestions (SILVER) -- PRD 5.18 US-005.
# (Inainte era nomenclator_rar; migrat la mapping_suggestions ca k-NN sa
# opereze pe exemple reale etichetate, nu pe categorii generice RAR.)
conn.execute(
"INSERT OR REPLACE INTO nomenclator_rar (cod_prestatie, nume_prestatie) VALUES (?, ?)",
("UL-1", "Schimb ulei"),
"INSERT OR REPLACE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, ?, ?)",
("Schimb ulei", "UL-1", 0, "llm", 0.95),
)
conn.execute(
"INSERT OR REPLACE INTO nomenclator_rar (cod_prestatie, nume_prestatie) VALUES (?, ?)",
("FR-1", "Placute frana"),
"INSERT OR REPLACE INTO mapping_suggestions "
"(denumire_normalizata, cod_prestatie, is_nul, source, confidence) VALUES (?, ?, ?, ?, ?)",
("Placute frana", "FR-1", 0, "llm", 0.95),
)
conn.commit()

113
tests/test_operatii_seed.py Normal file
View File

@@ -0,0 +1,113 @@
"""US-004 (PRD 5.18) — seeder corpus etichetat in mapping_suggestions (SILVER).
INSERT OR IGNORE din artefactul comis -> SILVER nu mai e gol in productie.
NB (F10): confirmarile UMANE stau in shared_mappings, NU aici; deci INSERT OR IGNORE
pastreaza codul LLM existent la re-seed (v1 = ignore, nu upsert).
"""
from __future__ import annotations
import json
import os
import tempfile
import pytest
@pytest.fixture()
def env(monkeypatch):
tmp = tempfile.mkdtemp()
monkeypatch.setenv("AUTOPASS_DB_PATH", os.path.join(tmp, "us004.db"))
monkeypatch.setenv("AUTOPASS_WEB_AUTH_REQUIRED", "false")
from app.config import get_settings
get_settings.cache_clear()
from app.db import init_db
init_db()
yield tmp
get_settings.cache_clear()
@pytest.fixture()
def conn(env):
from app.db import get_connection
c = get_connection()
yield c
c.close()
def _scrie_seed(tmp, items) -> str:
p = os.path.join(tmp, "operatii-etichetate.json")
with open(p, "w", encoding="utf-8") as fh:
json.dump(items, fh, ensure_ascii=False)
return p
SEED_OE = {"denumire": "SCHIMB ULEI MOTOR", "denumire_normalizata": "SCHIMB ULEI MOTOR",
"cod": "OE-3", "is_nul": False, "source": "llm_seed", "confidence": 0.7}
SEED_NUL = {"denumire": "13 X ITP", "denumire_normalizata": "13 X ITP",
"cod": None, "is_nul": True, "source": "llm_seed", "confidence": 0.7}
def test_seed_populeaza_mapping_suggestions(env, conn):
from app.operatii_seed import seed_operatii_etichetate
path = _scrie_seed(env, [SEED_OE])
n = seed_operatii_etichetate(conn, path)
conn.commit()
assert n == 1
row = conn.execute(
"SELECT cod_prestatie, source, confidence FROM mapping_suggestions "
"WHERE denumire_normalizata = 'SCHIMB ULEI MOTOR'"
).fetchone()
assert row["cod_prestatie"] == "OE-3"
assert row["source"] == "llm_seed"
assert abs(row["confidence"] - 0.7) < 1e-9
def test_is_nul_din_seed(env, conn):
from app.operatii_seed import seed_operatii_etichetate
path = _scrie_seed(env, [SEED_NUL])
seed_operatii_etichetate(conn, path)
conn.commit()
row = conn.execute(
"SELECT cod_prestatie, is_nul FROM mapping_suggestions WHERE denumire_normalizata = '13 X ITP'"
).fetchone()
assert row["is_nul"] == 1
assert row["cod_prestatie"] is None # respecta CHECK-ul (NUL -> cod NULL)
def test_insert_or_ignore_nu_clobber(env, conn):
from app.operatii_seed import seed_operatii_etichetate
# Un rand pre-existent (ex. embedding) pe aceeasi cheie, cu alt cod.
conn.execute(
"INSERT INTO mapping_suggestions (denumire_normalizata, cod_prestatie, is_nul, source, confidence) "
"VALUES ('SCHIMB ULEI MOTOR', 'OE-1', 0, 'embedding', 0.5)"
)
conn.commit()
path = _scrie_seed(env, [SEED_OE])
n = seed_operatii_etichetate(conn, path)
conn.commit()
assert n == 0 # INSERT OR IGNORE -> nu suprascrie
row = conn.execute(
"SELECT cod_prestatie, source FROM mapping_suggestions WHERE denumire_normalizata = 'SCHIMB ULEI MOTOR'"
).fetchone()
assert row["cod_prestatie"] == "OE-1" # randul existent ramane neatins
assert row["source"] == "embedding"
def test_idempotent_la_reinit(env, conn):
from app.operatii_seed import seed_operatii_etichetate
path = _scrie_seed(env, [SEED_OE, SEED_NUL])
n1 = seed_operatii_etichetate(conn, path)
conn.commit()
n2 = seed_operatii_etichetate(conn, path)
conn.commit()
assert n1 == 2
assert n2 == 0 # a doua rulare nu dubleaza
total = conn.execute("SELECT COUNT(*) AS n FROM mapping_suggestions").fetchone()["n"]
assert total == 2
def test_seed_inexistent_e_noop(env, conn):
from app.operatii_seed import seed_operatii_etichetate
n = seed_operatii_etichetate(conn, os.path.join(env, "nu-exista.json"))
assert n == 0

View File

@@ -0,0 +1,72 @@
"""US-001 (PRD 5.18) — pre-filtru determinist non-operatii (NUL).
Masuratoarea k-NN (memorie test-precizie-knn-embeddings) arata recall NUL doar 64%:
gunoiul evident (ITP, plata, discount, nr. inmatriculare, tractare) scapa ca OE-1.
Un pre-filtru determinist il marcheaza NUL INAINTE de k-NN.
Garantie non-negociabila (AC): ZERO fals-pozitiv pe operatii reale. Regulile
text/regex au fost calibrate pe `docs/operatii-service/*.csv` (vezi sesiunea de
implementare): triggerele ambigue (TRACTARE, NR INMATRICULARE/placuta) sunt
ECRANATE de un context de piesa/operatie (D/R, CARLIG, CAPAC, INLOCUIT...).
"""
from __future__ import annotations
from app.mapping import prefiltru_nul
def test_itp_e_nul():
assert prefiltru_nul("13 X ITP") is True
assert prefiltru_nul("11XITP") is True # glue fara spatii
assert prefiltru_nul("ITP") is True
assert prefiltru_nul("2 X ITP") is True
def test_plata_discount_nul():
assert prefiltru_nul("DISCOUNT FIDELITATE 10%") is True
assert prefiltru_nul("REDUCERE COMERCIALA") is True
assert prefiltru_nul("ACHITAT DE CONF.URBAN") is True
assert prefiltru_nul("PLATA AVANS") is True
assert prefiltru_nul("TAXA DE MEDIU") is True
def test_nr_inmatriculare_nul():
assert prefiltru_nul("NR INMATRICULARE") is True
assert prefiltru_nul("NUMAR INMATRICULARE") is True
assert prefiltru_nul("B 123 ABC") is True # pattern placuta standalone
assert prefiltru_nul("CT 44 MKY") is True
def test_tractare_serviciu_nul():
# Serviciul de tractare (rmorca) = non-operatie de service.
assert prefiltru_nul("TRACTARE CTA-SLOBOZIA") is True
assert prefiltru_nul("TRACTARE 100 KM") is True
def test_operatie_reala_nu_e_nul():
# Punctul critic: trigger ambiguu intr-un context de piesa reala -> NU e NUL.
assert prefiltru_nul("INLOCUIT PLACUTE FRANA") is False
assert prefiltru_nul("D/R CARLIG TRACTARE") is False # carlig = piesa, nu serviciu
assert prefiltru_nul("D/R CAPAC TRACTARE BARA SPATE") is False
assert prefiltru_nul("D/R NR INMATRICULARE") is False # suport placuta = piesa
assert prefiltru_nul("D/R ELECTROMOTOR CT 44 MKY") is False # placuta lipita la o operatie reala
def test_zero_fals_pozitiv_pe_set_operatii_reale():
"""AC: zero fals-pozitiv pe un set de 20 operatii reale (din docs/operatii-service)."""
reale = [
"REVIZIE", "SCHIMB ULEI MOTOR", "INLOCUIT PLACUTE FRANA FATA",
"D/R BARA FATA", "VOPSIT USA DR FATA", "INLOCUIT FILTRU AER",
"AERISIT INSTALATIE FRANA", "INLOCUIT AMORTIZOR SPATE", "ABSORBANT SOC BARA SPATE",
"INLOCUIT CUREA DISTRIBUTIE", "REGLAT FARURI", "INLOCUIT BUJII",
"REPARAT ARIPA FATA DR", "INLOCUIT DISCURI FRANA", "GRESAT PLANETARA",
"INLOCUIT RULMENT ROATA", "MONTAT ANVELOPE", "INLOCUIT BATERIE",
"DIAGNOZA COMPUTERIZATA", "INLOCUIT CONTACT PORNIRE",
]
for op in reale:
assert prefiltru_nul(op) is False, f"fals-pozitiv pe operatie reala: {op!r}"
def test_input_gol_nu_e_nul():
assert prefiltru_nul("") is False
assert prefiltru_nul(None) is False # type: ignore[arg-type]

View File

@@ -0,0 +1,258 @@
"""Etichetator offline operatii service -> coduri RAR (US-002, PRD 5.18).
Backend implicit = **LM Studio local** (Qwen3-4B, GPU RX 6600M via Tailscale),
backend-ul APROBAT pentru bootstrap-ul v1 (decizia D4). Groq / OpenRouter raman
fallback-uri interschimbabile, dar NU sunt calea aprobata pentru v1.
Particularitati care justifica un tool NOU (nu reuse de `or_common.call`):
- LM Studio RESPINGE `response_format: json_object` (eroare 400). Cere envelope
`json_schema` STRICT complet: {"type":"json_schema","json_schema":{...,"strict":true}}.
- `cod` e ENUM peste cele 19 etichete (18 coduri RAR + NUL) -> modelul nu poate
inventa coduri; orice abatere e prinsa de garda de truncare ('?').
- Qwen3 emite `<think>...` daca nu dezactivam thinking-ul -> umfla tokeni/latenta
sub structured output strict. Punem `/no_think` in promptul de sistem.
Setari conservatoare OBLIGATORII pe GPU-box (a facut shutdown sub sarcina 2026-06-29,
probabil termic/alimentare): in LM Studio incarca modelul cu `n_parallel=1`,
`n_ctx=4096`, batch 32-40, monitorizeaza temperatura. NU mari batch/context fara
headroom termic. Vezi memorie `lmstudio-gpu-etichetare`.
Reutilizeaza din `or_common`: scrub-ul PII (F3) si lista de coduri.
"""
from __future__ import annotations
import json
import os
import sys
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
# --- Coduri + scrub PII: sursa de adevar = or_common (acelasi nomenclator de etichete) ---
import importlib.util as _ilu
_OR_PATH = os.path.join(os.path.dirname(__file__), "or_common.py")
_spec = _ilu.spec_from_file_location("or_common", _OR_PATH)
or_common = _ilu.module_from_spec(_spec)
sys.modules.setdefault("or_common", or_common)
_spec.loader.exec_module(or_common)
scrub = or_common.scrub # VIN/placuta -> [VIN]/[NR]
# Cele 19 etichete (18 coduri RAR + NUL), extrase din CODURI (sursa unica or_common).
ALL_LABELS: list[str] = [c.split("=")[0].strip() for c in or_common.CODURI.replace(", ", ",").split(",")]
assert "NUL" in ALL_LABELS and len(ALL_LABELS) == 19, ALL_LABELS
_VALID = set(ALL_LABELS)
# --------------------------------------------------------------------------- #
# Prompt procedural in 3 pasi (versionat) #
# --------------------------------------------------------------------------- #
PROMPT_VERSION = "3pasi-v1"
_CODURI_LISTA = or_common.CODURI
SYS = (
"Esti expert RAR AUTOPASS. Clasifici fiecare operatie de service-auto in EXACT unul "
"din aceste coduri:\n" + _CODURI_LISTA + "\n\n"
"Urmeaza PROCEDURA in 3 pasi, in ordine:\n"
"PAS 1 (non-operatie -> NUL): daca textul NU e o operatie tehnica de service "
"(ITP, plata/achitat, discount/reducere, taxa, nr inmatriculare/placuta, manopera "
"generica, sau DOAR un nume de piesa fara actiune) -> cod = NUL. Opreste-te.\n"
"PAS 2 (avarie din ACCIDENT -> avarie grava): foloseste codurile de avarie grava DOAR "
"pentru daune in urma unui accident, pe sistemul avariat:\n"
" caroserie/structura rezistenta -> OE-C; sasiu -> OE-S; directie -> OE-D; "
"franare -> OE-F; sistem de retinere/airbag -> OE-R; ADAS (asistenta condus) -> OE-A.\n"
" Reparatiile curente, de uzura (NU dintr-un accident) NU sunt avarii grave -> mergi la PAS 3.\n"
"PAS 3 (operatie obisnuita): \n"
" inlocuire / D-R / reparare / vopsire / retus piese -> OE-1 (REPARATIE);\n"
" schimb ulei motor + filtre -> OE-3 (REVIZIE PERIODICA);\n"
" aerisit / gresat / completat nivele -> OE-2 (INTRETINERE);\n"
" reglare functionala (geometrie directie, faruri, ralanti) -> OE-4;\n"
" actualizare/programare software -> OE-7; schimb sezonier anvelope -> OE-8;\n"
" istoric/reparatie/inlocuire odometru -> OE-I / R-ODO / I-ODO; tahograf -> AITLV.\n\n"
"Raspunde DOAR cu JSON conform schemei. /no_think"
)
def construieste_mesaje(batch: list[str]) -> list[dict]:
"""Mesajele chat (system procedural + user enumerat). Scrub PII pe fiecare item."""
user = "\n".join(f"{i + 1}. {scrub(o)}" for i, o in enumerate(batch))
return [
{"role": "system", "content": SYS},
{"role": "user", "content": user},
]
# --------------------------------------------------------------------------- #
# Schema json_schema strict (envelope complet — LM Studio respinge json_object) #
# --------------------------------------------------------------------------- #
def _response_format() -> dict:
return {
"type": "json_schema",
"json_schema": {
"name": "etichete_operatii",
"strict": True,
"schema": {
"type": "object",
"properties": {
"rez": {
"type": "array",
"items": {
"type": "object",
"properties": {
"i": {"type": "integer"},
"cod": {"type": "string", "enum": ALL_LABELS},
},
"required": ["i", "cod"],
"additionalProperties": False,
},
}
},
"required": ["rez"],
"additionalProperties": False,
},
},
}
# --------------------------------------------------------------------------- #
# Backend-uri (LM Studio default; Groq/OpenRouter fallback) #
# --------------------------------------------------------------------------- #
@dataclass
class Backend:
name: str
url: str
model: str
api_key: str | None = None
# Endpoint LM Studio implicit = GPU-box pe Tailscale (memorie lmstudio-gpu-etichetare).
_DEFAULT_LMSTUDIO_URL = "http://100.64.151.22:1234/v1/chat/completions"
_BACKENDS = {
"lmstudio": {"url": _DEFAULT_LMSTUDIO_URL, "model": "qwen/qwen3-4b", "key_env": None},
"groq": {"url": "https://api.groq.com/openai/v1/chat/completions",
"model": "llama-3.3-70b-versatile", "key_env": "GROQ_KEY"},
"openrouter": {"url": "https://openrouter.ai/api/v1/chat/completions",
"model": "qwen/qwen3-4b:free", "key_env": "OPENROUTER_KEY"},
}
def get_backend(name: str | None = None) -> Backend:
"""Construieste backend-ul din env. Default = lmstudio (D4).
Override-uri: ETICHETARE_BACKEND, ETICHETARE_ENDPOINT, ETICHETARE_MODEL.
Cheia API (Groq/OpenRouter) se citeste din env-ul indicat de backend; LM Studio
local nu cere cheie.
"""
name = (name or os.environ.get("ETICHETARE_BACKEND") or "lmstudio").strip().lower()
if name not in _BACKENDS:
raise ValueError(f"backend necunoscut: {name} (alege din {list(_BACKENDS)})")
cfg = _BACKENDS[name]
url = os.environ.get("ETICHETARE_ENDPOINT") or cfg["url"]
model = os.environ.get("ETICHETARE_MODEL") or cfg["model"]
api_key = os.environ.get(cfg["key_env"]) if cfg["key_env"] else None
return Backend(name=name, url=url, model=model, api_key=api_key)
def construieste_body(batch: list[str], backend: Backend) -> dict:
"""Corpul request-ului OpenAI-compatibil cu envelope json_schema strict."""
return {
"model": backend.model,
"messages": construieste_mesaje(batch),
"temperature": 0,
"response_format": _response_format(),
}
# --------------------------------------------------------------------------- #
# Parsare + garda de truncare #
# --------------------------------------------------------------------------- #
def parseaza_raspuns(content: dict, n: int) -> list[str]:
"""Mapeaza raspunsul {"rez":[{i,cod}]} la o lista paralela cu batch-ul (len n).
Garda de truncare/validare (F8): pozitiile lipsa SAU codurile in afara enum-ului
devin '?', NU sunt ascunse tacit. Apelantul logheaza cate '?' au ramas.
"""
by_i: dict[int, str] = {}
for x in content.get("rez") or []:
try:
idx = int(x["i"])
except (KeyError, TypeError, ValueError):
continue
cod = str(x.get("cod") or "").strip().upper()
by_i[idx] = cod if cod in _VALID else "?"
return [by_i.get(i + 1, "?") for i in range(n)]
# --------------------------------------------------------------------------- #
# Transport (injectabil in teste) #
# --------------------------------------------------------------------------- #
def _urllib_transport(url: str, headers: dict, payload: dict, timeout: int) -> dict:
data = json.dumps(payload).encode()
req = urllib.request.Request(url, data=data, headers=headers)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.load(r)
def call(
batch: list[str],
backend: Backend,
*,
timeout: int = 180,
max_attempts: int = 5,
transport=None,
) -> tuple[list[str], dict]:
"""Un apel pe un batch. Intoarce (codes, meta).
codes: lista paralela cu batch; '?' pe pozitiile fara raspuns valid (garda F8).
meta: {ms, err, missing} — `missing` = cate '?' au ramas (truncare/cod invalid).
transport: callable(url, headers, payload, timeout) -> dict raspuns OpenAI
(injectabil in teste; default urllib).
"""
transport = transport or _urllib_transport
body = construieste_body(batch, backend)
headers = {"Content-Type": "application/json", "User-Agent": "Mozilla/5.0"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
t0 = time.time()
for attempt in range(max_attempts):
try:
resp = transport(backend.url, headers, body, timeout)
content = json.loads(resp["choices"][0]["message"]["content"])
codes = parseaza_raspuns(content, len(batch))
missing = codes.count("?")
return codes, {"ms": int((time.time() - t0) * 1000), "err": None, "missing": missing}
except urllib.error.HTTPError as e:
if e.code in (429, 500, 502, 503):
wait = float(e.headers.get("retry-after", 0)) or min(2 ** attempt, 30)
time.sleep(wait)
continue
return ["?"] * len(batch), {"ms": int((time.time() - t0) * 1000), "err": f"HTTP {e.code}", "missing": len(batch)}
except Exception as e: # noqa: BLE001 — degradare gratioasa, batch-ul devine '?'
if attempt < max_attempts - 1:
time.sleep(min(2 ** attempt, 20))
continue
return ["?"] * len(batch), {"ms": int((time.time() - t0) * 1000), "err": type(e).__name__, "missing": len(batch)}
return ["?"] * len(batch), {"ms": int((time.time() - t0) * 1000), "err": "max_attempts", "missing": len(batch)}
if __name__ == "__main__":
# Sanity-check manual: 1 batch mic pe backend-ul configurat (default lmstudio).
import sys
probe = sys.argv[1:] or ["13 X ITP", "INLOCUIT PLACUTE FRANA FATA", "SCHIMB ULEI MOTOR SI FILTRE"]
b = get_backend()
print(f"backend={b.name} url={b.url} model={b.model}")
codes, meta = call(probe, b)
for op, c in zip(probe, codes):
print(f" {c:6} {op}")
print("meta:", meta)

View File

@@ -0,0 +1,344 @@
"""Generare seed etichetat operatie->cod (US-003, PRD 5.18).
Produce artefactul `app/data/operatii-etichetate.json` (comis in repo), consumat de
seeder (US-004) si de corpusul embeddings (US-005). NU cheama LLM la runtime — o
singura data, offline, pe LM Studio (backend implicit, D4).
Pipeline dedup OBLIGATORIU, in ordine, INAINTE de orice apel LLM (D5):
1. Agrega cele N CSV-uri -> freq pe denumire RAW (NR ne-numeric -> skip rand, F9).
2. `cheie = normalize_for_match(denumire)` (ACEEASI functie ca DB/k-NN, NU strip exact).
Arunca randurile cu `cheie == ""` inainte de dedup (coliziune pe slot UNIQUE gol, F6).
3. Dedup pe cheie: un reprezentant per cheie, `freq = suma NR`.
4. Harta `cheie -> cod` din TOATE etichetele existente: `labels-groq-partial.json` (cheiat
brut) + seedul comis anterior (cheiat normalizat). Conflict (acelasi cheie, coduri diferite
pe variante raw) -> castiga codul cu freq-max, tie-break pe cod sortat (F3).
5. `de_etichetat = corpus(in prag) - harta`. Sortat desc pe freq = SINGURUL input la LLM.
Idempotenta cross-run (F2/F7): seedul comis = cache de etichete -> re-run = 0 apeluri LLM.
"""
from __future__ import annotations
import argparse
import csv
import glob
import importlib.util
import json
import os
import sys
from collections import Counter, defaultdict
# Functia de normalizare = sursa unica de adevar (consistenta cu DB/k-NN).
_APP_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if _APP_ROOT not in sys.path:
sys.path.insert(0, _APP_ROOT)
from app.mapping import normalize_for_match # noqa: E402
def _load_eticheteaza():
path = os.path.join(os.path.dirname(__file__), "eticheteaza.py")
spec = importlib.util.spec_from_file_location("eticheteaza", path)
mod = importlib.util.module_from_spec(spec)
sys.modules.setdefault("eticheteaza", mod)
spec.loader.exec_module(mod)
return mod
# Cai implicite (relative la repo).
DEFAULT_CSV_GLOB = os.path.join(_APP_ROOT, "docs", "operatii-service", "*.csv")
DEFAULT_LABELS = os.path.join(_APP_ROOT, "tools", "mapare-llm", "labels-groq-partial.json")
DEFAULT_SEED = os.path.join(_APP_ROOT, "app", "data", "operatii-etichetate.json")
NUL_LABEL = "NUL"
DEFAULT_CONFIDENCE = 0.7
DEFAULT_SOURCE = "llm_seed"
# --------------------------------------------------------------------------- #
# Pasul 1-3: corpus agregat pe cheie normalizata #
# --------------------------------------------------------------------------- #
def _freq_raw(csv_paths: list[str]) -> Counter:
"""Counter denumire_raw -> suma NR. NR ne-numeric -> skip rand (F9), nu zero-weight."""
freq: Counter = Counter()
for f in csv_paths:
with open(f, encoding="utf-8", errors="replace") as fh:
for r in list(csv.reader(fh, delimiter=";"))[1:]:
if len(r) <= 2:
continue
den = r[1].strip()
if not den:
continue
nr_raw = (r[2] or "").strip()
try:
nr = int(nr_raw)
except ValueError:
continue # F9: skip rand cu NR ne-numeric
freq[den] += nr
return freq
def _corpus_din_freq(freq_raw: Counter) -> dict[str, dict]:
"""{cheie_normalizata -> {denumire, freq}}. Arunca cheile vide (F6).
`denumire` = varianta raw cu freq individual maxim (tie-break: raw sortat asc),
folosita ca text trimis la LLM si stocata in seed.
"""
grup: dict[str, list[tuple[str, int]]] = defaultdict(list)
for raw, n in freq_raw.items():
cheie = normalize_for_match(raw)
if not cheie:
continue # F6
grup[cheie].append((raw, n))
corpus: dict[str, dict] = {}
for cheie, variante in grup.items():
freq = sum(n for _, n in variante)
# reprezentant determinist: freq max, tie-break raw sortat.
denumire = sorted(variante, key=lambda rn: (-rn[1], rn[0]))[0][0]
corpus[cheie] = {"denumire": denumire, "freq": freq}
return corpus
def agrega_corpus(csv_paths: list[str]) -> dict[str, dict]:
"""{cheie_normalizata -> {denumire, freq}} din CSV-uri (pasii 1-3)."""
return _corpus_din_freq(_freq_raw(csv_paths))
# --------------------------------------------------------------------------- #
# Pasul 4: harta cheie -> cod din etichetele existente (reuse + conflict) #
# --------------------------------------------------------------------------- #
def _incarca_seed(seed_path: str | None) -> list[dict]:
if not seed_path or not os.path.exists(seed_path):
return []
try:
return json.loads(open(seed_path, encoding="utf-8").read())
except (ValueError, OSError):
return []
def construieste_harta_etichete(
freq_raw: Counter,
corpus: dict[str, dict],
labels_path: str | None,
seed_existent: list[dict],
) -> dict[str, str]:
"""Harta cheie_normalizata -> eticheta (cod RAR sau 'NUL'), reuse in spatiu normalizat.
Voturi ponderate pe freq; conflict pe acelasi cheie -> freq-max, tie-break cod sortat (F3).
"""
votes: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
# labels-groq-partial.json: cheiat pe text BRUT.
if labels_path and os.path.exists(labels_path):
labels = json.loads(open(labels_path, encoding="utf-8").read())
for raw, cod in labels.items():
cheie = normalize_for_match(raw)
if not cheie:
continue
cod = str(cod or "").strip().upper()
if not cod:
continue
votes[cheie][cod] += freq_raw.get(raw, 0)
# seed comis anterior: cheiat normalizat (cache cross-run).
for e in seed_existent:
cheie = e.get("denumire_normalizata")
if not cheie:
continue
eticheta = NUL_LABEL if e.get("is_nul") else str(e.get("cod") or "").strip().upper()
if not eticheta:
continue
votes[cheie][eticheta] += corpus.get(cheie, {}).get("freq", 0)
harta: dict[str, str] = {}
for cheie, codmap in votes.items():
# freq desc, apoi cod asc -> determinist.
harta[cheie] = sorted(codmap.items(), key=lambda kv: (-kv[1], kv[0]))[0][0]
return harta
# --------------------------------------------------------------------------- #
# Pasul 5: selectie de_etichetat (prag de volum) + orchestrare #
# --------------------------------------------------------------------------- #
def selecteaza_de_etichetat(
corpus: dict[str, dict],
harta: dict[str, str],
*,
target_volum: float,
etichetare_all: bool,
) -> list[str]:
"""Cheile ne-etichetate, sortate desc pe freq, in interiorul pragului de volum."""
ordered = sorted(corpus, key=lambda k: (-corpus[k]["freq"], k))
if etichetare_all:
in_prag = ordered
else:
total = sum(c["freq"] for c in corpus.values()) or 1
in_prag = []
cum = 0
for k in ordered:
in_prag.append(k)
cum += corpus[k]["freq"]
if cum / total >= target_volum:
break
return [k for k in in_prag if k not in harta]
def genereaza(
csv_paths: list[str],
*,
labels_path: str | None = DEFAULT_LABELS,
seed_path: str = DEFAULT_SEED,
target_volum: float = 0.9,
etichetare_all: bool = False,
clasifica=None,
batch: int = 32,
confidence: float = DEFAULT_CONFIDENCE,
source: str = DEFAULT_SOURCE,
progres=None,
checkpoint_every: int = 1,
pauza: float = 0.0,
) -> dict:
"""Genereaza/actualizeaza seedul. Intoarce statistici. Scrie `seed_path`.
`clasifica(batch_denumiri) -> list[cod]` e injectabil (teste); default = LM Studio.
`progres(mesaj)` e un callback optional de logare.
Checkpointing (`checkpoint_every` batch-uri): seedul se scrie pe disc periodic in
timpul rularii, NU doar la final. Esential pe GPU-box-ul instabil (shutdown termic
sub sarcina, memorie lmstudio-gpu-etichetare): un crash la batch-ul 80/104 pastreaza
progresul, iar re-run-ul continua din cache (idempotenta cross-run). 0 = doar la final.
"""
freq_raw = _freq_raw(csv_paths)
corpus = _corpus_din_freq(freq_raw)
seed_existent = _incarca_seed(seed_path)
harta = construieste_harta_etichete(freq_raw, corpus, labels_path, seed_existent)
de_etichetat = selecteaza_de_etichetat(
corpus, harta, target_volum=target_volum, etichetare_all=etichetare_all
)
reused = len(harta)
brute = int(sum(freq_raw.values()))
if progres:
progres(f"{len(freq_raw)} randuri brute distincte -> {len(corpus)} dupa normalizare "
f"-> {len(de_etichetat)} trimise la LLM (deja: {len(harta)})")
clasif = clasifica
if clasif is None:
et = _load_eticheteaza()
backend = et.get_backend()
if progres:
progres(f"backend={backend.name} url={backend.url} model={backend.model}")
def clasif(batch_denumiri):
return et.call(batch_denumiri, backend)[0]
apeluri = 0
valide = _valid_labels()
nr_batch = (len(de_etichetat) + batch - 1) // batch
for k in range(0, len(de_etichetat), batch):
chunk = de_etichetat[k:k + batch]
denumiri = [corpus[c]["denumire"] for c in chunk]
codes = clasif(denumiri)
apeluri += 1
for cheie, cod in zip(chunk, codes):
cod = str(cod or "").strip().upper()
if cod in valide: # '?' / cod invalid -> ramane ne-etichetat (retry la urmatorul run)
harta[cheie] = cod
if progres:
progres(f" batch {apeluri}/{nr_batch} "
f"-> total etichetat {sum(1 for c in harta if c in corpus)}")
# Checkpoint periodic: protejeaza progresul pe GPU-box instabil.
if checkpoint_every and apeluri % checkpoint_every == 0:
_scrie_seed(seed_path, _construieste_seed(corpus, harta, confidence=confidence, source=source))
# Pauza intre batch-uri: ragaz termic pentru GPU-box (shutdown sub sarcina sustinuta).
if pauza and k + batch < len(de_etichetat):
import time as _t
_t.sleep(pauza)
seed = _construieste_seed(corpus, harta, confidence=confidence, source=source)
_scrie_seed(seed_path, seed)
return {
"brute": brute,
"distincte": len(corpus),
"deja_etichetate": reused,
"de_etichetat": len(de_etichetat),
"apeluri_llm": apeluri,
"seed": len(seed),
}
def _valid_labels() -> set[str]:
et = _load_eticheteaza()
return set(et.ALL_LABELS)
def _construieste_seed(corpus, harta, *, confidence, source) -> list[dict]:
"""Seed ordonat determinist (pe cheie) -> byte-stabil intre rulari."""
out = []
for cheie in sorted(harta):
if cheie not in corpus:
continue # eticheta fara corespondent in corpusul curent
eticheta = harta[cheie]
is_nul = eticheta == NUL_LABEL
out.append({
"denumire": corpus[cheie]["denumire"],
"denumire_normalizata": cheie,
"cod": None if is_nul else eticheta,
"is_nul": is_nul,
"source": source,
"confidence": confidence,
})
return out
def _scrie_seed(seed_path: str, seed: list[dict]) -> None:
os.makedirs(os.path.dirname(os.path.abspath(seed_path)), exist_ok=True)
with open(seed_path, "w", encoding="utf-8") as fh:
json.dump(seed, fh, ensure_ascii=False, indent=2)
fh.write("\n")
# --------------------------------------------------------------------------- #
# CLI #
# --------------------------------------------------------------------------- #
def main(argv=None):
ap = argparse.ArgumentParser(description="Genereaza seed etichetat operatie->cod (LM Studio).")
ap.add_argument("--target-volum", type=float, default=0.9,
help="prag de acoperire pe volum (default 0.9 = D1)")
ap.add_argument("--all", action="store_true", help="eticheteaza tot corpusul, ignora pragul")
ap.add_argument("--batch", type=int, default=32, help="dimensiune batch (conservator: 32-40)")
ap.add_argument("--pauza", type=float, default=1.5,
help="secunde de pauza intre batch-uri (ragaz termic GPU); 0 = fara")
ap.add_argument("--checkpoint-every", type=int, default=1,
help="scrie seedul la fiecare N batch-uri (1 = dupa fiecare, crash-safe)")
ap.add_argument("--confidence", type=float, default=DEFAULT_CONFIDENCE)
ap.add_argument("--csv-glob", default=DEFAULT_CSV_GLOB)
ap.add_argument("--labels", default=DEFAULT_LABELS)
ap.add_argument("--seed", default=DEFAULT_SEED)
args = ap.parse_args(argv)
csv_paths = sorted(glob.glob(args.csv_glob))
if not csv_paths:
ap.error(f"niciun CSV gasit la {args.csv_glob}")
stats = genereaza(
csv_paths,
labels_path=args.labels,
seed_path=args.seed,
target_volum=args.target_volum,
etichetare_all=args.all,
batch=args.batch,
pauza=args.pauza,
checkpoint_every=args.checkpoint_every,
confidence=args.confidence,
progres=lambda m: print(m, flush=True),
)
print("GATA:", json.dumps(stats, ensure_ascii=False))
if __name__ == "__main__":
main()