feat(5.15+5.14): CLOSE — fix-uri code-review + embeddings functional

5.15 (propagare design + dashboard editare) si 5.14 (mapare LLM distilata) inchise dupa /code-review high. 8 buguri reparate TDD: - HIGH modal nu se deschidea pe randul slim (base.html: trimitere-slim) - HIGH /repune trunchia prestatii (declaratie incompleta la RAR) -> iterare peste existing, codes pozitional - HIGH embeddings incarca model ~230MB degeaba pe corpus gol -> poarta has_corpus() - HIGH picker chips gol pe re-render eroare -> conn/account_id pe toate ramurile - MED obs re-derivat dupa stergere explicita -> _merge_override pastreaza obs='' - MED mapare salvata fara denumire poluă GOLD -> _record_gold_validation guard - MED typo nome_prestatie -> nume_prestatie in select /repune - MED bucketare timp +3h gresita iarna -> SQLite localtime + TZ=Europe/Bucharest Embeddings WIRE-uit functional (PRD #15, decizie user): ensure_embeddings_corpus construieste corpus din nomenclator, gated pe AUTOPASS_EMBEDDINGS_ENABLED (default off). Marime model corectata ~50MB->~230MB (estimare PRD gresita). Cleanup: hoist load_* din bucla bulk-fix; import re la top. Regresie: 1256 passed, 1 deselected (live), 0 failed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 20:48:34 +00:00
parent 9e42e7ed6f
commit 3fc53534e2
53 changed files with 9684 additions and 384 deletions
--- a/app/mapping.py
+++ b/app/mapping.py
@@ -14,6 +14,7 @@ unit-testabile direct. Cele cu `conn` sunt helpere de persistenta.

 from __future__ import annotations

+import hashlib
 import json
 import unicodedata
 from typing import Any
@@ -483,10 +484,18 @@ def pending_unmapped(conn, account_id=None) -> list[dict]:
                entry["denumire"] = item.get("denumire")
            entry["_ids"].add(r["id"])

+    # Indexeaza corpusul embeddings o data inainte de bucla (no-op cand flagul e off).
+    ensure_embeddings_corpus(conn, nomenclator)
+
    out: list[dict] = []
    for entry in agg.values():
        entry["blocked"] = len(entry.pop("_ids"))
        entry["suggestions"] = suggest_codes(entry["denumire"], nomenclator, limit=5)
+        # L14-S6: imbogatire sugestii cu GOLD partajat > SILVER > embeddings (Eng-F2).
+        # SUGGESTION-ONLY: nu intra in resolve_prestatii/load_mapping (#13).
+        enriched = enrich_suggestions(conn, entry["denumire"])
+        entry["sugestie_principala"] = enriched["sugestie_principala"]
+        entry["surse_sugestie"] = enriched["surse"]
        out.append(entry)
    out.sort(key=lambda e: (-e["blocked"], e["cod_op_service"]))
    return out
@@ -561,6 +570,148 @@ def delete_text_rule(conn, account_id: int | None, pattern: str) -> None:
    )


+# Prag minim de similaritate cosine pentru sugestia din embeddings NN.
+# Sub acest scor, sugestia NN e prea incerta si nu o afisam (previne recomandari
+# irelevante cand corpus-ul e mic sau neindexat corect).
+EMB_MIN_SIMILARITATE = 0.5
+
+
+def _corpus_signature(nomenclator: list[dict]) -> str:
+    """Semnatura stabila a nomenclatorului pentru cache-ul corpusului embeddings.
+
+    Hash pe perechile (cod, denumire) sortate dupa cod -> se schimba la orice
+    add/remove/redenumire de cod, ramane stabila altfel (evita re-embed inutil).
+    """
+    pairs = sorted(
+        (str(n.get("cod_prestatie") or ""), str(n.get("nume_prestatie") or ""))
+        for n in nomenclator
+    )
+    blob = "".join(f"{c}{d}" for c, d in pairs)
+    return hashlib.sha256(blob.encode("utf-8")).hexdigest()
+
+
+def ensure_embeddings_corpus(conn, nomenclator: list[dict] | None = None) -> None:
+    """Construieste/actualizeaza corpusul embeddings din nomenclator (Stratul 2 PRD 5.14).
+
+    Gated pe `AUTOPASS_EMBEDDINGS_ENABLED` (default OFF): cand e dezactivat, e un
+    no-op total (nu atinge modelul, nu interogheaza nomenclatorul) -> /mapari instant
+    + suita de teste rapida; sugestiile cad pe GOLD/SILVER + fuzzy.
+
+    Cand e activat: indexeaza corpusul {denumire=nume_prestatie, cod=cod_prestatie}
+    o singura data (lazy-load modelul ~230MB la prima chemare), re-indexeaza doar
+    cand semnatura nomenclatorului s-a schimbat. Degradare gratioasa: orice eroare
+    (model absent, embed esuat) lasa corpusul gol -> enrich_suggestions cade pe restul.
+
+    Apelat de apelantii care imbogatesc sugestii (pending_unmapped,
+    _nemapate_pentru_submission) INAINTE de bucla de enrich_suggestions, NU din
+    enrich_suggestions (care ramane o interogare ieftina cu garda has_corpus()).
+    """
+    from .config import get_settings
+    if not get_settings().embeddings_enabled:
+        return
+    try:
+        from . import embeddings as _emb
+        nomen = nomenclator if nomenclator is not None else load_nomenclator(conn)
+        if not nomen:
+            return
+        sig = _corpus_signature(nomen)
+        if _emb.corpus_signature() == sig and _emb.has_corpus():
+            return  # deja indexat pe acelasi nomenclator -> nimic de facut
+        items = [
+            {"denumire": str(n["nume_prestatie"]), "cod": str(n["cod_prestatie"])}
+            for n in nomen
+            if n.get("nume_prestatie") and n.get("cod_prestatie")
+        ]
+        _emb.index_corpus(items, signature=sig)
+    except Exception:
+        pass  # degradare gratioasa (#16b): esecul indexarii nu blocheaza editorul
+
+
+def enrich_suggestions(
+    conn,
+    denumire: str | None,
+    *,
+    include_embeddings: bool = True,
+) -> dict:
+    """Imbogateste sugestiile cu GOLD partajat, SILVER LLM si embeddings NN.
+
+    Precedenta Eng-F2 (pentru sugestie-only, nu auto-send):
+      shared GOLD > SILVER > embeddings
+
+    (Account GOLD = operations_mapping propriu = deja rezolvat inainte de needs_mapping;
+    nu apare in needs_mapping, deci nu e in precedenta de sugestie.)
+
+    Returneaza:
+      {
+        'sugestie_principala': {'cod_prestatie': str, 'sursa': str} | None,
+        'surse': {'gold_partajat': str|None, 'silver': str|None, 'embedding': str|None}
+      }
+
+    INVARIANTE:
+    - Toate sursele = SUGGESTION-ONLY. NU intra in resolve_prestatii/load_mapping (#13).
+    - SILVER cu is_nul=1 (non-operatie/gunoi) NU produce sugestie (#4).
+    - Degradare gratioasa pe embeddings (#16b): daca motorul nu e disponibil sau arunca,
+      returneaza sugestia disponibila din celelalte surse, fara exceptie.
+    - Import local shared_store/embeddings: evita ciclu la import-time (shared_store
+      importa normalize_for_match din mapping).
+    """
+    sugestie_principala: dict | None = None
+    surse: dict = {"gold_partajat": None, "silver": None, "embedding": None}
+
+    if not denumire:
+        return {"sugestie_principala": sugestie_principala, "surse": surse}
+
+    # Colecteaza TOATE sursele (fara short-circuit) in `surse`: editorul le poate afisa
+    # toate, independent de care castiga ca sugestie principala.
+    # Precedenta Eng-F2 se aplica DOAR la alegerea sugestiei_principale.
+
+    # 1. GOLD partajat cross-account (validat de om, cel mai de incredere)
+    try:
+        from .shared_store import lookup_shared_gold
+        row_gold = lookup_shared_gold(conn, denumire)
+        if row_gold and row_gold["cod_prestatie"]:
+            surse["gold_partajat"] = str(row_gold["cod_prestatie"])
+    except Exception:
+        pass  # degradare gratioasa
+
+    # 2. SILVER LLM (bootstrap, nevalidat de om; is_nul = supresie)
+    try:
+        from .shared_store import lookup_suggestion
+        row_silver = lookup_suggestion(conn, denumire)
+        if row_silver and not row_silver["is_nul"] and row_silver["cod_prestatie"]:
+            surse["silver"] = str(row_silver["cod_prestatie"])
+    except Exception:
+        pass  # degradare gratioasa
+
+    # 3. Embeddings NN (similaritate semantica, degradare gratioasa #16b)
+    if include_embeddings:
+        try:
+            from . import embeddings as _emb
+            # Poarta IEFTINA: nu atinge is_available()/suggest_nearest cand corpus-ul
+            # e gol — `is_available()` lazy-load-eaza modelul de ~230MB (30-120s in
+            # thread-ul de cerere). Corpusul se construieste de apelant prin
+            # ensure_embeddings_corpus (gated pe AUTOPASS_EMBEDDINGS_ENABLED); cand
+            # flagul e off, has_corpus() ramane False si calea e un no-op real.
+            if _emb.has_corpus():
+                nn = _emb.suggest_nearest(str(denumire), top_k=1)
+                # Prag minim: similaritate prea mica = sugestie inutila.
+                # Evita recomandari irelevante cand corpus-ul e mic/partial.
+                if nn and nn[0].get("similaritate", 0) >= EMB_MIN_SIMILARITATE:
+                    surse["embedding"] = str(nn[0]["cod"])
+        except Exception:
+            pass  # degradare gratioasa (#16b): motorul absent nu blocheaza
+
+    # Alege sugestia principala in ordinea de precedenta: GOLD > SILVER > embeddings
+    if surse["gold_partajat"]:
+        sugestie_principala = {"cod_prestatie": surse["gold_partajat"], "sursa": "gold_partajat"}
+    elif surse["silver"]:
+        sugestie_principala = {"cod_prestatie": surse["silver"], "sursa": "silver"}
+    elif surse["embedding"]:
+        sugestie_principala = {"cod_prestatie": surse["embedding"], "sursa": "embedding"}
+
+    return {"sugestie_principala": sugestie_principala, "surse": surse}
+
+
 def _emite_text_rule_hits(conn, account_id: int, submission_id: int, resolved: list[dict] | None) -> None:
    """Emite `text_rule_hit` in app_events pentru fiecare item rezolvat prin regula text.