"""Generare seed etichetat operatie->cod (US-003, PRD 5.18). Produce artefactul `app/data/operatii-etichetate.json` (comis in repo), consumat de seeder (US-004) si de corpusul embeddings (US-005). NU cheama LLM la runtime — o singura data, offline, pe LM Studio (backend implicit, D4). Pipeline dedup OBLIGATORIU, in ordine, INAINTE de orice apel LLM (D5): 1. Agrega cele N CSV-uri -> freq pe denumire RAW (NR ne-numeric -> skip rand, F9). 2. `cheie = normalize_for_match(denumire)` (ACEEASI functie ca DB/k-NN, NU strip exact). Arunca randurile cu `cheie == ""` inainte de dedup (coliziune pe slot UNIQUE gol, F6). 3. Dedup pe cheie: un reprezentant per cheie, `freq = suma NR`. 4. Harta `cheie -> cod` din TOATE etichetele existente: `labels-groq-partial.json` (cheiat brut) + seedul comis anterior (cheiat normalizat). Conflict (acelasi cheie, coduri diferite pe variante raw) -> castiga codul cu freq-max, tie-break pe cod sortat (F3). 5. `de_etichetat = corpus(in prag) - harta`. Sortat desc pe freq = SINGURUL input la LLM. Idempotenta cross-run (F2/F7): seedul comis = cache de etichete -> re-run = 0 apeluri LLM. """ from __future__ import annotations import argparse import csv import glob import importlib.util import json import os import sys from collections import Counter, defaultdict # Functia de normalizare = sursa unica de adevar (consistenta cu DB/k-NN). _APP_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) if _APP_ROOT not in sys.path: sys.path.insert(0, _APP_ROOT) from app.mapping import normalize_for_match # noqa: E402 def _load_eticheteaza(): path = os.path.join(os.path.dirname(__file__), "eticheteaza.py") spec = importlib.util.spec_from_file_location("eticheteaza", path) mod = importlib.util.module_from_spec(spec) sys.modules.setdefault("eticheteaza", mod) spec.loader.exec_module(mod) return mod # Cai implicite (relative la repo). DEFAULT_CSV_GLOB = os.path.join(_APP_ROOT, "docs", "operatii-service", "*.csv") DEFAULT_LABELS = os.path.join(_APP_ROOT, "tools", "mapare-llm", "labels-groq-partial.json") DEFAULT_SEED = os.path.join(_APP_ROOT, "app", "data", "operatii-etichetate.json") NUL_LABEL = "NUL" DEFAULT_CONFIDENCE = 0.7 DEFAULT_SOURCE = "llm_seed" # --------------------------------------------------------------------------- # # Pasul 1-3: corpus agregat pe cheie normalizata # # --------------------------------------------------------------------------- # def _freq_raw(csv_paths: list[str]) -> Counter: """Counter denumire_raw -> suma NR. NR ne-numeric -> skip rand (F9), nu zero-weight.""" freq: Counter = Counter() for f in csv_paths: with open(f, encoding="utf-8", errors="replace") as fh: for r in list(csv.reader(fh, delimiter=";"))[1:]: if len(r) <= 2: continue den = r[1].strip() if not den: continue nr_raw = (r[2] or "").strip() try: nr = int(nr_raw) except ValueError: continue # F9: skip rand cu NR ne-numeric freq[den] += nr return freq def _corpus_din_freq(freq_raw: Counter) -> dict[str, dict]: """{cheie_normalizata -> {denumire, freq}}. Arunca cheile vide (F6). `denumire` = varianta raw cu freq individual maxim (tie-break: raw sortat asc), folosita ca text trimis la LLM si stocata in seed. """ grup: dict[str, list[tuple[str, int]]] = defaultdict(list) for raw, n in freq_raw.items(): cheie = normalize_for_match(raw) if not cheie: continue # F6 grup[cheie].append((raw, n)) corpus: dict[str, dict] = {} for cheie, variante in grup.items(): freq = sum(n for _, n in variante) # reprezentant determinist: freq max, tie-break raw sortat. denumire = sorted(variante, key=lambda rn: (-rn[1], rn[0]))[0][0] corpus[cheie] = {"denumire": denumire, "freq": freq} return corpus def agrega_corpus(csv_paths: list[str]) -> dict[str, dict]: """{cheie_normalizata -> {denumire, freq}} din CSV-uri (pasii 1-3).""" return _corpus_din_freq(_freq_raw(csv_paths)) # --------------------------------------------------------------------------- # # Pasul 4: harta cheie -> cod din etichetele existente (reuse + conflict) # # --------------------------------------------------------------------------- # def _incarca_seed(seed_path: str | None) -> list[dict]: if not seed_path or not os.path.exists(seed_path): return [] try: with open(seed_path, encoding="utf-8") as fh: return json.loads(fh.read()) except (ValueError, OSError): return [] def construieste_harta_etichete( freq_raw: Counter, corpus: dict[str, dict], labels_path: str | None, seed_existent: list[dict], ) -> dict[str, str]: """Harta cheie_normalizata -> eticheta (cod RAR sau 'NUL'), reuse in spatiu normalizat. Voturi ponderate pe freq; conflict pe acelasi cheie -> freq-max, tie-break cod sortat (F3). """ votes: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) # labels-groq-partial.json: cheiat pe text BRUT. if labels_path and os.path.exists(labels_path): with open(labels_path, encoding="utf-8") as fh: labels = json.loads(fh.read()) for raw, cod in labels.items(): cheie = normalize_for_match(raw) if not cheie: continue cod = str(cod or "").strip().upper() if not cod: continue votes[cheie][cod] += freq_raw.get(raw, 0) # seed comis anterior: cheiat normalizat (cache cross-run). for e in seed_existent: cheie = e.get("denumire_normalizata") if not cheie: continue eticheta = NUL_LABEL if e.get("is_nul") else str(e.get("cod") or "").strip().upper() if not eticheta: continue votes[cheie][eticheta] += corpus.get(cheie, {}).get("freq", 0) harta: dict[str, str] = {} for cheie, codmap in votes.items(): # freq desc, apoi cod asc -> determinist. harta[cheie] = sorted(codmap.items(), key=lambda kv: (-kv[1], kv[0]))[0][0] return harta # --------------------------------------------------------------------------- # # Pasul 5: selectie de_etichetat (prag de volum) + orchestrare # # --------------------------------------------------------------------------- # def selecteaza_de_etichetat( corpus: dict[str, dict], harta: dict[str, str], *, target_volum: float, etichetare_all: bool, ) -> list[str]: """Cheile ne-etichetate, sortate desc pe freq, in interiorul pragului de volum.""" ordered = sorted(corpus, key=lambda k: (-corpus[k]["freq"], k)) if etichetare_all: in_prag = ordered else: total = sum(c["freq"] for c in corpus.values()) or 1 in_prag = [] cum = 0 for k in ordered: in_prag.append(k) cum += corpus[k]["freq"] if cum / total >= target_volum: break return [k for k in in_prag if k not in harta] def genereaza( csv_paths: list[str], *, labels_path: str | None = DEFAULT_LABELS, seed_path: str = DEFAULT_SEED, target_volum: float = 0.9, etichetare_all: bool = False, clasifica=None, batch: int = 32, confidence: float = DEFAULT_CONFIDENCE, source: str = DEFAULT_SOURCE, progres=None, checkpoint_every: int = 1, pauza: float = 0.0, ) -> dict: """Genereaza/actualizeaza seedul. Intoarce statistici. Scrie `seed_path`. `clasifica(batch_denumiri) -> list[cod]` e injectabil (teste); default = LM Studio. `progres(mesaj)` e un callback optional de logare. Checkpointing (`checkpoint_every` batch-uri): seedul se scrie pe disc periodic in timpul rularii, NU doar la final. Esential pe GPU-box-ul instabil (shutdown termic sub sarcina, memorie lmstudio-gpu-etichetare): un crash la batch-ul 80/104 pastreaza progresul, iar re-run-ul continua din cache (idempotenta cross-run). 0 = doar la final. """ freq_raw = _freq_raw(csv_paths) corpus = _corpus_din_freq(freq_raw) seed_existent = _incarca_seed(seed_path) harta = construieste_harta_etichete(freq_raw, corpus, labels_path, seed_existent) de_etichetat = selecteaza_de_etichetat( corpus, harta, target_volum=target_volum, etichetare_all=etichetare_all ) reused = len(harta) brute = int(sum(freq_raw.values())) if progres: progres(f"{len(freq_raw)} randuri brute distincte -> {len(corpus)} dupa normalizare " f"-> {len(de_etichetat)} trimise la LLM (deja: {len(harta)})") clasif = clasifica if clasif is None: et = _load_eticheteaza() backend = et.get_backend() if progres: progres(f"backend={backend.name} url={backend.url} model={backend.model}") def clasif(batch_denumiri): return et.call(batch_denumiri, backend)[0] apeluri = 0 valide = _valid_labels() nr_batch = (len(de_etichetat) + batch - 1) // batch for k in range(0, len(de_etichetat), batch): chunk = de_etichetat[k:k + batch] denumiri = [corpus[c]["denumire"] for c in chunk] codes = clasif(denumiri) apeluri += 1 for cheie, cod in zip(chunk, codes): cod = str(cod or "").strip().upper() if cod in valide: # '?' / cod invalid -> ramane ne-etichetat (retry la urmatorul run) harta[cheie] = cod if progres: progres(f" batch {apeluri}/{nr_batch} " f"-> total etichetat {sum(1 for c in harta if c in corpus)}") # Checkpoint periodic: protejeaza progresul pe GPU-box instabil. if checkpoint_every and apeluri % checkpoint_every == 0: _scrie_seed(seed_path, _construieste_seed(corpus, harta, confidence=confidence, source=source)) # Pauza intre batch-uri: ragaz termic pentru GPU-box (shutdown sub sarcina sustinuta). if pauza and k + batch < len(de_etichetat): import time as _t _t.sleep(pauza) seed = _construieste_seed(corpus, harta, confidence=confidence, source=source) _scrie_seed(seed_path, seed) return { "brute": brute, "distincte": len(corpus), "deja_etichetate": reused, "de_etichetat": len(de_etichetat), "apeluri_llm": apeluri, "seed": len(seed), } def _valid_labels() -> set[str]: et = _load_eticheteaza() return set(et.ALL_LABELS) def _construieste_seed(corpus, harta, *, confidence, source) -> list[dict]: """Seed ordonat determinist (pe cheie) -> byte-stabil intre rulari.""" out = [] for cheie in sorted(harta): if cheie not in corpus: continue # eticheta fara corespondent in corpusul curent eticheta = harta[cheie] is_nul = eticheta == NUL_LABEL out.append({ "denumire": corpus[cheie]["denumire"], "denumire_normalizata": cheie, "cod": None if is_nul else eticheta, "is_nul": is_nul, "source": source, "confidence": confidence, }) return out def _scrie_seed(seed_path: str, seed: list[dict]) -> None: os.makedirs(os.path.dirname(os.path.abspath(seed_path)), exist_ok=True) with open(seed_path, "w", encoding="utf-8") as fh: json.dump(seed, fh, ensure_ascii=False, indent=2) fh.write("\n") # --------------------------------------------------------------------------- # # CLI # # --------------------------------------------------------------------------- # def main(argv=None): ap = argparse.ArgumentParser(description="Genereaza seed etichetat operatie->cod (LM Studio).") ap.add_argument("--target-volum", type=float, default=0.9, help="prag de acoperire pe volum (default 0.9 = D1)") ap.add_argument("--all", action="store_true", help="eticheteaza tot corpusul, ignora pragul") ap.add_argument("--batch", type=int, default=32, help="dimensiune batch (conservator: 32-40)") ap.add_argument("--pauza", type=float, default=1.5, help="secunde de pauza intre batch-uri (ragaz termic GPU); 0 = fara") ap.add_argument("--checkpoint-every", type=int, default=1, help="scrie seedul la fiecare N batch-uri (1 = dupa fiecare, crash-safe)") ap.add_argument("--confidence", type=float, default=DEFAULT_CONFIDENCE) ap.add_argument("--csv-glob", default=DEFAULT_CSV_GLOB) ap.add_argument("--labels", default=DEFAULT_LABELS) ap.add_argument("--seed", default=DEFAULT_SEED) args = ap.parse_args(argv) csv_paths = sorted(glob.glob(args.csv_glob)) if not csv_paths: ap.error(f"niciun CSV gasit la {args.csv_glob}") stats = genereaza( csv_paths, labels_path=args.labels, seed_path=args.seed, target_volum=args.target_volum, etichetare_all=args.all, batch=args.batch, pauza=args.pauza, checkpoint_every=args.checkpoint_every, confidence=args.confidence, progres=lambda m: print(m, flush=True), ) print("GATA:", json.dumps(stats, ensure_ascii=False)) if __name__ == "__main__": main()