#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ run_enrichment.py — enrichment orchestrator (plan Part B3). Mirror of run_extraction.py, on the *other* side of the rebuild. It reads the already-rebuilt data/activities.db, and for every activity emits one subagent prompt asking for a single bilingual + inferred-filter enrichment pass. Like extraction, this script does NOT call the LLM — the interactive Claude Code orchestrator launches waves of subagents on the emitted prompts. Keying is the crux (plan §"Cheia de keying"): each row's overlay is keyed on import_common.content_key(normalized_name, language, _normalize_text(description)) — the SAME function build_database uses to apply the overlay. The key is stable only while the extraction text is frozen, so enrichment runs AFTER the freezing rebuild. Modes: (default) emit one prompt per activity that has no enrichment part yet (resumable: data/enrichment_parts/.json present => skip) --collect merge data/enrichment_parts/*.json -> data/enrichment.json Pilot scoping (plan B5): --source and/or --limit N narrow the emitted prompts to a single source / category for the sign-off pilot. Usage: python scripts/run_enrichment.py --source teambuilding_corbu # pilot python scripts/run_enrichment.py # all rows python scripts/run_enrichment.py --collect # merge parts """ from __future__ import annotations import argparse import json import sqlite3 import sys from pathlib import Path from typing import Optional SCRIPT_DIR = Path(__file__).resolve().parent REPO_ROOT = SCRIPT_DIR.parent for _p in (str(SCRIPT_DIR), str(REPO_ROOT)): if _p not in sys.path: sys.path.insert(0, _p) from import_common import ( # noqa: E402 content_key, find_chunk_text, normalize_name, ) ENRICHMENT_PROMPT = SCRIPT_DIR / "ENRICHMENT_PROMPT.md" # Columns pulled from the DB into the prompt as the "current value" context. _DB_COLUMNS = ( "id", "name", "description", "rules", "variations", "category", "content_type", "language", "normalized_name", "page_reference", "source_id", "chunk_key", "participants_min", "participants_max", "duration_min", "duration_max", "age_group_min", "age_group_max", ) # How much source-chunk text to inline. Chunks are page-sized; cap so a dense # chunk does not blow the prompt up, but keep enough to ground the expansion. _CHUNK_TEXT_CAP = 12000 def _fetch_rows(db_path: Path, source_substr: Optional[str]) -> list[dict]: conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row try: cols = ", ".join(_DB_COLUMNS) sql = f"SELECT {cols} FROM activities" params: list = [] if source_substr: sql += " WHERE (source_id LIKE ? OR chunk_key LIKE ?)" params = [f"%{source_substr}%", f"%{source_substr}%"] sql += " ORDER BY source_id, id" return [dict(r) for r in conn.execute(sql, params).fetchall()] finally: conn.close() def _row_content_key(row: dict) -> str: return content_key( row.get("normalized_name") or normalize_name(row.get("name") or ""), row.get("language"), row.get("description") or "", ) def _chunk_text_for_row(row: dict, chunks_dir: Path) -> Optional[str]: """Locate the source-chunk text via the row's chunk_key / source_id.""" header = {"chunk_key": row.get("chunk_key"), "source_id": row.get("source_id")} if not header["chunk_key"]: return None # find_chunk_text resolves from the header when chunk_key is present; # the json_path arg is only a fallback, so a synthetic path is fine. text = find_chunk_text(Path(f"{row['chunk_key']}.json"), header, chunks_dir) if text and len(text) > _CHUNK_TEXT_CAP: text = text[:_CHUNK_TEXT_CAP] + "\n…[chunk truncated]…" return text def _current_fields_block(row: dict) -> str: """The activity's current DB values, as a compact JSON block for context.""" fields = { "name": row.get("name"), "description": row.get("description"), "rules": row.get("rules"), "variations": row.get("variations"), "category": row.get("category"), "content_type": row.get("content_type"), "language": row.get("language"), "participants_min": row.get("participants_min"), "participants_max": row.get("participants_max"), "duration_min": row.get("duration_min"), "duration_max": row.get("duration_max"), "age_group_min": row.get("age_group_min"), "age_group_max": row.get("age_group_max"), } return json.dumps(fields, ensure_ascii=False, indent=2) def emit_enrichment_prompt( row: dict, key: str, chunks_dir: Path, prompts_dir: Path ) -> Path: """Write the subagent enrichment prompt for one activity.""" chunk_text = _chunk_text_for_row(row, chunks_dir) source_block = ( chunk_text if chunk_text is not None else "[source chunk text unavailable — translate only what is given " "above; do NOT invent steps, and mark any inferred filter field " "as estimated]" ) part_path = f"data/enrichment_parts/{key}.json" text = "\n".join([ f"# ENRICHMENT — activity `{row.get('name')}` (id {row.get('id')})", "", f"Follow the rules in `{ENRICHMENT_PROMPT.relative_to(REPO_ROOT)}` EXACTLY.", "Single pass. Translate faithfully to Romanian; expand description_ro " "ONLY from the source chunk text below; mark inferred filter fields in " "`estimated_fields`.", "", f"Write the result JSON to: `{part_path}`", f'It MUST include `"content_key": "{key}"`.', f'Page reference: {row.get("page_reference") or "?"}', "", "## Current activity values (the text to translate / enrich)", "```json", _current_fields_block(row), "```", "", "## Source chunk text (ground description_ro expansion in THIS only)", "```", source_block, "```", "", ]) prompts_dir.mkdir(parents=True, exist_ok=True) out = prompts_dir / f"{key}.prompt.md" out.write_text(text, encoding="utf-8") return out def collect_enrichment(parts_dir: Path, out_path: Path) -> dict: """Merge data/enrichment_parts/*.json into one flat content_key map.""" merged: dict = {} bad: list[str] = [] if parts_dir.is_dir(): for part in sorted(parts_dir.glob("*.json")): try: data = json.loads(part.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): bad.append(part.name) continue key = data.get("content_key") or part.stem entry = {k: v for k, v in data.items() if k != "content_key"} merged[key] = entry out_path.write_text( json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8" ) return {"entries": len(merged), "bad_parts": bad, "out": str(out_path)} def run_emit( *, db_path: Path, chunks_dir: Path, parts_dir: Path, prompts_dir: Path, source_substr: Optional[str], limit: Optional[int], ) -> dict: rows = _fetch_rows(db_path, source_substr) emitted, skipped = 0, 0 for row in rows: key = _row_content_key(row) if (parts_dir / f"{key}.json").is_file(): skipped += 1 continue emit_enrichment_prompt(row, key, chunks_dir, prompts_dir) emitted += 1 if limit and emitted >= limit: break return { "rows": len(rows), "emitted": emitted, "skipped_done": skipped, "prompts_dir": str(prompts_dir), } def main(argv: Optional[list[str]] = None) -> int: parser = argparse.ArgumentParser(description="Enrichment orchestrator.") parser.add_argument("--db", default="data/activities.db") parser.add_argument("--chunks", default="data/chunks") parser.add_argument("--parts", default="data/enrichment_parts") parser.add_argument("--prompts", default="data/enrichment_prompts") parser.add_argument("--out", default="data/enrichment.json") parser.add_argument("--source", default=None, help="only rows whose source_id/chunk_key contains this (pilot)") parser.add_argument("--limit", type=int, default=None, help="cap emitted prompts (pilot)") parser.add_argument("--collect", action="store_true", help="merge enrichment parts into the overlay JSON") args = parser.parse_args(argv) print("=" * 60) print("ENRICHMENT ORCHESTRATOR") print("=" * 60) if args.collect: result = collect_enrichment(Path(args.parts), Path(args.out)) print(f"collected : {result['entries']} entries -> {result['out']}") if result["bad_parts"]: print(f"bad parts : {len(result['bad_parts'])} (skipped)") for name in result["bad_parts"]: print(f" - {name}") print("Run build_database.py --rebuild to apply the overlay.") print("=" * 60) return 0 summary = run_emit( db_path=Path(args.db), chunks_dir=Path(args.chunks), parts_dir=Path(args.parts), prompts_dir=Path(args.prompts), source_substr=args.source, limit=args.limit, ) print(f"rows in DB : {summary['rows']}" + (f" (filtered by '{args.source}')" if args.source else "")) print(f"already enriched : {summary['skipped_done']}") print(f"prompts emitted : {summary['emitted']}") if summary["emitted"]: print(f"prompts dir : {summary['prompts_dir']}/") print("Launch waves of ~8-16 Sonnet subagents on those prompts, each " "writing data/enrichment_parts/.json, then run " "run_enrichment.py --collect and build_database.py --rebuild.") else: print("Nothing to emit — run --collect then build_database.py --rebuild.") print("=" * 60) return 0 if __name__ == "__main__": raise SystemExit(main())