Headless cron enrichment system + progress checkpoint at 32%

OS cron fires enrich_wave.sh twice nightly (post 23:00 UTC reset); each wave caps at ~700 keys (~75% window) via enrichment_wave.py --prepare. Fully headless: one claude -p per batch via xargs, flock-guarded, idempotent. DB updated to 9541 activities; .gitignore covers enrichment intermediates. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 21:26:35 +00:00
parent d6971e47f8
commit f7a37f91ec
6 changed files with 619 additions and 7 deletions
--- a/scripts/enrich_wave.sh
+++ b/scripts/enrich_wave.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# ============================================================================
+# enrich_wave.sh — ONE throttled enrichment wave, fully headless (no Claude
+# session). Designed to be run by the LXC's OS cron at night.
+#
+#   - Prepares a bounded wave (first N missing keys) via enrichment_wave.py.
+#   - Runs ONE `claude -p` per batch file, PAR batches concurrently (OS-level
+#     parallelism — no Workflow tool, no 2-per-workflow cap, no session needed).
+#   - When the backlog is empty, runs --collect + --rebuild and stops.
+#
+# Throttle = --keys (default 700 ≈ 75% of a 5h usage window ≈ 950 keys).
+# A single flock guarantees waves never overlap.
+#
+# Usage:  scripts/enrich_wave.sh [KEYS] [PAR]
+#         KEYS = max keys this wave   (default 700)
+#         PAR  = concurrent claude -p (default 6)
+# ============================================================================
+set -uo pipefail
+
+REPO="/workspace/game-library"
+LOG_DIR="/workspace/.claude-logs"
+LOCK="/tmp/enrich_wave.lock"
+KEYS="${1:-700}"
+PAR="${2:-6}"
+MAX_TURNS=100
+
+# --- environment (cron has a minimal env) ---------------------------------- #
+export HOME="${HOME:-/home/claude}"
+[ -f "$HOME/.nvm/nvm.sh" ] && . "$HOME/.nvm/nvm.sh" >/dev/null 2>&1
+export PATH="$HOME/.nvm/versions/node/v20.19.6/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
+
+mkdir -p "$LOG_DIR"
+TS="$(date +%Y%m%d_%H%M%S)"
+LOG="$LOG_DIR/enrich_${TS}.log"
+
+log() { echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; }
+
+# --- single-instance lock: skip if a wave is still running ----------------- #
+exec 9>"$LOCK"
+if ! flock -n 9; then
+  log "another wave holds the lock; exiting."
+  exit 0
+fi
+
+cd "$REPO" || { log "cannot cd $REPO"; exit 1; }
+command -v claude >/dev/null 2>&1 || { log "claude CLI not on PATH"; exit 1; }
+
+log "=== enrichment wave start (keys=$KEYS par=$PAR) ==="
+
+# --- 1) prepare bounded wave (batch files only) ---------------------------- #
+PREP="$(python3 scripts/enrichment_wave.py --prepare --keys "$KEYS" --no-shards 2>&1)"
+echo "$PREP" | tee -a "$LOG"
+
+if echo "$PREP" | grep -q "WAVE: COMPLETE"; then
+  log "backlog empty -> collect + rebuild"
+  python3 scripts/run_enrichment.py --collect   >>"$LOG" 2>&1
+  python3 scripts/build_database.py --rebuild   >>"$LOG" 2>&1
+  grep -E "enrichment .*matched" "$LOG" | tail -1 | tee -a "$LOG"
+  log "=== ENRICHMENT COMPLETE ==="
+  exit 0
+fi
+
+# --- 2) per-batch headless enrichment, PAR-way parallel -------------------- #
+read -r -d '' BATCH_PROMPT <<'EOP'
+You are an enrichment subagent in the game-library pipeline. Working dir: /workspace/game-library.
+
+Read `scripts/ENRICHMENT_PROMPT.md` FIRST — it defines the rules and output format EXACTLY (translate faithfully to Romanian; expand description_ro ONLY from the source chunk text; mark inferred filter fields in estimated_fields; fixed enum vocab).
+
+Your batch file is __BATCHFILE__ — it lists content_keys, one per line. For EACH key:
+1. IDEMPOTENT SKIP: if `data/enrichment_parts/<key>.json` already exists AND parses as valid JSON, SKIP it (do not rewrite).
+2. Otherwise read its prompt `data/enrichment_prompts/<key>.prompt.md`, produce the enrichment JSON per ENRICHMENT_PROMPT.md, and write it to `data/enrichment_parts/<key>.json` (MUST include the exact "content_key": "<key>").
+3. Validate it parses: python3 -c "import json;json.load(open('data/enrichment_parts/<key>.json'))".
+
+CRITICAL — JSON quote escaping: any literal ASCII double-quote inside a string value MUST be escaped as \". Romanian text uses „cuvant" where the closing mark is a plain ASCII " — written raw it breaks the JSON. Either keep the typographic „ " marks or escape every ASCII ". Re-read and re-validate each file; fix any that fail.
+
+Work through EVERY key in the batch file. If a key's prompt is missing, skip it and continue. When done, reply with one line: the count written and skipped.
+EOP
+
+export REPO LOG MAX_TURNS BATCH_PROMPT
+run_one() {
+  local bf="$1"
+  local name; name="$(basename "$bf")"
+  local prompt="${BATCH_PROMPT/__BATCHFILE__/$bf}"
+  cd "$REPO" || return 1
+  timeout 1200 claude -p "$prompt" \
+    --allowedTools "Bash(python3:*),Read,Write,Bash(cat:*),Bash(ls:*)" \
+    --max-turns "$MAX_TURNS" </dev/null >>"$LOG.$name.out" 2>&1
+  echo "[$(date '+%H:%M:%S')] done $name (exit $?)" >>"$LOG"
+}
+export -f run_one
+
+BATCHES=(data/enrichment_batches/batch_*.txt)
+log "launching ${#BATCHES[@]} batches, $PAR concurrent..."
+printf '%s\n' "${BATCHES[@]}" | xargs -P "$PAR" -I{} bash -c 'run_one "$@"' _ {}
+
+# --- 3) summary ------------------------------------------------------------ #
+if grep -qi "session limit\|usage limit" "$LOG".batch_*.out 2>/dev/null; then
+  log "WINDOW EXHAUSTED (usage limit hit mid-wave) — unfinished keys retry next fire."
+fi
+STATUS="$(python3 scripts/enrichment_wave.py --status 2>&1 | grep -E 'good|missing|done')"
+echo "$STATUS" | tee -a "$LOG"
+log "=== wave done ==="
--- a/scripts/enrichment_wave.py
+++ b/scripts/enrichment_wave.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+enrichment_wave.py — throttled, window-paced wave preparation for the corpus
+enrichment pipeline.
+
+The enrichment backlog (~9541 keys) does NOT fit in one 5-hour Anthropic usage
+window. Launching all remaining batches at once always runs the window to
+EXHAUSTION (the "subagent completed without calling StructuredOutput" signature),
+consuming 100% and blocking other work. There is no readable real-time window
+meter, so pacing must be BLIND: cap each wave to a fixed KEY COUNT (sized to
+~75% of empirical window capacity, ~950 keys), and let an external scheduler
+(cron, every 6h) space waves across windows.
+
+This script encapsulates the reconcile + bounded-wave preparation that used to
+live as ad-hoc inline Python. It does NOT call the LLM and does NOT launch
+workflows — it only prepares files on disk and prints what to launch.
+
+Modes:
+  --status                          read-only: print done / missing / pct
+  --prepare --keys N --shards K     drop corrupt parts; take the FIRST N missing
+                                    keys (sorted, deterministic); write batch
+                                    files for ONLY those; regenerate K shard JS
+                                    files covering exactly those batches; print
+                                    machine-greppable WAVE:/SHARD: lines.
+
+Idempotency: a key is "done" iff data/enrichment_parts/<key>.json exists AND
+parses. Re-running --prepare with the same args is deterministic (same sorted
+first-N keys), so a re-fire never reshuffles work. Parts on disk are the durable
+checkpoint.
+
+Output contract (parsed by the cron wave-runner):
+  WAVE: COMPLETE                                  -> backlog empty; run collect+rebuild
+  WAVE: PREPARED keys=.. batches=.. shards=.. remaining_after=..
+  SHARD: data/enrichment_wf/shard_0.js            -> one line per workflow to launch
+  ...
+
+Usage:
+    python3 scripts/enrichment_wave.py --status
+    python3 scripts/enrichment_wave.py --prepare --keys 700 --shards 8
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+
+PROMPT_SUFFIX = ".prompt.md"
+PART_SUFFIX = ".json"
+BATCH_SIZE_DEFAULT = 12
+KEYS_DEFAULT = 700
+SHARDS_DEFAULT = 8
+
+# Resolved relative to REPO_ROOT so the script works from any cwd.
+DEF_PROMPTS = "data/enrichment_prompts"
+DEF_PARTS = "data/enrichment_parts"
+DEF_BATCHES = "data/enrichment_batches"
+DEF_WF = "data/enrichment_wf"
+TEMPLATE_NAME = "shard.js.tmpl"
+
+
+# --------------------------------------------------------------------------- #
+# Helpers
+# --------------------------------------------------------------------------- #
+def _abs(p: str) -> Path:
+    q = Path(p)
+    return q if q.is_absolute() else (REPO_ROOT / q)
+
+
+def part_ok(path: Path) -> bool:
+    """A part counts as done iff it parses as a JSON object."""
+    try:
+        return isinstance(json.load(open(path, encoding="utf-8")), dict)
+    except Exception:
+        return False
+
+
+def corrupt_parts(parts_dir: Path) -> list[Path]:
+    return [p for p in parts_dir.glob("*" + PART_SUFFIX) if not part_ok(p)]
+
+
+def compute_missing(prompts_dir: Path, parts_dir: Path) -> list[str]:
+    """Keys whose prompt exists but whose part is absent. Sorted = deterministic."""
+    missing = []
+    for pr in prompts_dir.glob("*" + PROMPT_SUFFIX):
+        key = pr.name[: -len(PROMPT_SUFFIX)]
+        if not (parts_dir / (key + PART_SUFFIX)).exists():
+            missing.append(key)
+    return sorted(missing)
+
+
+def count_done(prompts_dir: Path, parts_dir: Path) -> tuple[int, int]:
+    """(good_parts_with_prompt, total_prompts)."""
+    total = 0
+    good = 0
+    for pr in prompts_dir.glob("*" + PROMPT_SUFFIX):
+        total += 1
+        key = pr.name[: -len(PROMPT_SUFFIX)]
+        part = parts_dir / (key + PART_SUFFIX)
+        if part.exists() and part_ok(part):
+            good += 1
+    return good, total
+
+
+def write_batches(keys: list[str], batches_dir: Path, size: int) -> int:
+    """Replace all batch_*.txt with fresh files of <= size keys. Returns NB."""
+    batches_dir.mkdir(parents=True, exist_ok=True)
+    for old in batches_dir.glob("batch_*.txt"):
+        old.unlink()
+    nb = 0
+    for i in range(0, len(keys), size):
+        chunk = keys[i : i + size]
+        (batches_dir / f"batch_{nb:04d}.txt").write_text(
+            "\n".join(chunk) + "\n", encoding="utf-8"
+        )
+        nb += 1
+    return nb
+
+
+def shard_ranges(nb: int, k: int) -> list[tuple[int, int]]:
+    """Split [0,nb) into k contiguous, disjoint, total-covering ranges.
+
+    Even distribution: the first (nb % k) shards carry one extra batch. When
+    nb < k the trailing ranges are empty [x,x) and are dropped by the caller.
+    """
+    if nb <= 0 or k <= 0:
+        return []
+    base, extra = divmod(nb, k)
+    ranges = []
+    start = 0
+    for i in range(k):
+        length = base + (1 if i < extra else 0)
+        ranges.append((start, start + length))
+        start += length
+    return ranges
+
+
+def render_shard(template: str, shard: int, start: int, end: int, nshards: int) -> str:
+    return (
+        template.replace("__SHARD__", str(shard))
+        .replace("__START__", str(start))
+        .replace("__END__", str(end))
+        .replace("__NSHARDS__", str(nshards))
+    )
+
+
+def write_shards(ranges: list[tuple[int, int]], template: str, wf_dir: Path) -> list[Path]:
+    """Delete stale shard_*.js, then write one per NON-EMPTY range. Returns paths."""
+    wf_dir.mkdir(parents=True, exist_ok=True)
+    for old in wf_dir.glob("shard_*.js"):
+        old.unlink()
+    non_empty = [(i, s, e) for i, (s, e) in enumerate(ranges) if e > s]
+    nshards = len(non_empty)
+    paths = []
+    # Re-index shards 0..nshards-1 so labels/meta stay contiguous even if some
+    # trailing ranges were empty (tiny final wave with fewer batches than K).
+    for new_idx, (_, s, e) in enumerate(non_empty):
+        path = wf_dir / f"shard_{new_idx}.js"
+        path.write_text(
+            render_shard(template, new_idx, s, e, nshards), encoding="utf-8"
+        )
+        paths.append(path)
+    return paths
+
+
+def rel(path: Path) -> str:
+    try:
+        return str(path.relative_to(REPO_ROOT))
+    except ValueError:
+        return str(path)
+
+
+# --------------------------------------------------------------------------- #
+# Modes
+# --------------------------------------------------------------------------- #
+def cmd_status(prompts_dir: Path, parts_dir: Path) -> int:
+    good, total = count_done(prompts_dir, parts_dir)
+    parts_on_disk = len(list(parts_dir.glob("*" + PART_SUFFIX)))
+    bad = len(corrupt_parts(parts_dir))
+    missing = total - good
+    pct = (100.0 * good / total) if total else 0.0
+    print("=== enrichment status ===")
+    print(f"prompts (universe) : {total}")
+    print(f"parts on disk      : {parts_on_disk}")
+    print(f"good (done)        : {good}")
+    print(f"corrupt parts      : {bad}  (reported only; --prepare drops them)")
+    print(f"missing            : {missing}")
+    print(f"done               : {pct:.1f}%")
+    if total:
+        print(f"WAVE: {'COMPLETE' if missing == 0 else 'PENDING'} missing={missing}")
+    return 0
+
+
+def cmd_prepare(
+    prompts_dir: Path,
+    parts_dir: Path,
+    batches_dir: Path,
+    wf_dir: Path,
+    keys: int,
+    shards: int,
+    batch_size: int,
+    make_shards: bool = True,
+) -> int:
+    template = ""
+    if make_shards:
+        template_path = wf_dir / TEMPLATE_NAME
+        if not template_path.is_file():
+            print(f"ERROR: missing shard template {rel(template_path)}", file=sys.stderr)
+            return 2
+        template = template_path.read_text(encoding="utf-8")
+
+    # 1) drop corrupt parts (only mutation to parts/)
+    dropped = 0
+    for p in corrupt_parts(parts_dir):
+        p.unlink()
+        dropped += 1
+
+    # 2) compute missing (deterministic)
+    missing = compute_missing(prompts_dir, parts_dir)
+
+    # 3) empty -> COMPLETE sentinel, no files written
+    if not missing:
+        print(f"dropped_corrupt={dropped}")
+        print("WAVE: COMPLETE")
+        return 0
+
+    # 4) clamp to first N
+    take = missing[:keys]
+
+    # 5) batches for ONLY those keys
+    nb = write_batches(take, batches_dir, batch_size)
+
+    # 6) shard scripts covering exactly those batches (skipped on the bash path)
+    paths = []
+    if make_shards:
+        ranges = shard_ranges(nb, shards)
+        paths = write_shards(ranges, template, wf_dir)
+
+    remaining_after = len(missing) - len(take)
+    print(f"dropped_corrupt={dropped}")
+    print(
+        f"WAVE: PREPARED keys={len(take)} batches={nb} "
+        f"shards={len(paths)} remaining_after={remaining_after}"
+    )
+    for p in paths:
+        print(f"SHARD: {rel(p)}")
+    return 0
+
+
+def main(argv=None) -> int:
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--status", action="store_true", help="read-only progress report")
+    ap.add_argument("--prepare", action="store_true", help="prepare one bounded wave")
+    ap.add_argument("--keys", type=int, default=KEYS_DEFAULT, help=f"max keys this wave (default {KEYS_DEFAULT})")
+    ap.add_argument("--shards", type=int, default=SHARDS_DEFAULT, help=f"workflow shards (default {SHARDS_DEFAULT})")
+    ap.add_argument("--batch-size", type=int, default=BATCH_SIZE_DEFAULT, help=f"keys per batch (default {BATCH_SIZE_DEFAULT})")
+    ap.add_argument("--no-shards", action="store_true", help="prepare batch files only; skip shard JS generation (bash/headless path)")
+    ap.add_argument("--prompts", default=DEF_PROMPTS)
+    ap.add_argument("--parts", default=DEF_PARTS)
+    ap.add_argument("--batches", default=DEF_BATCHES)
+    ap.add_argument("--wf-dir", default=DEF_WF)
+    args = ap.parse_args(argv)
+
+    prompts_dir = _abs(args.prompts)
+    parts_dir = _abs(args.parts)
+    batches_dir = _abs(args.batches)
+    wf_dir = _abs(args.wf_dir)
+
+    if not prompts_dir.is_dir():
+        print(f"ERROR: prompts dir not found: {rel(prompts_dir)}", file=sys.stderr)
+        return 2
+    parts_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.keys < 1 or args.shards < 1 or args.batch_size < 1:
+        print("ERROR: --keys/--shards/--batch-size must be >= 1", file=sys.stderr)
+        return 2
+
+    if args.prepare:
+        return cmd_prepare(
+            prompts_dir, parts_dir, batches_dir, wf_dir,
+            args.keys, args.shards, args.batch_size,
+            make_shards=not args.no_shards,
+        )
+    # default to status
+    return cmd_status(prompts_dir, parts_dir)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())