#!/bin/bash # ============================================================================ # enrich_wave.sh — ONE throttled enrichment wave, fully headless (no Claude # session). Designed to be run by the LXC's OS cron at night. # # - Prepares a bounded wave (first N missing keys) via enrichment_wave.py. # - Runs ONE `claude -p` per batch file, PAR batches concurrently (OS-level # parallelism — no Workflow tool, no 2-per-workflow cap, no session needed). # - When the backlog is empty, runs --collect + --rebuild and stops. # # Throttle = --keys (default 700 ≈ 75% of a 5h usage window ≈ 950 keys). # A single flock guarantees waves never overlap. # # Usage: scripts/enrich_wave.sh [KEYS] [PAR] # KEYS = max keys this wave (default 700) # PAR = concurrent claude -p (default 6) # ============================================================================ set -uo pipefail REPO="/workspace/game-library" LOG_DIR="/workspace/.claude-logs" LOCK="/tmp/enrich_wave.lock" KEYS="${1:-700}" PAR="${2:-6}" MAX_TURNS=100 # --- environment (cron has a minimal env) ---------------------------------- # export HOME="${HOME:-/home/claude}" [ -f "$HOME/.nvm/nvm.sh" ] && . "$HOME/.nvm/nvm.sh" >/dev/null 2>&1 export PATH="$HOME/.nvm/versions/node/v20.19.6/bin:/usr/local/bin:/usr/bin:/bin:$PATH" mkdir -p "$LOG_DIR" TS="$(date +%Y%m%d_%H%M%S)" LOG="$LOG_DIR/enrich_${TS}.log" log() { echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; } # --- single-instance lock: skip if a wave is still running ----------------- # exec 9>"$LOCK" if ! flock -n 9; then log "another wave holds the lock; exiting." exit 0 fi cd "$REPO" || { log "cannot cd $REPO"; exit 1; } command -v claude >/dev/null 2>&1 || { log "claude CLI not on PATH"; exit 1; } log "=== enrichment wave start (keys=$KEYS par=$PAR) ===" # --- 1) prepare bounded wave (batch files only) ---------------------------- # PREP="$(python3 scripts/enrichment_wave.py --prepare --keys "$KEYS" --no-shards 2>&1)" echo "$PREP" | tee -a "$LOG" if echo "$PREP" | grep -q "WAVE: COMPLETE"; then log "backlog empty -> collect + rebuild" python3 scripts/run_enrichment.py --collect >>"$LOG" 2>&1 python3 scripts/build_database.py --rebuild >>"$LOG" 2>&1 grep -E "enrichment .*matched" "$LOG" | tail -1 | tee -a "$LOG" log "=== ENRICHMENT COMPLETE ===" exit 0 fi # --- 2) per-batch headless enrichment, PAR-way parallel -------------------- # read -r -d '' BATCH_PROMPT <<'EOP' You are an enrichment subagent in the game-library pipeline. Working dir: /workspace/game-library. Read `scripts/ENRICHMENT_PROMPT.md` FIRST — it defines the rules and output format EXACTLY (translate faithfully to Romanian; expand description_ro ONLY from the source chunk text; mark inferred filter fields in estimated_fields; fixed enum vocab). Your batch file is __BATCHFILE__ — it lists content_keys, one per line. For EACH key: 1. IDEMPOTENT SKIP: if `data/enrichment_parts/.json` already exists AND parses as valid JSON, SKIP it (do not rewrite). 2. Otherwise read its prompt `data/enrichment_prompts/.prompt.md`, produce the enrichment JSON per ENRICHMENT_PROMPT.md, and write it to `data/enrichment_parts/.json` (MUST include the exact "content_key": ""). 3. Validate it parses: python3 -c "import json;json.load(open('data/enrichment_parts/.json'))". CRITICAL — JSON quote escaping: any literal ASCII double-quote inside a string value MUST be escaped as \". Romanian text uses „cuvant" where the closing mark is a plain ASCII " — written raw it breaks the JSON. Either keep the typographic „ " marks or escape every ASCII ". Re-read and re-validate each file; fix any that fail. Work through EVERY key in the batch file. If a key's prompt is missing, skip it and continue. When done, reply with one line: the count written and skipped. EOP export REPO LOG MAX_TURNS BATCH_PROMPT run_one() { local bf="$1" local name; name="$(basename "$bf")" local prompt="${BATCH_PROMPT/__BATCHFILE__/$bf}" cd "$REPO" || return 1 timeout 1200 claude -p "$prompt" \ --allowedTools "Bash(python3:*),Read,Write,Bash(cat:*),Bash(ls:*)" \ --max-turns "$MAX_TURNS" >"$LOG.$name.out" 2>&1 echo "[$(date '+%H:%M:%S')] done $name (exit $?)" >>"$LOG" } export -f run_one BATCHES=(data/enrichment_batches/batch_*.txt) log "launching ${#BATCHES[@]} batches, $PAR concurrent..." printf '%s\n' "${BATCHES[@]}" | xargs -P "$PAR" -I{} bash -c 'run_one "$@"' _ {} # --- 3) summary ------------------------------------------------------------ # if grep -qi "session limit\|usage limit" "$LOG".batch_*.out 2>/dev/null; then log "WINDOW EXHAUSTED (usage limit hit mid-wave) — unfinished keys retry next fire." fi STATUS="$(python3 scripts/enrichment_wave.py --status 2>&1 | grep -E 'good|missing|done')" echo "$STATUS" | tee -a "$LOG" log "=== wave done ==="