OS cron fires enrich_wave.sh twice nightly (post 23:00 UTC reset); each wave caps at ~700 keys (~75% window) via enrichment_wave.py --prepare. Fully headless: one claude -p per batch via xargs, flock-guarded, idempotent. DB updated to 9541 activities; .gitignore covers enrichment intermediates. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
103 lines
4.8 KiB
Bash
Executable File
103 lines
4.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# ============================================================================
|
|
# enrich_wave.sh — ONE throttled enrichment wave, fully headless (no Claude
|
|
# session). Designed to be run by the LXC's OS cron at night.
|
|
#
|
|
# - Prepares a bounded wave (first N missing keys) via enrichment_wave.py.
|
|
# - Runs ONE `claude -p` per batch file, PAR batches concurrently (OS-level
|
|
# parallelism — no Workflow tool, no 2-per-workflow cap, no session needed).
|
|
# - When the backlog is empty, runs --collect + --rebuild and stops.
|
|
#
|
|
# Throttle = --keys (default 700 ≈ 75% of a 5h usage window ≈ 950 keys).
|
|
# A single flock guarantees waves never overlap.
|
|
#
|
|
# Usage: scripts/enrich_wave.sh [KEYS] [PAR]
|
|
# KEYS = max keys this wave (default 700)
|
|
# PAR = concurrent claude -p (default 6)
|
|
# ============================================================================
|
|
set -uo pipefail
|
|
|
|
REPO="/workspace/game-library"
|
|
LOG_DIR="/workspace/.claude-logs"
|
|
LOCK="/tmp/enrich_wave.lock"
|
|
KEYS="${1:-700}"
|
|
PAR="${2:-6}"
|
|
MAX_TURNS=100
|
|
|
|
# --- environment (cron has a minimal env) ---------------------------------- #
|
|
export HOME="${HOME:-/home/claude}"
|
|
[ -f "$HOME/.nvm/nvm.sh" ] && . "$HOME/.nvm/nvm.sh" >/dev/null 2>&1
|
|
export PATH="$HOME/.nvm/versions/node/v20.19.6/bin:/usr/local/bin:/usr/bin:/bin:$PATH"
|
|
|
|
mkdir -p "$LOG_DIR"
|
|
TS="$(date +%Y%m%d_%H%M%S)"
|
|
LOG="$LOG_DIR/enrich_${TS}.log"
|
|
|
|
log() { echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; }
|
|
|
|
# --- single-instance lock: skip if a wave is still running ----------------- #
|
|
exec 9>"$LOCK"
|
|
if ! flock -n 9; then
|
|
log "another wave holds the lock; exiting."
|
|
exit 0
|
|
fi
|
|
|
|
cd "$REPO" || { log "cannot cd $REPO"; exit 1; }
|
|
command -v claude >/dev/null 2>&1 || { log "claude CLI not on PATH"; exit 1; }
|
|
|
|
log "=== enrichment wave start (keys=$KEYS par=$PAR) ==="
|
|
|
|
# --- 1) prepare bounded wave (batch files only) ---------------------------- #
|
|
PREP="$(python3 scripts/enrichment_wave.py --prepare --keys "$KEYS" --no-shards 2>&1)"
|
|
echo "$PREP" | tee -a "$LOG"
|
|
|
|
if echo "$PREP" | grep -q "WAVE: COMPLETE"; then
|
|
log "backlog empty -> collect + rebuild"
|
|
python3 scripts/run_enrichment.py --collect >>"$LOG" 2>&1
|
|
python3 scripts/build_database.py --rebuild >>"$LOG" 2>&1
|
|
grep -E "enrichment .*matched" "$LOG" | tail -1 | tee -a "$LOG"
|
|
log "=== ENRICHMENT COMPLETE ==="
|
|
exit 0
|
|
fi
|
|
|
|
# --- 2) per-batch headless enrichment, PAR-way parallel -------------------- #
|
|
read -r -d '' BATCH_PROMPT <<'EOP'
|
|
You are an enrichment subagent in the game-library pipeline. Working dir: /workspace/game-library.
|
|
|
|
Read `scripts/ENRICHMENT_PROMPT.md` FIRST — it defines the rules and output format EXACTLY (translate faithfully to Romanian; expand description_ro ONLY from the source chunk text; mark inferred filter fields in estimated_fields; fixed enum vocab).
|
|
|
|
Your batch file is __BATCHFILE__ — it lists content_keys, one per line. For EACH key:
|
|
1. IDEMPOTENT SKIP: if `data/enrichment_parts/<key>.json` already exists AND parses as valid JSON, SKIP it (do not rewrite).
|
|
2. Otherwise read its prompt `data/enrichment_prompts/<key>.prompt.md`, produce the enrichment JSON per ENRICHMENT_PROMPT.md, and write it to `data/enrichment_parts/<key>.json` (MUST include the exact "content_key": "<key>").
|
|
3. Validate it parses: python3 -c "import json;json.load(open('data/enrichment_parts/<key>.json'))".
|
|
|
|
CRITICAL — JSON quote escaping: any literal ASCII double-quote inside a string value MUST be escaped as \". Romanian text uses „cuvant" where the closing mark is a plain ASCII " — written raw it breaks the JSON. Either keep the typographic „ " marks or escape every ASCII ". Re-read and re-validate each file; fix any that fail.
|
|
|
|
Work through EVERY key in the batch file. If a key's prompt is missing, skip it and continue. When done, reply with one line: the count written and skipped.
|
|
EOP
|
|
|
|
export REPO LOG MAX_TURNS BATCH_PROMPT
|
|
run_one() {
|
|
local bf="$1"
|
|
local name; name="$(basename "$bf")"
|
|
local prompt="${BATCH_PROMPT/__BATCHFILE__/$bf}"
|
|
cd "$REPO" || return 1
|
|
timeout 1200 claude -p "$prompt" \
|
|
--allowedTools "Bash(python3:*),Read,Write,Bash(cat:*),Bash(ls:*)" \
|
|
--max-turns "$MAX_TURNS" </dev/null >>"$LOG.$name.out" 2>&1
|
|
echo "[$(date '+%H:%M:%S')] done $name (exit $?)" >>"$LOG"
|
|
}
|
|
export -f run_one
|
|
|
|
BATCHES=(data/enrichment_batches/batch_*.txt)
|
|
log "launching ${#BATCHES[@]} batches, $PAR concurrent..."
|
|
printf '%s\n' "${BATCHES[@]}" | xargs -P "$PAR" -I{} bash -c 'run_one "$@"' _ {}
|
|
|
|
# --- 3) summary ------------------------------------------------------------ #
|
|
if grep -qi "session limit\|usage limit" "$LOG".batch_*.out 2>/dev/null; then
|
|
log "WINDOW EXHAUSTED (usage limit hit mid-wave) — unfinished keys retry next fire."
|
|
fi
|
|
STATUS="$(python3 scripts/enrichment_wave.py --status 2>&1 | grep -E 'good|missing|done')"
|
|
echo "$STATUS" | tee -a "$LOG"
|
|
log "=== wave done ==="
|