Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
141 lines
4.9 KiB
Python
141 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
run_extraction.py — extraction orchestrator (plan §3).
|
|
|
|
The pipeline is script-only up to the LLM step: this script normalizes the
|
|
corpus, chunks the normalized sources, and emits one subagent prompt per
|
|
`pending` chunk. It does NOT run the extraction itself — that step is the
|
|
interactive Claude Code orchestrator launching waves of subagents.
|
|
|
|
Steps:
|
|
1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt
|
|
2. chunk data/sources/*.txt -> data/chunks/<id>/*.txt + manifest.json
|
|
3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md
|
|
4. report how many chunks remain `pending`
|
|
|
|
Usage:
|
|
python scripts/run_extraction.py
|
|
python scripts/run_extraction.py --skip-normalize # re-chunk only
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
REPO_ROOT = SCRIPT_DIR.parent
|
|
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
|
if _p not in sys.path:
|
|
sys.path.insert(0, _p)
|
|
|
|
import chunk_sources # noqa: E402
|
|
import normalize_sources # noqa: E402
|
|
|
|
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
|
|
|
|
|
|
def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
|
|
"""Write the subagent prompt for one pending chunk."""
|
|
chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
|
|
expected_json = meta.get("expected_json", f"{chunk_key}.json")
|
|
text = "\n".join([
|
|
f"# EXTRACTION — chunk `{chunk_key}`",
|
|
"",
|
|
f"Read ONLY this chunk: `{chunk_file}`",
|
|
f"Chunk range: {meta.get('chunk_range', '?')}",
|
|
"",
|
|
f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
|
|
"Identify every distinct activity, fill the schema "
|
|
"(`scripts/activity_schema.json`), and write the result to:",
|
|
"",
|
|
f" data/extracted/{expected_json}",
|
|
"",
|
|
"Header fields to set: "
|
|
f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
|
|
f'source_hash="{meta.get("source_hash", "")}".',
|
|
"",
|
|
])
|
|
prompts_dir.mkdir(parents=True, exist_ok=True)
|
|
out = prompts_dir / f"{chunk_key}.prompt.md"
|
|
out.write_text(text, encoding="utf-8")
|
|
return out
|
|
|
|
|
|
def run(
|
|
*,
|
|
corpus_root: Path,
|
|
sources_dir: Path,
|
|
chunks_dir: Path,
|
|
skip_normalize: bool = False,
|
|
) -> dict:
|
|
summary: dict = {}
|
|
|
|
if not skip_normalize:
|
|
norm = normalize_sources.run(corpus_root, sources_dir)
|
|
summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
|
|
"errors": norm["errors"]}
|
|
|
|
chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
|
|
summary["chunks"] = chunk_summary
|
|
|
|
manifest_path = chunks_dir / "manifest.json"
|
|
manifest = chunk_sources.load_manifest(manifest_path)
|
|
prompts_dir = chunks_dir / "_prompts"
|
|
|
|
pending = {k: m for k, m in manifest["chunks"].items()
|
|
if m.get("state") == "pending"}
|
|
for key, meta in sorted(pending.items()):
|
|
emit_chunk_prompt(key, meta, prompts_dir)
|
|
|
|
states: dict[str, int] = {}
|
|
for m in manifest["chunks"].values():
|
|
states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
|
|
summary["states"] = states
|
|
summary["pending"] = len(pending)
|
|
summary["prompts_dir"] = str(prompts_dir)
|
|
return summary
|
|
|
|
|
|
def main(argv: Optional[list[str]] = None) -> int:
|
|
parser = argparse.ArgumentParser(description="Extraction orchestrator.")
|
|
parser.add_argument("--corpus", default="data/carti-camp-jocuri")
|
|
parser.add_argument("--sources", default="data/sources")
|
|
parser.add_argument("--chunks", default="data/chunks")
|
|
parser.add_argument("--skip-normalize", action="store_true",
|
|
help="skip normalization, re-chunk existing sources only")
|
|
args = parser.parse_args(argv)
|
|
|
|
summary = run(
|
|
corpus_root=Path(args.corpus),
|
|
sources_dir=Path(args.sources),
|
|
chunks_dir=Path(args.chunks),
|
|
skip_normalize=args.skip_normalize,
|
|
)
|
|
|
|
print("=" * 60)
|
|
print("EXTRACTION ORCHESTRATOR")
|
|
print("=" * 60)
|
|
if "normalized" in summary:
|
|
n = summary["normalized"]
|
|
print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
|
|
print(f"chunks : {summary['chunks']['chunks']}")
|
|
for state, count in sorted(summary["states"].items()):
|
|
print(f" {state:<10}: {count}")
|
|
print(f"\npending chunks remaining : {summary['pending']}")
|
|
if summary["pending"]:
|
|
print(f"subagent prompts written : {summary['prompts_dir']}/")
|
|
print("Launch waves of ~5-10 subagents on those prompts, then run "
|
|
"validate_extractions.py and build_database.py --rebuild.")
|
|
else:
|
|
print("All chunks extracted — run build_database.py --rebuild.")
|
|
print("=" * 60)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|