Files
game-library/scripts/validate_extractions.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

209 lines
7.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
validate_extractions.py — validate every data/extracted/*.json (plan §5b).
For each extraction file it runs two checks:
1. JSON-schema validation against scripts/activity_schema.json,
2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
substring of the chunk it came from).
For every failing chunk it:
* writes the exact re-extraction prompt to data/extracted/_reextract/<chunk>.prompt.md,
* marks the chunk `rejected` in data/chunks/manifest.json.
The orchestrator then re-launches subagents only on the `rejected` chunks; the
loop repeats until nothing is rejected.
Usage:
python scripts/validate_extractions.py
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
if _p not in sys.path:
sys.path.insert(0, _p)
from import_common import ( # noqa: E402
DEFAULT_SCHEMA_PATH,
chunk_key_for,
excerpt_matches,
excerpt_score,
find_chunk_text,
iter_extraction_files,
load_schema,
validate_extraction,
)
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
# --------------------------------------------------------------------------
# re-extraction prompt
# --------------------------------------------------------------------------
def build_reextraction_prompt(
chunk_key: str, chunk_file: Optional[str], errors: list[str]
) -> str:
"""The exact prompt to hand a subagent to re-extract a rejected chunk."""
chunk_ref = chunk_file or f"data/chunks/<source_id>/{chunk_key}.txt"
lines = [
f"# RE-EXTRACTION — chunk `{chunk_key}`",
"",
"The previous extraction for this chunk was **REJECTED**. Reasons:",
"",
]
lines += [f"- {e}" for e in errors]
lines += [
"",
"## What to do",
"",
f"1. Read ONLY this chunk: `{chunk_ref}`",
f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
"3. Fix every problem listed above. In particular:",
" - every `source_excerpt` must be copied **verbatim** from the chunk",
" (it is checked as a fuzzy substring — invented quotes are rejected);",
" - `source_excerpt` and `page_reference` are mandatory on every activity;",
" - the output must validate against `scripts/activity_schema.json`.",
f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
"",
]
return "\n".join(lines)
# --------------------------------------------------------------------------
# manifest
# --------------------------------------------------------------------------
def load_manifest(manifest_path: Path) -> dict:
if manifest_path.is_file():
try:
data = json.loads(manifest_path.read_text(encoding="utf-8"))
data.setdefault("chunks", {})
return data
except (json.JSONDecodeError, OSError):
pass
return {"chunks": {}}
def save_manifest(manifest: dict, manifest_path: Path) -> None:
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(
json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
)
def mark_rejected(manifest: dict, chunk_key: str) -> None:
"""Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
entry = manifest["chunks"].get(chunk_key, {})
entry["state"] = "rejected"
manifest["chunks"][chunk_key] = entry
# --------------------------------------------------------------------------
# validation
# --------------------------------------------------------------------------
def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
"""Return the list of errors for one extraction file (empty == valid)."""
try:
data = json.loads(json_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as exc:
return [f"invalid JSON: {exc}"]
errors = validate_extraction(data, schema)
if errors:
return errors
header = data.get("header", {})
chunk_text = find_chunk_text(json_path, header, chunks_dir)
if chunk_text is None:
return [f"source chunk not found for {chunk_key_for(json_path, header)}"]
for adict in data.get("activities", []):
excerpt = adict.get("source_excerpt") or ""
if not excerpt_matches(excerpt, chunk_text):
score = excerpt_score(excerpt, chunk_text)
errors.append(
f"activity {adict.get('name')!r}: source_excerpt not found in "
f"chunk (best match {score:.0f}/100) — possible hallucination"
)
return errors
def run(
extracted_dir: Path,
chunks_dir: Path,
manifest_path: Path,
schema_path: Path = DEFAULT_SCHEMA_PATH,
) -> dict:
schema = load_schema(schema_path)
manifest = load_manifest(manifest_path)
reextract_dir = extracted_dir / "_reextract"
report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
for json_path in iter_extraction_files(extracted_dir):
report["total"] += 1
errors = validate_file(json_path, schema, chunks_dir)
if not errors:
report["valid"] += 1
continue
report["rejected"] += 1
try:
data = json.loads(json_path.read_text(encoding="utf-8"))
header = data.get("header", {})
except json.JSONDecodeError:
header = {}
chunk_key = chunk_key_for(json_path, header)
chunk_file = None
meta = manifest["chunks"].get(chunk_key)
if meta:
chunk_file = meta.get("chunk_file")
reextract_dir.mkdir(parents=True, exist_ok=True)
prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
(reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")
mark_rejected(manifest, chunk_key)
report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})
save_manifest(manifest, manifest_path)
return report
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
parser.add_argument("--extracted", default="data/extracted")
parser.add_argument("--chunks", default="data/chunks")
parser.add_argument("--manifest", default="data/chunks/manifest.json")
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
args = parser.parse_args(argv)
report = run(
Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
)
print(f"extraction files : {report['total']}")
print(f" valid : {report['valid']}")
print(f" rejected : {report['rejected']}")
for item in report["rejected_chunks"]:
print(f" [rejected] {item['chunk']}")
for err in item["errors"]:
print(f" - {err}")
if report["rejected"]:
print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
return 0
if __name__ == "__main__":
raise SystemExit(main())