Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
209 lines
7.3 KiB
Python
209 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
validate_extractions.py — validate every data/extracted/*.json (plan §5b).
|
|
|
|
For each extraction file it runs two checks:
|
|
1. JSON-schema validation against scripts/activity_schema.json,
|
|
2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
|
|
substring of the chunk it came from).
|
|
|
|
For every failing chunk it:
|
|
* writes the exact re-extraction prompt to data/extracted/_reextract/<chunk>.prompt.md,
|
|
* marks the chunk `rejected` in data/chunks/manifest.json.
|
|
|
|
The orchestrator then re-launches subagents only on the `rejected` chunks; the
|
|
loop repeats until nothing is rejected.
|
|
|
|
Usage:
|
|
python scripts/validate_extractions.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
REPO_ROOT = SCRIPT_DIR.parent
|
|
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
|
if _p not in sys.path:
|
|
sys.path.insert(0, _p)
|
|
|
|
from import_common import ( # noqa: E402
|
|
DEFAULT_SCHEMA_PATH,
|
|
chunk_key_for,
|
|
excerpt_matches,
|
|
excerpt_score,
|
|
find_chunk_text,
|
|
iter_extraction_files,
|
|
load_schema,
|
|
validate_extraction,
|
|
)
|
|
|
|
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# re-extraction prompt
|
|
# --------------------------------------------------------------------------
|
|
def build_reextraction_prompt(
|
|
chunk_key: str, chunk_file: Optional[str], errors: list[str]
|
|
) -> str:
|
|
"""The exact prompt to hand a subagent to re-extract a rejected chunk."""
|
|
chunk_ref = chunk_file or f"data/chunks/<source_id>/{chunk_key}.txt"
|
|
lines = [
|
|
f"# RE-EXTRACTION — chunk `{chunk_key}`",
|
|
"",
|
|
"The previous extraction for this chunk was **REJECTED**. Reasons:",
|
|
"",
|
|
]
|
|
lines += [f"- {e}" for e in errors]
|
|
lines += [
|
|
"",
|
|
"## What to do",
|
|
"",
|
|
f"1. Read ONLY this chunk: `{chunk_ref}`",
|
|
f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
|
|
"3. Fix every problem listed above. In particular:",
|
|
" - every `source_excerpt` must be copied **verbatim** from the chunk",
|
|
" (it is checked as a fuzzy substring — invented quotes are rejected);",
|
|
" - `source_excerpt` and `page_reference` are mandatory on every activity;",
|
|
" - the output must validate against `scripts/activity_schema.json`.",
|
|
f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
|
|
"",
|
|
]
|
|
return "\n".join(lines)
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# manifest
|
|
# --------------------------------------------------------------------------
|
|
def load_manifest(manifest_path: Path) -> dict:
|
|
if manifest_path.is_file():
|
|
try:
|
|
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
data.setdefault("chunks", {})
|
|
return data
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
return {"chunks": {}}
|
|
|
|
|
|
def save_manifest(manifest: dict, manifest_path: Path) -> None:
|
|
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
manifest_path.write_text(
|
|
json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
)
|
|
|
|
|
|
def mark_rejected(manifest: dict, chunk_key: str) -> None:
|
|
"""Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
|
|
entry = manifest["chunks"].get(chunk_key, {})
|
|
entry["state"] = "rejected"
|
|
manifest["chunks"][chunk_key] = entry
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# validation
|
|
# --------------------------------------------------------------------------
|
|
def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
|
|
"""Return the list of errors for one extraction file (empty == valid)."""
|
|
try:
|
|
data = json.loads(json_path.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError as exc:
|
|
return [f"invalid JSON: {exc}"]
|
|
|
|
errors = validate_extraction(data, schema)
|
|
if errors:
|
|
return errors
|
|
|
|
header = data.get("header", {})
|
|
chunk_text = find_chunk_text(json_path, header, chunks_dir)
|
|
if chunk_text is None:
|
|
return [f"source chunk not found for {chunk_key_for(json_path, header)}"]
|
|
|
|
for adict in data.get("activities", []):
|
|
excerpt = adict.get("source_excerpt") or ""
|
|
if not excerpt_matches(excerpt, chunk_text):
|
|
score = excerpt_score(excerpt, chunk_text)
|
|
errors.append(
|
|
f"activity {adict.get('name')!r}: source_excerpt not found in "
|
|
f"chunk (best match {score:.0f}/100) — possible hallucination"
|
|
)
|
|
return errors
|
|
|
|
|
|
def run(
|
|
extracted_dir: Path,
|
|
chunks_dir: Path,
|
|
manifest_path: Path,
|
|
schema_path: Path = DEFAULT_SCHEMA_PATH,
|
|
) -> dict:
|
|
schema = load_schema(schema_path)
|
|
manifest = load_manifest(manifest_path)
|
|
reextract_dir = extracted_dir / "_reextract"
|
|
|
|
report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
|
|
for json_path in iter_extraction_files(extracted_dir):
|
|
report["total"] += 1
|
|
errors = validate_file(json_path, schema, chunks_dir)
|
|
if not errors:
|
|
report["valid"] += 1
|
|
continue
|
|
|
|
report["rejected"] += 1
|
|
try:
|
|
data = json.loads(json_path.read_text(encoding="utf-8"))
|
|
header = data.get("header", {})
|
|
except json.JSONDecodeError:
|
|
header = {}
|
|
chunk_key = chunk_key_for(json_path, header)
|
|
chunk_file = None
|
|
meta = manifest["chunks"].get(chunk_key)
|
|
if meta:
|
|
chunk_file = meta.get("chunk_file")
|
|
|
|
reextract_dir.mkdir(parents=True, exist_ok=True)
|
|
prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
|
|
(reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")
|
|
|
|
mark_rejected(manifest, chunk_key)
|
|
report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})
|
|
|
|
save_manifest(manifest, manifest_path)
|
|
return report
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# CLI
|
|
# --------------------------------------------------------------------------
|
|
def main(argv: Optional[list[str]] = None) -> int:
|
|
parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
|
|
parser.add_argument("--extracted", default="data/extracted")
|
|
parser.add_argument("--chunks", default="data/chunks")
|
|
parser.add_argument("--manifest", default="data/chunks/manifest.json")
|
|
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
|
|
args = parser.parse_args(argv)
|
|
|
|
report = run(
|
|
Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
|
|
)
|
|
print(f"extraction files : {report['total']}")
|
|
print(f" valid : {report['valid']}")
|
|
print(f" rejected : {report['rejected']}")
|
|
for item in report["rejected_chunks"]:
|
|
print(f" [rejected] {item['chunk']}")
|
|
for err in item["errors"]:
|
|
print(f" - {err}")
|
|
if report["rejected"]:
|
|
print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|