Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
156
tests/test_validate_extractions.py
Normal file
156
tests/test_validate_extractions.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for scripts/validate_extractions.py.
|
||||
|
||||
Covers: schema rejection, the source_excerpt hallucination check, the content
|
||||
of the generated re-extraction prompt, and the manifest `rejected` marking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import validate_extractions as ve # noqa: E402
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def _ext_activity(**over):
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
source_excerpt="ancora din bucata sursa",
|
||||
page_reference="page 1",
|
||||
)
|
||||
base.update(over)
|
||||
return base
|
||||
|
||||
|
||||
def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
header = {
|
||||
"source_hash": "hash1234deadbeef",
|
||||
"schema_version": "1.0",
|
||||
"prompt_version": "1.0",
|
||||
"chunk_range": "pages 1-20",
|
||||
"source_id": "src01",
|
||||
"chunk_key": chunk_key,
|
||||
}
|
||||
if header_extra:
|
||||
header.update(header_extra)
|
||||
payload = {"header": header, "activities": activities}
|
||||
(extracted_dir / f"{chunk_key}.json").write_text(
|
||||
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def _write_chunk(chunks_dir, source_id, chunk_key, text):
|
||||
d = chunks_dir / source_id
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# tests
|
||||
# --------------------------------------------------------------------------
|
||||
def test_valid_file_passes(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
excerpt = "ancora din bucata sursa apare aici"
|
||||
_write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
|
||||
_write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["valid"] == 1
|
||||
assert report["rejected"] == 0
|
||||
|
||||
|
||||
def test_schema_invalid_file_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
extracted.mkdir(parents=True)
|
||||
(extracted / "src01.part01.json").write_text(
|
||||
json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
|
||||
)
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["rejected"] == 1
|
||||
prompt = extracted / "_reextract" / "src01.part01.prompt.md"
|
||||
assert prompt.exists()
|
||||
|
||||
|
||||
def test_hallucinated_excerpt_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["rejected"] == 1
|
||||
errors = report["rejected_chunks"][0]["errors"]
|
||||
assert any("hallucination" in e for e in errors)
|
||||
|
||||
|
||||
def test_reextraction_prompt_content(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
|
||||
|
||||
ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
assert "src01.part01" in prompt
|
||||
assert "REJECTED" in prompt
|
||||
assert "verbatim" in prompt
|
||||
assert "data/extracted/src01.part01.json" in prompt
|
||||
|
||||
|
||||
def test_manifest_marks_chunk_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
manifest_path = tmp_path / "manifest.json"
|
||||
manifest_path.write_text(
|
||||
json.dumps({"chunks": {"src01.part01": {"state": "done",
|
||||
"chunk_file": "chunks/src01/src01.part01.txt"}}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat fabricat absent vvvv")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\nun continut neinrudit.\n")
|
||||
|
||||
ve.run(extracted, chunks, manifest_path)
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
|
||||
|
||||
|
||||
def test_build_reextraction_prompt_lists_errors():
|
||||
prompt = ve.build_reextraction_prompt(
|
||||
"abc.part03", "data/chunks/abc/abc.part03.txt",
|
||||
["header: 'source_hash' is a required property"],
|
||||
)
|
||||
assert "abc.part03" in prompt
|
||||
assert "source_hash" in prompt
|
||||
Reference in New Issue
Block a user