game-library/tests/test_validate_extractions.py

# -*- coding: utf-8 -*-
"""
Tests for scripts/validate_extractions.py.

Covers: schema rejection, the source_excerpt hallucination check, the content
of the generated re-extraction prompt, and the manifest `rejected` marking.
"""

import json
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
    if _p not in sys.path:
        sys.path.insert(0, _p)

import validate_extractions as ve  # noqa: E402


# --------------------------------------------------------------------------
# helpers
# --------------------------------------------------------------------------
def _ext_activity(**over):
    base = dict(
        name="Jocul testului",
        description="O activitate de echipa in aer liber.",
        category="team-building",
        content_type="joc",
        language="ro",
        extraction_confidence="high",
        source_excerpt="ancora din bucata sursa",
        page_reference="page 1",
    )
    base.update(over)
    return base


def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
    extracted_dir.mkdir(parents=True, exist_ok=True)
    header = {
        "source_hash": "hash1234deadbeef",
        "schema_version": "1.0",
        "prompt_version": "1.0",
        "chunk_range": "pages 1-20",
        "source_id": "src01",
        "chunk_key": chunk_key,
    }
    if header_extra:
        header.update(header_extra)
    payload = {"header": header, "activities": activities}
    (extracted_dir / f"{chunk_key}.json").write_text(
        json.dumps(payload, ensure_ascii=False), encoding="utf-8"
    )


def _write_chunk(chunks_dir, source_id, chunk_key, text):
    d = chunks_dir / source_id
    d.mkdir(parents=True, exist_ok=True)
    (d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")


# --------------------------------------------------------------------------
# tests
# --------------------------------------------------------------------------
def test_valid_file_passes(tmp_path):
    extracted = tmp_path / "extracted"
    chunks = tmp_path / "chunks"
    excerpt = "ancora din bucata sursa apare aici"
    _write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
    _write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")

    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
    assert report["valid"] == 1
    assert report["rejected"] == 0


def test_schema_invalid_file_rejected(tmp_path):
    extracted = tmp_path / "extracted"
    chunks = tmp_path / "chunks"
    extracted.mkdir(parents=True)
    (extracted / "src01.part01.json").write_text(
        json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
    )

    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
    assert report["rejected"] == 1
    prompt = extracted / "_reextract" / "src01.part01.prompt.md"
    assert prompt.exists()


def test_hallucinated_excerpt_rejected(tmp_path):
    extracted = tmp_path / "extracted"
    chunks = tmp_path / "chunks"
    _write_extraction(
        extracted, "src01.part01",
        [_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
    )
    _write_chunk(chunks, "src01", "src01.part01",
                 "--- PAGE 1 ---\ntext complet diferit despre altceva.\n")

    report = ve.run(extracted, chunks, tmp_path / "manifest.json")
    assert report["rejected"] == 1
    errors = report["rejected_chunks"][0]["errors"]
    assert any("hallucination" in e for e in errors)


def test_reextraction_prompt_content(tmp_path):
    extracted = tmp_path / "extracted"
    chunks = tmp_path / "chunks"
    _write_extraction(
        extracted, "src01.part01",
        [_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
    )
    _write_chunk(chunks, "src01", "src01.part01",
                 "--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")

    ve.run(extracted, chunks, tmp_path / "manifest.json")
    prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
        encoding="utf-8"
    )
    assert "src01.part01" in prompt
    assert "REJECTED" in prompt
    assert "verbatim" in prompt
    assert "data/extracted/src01.part01.json" in prompt


def test_manifest_marks_chunk_rejected(tmp_path):
    extracted = tmp_path / "extracted"
    chunks = tmp_path / "chunks"
    manifest_path = tmp_path / "manifest.json"
    manifest_path.write_text(
        json.dumps({"chunks": {"src01.part01": {"state": "done",
                                                "chunk_file": "chunks/src01/src01.part01.txt"}}}),
        encoding="utf-8",
    )
    _write_extraction(
        extracted, "src01.part01",
        [_ext_activity(source_excerpt="citat fabricat absent vvvv")],
    )
    _write_chunk(chunks, "src01", "src01.part01",
                 "--- PAGE 1 ---\nun continut neinrudit.\n")

    ve.run(extracted, chunks, manifest_path)
    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
    assert manifest["chunks"]["src01.part01"]["state"] == "rejected"


def test_build_reextraction_prompt_lists_errors():
    prompt = ve.build_reextraction_prompt(
        "abc.part03", "data/chunks/abc/abc.part03.txt",
        ["header: 'source_hash' is a required property"],
    )
    assert "abc.part03" in prompt
    assert "source_hash" in prompt