Files
game-library/tests/conftest.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

115 lines
4.0 KiB
Python

# -*- coding: utf-8 -*-
"""
Shared pytest fixtures for the extraction-pipeline tests.
scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
"""
import sys
import zipfile
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
if str(SCRIPTS_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPTS_DIR))
# --------------------------------------------------------------------------
# synthetic PDF — deliberately large to pin the "no max_pages" regression
# --------------------------------------------------------------------------
@pytest.fixture
def big_pdf(tmp_path):
"""A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
path = tmp_path / "big.pdf"
c = canvas.Canvas(str(path), pagesize=letter)
for n in range(1, 61):
c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
c.showPage()
c.save()
return path
# --------------------------------------------------------------------------
# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
# --------------------------------------------------------------------------
@pytest.fixture
def sample_docx(tmp_path):
import docx
path = tmp_path / "sample.docx"
document = docx.Document()
for i in range(100):
document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
document.save(str(path))
return path
# --------------------------------------------------------------------------
# synthetic HTML mirror page — with nav/script/footer chrome to strip
# --------------------------------------------------------------------------
HTML_WITH_NAV = """<!doctype html>
<html><head><title>Joc</title>
<style>.x{color:red}</style>
<script>var tracking = 1;</script>
</head><body>
<nav><a href="/">Home</a><a href="/games">Games</a></nav>
<header>Site Banner Junk</header>
<main>
<h1>Vanatoarea de comori</h1>
<p>Acesta este un joc real de orientare pentru cercetasi.</p>
<p>Jucatorii cauta indicii ascunse in tabara.</p>
</main>
<footer>Copyright 2024 - toate drepturile rezervate</footer>
</body></html>
"""
@pytest.fixture
def html_with_nav(tmp_path):
path = tmp_path / "page.html"
path.write_text(HTML_WITH_NAV, encoding="utf-8")
return path
# --------------------------------------------------------------------------
# synthetic zip — contains a docx and a stray junk file
# --------------------------------------------------------------------------
@pytest.fixture
def sample_zip(tmp_path, sample_docx):
path = tmp_path / "archive.zip"
with zipfile.ZipFile(path, "w") as zf:
zf.write(sample_docx, arcname="inner/sample.docx")
zf.writestr("desktop.ini", "junk")
return path
# --------------------------------------------------------------------------
# synthetic normalized source — paginated, with an activity straddling a
# page boundary so the chunker overlap can be verified.
# --------------------------------------------------------------------------
@pytest.fixture
def paginated_source(tmp_path):
"""A 50-page normalized source. An activity spans the page 20/21 boundary."""
lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
"FORMAT: pdf", "=" * 50, ""]
for n in range(1, 51):
lines.append(f"--- PAGE {n} ---")
if n == 20:
lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
elif n == 21:
lines.append("continuare a jocului podului ACTIVITY-END")
else:
lines.append(f"continut obisnuit pe pagina {n}")
lines.append("")
path = tmp_path / "src_paginated.txt"
path.write_text("\n".join(lines), encoding="utf-8")
return path