Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
115 lines
4.0 KiB
Python
115 lines
4.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Shared pytest fixtures for the extraction-pipeline tests.
|
|
|
|
scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
|
|
(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
|
|
"""
|
|
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
|
if str(SCRIPTS_DIR) not in sys.path:
|
|
sys.path.insert(0, str(SCRIPTS_DIR))
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# synthetic PDF — deliberately large to pin the "no max_pages" regression
|
|
# --------------------------------------------------------------------------
|
|
@pytest.fixture
|
|
def big_pdf(tmp_path):
|
|
"""A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
|
|
from reportlab.pdfgen import canvas
|
|
from reportlab.lib.pagesizes import letter
|
|
|
|
path = tmp_path / "big.pdf"
|
|
c = canvas.Canvas(str(path), pagesize=letter)
|
|
for n in range(1, 61):
|
|
c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
|
|
c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
|
|
c.showPage()
|
|
c.save()
|
|
return path
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
|
|
# --------------------------------------------------------------------------
|
|
@pytest.fixture
|
|
def sample_docx(tmp_path):
|
|
import docx
|
|
|
|
path = tmp_path / "sample.docx"
|
|
document = docx.Document()
|
|
for i in range(100):
|
|
document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
|
|
document.save(str(path))
|
|
return path
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# synthetic HTML mirror page — with nav/script/footer chrome to strip
|
|
# --------------------------------------------------------------------------
|
|
HTML_WITH_NAV = """<!doctype html>
|
|
<html><head><title>Joc</title>
|
|
<style>.x{color:red}</style>
|
|
<script>var tracking = 1;</script>
|
|
</head><body>
|
|
<nav><a href="/">Home</a><a href="/games">Games</a></nav>
|
|
<header>Site Banner Junk</header>
|
|
<main>
|
|
<h1>Vanatoarea de comori</h1>
|
|
<p>Acesta este un joc real de orientare pentru cercetasi.</p>
|
|
<p>Jucatorii cauta indicii ascunse in tabara.</p>
|
|
</main>
|
|
<footer>Copyright 2024 - toate drepturile rezervate</footer>
|
|
</body></html>
|
|
"""
|
|
|
|
|
|
@pytest.fixture
|
|
def html_with_nav(tmp_path):
|
|
path = tmp_path / "page.html"
|
|
path.write_text(HTML_WITH_NAV, encoding="utf-8")
|
|
return path
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# synthetic zip — contains a docx and a stray junk file
|
|
# --------------------------------------------------------------------------
|
|
@pytest.fixture
|
|
def sample_zip(tmp_path, sample_docx):
|
|
path = tmp_path / "archive.zip"
|
|
with zipfile.ZipFile(path, "w") as zf:
|
|
zf.write(sample_docx, arcname="inner/sample.docx")
|
|
zf.writestr("desktop.ini", "junk")
|
|
return path
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# synthetic normalized source — paginated, with an activity straddling a
|
|
# page boundary so the chunker overlap can be verified.
|
|
# --------------------------------------------------------------------------
|
|
@pytest.fixture
|
|
def paginated_source(tmp_path):
|
|
"""A 50-page normalized source. An activity spans the page 20/21 boundary."""
|
|
lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
|
|
"FORMAT: pdf", "=" * 50, ""]
|
|
for n in range(1, 51):
|
|
lines.append(f"--- PAGE {n} ---")
|
|
if n == 20:
|
|
lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
|
|
elif n == 21:
|
|
lines.append("continuare a jocului podului ACTIVITY-END")
|
|
else:
|
|
lines.append(f"continut obisnuit pe pagina {n}")
|
|
lines.append("")
|
|
path = tmp_path / "src_paginated.txt"
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
return path
|