Files
game-library/tests/test_extract_common.py
Claude Agent 66ae831c36 Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:43:38 +00:00

178 lines
6.4 KiB
Python

# -*- coding: utf-8 -*-
"""Tests for scripts/extract_common.py."""
import shutil
import zipfile
import pytest
import extract_common as ec
# --------------------------------------------------------------------------
# format detection
# --------------------------------------------------------------------------
def test_detect_format():
assert ec.detect_format("a/b/file.PDF") == "pdf"
assert ec.detect_format("x.docx") == "docx"
assert ec.detect_format("x.doc") == "doc"
assert ec.detect_format("x.pptx") == "pptx"
assert ec.detect_format("x.html") == "html"
assert ec.detect_format("x.zip") == "zip"
assert ec.detect_format("x.epub") == "epub"
assert ec.detect_format("x.xyz") == "unknown"
def test_is_junk():
assert ec.is_junk("some/desktop.ini")
assert ec.is_junk("notes.bak")
assert ec.is_junk("README.md")
assert not ec.is_junk("1000 Scout Games.pdf")
# --------------------------------------------------------------------------
# PDF — the critical "no max_pages" regression
# --------------------------------------------------------------------------
def test_pdf_extracts_all_60_pages(big_pdf):
body = ec.extract_pdf(big_pdf)
# the old converter capped at 50 pages — page 60 must be present now
assert "--- PAGE 60 ---" in body
assert "PDFMARK-60" in body
assert ec.count_page_markers(body) == 60
def test_pdf_does_not_truncate_mid_document(big_pdf):
body = ec.extract_pdf(big_pdf)
pages = ec.split_pages(body)
assert pages[-1][0] == 60 # last marker is the real last page
# --------------------------------------------------------------------------
# page join / split round-trip
# --------------------------------------------------------------------------
def test_join_split_round_trip():
body = ec.join_pages(["alpha", "beta", "gamma"])
pages = ec.split_pages(body)
assert [n for n, _ in pages] == [1, 2, 3]
assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
def test_split_pages_no_markers_returns_empty():
assert ec.split_pages("plain text with no markers") == []
# --------------------------------------------------------------------------
# docx — synthetic page markers
# --------------------------------------------------------------------------
def test_docx_synthetic_page_markers(sample_docx):
body = ec.extract_docx(sample_docx)
# 100 paragraphs / 40 per page => 3 pages
assert ec.count_page_markers(body) == 3
assert "Paragraf 99" in body
# --------------------------------------------------------------------------
# HTML mirror — nav/script/footer stripped
# --------------------------------------------------------------------------
def test_html_strips_chrome(html_with_nav):
body = ec.extract_html(html_with_nav)
assert "Vanatoarea de comori" in body
assert "joc real de orientare" in body
# chrome must be gone
assert "tracking" not in body
assert "Site Banner Junk" not in body
assert "toate drepturile rezervate" not in body
assert "Games" not in body
# --------------------------------------------------------------------------
# content hash + near-duplicate elimination
# --------------------------------------------------------------------------
def test_content_hash_ignores_whitespace():
assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
def test_dedupe_exact_duplicates():
items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
kept = ec.dedupe_texts(items)
assert [k for k, _ in kept] == ["a", "c"]
def test_dedupe_near_duplicates():
base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
near = base + " Pagina printata." # >95% similar
items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
kept = ec.dedupe_texts(items, threshold=85.0)
keys = [k for k, _ in kept]
assert "orig" in keys
assert "print" not in keys
assert "other" in keys
# --------------------------------------------------------------------------
# zip recursion
# --------------------------------------------------------------------------
def test_zip_recurses_into_inner_files(sample_zip):
body = ec.extract_zip(sample_zip)
assert "Paragraf 0" in body
assert ec.count_page_markers(body) > 0
def test_zip_bad_archive_returns_empty(tmp_path):
bad = tmp_path / "broken.zip"
bad.write_text("not a zip", encoding="utf-8")
assert ec.extract_zip(bad) == ""
def test_nested_zip(tmp_path, sample_zip):
outer = tmp_path / "outer.zip"
with zipfile.ZipFile(outer, "w") as zf:
zf.write(sample_zip, arcname="nested/archive.zip")
body = ec.extract_zip(outer)
assert "Paragraf 0" in body
# --------------------------------------------------------------------------
# preflight
# --------------------------------------------------------------------------
def test_preflight_python_packages_present():
report = ec.preflight()
# all required packages are installed in the test environment
assert report["missing_python"] == []
def test_preflight_reports_libreoffice_state():
report = ec.preflight()
has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
if has_lo:
assert all("libreoffice" not in w for w in report["warnings"])
else:
assert any("libreoffice" in w for w in report["warnings"])
def test_preflight_ocr_flag():
report = ec.preflight(check_ocr=True)
if not shutil.which("tesseract"):
assert any("tesseract" in m for m in report["missing_system"])
# --------------------------------------------------------------------------
# legacy .doc — skipped unless libreoffice is installed
# --------------------------------------------------------------------------
@pytest.mark.skipif(
not (shutil.which("libreoffice") or shutil.which("soffice")),
reason="libreoffice not installed",
)
def test_doc_conversion(tmp_path, sample_docx):
doc_path = tmp_path / "legacy.doc"
shutil.copy(sample_docx, doc_path) # smoke test of the docx path
body = ec.extract_doc(doc_path)
assert ec.count_page_markers(body) >= 1
def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
monkeypatch.setattr(ec.shutil, "which", lambda _: None)
with pytest.raises(RuntimeError):
ec.extract_doc(tmp_path / "whatever.doc")