Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
178 lines
6.4 KiB
Python
178 lines
6.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Tests for scripts/extract_common.py."""
|
|
|
|
import shutil
|
|
import zipfile
|
|
|
|
import pytest
|
|
|
|
import extract_common as ec
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# format detection
|
|
# --------------------------------------------------------------------------
|
|
def test_detect_format():
|
|
assert ec.detect_format("a/b/file.PDF") == "pdf"
|
|
assert ec.detect_format("x.docx") == "docx"
|
|
assert ec.detect_format("x.doc") == "doc"
|
|
assert ec.detect_format("x.pptx") == "pptx"
|
|
assert ec.detect_format("x.html") == "html"
|
|
assert ec.detect_format("x.zip") == "zip"
|
|
assert ec.detect_format("x.epub") == "epub"
|
|
assert ec.detect_format("x.xyz") == "unknown"
|
|
|
|
|
|
def test_is_junk():
|
|
assert ec.is_junk("some/desktop.ini")
|
|
assert ec.is_junk("notes.bak")
|
|
assert ec.is_junk("README.md")
|
|
assert not ec.is_junk("1000 Scout Games.pdf")
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# PDF — the critical "no max_pages" regression
|
|
# --------------------------------------------------------------------------
|
|
def test_pdf_extracts_all_60_pages(big_pdf):
|
|
body = ec.extract_pdf(big_pdf)
|
|
# the old converter capped at 50 pages — page 60 must be present now
|
|
assert "--- PAGE 60 ---" in body
|
|
assert "PDFMARK-60" in body
|
|
assert ec.count_page_markers(body) == 60
|
|
|
|
|
|
def test_pdf_does_not_truncate_mid_document(big_pdf):
|
|
body = ec.extract_pdf(big_pdf)
|
|
pages = ec.split_pages(body)
|
|
assert pages[-1][0] == 60 # last marker is the real last page
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# page join / split round-trip
|
|
# --------------------------------------------------------------------------
|
|
def test_join_split_round_trip():
|
|
body = ec.join_pages(["alpha", "beta", "gamma"])
|
|
pages = ec.split_pages(body)
|
|
assert [n for n, _ in pages] == [1, 2, 3]
|
|
assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
|
|
|
|
|
|
def test_split_pages_no_markers_returns_empty():
|
|
assert ec.split_pages("plain text with no markers") == []
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# docx — synthetic page markers
|
|
# --------------------------------------------------------------------------
|
|
def test_docx_synthetic_page_markers(sample_docx):
|
|
body = ec.extract_docx(sample_docx)
|
|
# 100 paragraphs / 40 per page => 3 pages
|
|
assert ec.count_page_markers(body) == 3
|
|
assert "Paragraf 99" in body
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# HTML mirror — nav/script/footer stripped
|
|
# --------------------------------------------------------------------------
|
|
def test_html_strips_chrome(html_with_nav):
|
|
body = ec.extract_html(html_with_nav)
|
|
assert "Vanatoarea de comori" in body
|
|
assert "joc real de orientare" in body
|
|
# chrome must be gone
|
|
assert "tracking" not in body
|
|
assert "Site Banner Junk" not in body
|
|
assert "toate drepturile rezervate" not in body
|
|
assert "Games" not in body
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# content hash + near-duplicate elimination
|
|
# --------------------------------------------------------------------------
|
|
def test_content_hash_ignores_whitespace():
|
|
assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
|
|
assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
|
|
|
|
|
|
def test_dedupe_exact_duplicates():
|
|
items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
|
|
kept = ec.dedupe_texts(items)
|
|
assert [k for k, _ in kept] == ["a", "c"]
|
|
|
|
|
|
def test_dedupe_near_duplicates():
|
|
base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
|
|
near = base + " Pagina printata." # >95% similar
|
|
items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
|
|
kept = ec.dedupe_texts(items, threshold=85.0)
|
|
keys = [k for k, _ in kept]
|
|
assert "orig" in keys
|
|
assert "print" not in keys
|
|
assert "other" in keys
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# zip recursion
|
|
# --------------------------------------------------------------------------
|
|
def test_zip_recurses_into_inner_files(sample_zip):
|
|
body = ec.extract_zip(sample_zip)
|
|
assert "Paragraf 0" in body
|
|
assert ec.count_page_markers(body) > 0
|
|
|
|
|
|
def test_zip_bad_archive_returns_empty(tmp_path):
|
|
bad = tmp_path / "broken.zip"
|
|
bad.write_text("not a zip", encoding="utf-8")
|
|
assert ec.extract_zip(bad) == ""
|
|
|
|
|
|
def test_nested_zip(tmp_path, sample_zip):
|
|
outer = tmp_path / "outer.zip"
|
|
with zipfile.ZipFile(outer, "w") as zf:
|
|
zf.write(sample_zip, arcname="nested/archive.zip")
|
|
body = ec.extract_zip(outer)
|
|
assert "Paragraf 0" in body
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# preflight
|
|
# --------------------------------------------------------------------------
|
|
def test_preflight_python_packages_present():
|
|
report = ec.preflight()
|
|
# all required packages are installed in the test environment
|
|
assert report["missing_python"] == []
|
|
|
|
|
|
def test_preflight_reports_libreoffice_state():
|
|
report = ec.preflight()
|
|
has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
|
|
if has_lo:
|
|
assert all("libreoffice" not in w for w in report["warnings"])
|
|
else:
|
|
assert any("libreoffice" in w for w in report["warnings"])
|
|
|
|
|
|
def test_preflight_ocr_flag():
|
|
report = ec.preflight(check_ocr=True)
|
|
if not shutil.which("tesseract"):
|
|
assert any("tesseract" in m for m in report["missing_system"])
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# legacy .doc — skipped unless libreoffice is installed
|
|
# --------------------------------------------------------------------------
|
|
@pytest.mark.skipif(
|
|
not (shutil.which("libreoffice") or shutil.which("soffice")),
|
|
reason="libreoffice not installed",
|
|
)
|
|
def test_doc_conversion(tmp_path, sample_docx):
|
|
doc_path = tmp_path / "legacy.doc"
|
|
shutil.copy(sample_docx, doc_path) # smoke test of the docx path
|
|
body = ec.extract_doc(doc_path)
|
|
assert ec.count_page_markers(body) >= 1
|
|
|
|
|
|
def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
|
|
monkeypatch.setattr(ec.shutil, "which", lambda _: None)
|
|
with pytest.raises(RuntimeError):
|
|
ec.extract_doc(tmp_path / "whatever.doc")
|