game-library/tests/test_extract_common.py

# -*- coding: utf-8 -*-
"""Tests for scripts/extract_common.py."""

import shutil
import zipfile

import pytest

import extract_common as ec


# --------------------------------------------------------------------------
# format detection
# --------------------------------------------------------------------------
def test_detect_format():
    assert ec.detect_format("a/b/file.PDF") == "pdf"
    assert ec.detect_format("x.docx") == "docx"
    assert ec.detect_format("x.doc") == "doc"
    assert ec.detect_format("x.pptx") == "pptx"
    assert ec.detect_format("x.html") == "html"
    assert ec.detect_format("x.zip") == "zip"
    assert ec.detect_format("x.epub") == "epub"
    assert ec.detect_format("x.xyz") == "unknown"


def test_is_junk():
    assert ec.is_junk("some/desktop.ini")
    assert ec.is_junk("notes.bak")
    assert ec.is_junk("README.md")
    assert not ec.is_junk("1000 Scout Games.pdf")


# --------------------------------------------------------------------------
# PDF — the critical "no max_pages" regression
# --------------------------------------------------------------------------
def test_pdf_extracts_all_60_pages(big_pdf):
    body = ec.extract_pdf(big_pdf)
    # the old converter capped at 50 pages — page 60 must be present now
    assert "--- PAGE 60 ---" in body
    assert "PDFMARK-60" in body
    assert ec.count_page_markers(body) == 60


def test_pdf_does_not_truncate_mid_document(big_pdf):
    body = ec.extract_pdf(big_pdf)
    pages = ec.split_pages(body)
    assert pages[-1][0] == 60  # last marker is the real last page


# --------------------------------------------------------------------------
# page join / split round-trip
# --------------------------------------------------------------------------
def test_join_split_round_trip():
    body = ec.join_pages(["alpha", "beta", "gamma"])
    pages = ec.split_pages(body)
    assert [n for n, _ in pages] == [1, 2, 3]
    assert [t for _, t in pages] == ["alpha", "beta", "gamma"]


def test_split_pages_no_markers_returns_empty():
    assert ec.split_pages("plain text with no markers") == []


# --------------------------------------------------------------------------
# docx — synthetic page markers
# --------------------------------------------------------------------------
def test_docx_synthetic_page_markers(sample_docx):
    body = ec.extract_docx(sample_docx)
    # 100 paragraphs / 40 per page => 3 pages
    assert ec.count_page_markers(body) == 3
    assert "Paragraf 99" in body


# --------------------------------------------------------------------------
# HTML mirror — nav/script/footer stripped
# --------------------------------------------------------------------------
def test_html_strips_chrome(html_with_nav):
    body = ec.extract_html(html_with_nav)
    assert "Vanatoarea de comori" in body
    assert "joc real de orientare" in body
    # chrome must be gone
    assert "tracking" not in body
    assert "Site Banner Junk" not in body
    assert "toate drepturile rezervate" not in body
    assert "Games" not in body


# --------------------------------------------------------------------------
# content hash + near-duplicate elimination
# --------------------------------------------------------------------------
def test_content_hash_ignores_whitespace():
    assert ec.content_hash("hello  world") == ec.content_hash("hello world\n")
    assert ec.content_hash("hello world") != ec.content_hash("goodbye world")


def test_dedupe_exact_duplicates():
    items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
    kept = ec.dedupe_texts(items)
    assert [k for k, _ in kept] == ["a", "c"]


def test_dedupe_near_duplicates():
    base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
    near = base + " Pagina printata."  # >95% similar
    items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
    kept = ec.dedupe_texts(items, threshold=85.0)
    keys = [k for k, _ in kept]
    assert "orig" in keys
    assert "print" not in keys
    assert "other" in keys


# --------------------------------------------------------------------------
# zip recursion
# --------------------------------------------------------------------------
def test_zip_recurses_into_inner_files(sample_zip):
    body = ec.extract_zip(sample_zip)
    assert "Paragraf 0" in body
    assert ec.count_page_markers(body) > 0


def test_zip_bad_archive_returns_empty(tmp_path):
    bad = tmp_path / "broken.zip"
    bad.write_text("not a zip", encoding="utf-8")
    assert ec.extract_zip(bad) == ""


def test_nested_zip(tmp_path, sample_zip):
    outer = tmp_path / "outer.zip"
    with zipfile.ZipFile(outer, "w") as zf:
        zf.write(sample_zip, arcname="nested/archive.zip")
    body = ec.extract_zip(outer)
    assert "Paragraf 0" in body


# --------------------------------------------------------------------------
# preflight
# --------------------------------------------------------------------------
def test_preflight_python_packages_present():
    report = ec.preflight()
    # all required packages are installed in the test environment
    assert report["missing_python"] == []


def test_preflight_reports_libreoffice_state():
    report = ec.preflight()
    has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
    if has_lo:
        assert all("libreoffice" not in w for w in report["warnings"])
    else:
        assert any("libreoffice" in w for w in report["warnings"])


def test_preflight_ocr_flag():
    report = ec.preflight(check_ocr=True)
    if not shutil.which("tesseract"):
        assert any("tesseract" in m for m in report["missing_system"])


# --------------------------------------------------------------------------
# legacy .doc — skipped unless libreoffice is installed
# --------------------------------------------------------------------------
@pytest.mark.skipif(
    not (shutil.which("libreoffice") or shutil.which("soffice")),
    reason="libreoffice not installed",
)
def test_doc_conversion(tmp_path, sample_docx):
    doc_path = tmp_path / "legacy.doc"
    shutil.copy(sample_docx, doc_path)  # smoke test of the docx path
    body = ec.extract_doc(doc_path)
    assert ec.count_page_markers(body) >= 1


def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
    monkeypatch.setattr(ec.shutil, "which", lambda _: None)
    with pytest.raises(RuntimeError):
        ec.extract_doc(tmp_path / "whatever.doc")