game-library/tests/conftest.py

# -*- coding: utf-8 -*-
"""
Shared pytest fixtures for the extraction-pipeline tests.

scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
"""

import sys
import zipfile
from pathlib import Path

import pytest

REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
if str(SCRIPTS_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPTS_DIR))


# --------------------------------------------------------------------------
# synthetic PDF — deliberately large to pin the "no max_pages" regression
# --------------------------------------------------------------------------
@pytest.fixture
def big_pdf(tmp_path):
    """A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import letter

    path = tmp_path / "big.pdf"
    c = canvas.Canvas(str(path), pagesize=letter)
    for n in range(1, 61):
        c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
        c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
        c.showPage()
    c.save()
    return path


# --------------------------------------------------------------------------
# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
# --------------------------------------------------------------------------
@pytest.fixture
def sample_docx(tmp_path):
    import docx

    path = tmp_path / "sample.docx"
    document = docx.Document()
    for i in range(100):
        document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
    document.save(str(path))
    return path


# --------------------------------------------------------------------------
# synthetic HTML mirror page — with nav/script/footer chrome to strip
# --------------------------------------------------------------------------
HTML_WITH_NAV = """<!doctype html>
<html><head><title>Joc</title>
<style>.x{color:red}</style>
<script>var tracking = 1;</script>
</head><body>
<nav><a href="/">Home</a><a href="/games">Games</a></nav>
<header>Site Banner Junk</header>
<main>
<h1>Vanatoarea de comori</h1>
<p>Acesta este un joc real de orientare pentru cercetasi.</p>
<p>Jucatorii cauta indicii ascunse in tabara.</p>
</main>
<footer>Copyright 2024 - toate drepturile rezervate</footer>
</body></html>
"""


@pytest.fixture
def html_with_nav(tmp_path):
    path = tmp_path / "page.html"
    path.write_text(HTML_WITH_NAV, encoding="utf-8")
    return path


# --------------------------------------------------------------------------
# synthetic zip — contains a docx and a stray junk file
# --------------------------------------------------------------------------
@pytest.fixture
def sample_zip(tmp_path, sample_docx):
    path = tmp_path / "archive.zip"
    with zipfile.ZipFile(path, "w") as zf:
        zf.write(sample_docx, arcname="inner/sample.docx")
        zf.writestr("desktop.ini", "junk")
    return path


# --------------------------------------------------------------------------
# synthetic normalized source — paginated, with an activity straddling a
# page boundary so the chunker overlap can be verified.
# --------------------------------------------------------------------------
@pytest.fixture
def paginated_source(tmp_path):
    """A 50-page normalized source. An activity spans the page 20/21 boundary."""
    lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
             "FORMAT: pdf", "=" * 50, ""]
    for n in range(1, 51):
        lines.append(f"--- PAGE {n} ---")
        if n == 20:
            lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
        elif n == 21:
            lines.append("continuare a jocului podului ACTIVITY-END")
        else:
            lines.append(f"continut obisnuit pe pagina {n}")
        lines.append("")
    path = tmp_path / "src_paginated.txt"
    path.write_text("\n".join(lines), encoding="utf-8")
    return path