"""Tests for scripts/stats.py.""" from __future__ import annotations import csv import sys from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from scripts.append_row import CSV_COLUMNS # noqa: E402 from scripts.stats import ( # noqa: E402 BACKTEST_SOURCES, CORE_CALIBRATION_FIELDS, bootstrap_ci, calibration_mismatch, compute_group_stats, expectancy, format_calibration_report, format_report, group_by, load_trades, main, win_rate, wilson_ci, ) # --------------------------------------------------------------------------- # Synthetic CSV fixture: 30 trades # --------------------------------------------------------------------------- def _base_row(**overrides) -> dict[str, str]: base = { "id": "0", "screenshot_file": "", "source": "vision", "data": "2026-05-13", "zi": "Mi", "ora_ro": "17:30", "ora_utc": "14:30", "instrument": "DIA", "directie": "Buy", "tf_mare": "5min", "tf_mic": "1min", "calitate": "Clară", "entry": "400.0", "sl": "399.0", "tp0": "400.5", "tp1": "401.0", "tp2": "402.0", "risc_pct": "0.25", "outcome_path": "TP0→TP1", "max_reached": "TP1", "be_moved": "True", "pl_marius": "0.5000", "pl_theoretical": "0.3330", "set": "A2", "indicator_version": "v-2026-05", "pl_overlay_version": "marius-v1", "csv_schema_version": "1", "extracted_at": "2026-05-13T10:00:00Z", "note": "", } base.update({k: str(v) for k, v in overrides.items()}) return base def _write_csv(path: Path, rows: list[dict[str, str]]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8", newline="") as fh: w = csv.DictWriter(fh, fieldnames=list(CSV_COLUMNS)) w.writeheader() for r in rows: w.writerow({k: r.get(k, "") for k in CSV_COLUMNS}) def _synthetic_30(tmp_path: Path) -> Path: """30 vision-source trades engineered for known stats. Layout (by Set): - A1: 10 trades — 6 wins TP0->TP1 (+0.5), 4 losses SL (-1.0) → WR 60% - A2: 10 trades — 7 wins TP0->TP2 (+0.5), 3 losses SL (-1.0) → WR 70% - A3: 10 trades — 4 wins TP0->TP1 (+0.5), 6 losses SL (-1.0) → WR 40% Overall: 17 wins / 30, WR ≈ 56.67%. """ rows: list[dict[str, str]] = [] rid = 0 def add(set_label: str, n_win: int, n_loss: int, calitate: str = "Clară") -> None: nonlocal rid for _ in range(n_win): rid += 1 rows.append( _base_row( id=rid, screenshot_file=f"win-{rid}.png", set=set_label, calitate=calitate, outcome_path="TP0→TP1", max_reached="TP1", be_moved="True", pl_marius="0.5000", pl_theoretical="0.3330", ) ) for _ in range(n_loss): rid += 1 rows.append( _base_row( id=rid, screenshot_file=f"loss-{rid}.png", set=set_label, calitate=calitate, outcome_path="SL", max_reached="SL_first", be_moved="False", pl_marius="-1.0000", pl_theoretical="-1.0000", ) ) add("A1", 6, 4) add("A2", 7, 3) add("A3", 4, 6) path = tmp_path / "jurnal.csv" _write_csv(path, rows) return path # --------------------------------------------------------------------------- # Wilson CI — reference values # --------------------------------------------------------------------------- class TestWilsonCI: def test_n_zero(self) -> None: assert wilson_ci(0, 0) == (0.0, 0.0) def test_50pct_at_n40(self) -> None: lo, hi = wilson_ci(20, 40) assert lo == pytest.approx(0.3519927879709976, abs=1e-9) assert hi == pytest.approx(0.6480072120290024, abs=1e-9) def test_55pct_at_n40(self) -> None: lo, hi = wilson_ci(22, 40) assert lo == pytest.approx(0.3982882988844078, abs=1e-9) assert hi == pytest.approx(0.6929492471905531, abs=1e-9) def test_55pct_at_n100(self) -> None: # Larger N tightens the CI; lower bound rises above 45%. lo, hi = wilson_ci(55, 100) assert lo == pytest.approx(0.4524442703164345, abs=1e-9) assert hi == pytest.approx(0.6438562489359655, abs=1e-9) assert lo > 0.45 # STOPPING_RULE GO-LIVE gate def test_zero_wins(self) -> None: lo, hi = wilson_ci(0, 10) assert lo == pytest.approx(0.0, abs=1e-12) assert hi == pytest.approx(0.2775401687666165, abs=1e-9) def test_all_wins(self) -> None: lo, hi = wilson_ci(10, 10) assert lo == pytest.approx(0.7224598312333834, abs=1e-9) assert hi == pytest.approx(1.0, abs=1e-12) def test_wins_out_of_range(self) -> None: with pytest.raises(ValueError): wilson_ci(11, 10) with pytest.raises(ValueError): wilson_ci(-1, 10) # --------------------------------------------------------------------------- # Bootstrap CI — determinism + sanity # --------------------------------------------------------------------------- class TestBootstrapCI: def test_deterministic_with_seed(self) -> None: vals = [0.5, -1.0, 0.5, 0.5, -1.0, 0.2, -0.3, 0.5, -1.0, 0.5] lo1, hi1 = bootstrap_ci(vals, iterations=500, seed=42) lo2, hi2 = bootstrap_ci(vals, iterations=500, seed=42) assert (lo1, hi1) == (lo2, hi2) def test_different_seed_different_result(self) -> None: vals = [0.5, -1.0, 0.5, 0.5, -1.0, 0.2, -0.3, 0.5, -1.0, 0.5] r1 = bootstrap_ci(vals, iterations=500, seed=1) r2 = bootstrap_ci(vals, iterations=500, seed=2) assert r1 != r2 def test_brackets_the_mean(self) -> None: vals = [0.5, -1.0, 0.5, 0.5, -1.0, 0.2, -0.3, 0.5, -1.0, 0.5] * 5 mean = sum(vals) / len(vals) lo, hi = bootstrap_ci(vals, iterations=1000, seed=7) assert lo <= mean <= hi def test_empty_input(self) -> None: assert bootstrap_ci([], iterations=100, seed=0) == (0.0, 0.0) def test_single_value(self) -> None: lo, hi = bootstrap_ci([0.5], iterations=100, seed=0) # No variance with n=1: short-circuited to (mean, mean). assert lo == pytest.approx(0.5) assert hi == pytest.approx(0.5) # --------------------------------------------------------------------------- # Loading + group stats on the 30-trade fixture # --------------------------------------------------------------------------- class TestSyntheticFixture: def test_load_30(self, tmp_path: Path) -> None: path = _synthetic_30(tmp_path) trades = load_trades(path) assert len(trades) == 30 assert all(t.source == "vision" for t in trades) def test_overall_wr(self, tmp_path: Path) -> None: trades = load_trades(_synthetic_30(tmp_path)) wins, n, wr = win_rate(trades) assert wins == 17 assert n == 30 assert wr == pytest.approx(17 / 30) def test_overall_expectancy(self, tmp_path: Path) -> None: trades = load_trades(_synthetic_30(tmp_path)) # 17 wins * 0.5 + 13 losses * -1.0 = 8.5 - 13.0 = -4.5 → mean = -0.15 assert expectancy(trades) == pytest.approx(-0.15, abs=1e-9) def test_per_set_wr(self, tmp_path: Path) -> None: trades = load_trades(_synthetic_30(tmp_path)) by_set = group_by(trades, "set") wr_a1 = win_rate(by_set["A1"])[2] wr_a2 = win_rate(by_set["A2"])[2] wr_a3 = win_rate(by_set["A3"])[2] assert wr_a1 == pytest.approx(0.60) assert wr_a2 == pytest.approx(0.70) assert wr_a3 == pytest.approx(0.40) def test_group_stats_a2(self, tmp_path: Path) -> None: trades = load_trades(_synthetic_30(tmp_path)) a2 = [t for t in trades if t.set == "A2"] s = compute_group_stats( a2, label="A2", bootstrap_iterations=500, seed=11 ) assert s.n_total == 10 assert s.n_resolved == 10 assert s.wins == 7 assert s.wr == pytest.approx(0.70) # Wilson 7/10 assert s.wr_ci_lo == pytest.approx(0.3967732199795652, abs=1e-9) assert s.wr_ci_hi == pytest.approx(0.892210712513788, abs=1e-9) # Expectancy A2 = 7*0.5 + 3*(-1.0) = 0.5 → mean = 0.05 assert s.exp_marius == pytest.approx(0.05, abs=1e-9) assert s.exp_marius_ci_lo <= s.exp_marius <= s.exp_marius_ci_hi # --------------------------------------------------------------------------- # Pending-trade handling # --------------------------------------------------------------------------- class TestPendingHandling: def test_pending_excluded_from_wr(self, tmp_path: Path) -> None: rows = [ _base_row( id=1, screenshot_file="a.png", outcome_path="TP0→TP1", max_reached="TP1", be_moved="True", pl_marius="0.5000", pl_theoretical="0.3330", ), _base_row( id=2, screenshot_file="b.png", outcome_path="pending", max_reached="TP0", be_moved="False", pl_marius="", pl_theoretical="0.1330", ), _base_row( id=3, screenshot_file="c.png", outcome_path="SL", max_reached="SL_first", be_moved="False", pl_marius="-1.0000", pl_theoretical="-1.0000", ), ] p = tmp_path / "j.csv" _write_csv(p, rows) trades = load_trades(p) wins, n, wr = win_rate(trades) assert wins == 1 assert n == 2 # pending excluded assert wr == pytest.approx(0.5) # Expectancy on pl_marius averages only resolved rows: (0.5 + -1.0) / 2 = -0.25 assert expectancy(trades, "pl_marius") == pytest.approx(-0.25) # --------------------------------------------------------------------------- # Source filtering: calibration rows excluded from main report # --------------------------------------------------------------------------- class TestSourceFiltering: def test_calibration_rows_excluded_from_backtest_stats( self, tmp_path: Path ) -> None: rows = [ _base_row(id=1, source="vision", screenshot_file="v.png", pl_marius="0.5000"), _base_row(id=2, source="manual", screenshot_file="m.png", pl_marius="0.5000"), _base_row(id=3, source="manual_calibration", screenshot_file="c.png", pl_marius="-1.0000"), _base_row(id=4, source="vision_calibration", screenshot_file="c.png", pl_marius="-1.0000"), ] p = tmp_path / "j.csv" _write_csv(p, rows) trades = load_trades(p) backtest = [t for t in trades if t.source in BACKTEST_SOURCES] assert len(backtest) == 2 wins, n, wr = win_rate(backtest) assert (wins, n) == (2, 2) assert wr == pytest.approx(1.0) # --------------------------------------------------------------------------- # Calibration mode: pairing + mismatch # --------------------------------------------------------------------------- class TestCalibration: def test_pairs_and_zero_mismatch(self, tmp_path: Path) -> None: m = _base_row( id=1, source="manual_calibration", screenshot_file="cal-1.png" ) v = _base_row( id=2, source="vision_calibration", screenshot_file="cal-1.png" ) p = tmp_path / "j.csv" _write_csv(p, [m, v]) trades = load_trades(p) rep = calibration_mismatch(trades) assert rep.pairs == 1 assert sum(rep.field_mismatches.values()) == 0 assert rep.overall_mismatch_rate == 0.0 def test_one_field_mismatch(self, tmp_path: Path) -> None: m = _base_row( id=1, source="manual_calibration", screenshot_file="cal-1.png", entry="400.0", ) v = _base_row( id=2, source="vision_calibration", screenshot_file="cal-1.png", entry="400.10", # different entry ) p = tmp_path / "j.csv" _write_csv(p, [m, v]) trades = load_trades(p) rep = calibration_mismatch(trades) assert rep.pairs == 1 assert rep.field_mismatches["entry"] == 1 # all other core fields match for fld in CORE_CALIBRATION_FIELDS: if fld == "entry": continue assert rep.field_mismatches[fld] == 0 # 1 mismatch / (1 pair * 8 fields) = 12.5% assert rep.overall_mismatch_rate == pytest.approx(1.0 / len(CORE_CALIBRATION_FIELDS)) def test_unpaired_rows_ignored(self, tmp_path: Path) -> None: # Only a manual leg — no pair → 0 pairs. m = _base_row( id=1, source="manual_calibration", screenshot_file="lonely.png" ) p = tmp_path / "j.csv" _write_csv(p, [m]) trades = load_trades(p) rep = calibration_mismatch(trades) assert rep.pairs == 0 assert rep.total_comparisons == 0 assert rep.overall_mismatch_rate == 0.0 def test_numeric_equivalence_tolerated(self, tmp_path: Path) -> None: """'400' and '400.0000' should NOT count as a mismatch on entry.""" m = _base_row( id=1, source="manual_calibration", screenshot_file="cal-1.png", entry="400", ) v = _base_row( id=2, source="vision_calibration", screenshot_file="cal-1.png", entry="400.0000", ) p = tmp_path / "j.csv" _write_csv(p, [m, v]) rep = calibration_mismatch(load_trades(p)) assert rep.field_mismatches["entry"] == 0 # --------------------------------------------------------------------------- # Report formatting + CLI # --------------------------------------------------------------------------- class TestReporting: def test_format_report_contains_sections(self, tmp_path: Path) -> None: out = format_report( load_trades(_synthetic_30(tmp_path)), bootstrap_iterations=200, seed=0, ) assert "M2D Backtest Stats" in out assert "Overall" in out assert "By Set" in out assert "A1" in out and "A2" in out and "A3" in out # calitate warning present assert "descriptor only" in out.lower() or "biased" in out.lower() def test_format_calibration_report(self, tmp_path: Path) -> None: rows = [ _base_row( id=1, source="manual_calibration", screenshot_file="cal-1.png" ), _base_row( id=2, source="vision_calibration", screenshot_file="cal-1.png", directie="Sell", # mismatch on directie entry="400.0", sl="401.0", tp0="399.5", tp1="399.0", tp2="398.0", ), ] p = tmp_path / "j.csv" _write_csv(p, rows) out = format_calibration_report(load_trades(p)) assert "Paired screenshots" in out assert "directie" in out # 1 mismatch (directie) of 8 fields = 12.5% → FAIL P4 gate assert "FAIL" in out def test_empty_csv_report(self, tmp_path: Path) -> None: p = tmp_path / "empty.csv" _write_csv(p, []) out = format_report(load_trades(p)) assert "no backtest trades" in out.lower() def test_main_cli_runs( self, tmp_path: Path, capsys: pytest.CaptureFixture ) -> None: path = _synthetic_30(tmp_path) rc = main(["--csv", str(path), "--seed", "0", "--bootstrap-iterations", "100"]) assert rc == 0 captured = capsys.readouterr() assert "M2D Backtest Stats" in captured.out def test_main_cli_calibration( self, tmp_path: Path, capsys: pytest.CaptureFixture ) -> None: rows = [ _base_row(id=1, source="manual_calibration", screenshot_file="cal-1.png"), _base_row(id=2, source="vision_calibration", screenshot_file="cal-1.png"), ] p = tmp_path / "j.csv" _write_csv(p, rows) rc = main(["--csv", str(p), "--calibration"]) assert rc == 0 out = capsys.readouterr().out assert "Calibration P4 gate" in out assert "PASS" in out # all fields match → PASS