atm-backtesting/scripts/stats.py

"""Backtest statistics for ``data/jurnal.csv``.

Outputs:
- Overall + per-Set + per-calitate + per-instrument WR, expectancy.
- Wilson 95% CI for WR (closed form).
- Bootstrap percentile 95% CI for expectancy (deterministic via ``seed``).
- ``--calibration`` mode: joins ``manual_calibration`` rows with their
  ``vision_calibration`` counterparts on ``screenshot_file`` and reports
  field-by-field mismatch rates for the P4 gate (see ``STOPPING_RULE.md``).

A "win" is any trade with ``pl_marius > 0``. Pending trades
(``pl_marius`` blank, i.e. ``outcome_path in {pending, TP0->pending}``) are
excluded from both WR and expectancy: there is no realised outcome yet.

The ``calitate`` field is a known-biased descriptor (post-outcome
classification — see ``STOPPING_RULE.md`` §3). It is reported as
informational only and explicitly flagged as such; do NOT use it as a
filter for GO LIVE decisions.
"""

from __future__ import annotations

import argparse
import csv
import math
import random
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable

__all__ = [
    "CORE_CALIBRATION_FIELDS",
    "BACKTEST_SOURCES",
    "CALIBRATION_SOURCES",
    "Trade",
    "GroupStats",
    "load_trades",
    "wilson_ci",
    "bootstrap_ci",
    "win_rate",
    "expectancy",
    "group_by",
    "compute_group_stats",
    "calibration_mismatch",
    "format_report",
    "main",
]


# Fields compared in the calibration mismatch gate (STOPPING_RULE.md §P4).
CORE_CALIBRATION_FIELDS: tuple[str, ...] = (
    "entry",
    "sl",
    "tp0",
    "tp1",
    "tp2",
    "outcome_path",
    "max_reached",
    "directie",
)


BACKTEST_SOURCES: frozenset[str] = frozenset({"vision", "manual"})
CALIBRATION_SOURCES: frozenset[str] = frozenset(
    {"manual_calibration", "vision_calibration"}
)


# ---------------------------------------------------------------------------
# Loading / typed access
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class Trade:
    """One realised (or pending) trade row, typed."""

    id: int
    screenshot_file: str
    source: str
    data: str
    zi: str
    ora_ro: str
    instrument: str
    directie: str
    calitate: str
    set: str
    outcome_path: str
    max_reached: str
    be_moved: bool
    pl_marius: float | None
    pl_theoretical: float
    raw: dict[str, str] = field(default_factory=dict)

    @property
    def is_pending(self) -> bool:
        return self.pl_marius is None

    @property
    def is_win(self) -> bool:
        return self.pl_marius is not None and self.pl_marius > 0


def _parse_optional_float(value: str) -> float | None:
    s = (value or "").strip()
    if s == "":
        return None
    return float(s)


def _parse_bool(value: str) -> bool:
    return (value or "").strip().lower() in {"true", "1", "yes", "da"}


def _row_to_trade(row: dict[str, str]) -> Trade:
    return Trade(
        id=int(row.get("id") or 0),
        screenshot_file=row.get("screenshot_file", ""),
        source=row.get("source", ""),
        data=row.get("data", ""),
        zi=row.get("zi", ""),
        ora_ro=row.get("ora_ro", ""),
        instrument=row.get("instrument", ""),
        directie=row.get("directie", ""),
        calitate=row.get("calitate", ""),
        set=row.get("set", ""),
        outcome_path=row.get("outcome_path", ""),
        max_reached=row.get("max_reached", ""),
        be_moved=_parse_bool(row.get("be_moved", "")),
        pl_marius=_parse_optional_float(row.get("pl_marius", "")),
        pl_theoretical=float(row.get("pl_theoretical") or 0.0),
        raw=dict(row),
    )


def load_trades(csv_path: Path | str) -> list[Trade]:
    """Load all rows of ``csv_path`` as :class:`Trade` objects.

    Returns ``[]`` if the file does not exist or is empty.
    """
    p = Path(csv_path)
    if not p.exists() or p.stat().st_size == 0:
        return []
    with p.open("r", encoding="utf-8", newline="") as fh:
        reader = csv.DictReader(fh)
        return [_row_to_trade(r) for r in reader]


# ---------------------------------------------------------------------------
# Statistics primitives
# ---------------------------------------------------------------------------


def wilson_ci(wins: int, n: int, z: float = 1.96) -> tuple[float, float]:
    """Wilson score interval for a binomial proportion.

    Returns ``(lo, hi)`` as proportions in [0, 1]. For ``n == 0`` returns
    ``(0.0, 0.0)``. ``z = 1.96`` corresponds to a 95% CI.
    """
    if n <= 0:
        return (0.0, 0.0)
    if wins < 0 or wins > n:
        raise ValueError(f"wins={wins} out of range for n={n}")
    p_hat = wins / n
    denom = 1.0 + (z * z) / n
    center = p_hat + (z * z) / (2.0 * n)
    half = z * math.sqrt((p_hat * (1.0 - p_hat) + (z * z) / (4.0 * n)) / n)
    lo = (center - half) / denom
    hi = (center + half) / denom
    return (max(0.0, lo), min(1.0, hi))


def bootstrap_ci(
    values: list[float],
    *,
    iterations: int = 2000,
    alpha: float = 0.05,
    seed: int | None = None,
) -> tuple[float, float]:
    """Percentile-method bootstrap CI for the mean of ``values``.

    Deterministic when ``seed`` is provided. Returns ``(lo, hi)``. For
    ``len(values) < 2`` returns ``(mean, mean)``.
    """
    if not values:
        return (0.0, 0.0)
    n = len(values)
    mean = sum(values) / n
    if n < 2 or iterations <= 0:
        return (mean, mean)

    rng = random.Random(seed)
    means: list[float] = []
    for _ in range(iterations):
        s = 0.0
        for _ in range(n):
            s += values[rng.randrange(n)]
        means.append(s / n)
    means.sort()
    lo_idx = int(math.floor((alpha / 2.0) * iterations))
    hi_idx = int(math.ceil((1.0 - alpha / 2.0) * iterations)) - 1
    lo_idx = max(0, min(iterations - 1, lo_idx))
    hi_idx = max(0, min(iterations - 1, hi_idx))
    return (means[lo_idx], means[hi_idx])


def win_rate(trades: Iterable[Trade]) -> tuple[int, int, float]:
    """Return ``(wins, n_resolved, wr)`` ignoring pending trades."""
    resolved = [t for t in trades if not t.is_pending]
    wins = sum(1 for t in resolved if t.is_win)
    n = len(resolved)
    wr = (wins / n) if n else 0.0
    return wins, n, wr


def expectancy(trades: Iterable[Trade], overlay: str = "pl_marius") -> float:
    """Mean P/L (in R) over non-pending trades, on the given overlay."""
    if overlay not in {"pl_marius", "pl_theoretical"}:
        raise ValueError(f"unknown overlay {overlay!r}")
    if overlay == "pl_marius":
        vals = [t.pl_marius for t in trades if t.pl_marius is not None]
    else:
        vals = [t.pl_theoretical for t in trades if not t.is_pending]
    if not vals:
        return 0.0
    return sum(vals) / len(vals)


# ---------------------------------------------------------------------------
# Group stats
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class GroupStats:
    key: str
    n_total: int
    n_resolved: int
    wins: int
    wr: float
    wr_ci_lo: float
    wr_ci_hi: float
    exp_marius: float
    exp_marius_ci_lo: float
    exp_marius_ci_hi: float
    exp_theoretical: float
    exp_theoretical_ci_lo: float
    exp_theoretical_ci_hi: float


def group_by(trades: Iterable[Trade], field_name: str) -> dict[str, list[Trade]]:
    out: dict[str, list[Trade]] = {}
    for t in trades:
        key = getattr(t, field_name, "") or "(blank)"
        out.setdefault(key, []).append(t)
    return out


def compute_group_stats(
    trades: list[Trade],
    *,
    label: str,
    bootstrap_iterations: int = 2000,
    seed: int | None = None,
) -> GroupStats:
    wins, n_resolved, wr = win_rate(trades)
    wr_lo, wr_hi = wilson_ci(wins, n_resolved)

    pl_m_vals = [t.pl_marius for t in trades if t.pl_marius is not None]
    exp_m = (sum(pl_m_vals) / len(pl_m_vals)) if pl_m_vals else 0.0
    exp_m_lo, exp_m_hi = bootstrap_ci(
        pl_m_vals, iterations=bootstrap_iterations, seed=seed
    )

    pl_t_vals = [t.pl_theoretical for t in trades if not t.is_pending]
    exp_t = (sum(pl_t_vals) / len(pl_t_vals)) if pl_t_vals else 0.0
    exp_t_lo, exp_t_hi = bootstrap_ci(
        pl_t_vals,
        iterations=bootstrap_iterations,
        seed=None if seed is None else seed + 1,
    )

    return GroupStats(
        key=label,
        n_total=len(trades),
        n_resolved=n_resolved,
        wins=wins,
        wr=wr,
        wr_ci_lo=wr_lo,
        wr_ci_hi=wr_hi,
        exp_marius=exp_m,
        exp_marius_ci_lo=exp_m_lo,
        exp_marius_ci_hi=exp_m_hi,
        exp_theoretical=exp_t,
        exp_theoretical_ci_lo=exp_t_lo,
        exp_theoretical_ci_hi=exp_t_hi,
    )


# ---------------------------------------------------------------------------
# Calibration mode
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class CalibrationReport:
    pairs: int
    field_mismatches: dict[str, int]
    total_comparisons: int

    @property
    def overall_mismatch_rate(self) -> float:
        if self.total_comparisons == 0:
            return 0.0
        total = sum(self.field_mismatches.values())
        return total / self.total_comparisons


def _normalise_for_compare(field_name: str, value: str) -> str:
    s = (value or "").strip()
    if field_name in {"entry", "sl", "tp0", "tp1", "tp2"}:
        try:
            return f"{float(s):.4f}"
        except ValueError:
            return s
    return s


def calibration_mismatch(
    trades: Iterable[Trade],
    *,
    fields: tuple[str, ...] = CORE_CALIBRATION_FIELDS,
) -> CalibrationReport:
    """Pair ``manual_calibration`` and ``vision_calibration`` rows by
    ``screenshot_file``, then count mismatches per ``fields``.

    Returns a :class:`CalibrationReport`. Unpaired calibration rows are
    silently ignored — they cannot contribute to a comparison.
    """
    manual: dict[str, Trade] = {}
    vision: dict[str, Trade] = {}
    for t in trades:
        if t.source == "manual_calibration":
            manual[t.screenshot_file] = t
        elif t.source == "vision_calibration":
            vision[t.screenshot_file] = t

    paired_files = sorted(set(manual) & set(vision))
    field_mismatches: dict[str, int] = {f: 0 for f in fields}
    for f in paired_files:
        m = manual[f]
        v = vision[f]
        for fld in fields:
            mv = _normalise_for_compare(fld, m.raw.get(fld, ""))
            vv = _normalise_for_compare(fld, v.raw.get(fld, ""))
            if mv != vv:
                field_mismatches[fld] += 1

    total_comparisons = len(paired_files) * len(fields)
    return CalibrationReport(
        pairs=len(paired_files),
        field_mismatches=field_mismatches,
        total_comparisons=total_comparisons,
    )


# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------


def _fmt_pct(p: float) -> str:
    return f"{100.0 * p:5.1f}%"


def _fmt_r(x: float) -> str:
    return f"{x:+.3f}R"


def _fmt_stats_row(s: GroupStats) -> str:
    return (
        f"{s.key:<14} N={s.n_total:>3} (resolved {s.n_resolved:>3})  "
        f"WR={_fmt_pct(s.wr)} [{_fmt_pct(s.wr_ci_lo)}, {_fmt_pct(s.wr_ci_hi)}]  "
        f"E_marius={_fmt_r(s.exp_marius)} "
        f"[{_fmt_r(s.exp_marius_ci_lo)}, {_fmt_r(s.exp_marius_ci_hi)}]  "
        f"E_theor={_fmt_r(s.exp_theoretical)}"
    )


def format_report(
    trades: list[Trade],
    *,
    bootstrap_iterations: int = 2000,
    seed: int | None = None,
) -> str:
    """Render the main stats report.

    Only ``source in {vision, manual}`` rows are included in the WR /
    expectancy computations; calibration rows are reported separately via
    ``--calibration``.
    """
    backtest = [t for t in trades if t.source in BACKTEST_SOURCES]
    lines: list[str] = []
    lines.append("=== M2D Backtest Stats ===")
    lines.append(f"Backtest rows: {len(backtest)} (calibration excluded)")
    lines.append("")

    if not backtest:
        lines.append("(no backtest trades yet)")
        return "\n".join(lines)

    overall = compute_group_stats(
        backtest,
        label="OVERALL",
        bootstrap_iterations=bootstrap_iterations,
        seed=seed,
    )
    lines.append("-- Overall --")
    lines.append(_fmt_stats_row(overall))
    lines.append("")

    def _emit_group(title: str, field_name: str, key_order: list[str] | None = None) -> None:
        lines.append(f"-- By {title} --")
        groups = group_by(backtest, field_name)
        keys = key_order if key_order is not None else sorted(groups)
        for k in keys:
            if k not in groups:
                continue
            sub_seed = None if seed is None else seed + abs(hash(k)) % 10_000
            s = compute_group_stats(
                groups[k],
                label=k,
                bootstrap_iterations=bootstrap_iterations,
                seed=sub_seed,
            )
            lines.append(_fmt_stats_row(s))
        lines.append("")

    _emit_group(
        "Set",
        "set",
        key_order=["A1", "A2", "A3", "B", "C", "D", "Other"],
    )
    _emit_group("Instrument", "instrument")
    lines.append(
        "[!] By calitate — descriptor only (post-outcome, biased; do not use "
        "as a GO LIVE filter — see STOPPING_RULE.md §3)."
    )
    _emit_group(
        "calitate",
        "calitate",
        key_order=["Clară", "Mai mare ca impuls", "Slabă", "n/a"],
    )

    return "\n".join(lines).rstrip() + "\n"


def format_calibration_report(trades: list[Trade]) -> str:
    cal = calibration_mismatch(trades)
    lines: list[str] = []
    lines.append("=== Calibration P4 gate ===")
    lines.append(f"Paired screenshots (manual ∩ vision): {cal.pairs}")
    if cal.pairs == 0:
        lines.append("(no calibration pairs yet)")
        return "\n".join(lines) + "\n"

    lines.append("")
    lines.append(f"{'field':<14} mismatches / pairs    rate")
    for fld in CORE_CALIBRATION_FIELDS:
        m = cal.field_mismatches.get(fld, 0)
        rate = (m / cal.pairs) if cal.pairs else 0.0
        lines.append(f"{fld:<14} {m:>3} / {cal.pairs:<3}        {_fmt_pct(rate)}")
    lines.append("")
    lines.append(
        f"Overall mismatch rate: {_fmt_pct(cal.overall_mismatch_rate)} "
        f"({sum(cal.field_mismatches.values())} of {cal.total_comparisons} comparisons)"
    )
    threshold = 0.10
    verdict = "PASS" if cal.overall_mismatch_rate <= threshold else "FAIL"
    lines.append(f"P4 gate (<= 10%): {verdict}")
    return "\n".join(lines) + "\n"


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        prog="stats",
        description="Backtest statistics for data/jurnal.csv",
    )
    parser.add_argument(
        "--csv",
        type=Path,
        default=Path("data/jurnal.csv"),
        help="Path to the jurnal CSV (default: data/jurnal.csv).",
    )
    parser.add_argument(
        "--calibration",
        action="store_true",
        help="Show P4 calibration mismatch report instead of backtest stats.",
    )
    parser.add_argument(
        "--bootstrap-iterations",
        type=int,
        default=2000,
        help="Bootstrap iterations for expectancy CI (default: 2000).",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Seed for the bootstrap RNG (set for deterministic output).",
    )
    args = parser.parse_args(argv)

    trades = load_trades(args.csv)
    if args.calibration:
        out = format_calibration_report(trades)
    else:
        out = format_report(
            trades,
            bootstrap_iterations=args.bootstrap_iterations,
            seed=args.seed,
        )
    # Force UTF-8 on stdout: the report contains diacritics ("Clară", "Slabă")
    # and a console codepage like cp1252 would crash on those.
    try:
        sys.stdout.reconfigure(encoding="utf-8")  # type: ignore[attr-defined]
    except (AttributeError, OSError):
        pass
    sys.stdout.write(out)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())