"""Backtest statistics for ``data/jurnal.csv``. Outputs: - Overall + per-Set + per-calitate + per-instrument WR, expectancy. - Wilson 95% CI for WR (closed form). - Bootstrap percentile 95% CI for expectancy (deterministic via ``seed``). - ``--calibration`` mode: joins ``manual_calibration`` rows with their ``vision_calibration`` counterparts on ``screenshot_file`` and reports field-by-field mismatch rates for the P4 gate (see ``STOPPING_RULE.md``). A "win" is any trade with ``pl_marius > 0``. Pending trades (``pl_marius`` blank, i.e. ``outcome_path in {pending, TP0->pending}``) are excluded from both WR and expectancy: there is no realised outcome yet. The ``calitate`` field is a known-biased descriptor (post-outcome classification — see ``STOPPING_RULE.md`` §3). It is reported as informational only and explicitly flagged as such; do NOT use it as a filter for GO LIVE decisions. """ from __future__ import annotations import argparse import csv import math import random import sys from dataclasses import dataclass, field from pathlib import Path from typing import Iterable __all__ = [ "CORE_CALIBRATION_FIELDS", "BACKTEST_SOURCES", "CALIBRATION_SOURCES", "Trade", "GroupStats", "load_trades", "wilson_ci", "bootstrap_ci", "win_rate", "expectancy", "group_by", "compute_group_stats", "calibration_mismatch", "format_report", "main", ] # Fields compared in the calibration mismatch gate (STOPPING_RULE.md §P4). CORE_CALIBRATION_FIELDS: tuple[str, ...] = ( "entry", "sl", "tp0", "tp1", "tp2", "outcome_path", "max_reached", "directie", ) BACKTEST_SOURCES: frozenset[str] = frozenset({"vision", "manual"}) CALIBRATION_SOURCES: frozenset[str] = frozenset( {"manual_calibration", "vision_calibration"} ) # --------------------------------------------------------------------------- # Loading / typed access # --------------------------------------------------------------------------- @dataclass(frozen=True) class Trade: """One realised (or pending) trade row, typed.""" id: int screenshot_file: str source: str data: str zi: str ora_ro: str instrument: str directie: str calitate: str set: str outcome_path: str max_reached: str be_moved: bool pl_marius: float | None pl_theoretical: float raw: dict[str, str] = field(default_factory=dict) @property def is_pending(self) -> bool: return self.pl_marius is None @property def is_win(self) -> bool: return self.pl_marius is not None and self.pl_marius > 0 def _parse_optional_float(value: str) -> float | None: s = (value or "").strip() if s == "": return None return float(s) def _parse_bool(value: str) -> bool: return (value or "").strip().lower() in {"true", "1", "yes", "da"} def _row_to_trade(row: dict[str, str]) -> Trade: return Trade( id=int(row.get("id") or 0), screenshot_file=row.get("screenshot_file", ""), source=row.get("source", ""), data=row.get("data", ""), zi=row.get("zi", ""), ora_ro=row.get("ora_ro", ""), instrument=row.get("instrument", ""), directie=row.get("directie", ""), calitate=row.get("calitate", ""), set=row.get("set", ""), outcome_path=row.get("outcome_path", ""), max_reached=row.get("max_reached", ""), be_moved=_parse_bool(row.get("be_moved", "")), pl_marius=_parse_optional_float(row.get("pl_marius", "")), pl_theoretical=float(row.get("pl_theoretical") or 0.0), raw=dict(row), ) def load_trades(csv_path: Path | str) -> list[Trade]: """Load all rows of ``csv_path`` as :class:`Trade` objects. Returns ``[]`` if the file does not exist or is empty. """ p = Path(csv_path) if not p.exists() or p.stat().st_size == 0: return [] with p.open("r", encoding="utf-8", newline="") as fh: reader = csv.DictReader(fh) return [_row_to_trade(r) for r in reader] # --------------------------------------------------------------------------- # Statistics primitives # --------------------------------------------------------------------------- def wilson_ci(wins: int, n: int, z: float = 1.96) -> tuple[float, float]: """Wilson score interval for a binomial proportion. Returns ``(lo, hi)`` as proportions in [0, 1]. For ``n == 0`` returns ``(0.0, 0.0)``. ``z = 1.96`` corresponds to a 95% CI. """ if n <= 0: return (0.0, 0.0) if wins < 0 or wins > n: raise ValueError(f"wins={wins} out of range for n={n}") p_hat = wins / n denom = 1.0 + (z * z) / n center = p_hat + (z * z) / (2.0 * n) half = z * math.sqrt((p_hat * (1.0 - p_hat) + (z * z) / (4.0 * n)) / n) lo = (center - half) / denom hi = (center + half) / denom return (max(0.0, lo), min(1.0, hi)) def bootstrap_ci( values: list[float], *, iterations: int = 2000, alpha: float = 0.05, seed: int | None = None, ) -> tuple[float, float]: """Percentile-method bootstrap CI for the mean of ``values``. Deterministic when ``seed`` is provided. Returns ``(lo, hi)``. For ``len(values) < 2`` returns ``(mean, mean)``. """ if not values: return (0.0, 0.0) n = len(values) mean = sum(values) / n if n < 2 or iterations <= 0: return (mean, mean) rng = random.Random(seed) means: list[float] = [] for _ in range(iterations): s = 0.0 for _ in range(n): s += values[rng.randrange(n)] means.append(s / n) means.sort() lo_idx = int(math.floor((alpha / 2.0) * iterations)) hi_idx = int(math.ceil((1.0 - alpha / 2.0) * iterations)) - 1 lo_idx = max(0, min(iterations - 1, lo_idx)) hi_idx = max(0, min(iterations - 1, hi_idx)) return (means[lo_idx], means[hi_idx]) def win_rate(trades: Iterable[Trade]) -> tuple[int, int, float]: """Return ``(wins, n_resolved, wr)`` ignoring pending trades.""" resolved = [t for t in trades if not t.is_pending] wins = sum(1 for t in resolved if t.is_win) n = len(resolved) wr = (wins / n) if n else 0.0 return wins, n, wr def expectancy(trades: Iterable[Trade], overlay: str = "pl_marius") -> float: """Mean P/L (in R) over non-pending trades, on the given overlay.""" if overlay not in {"pl_marius", "pl_theoretical"}: raise ValueError(f"unknown overlay {overlay!r}") if overlay == "pl_marius": vals = [t.pl_marius for t in trades if t.pl_marius is not None] else: vals = [t.pl_theoretical for t in trades if not t.is_pending] if not vals: return 0.0 return sum(vals) / len(vals) # --------------------------------------------------------------------------- # Group stats # --------------------------------------------------------------------------- @dataclass(frozen=True) class GroupStats: key: str n_total: int n_resolved: int wins: int wr: float wr_ci_lo: float wr_ci_hi: float exp_marius: float exp_marius_ci_lo: float exp_marius_ci_hi: float exp_theoretical: float exp_theoretical_ci_lo: float exp_theoretical_ci_hi: float def group_by(trades: Iterable[Trade], field_name: str) -> dict[str, list[Trade]]: out: dict[str, list[Trade]] = {} for t in trades: key = getattr(t, field_name, "") or "(blank)" out.setdefault(key, []).append(t) return out def compute_group_stats( trades: list[Trade], *, label: str, bootstrap_iterations: int = 2000, seed: int | None = None, ) -> GroupStats: wins, n_resolved, wr = win_rate(trades) wr_lo, wr_hi = wilson_ci(wins, n_resolved) pl_m_vals = [t.pl_marius for t in trades if t.pl_marius is not None] exp_m = (sum(pl_m_vals) / len(pl_m_vals)) if pl_m_vals else 0.0 exp_m_lo, exp_m_hi = bootstrap_ci( pl_m_vals, iterations=bootstrap_iterations, seed=seed ) pl_t_vals = [t.pl_theoretical for t in trades if not t.is_pending] exp_t = (sum(pl_t_vals) / len(pl_t_vals)) if pl_t_vals else 0.0 exp_t_lo, exp_t_hi = bootstrap_ci( pl_t_vals, iterations=bootstrap_iterations, seed=None if seed is None else seed + 1, ) return GroupStats( key=label, n_total=len(trades), n_resolved=n_resolved, wins=wins, wr=wr, wr_ci_lo=wr_lo, wr_ci_hi=wr_hi, exp_marius=exp_m, exp_marius_ci_lo=exp_m_lo, exp_marius_ci_hi=exp_m_hi, exp_theoretical=exp_t, exp_theoretical_ci_lo=exp_t_lo, exp_theoretical_ci_hi=exp_t_hi, ) # --------------------------------------------------------------------------- # Calibration mode # --------------------------------------------------------------------------- @dataclass(frozen=True) class CalibrationReport: pairs: int field_mismatches: dict[str, int] total_comparisons: int @property def overall_mismatch_rate(self) -> float: if self.total_comparisons == 0: return 0.0 total = sum(self.field_mismatches.values()) return total / self.total_comparisons def _normalise_for_compare(field_name: str, value: str) -> str: s = (value or "").strip() if field_name in {"entry", "sl", "tp0", "tp1", "tp2"}: try: return f"{float(s):.4f}" except ValueError: return s return s def calibration_mismatch( trades: Iterable[Trade], *, fields: tuple[str, ...] = CORE_CALIBRATION_FIELDS, ) -> CalibrationReport: """Pair ``manual_calibration`` and ``vision_calibration`` rows by ``screenshot_file``, then count mismatches per ``fields``. Returns a :class:`CalibrationReport`. Unpaired calibration rows are silently ignored — they cannot contribute to a comparison. """ manual: dict[str, Trade] = {} vision: dict[str, Trade] = {} for t in trades: if t.source == "manual_calibration": manual[t.screenshot_file] = t elif t.source == "vision_calibration": vision[t.screenshot_file] = t paired_files = sorted(set(manual) & set(vision)) field_mismatches: dict[str, int] = {f: 0 for f in fields} for f in paired_files: m = manual[f] v = vision[f] for fld in fields: mv = _normalise_for_compare(fld, m.raw.get(fld, "")) vv = _normalise_for_compare(fld, v.raw.get(fld, "")) if mv != vv: field_mismatches[fld] += 1 total_comparisons = len(paired_files) * len(fields) return CalibrationReport( pairs=len(paired_files), field_mismatches=field_mismatches, total_comparisons=total_comparisons, ) # --------------------------------------------------------------------------- # Reporting # --------------------------------------------------------------------------- def _fmt_pct(p: float) -> str: return f"{100.0 * p:5.1f}%" def _fmt_r(x: float) -> str: return f"{x:+.3f}R" def _fmt_stats_row(s: GroupStats) -> str: return ( f"{s.key:<14} N={s.n_total:>3} (resolved {s.n_resolved:>3}) " f"WR={_fmt_pct(s.wr)} [{_fmt_pct(s.wr_ci_lo)}, {_fmt_pct(s.wr_ci_hi)}] " f"E_marius={_fmt_r(s.exp_marius)} " f"[{_fmt_r(s.exp_marius_ci_lo)}, {_fmt_r(s.exp_marius_ci_hi)}] " f"E_theor={_fmt_r(s.exp_theoretical)}" ) def format_report( trades: list[Trade], *, bootstrap_iterations: int = 2000, seed: int | None = None, ) -> str: """Render the main stats report. Only ``source in {vision, manual}`` rows are included in the WR / expectancy computations; calibration rows are reported separately via ``--calibration``. """ backtest = [t for t in trades if t.source in BACKTEST_SOURCES] lines: list[str] = [] lines.append("=== M2D Backtest Stats ===") lines.append(f"Backtest rows: {len(backtest)} (calibration excluded)") lines.append("") if not backtest: lines.append("(no backtest trades yet)") return "\n".join(lines) overall = compute_group_stats( backtest, label="OVERALL", bootstrap_iterations=bootstrap_iterations, seed=seed, ) lines.append("-- Overall --") lines.append(_fmt_stats_row(overall)) lines.append("") def _emit_group(title: str, field_name: str, key_order: list[str] | None = None) -> None: lines.append(f"-- By {title} --") groups = group_by(backtest, field_name) keys = key_order if key_order is not None else sorted(groups) for k in keys: if k not in groups: continue sub_seed = None if seed is None else seed + abs(hash(k)) % 10_000 s = compute_group_stats( groups[k], label=k, bootstrap_iterations=bootstrap_iterations, seed=sub_seed, ) lines.append(_fmt_stats_row(s)) lines.append("") _emit_group( "Set", "set", key_order=["A1", "A2", "A3", "B", "C", "D", "Other"], ) _emit_group("Instrument", "instrument") lines.append( "[!] By calitate — descriptor only (post-outcome, biased; do not use " "as a GO LIVE filter — see STOPPING_RULE.md §3)." ) _emit_group( "calitate", "calitate", key_order=["Clară", "Mai mare ca impuls", "Slabă", "n/a"], ) return "\n".join(lines).rstrip() + "\n" def format_calibration_report(trades: list[Trade]) -> str: cal = calibration_mismatch(trades) lines: list[str] = [] lines.append("=== Calibration P4 gate ===") lines.append(f"Paired screenshots (manual ∩ vision): {cal.pairs}") if cal.pairs == 0: lines.append("(no calibration pairs yet)") return "\n".join(lines) + "\n" lines.append("") lines.append(f"{'field':<14} mismatches / pairs rate") for fld in CORE_CALIBRATION_FIELDS: m = cal.field_mismatches.get(fld, 0) rate = (m / cal.pairs) if cal.pairs else 0.0 lines.append(f"{fld:<14} {m:>3} / {cal.pairs:<3} {_fmt_pct(rate)}") lines.append("") lines.append( f"Overall mismatch rate: {_fmt_pct(cal.overall_mismatch_rate)} " f"({sum(cal.field_mismatches.values())} of {cal.total_comparisons} comparisons)" ) threshold = 0.10 verdict = "PASS" if cal.overall_mismatch_rate <= threshold else "FAIL" lines.append(f"P4 gate (<= 10%): {verdict}") return "\n".join(lines) + "\n" # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog="stats", description="Backtest statistics for data/jurnal.csv", ) parser.add_argument( "--csv", type=Path, default=Path("data/jurnal.csv"), help="Path to the jurnal CSV (default: data/jurnal.csv).", ) parser.add_argument( "--calibration", action="store_true", help="Show P4 calibration mismatch report instead of backtest stats.", ) parser.add_argument( "--bootstrap-iterations", type=int, default=2000, help="Bootstrap iterations for expectancy CI (default: 2000).", ) parser.add_argument( "--seed", type=int, default=None, help="Seed for the bootstrap RNG (set for deterministic output).", ) args = parser.parse_args(argv) trades = load_trades(args.csv) if args.calibration: out = format_calibration_report(trades) else: out = format_report( trades, bootstrap_iterations=args.bootstrap_iterations, seed=args.seed, ) # Force UTF-8 on stdout: the report contains diacritics ("Clară", "Slabă") # and a console codepage like cp1252 would crash on those. try: sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined] except (AttributeError, OSError): pass sys.stdout.write(out) return 0 if __name__ == "__main__": raise SystemExit(main())