commands: m2d-log + backtest + batch + stats slash commands (124 tests pass)

2026-05-13 12:48:26 +03:00
parent 26d084dc4b
commit 34af5b631e
7 changed files with 1111 additions and 730 deletions
--- a/scripts/stats.py
+++ b/scripts/stats.py
@@ -1,21 +1,20 @@
 """Backtest statistics for ``data/jurnal.csv``.

-Outputs:
- Overall + per-Set + per-calitate + per-instrument WR, expectancy.
- Wilson 95% CI for WR (closed form).
- Bootstrap percentile 95% CI for expectancy (deterministic via ``seed``).
- ``--calibration`` mode: joins ``manual_calibration`` rows with their
-  ``vision_calibration`` counterparts on ``screenshot_file`` and reports
-  field-by-field mismatch rates for the P4 gate (see ``STOPPING_RULE.md``).
+Public API:
+    - ``compute_stats(csv_path, overlay) -> dict``
+    - ``render_stats(stats, overlay) -> str``
+    - ``compute_calibration(csv_path) -> dict``
+    - ``render_calibration(cal) -> str``
+    - ``main()`` — CLI entry point.

-A "win" is any trade with ``pl_marius > 0``. Pending trades
-(``pl_marius`` blank, i.e. ``outcome_path in {pending, TP0->pending}``) are
-excluded from both WR and expectancy: there is no realised outcome yet.
+A "win" is a closed trade with ``pl_overlay > 0`` (where ``pl_overlay`` is
+either ``pl_marius`` or ``pl_theoretical``). Pending trades — ``pl_marius``
+blank, i.e. ``outcome_path in {pending, TP0->pending}`` — are excluded from
+both WR and expectancy: there is no realised outcome yet.

-The ``calitate`` field is a known-biased descriptor (post-outcome
-classification — see ``STOPPING_RULE.md`` §3). It is reported as
-informational only and explicitly flagged as such; do NOT use it as a
-filter for GO LIVE decisions.
+The ``calitate`` field is a known-biased descriptor: it is classified
+post-outcome (see ``STOPPING_RULE.md`` §3). The per-``calitate`` split is
+reported with an explicit *descriptor only — biased post-outcome* caveat.
 """

 from __future__ import annotations
@@ -23,32 +22,42 @@ from __future__ import annotations
 import argparse
 import csv
 import math
-import random
 import sys
-from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Iterable
+from typing import Any, Iterable
+
+import numpy as np
+
+from scripts.append_row import CSV_COLUMNS

 __all__ = [
-    "CORE_CALIBRATION_FIELDS",
    "BACKTEST_SOURCES",
    "CALIBRATION_SOURCES",
-    "Trade",
-    "GroupStats",
-    "load_trades",
+    "CORE_CALIBRATION_FIELDS",
+    "NUMERIC_CALIBRATION_FIELDS",
+    "STOPPING_RULE_N",
    "wilson_ci",
-    "bootstrap_ci",
-    "win_rate",
-    "expectancy",
-    "group_by",
-    "compute_group_stats",
-    "calibration_mismatch",
-    "format_report",
+    "bootstrap_expectancy_ci",
+    "compute_stats",
+    "render_stats",
+    "compute_calibration",
+    "render_calibration",
    "main",
 ]


-# Fields compared in the calibration mismatch gate (STOPPING_RULE.md §P4).
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+
+BACKTEST_SOURCES: frozenset[str] = frozenset({"vision", "manual"})
+CALIBRATION_SOURCES: frozenset[str] = frozenset(
+    {"manual_calibration", "vision_calibration"}
+)
+
+
+# Calibration P4 gate (STOPPING_RULE.md §P4) — explicitly reported per field.
 CORE_CALIBRATION_FIELDS: tuple[str, ...] = (
    "entry",
    "sl",
@@ -58,315 +67,205 @@ CORE_CALIBRATION_FIELDS: tuple[str, ...] = (
    "outcome_path",
    "max_reached",
    "directie",
+    "instrument",
 )


-BACKTEST_SOURCES: frozenset[str] = frozenset({"vision", "manual"})
-CALIBRATION_SOURCES: frozenset[str] = frozenset(
-    {"manual_calibration", "vision_calibration"}
+NUMERIC_CALIBRATION_FIELDS: frozenset[str] = frozenset(
+    {"entry", "sl", "tp0", "tp1", "tp2"}
 )


+# STOPPING_RULE.md §"GO LIVE" gate: N >= 40 per Set.
+STOPPING_RULE_N: int = 40
+
+
 # ---------------------------------------------------------------------------
-# Loading / typed access
+# Loading
 # ---------------------------------------------------------------------------


-@dataclass(frozen=True)
-class Trade:
-    """One realised (or pending) trade row, typed."""
-
-    id: int
-    screenshot_file: str
-    source: str
-    data: str
-    zi: str
-    ora_ro: str
-    instrument: str
-    directie: str
-    calitate: str
-    set: str
-    outcome_path: str
-    max_reached: str
-    be_moved: bool
-    pl_marius: float | None
-    pl_theoretical: float
-    raw: dict[str, str] = field(default_factory=dict)
-
-    @property
-    def is_pending(self) -> bool:
-        return self.pl_marius is None
-
-    @property
-    def is_win(self) -> bool:
-        return self.pl_marius is not None and self.pl_marius > 0
-
-
 def _parse_optional_float(value: str) -> float | None:
    s = (value or "").strip()
    if s == "":
        return None
-    return float(s)
+    try:
+        return float(s)
+    except ValueError:
+        return None


-def _parse_bool(value: str) -> bool:
-    return (value or "").strip().lower() in {"true", "1", "yes", "da"}
-
-
-def _row_to_trade(row: dict[str, str]) -> Trade:
-    return Trade(
-        id=int(row.get("id") or 0),
-        screenshot_file=row.get("screenshot_file", ""),
-        source=row.get("source", ""),
-        data=row.get("data", ""),
-        zi=row.get("zi", ""),
-        ora_ro=row.get("ora_ro", ""),
-        instrument=row.get("instrument", ""),
-        directie=row.get("directie", ""),
-        calitate=row.get("calitate", ""),
-        set=row.get("set", ""),
-        outcome_path=row.get("outcome_path", ""),
-        max_reached=row.get("max_reached", ""),
-        be_moved=_parse_bool(row.get("be_moved", "")),
-        pl_marius=_parse_optional_float(row.get("pl_marius", "")),
-        pl_theoretical=float(row.get("pl_theoretical") or 0.0),
-        raw=dict(row),
-    )
-
-
-def load_trades(csv_path: Path | str) -> list[Trade]:
-    """Load all rows of ``csv_path`` as :class:`Trade` objects.
-
-    Returns ``[]`` if the file does not exist or is empty.
-    """
+def _load_rows(csv_path: Path | str) -> list[dict[str, str]]:
    p = Path(csv_path)
    if not p.exists() or p.stat().st_size == 0:
        return []
    with p.open("r", encoding="utf-8", newline="") as fh:
-        reader = csv.DictReader(fh)
-        return [_row_to_trade(r) for r in reader]
+        return list(csv.DictReader(fh))


 # ---------------------------------------------------------------------------
-# Statistics primitives
+# CI primitives
 # ---------------------------------------------------------------------------


 def wilson_ci(wins: int, n: int, z: float = 1.96) -> tuple[float, float]:
    """Wilson score interval for a binomial proportion.

-    Returns ``(lo, hi)`` as proportions in [0, 1]. For ``n == 0`` returns
-    ``(0.0, 0.0)``. ``z = 1.96`` corresponds to a 95% CI.
+    Returns ``(lo, hi)`` clamped to ``[0.0, 1.0]``. For ``n == 0`` returns
+    ``(0.0, 0.0)``. ``z = 1.96`` ≈ 95% confidence.
    """
    if n <= 0:
        return (0.0, 0.0)
    if wins < 0 or wins > n:
        raise ValueError(f"wins={wins} out of range for n={n}")
-    p_hat = wins / n
+    p = wins / n
    denom = 1.0 + (z * z) / n
-    center = p_hat + (z * z) / (2.0 * n)
-    half = z * math.sqrt((p_hat * (1.0 - p_hat) + (z * z) / (4.0 * n)) / n)
-    lo = (center - half) / denom
-    hi = (center + half) / denom
-    return (max(0.0, lo), min(1.0, hi))
+    center = (p + (z * z) / (2.0 * n)) / denom
+    spread = z * math.sqrt(p * (1.0 - p) / n + (z * z) / (4.0 * n * n)) / denom
+    return (max(0.0, center - spread), min(1.0, center + spread))


-def bootstrap_ci(
-    values: list[float],
-    *,
-    iterations: int = 2000,
-    alpha: float = 0.05,
-    seed: int | None = None,
+def bootstrap_expectancy_ci(
+    values: list[float] | np.ndarray,
+    n_resamples: int = 5000,
+    seed: int = 42,
 ) -> tuple[float, float]:
-    """Percentile-method bootstrap CI for the mean of ``values``.
+    """Percentile-method bootstrap 95% CI for the mean of ``values``.

-    Deterministic when ``seed`` is provided. Returns ``(lo, hi)``. For
-    ``len(values) < 2`` returns ``(mean, mean)``.
+    Deterministic for a given ``seed``. Empty input → ``(0.0, 0.0)``.
+    Single value → ``(value, value)`` (no variance to resample).
    """
-    if not values:
+    arr = np.asarray(list(values), dtype=float)
+    if arr.size == 0:
        return (0.0, 0.0)
-    n = len(values)
-    mean = sum(values) / n
-    if n < 2 or iterations <= 0:
-        return (mean, mean)
-
-    rng = random.Random(seed)
-    means: list[float] = []
-    for _ in range(iterations):
-        s = 0.0
-        for _ in range(n):
-            s += values[rng.randrange(n)]
-        means.append(s / n)
-    means.sort()
-    lo_idx = int(math.floor((alpha / 2.0) * iterations))
-    hi_idx = int(math.ceil((1.0 - alpha / 2.0) * iterations)) - 1
-    lo_idx = max(0, min(iterations - 1, lo_idx))
-    hi_idx = max(0, min(iterations - 1, hi_idx))
-    return (means[lo_idx], means[hi_idx])
+    if arr.size == 1:
+        v = float(arr[0])
+        return (v, v)
+    rng = np.random.default_rng(seed)
+    boots = np.empty(n_resamples, dtype=float)
+    n = arr.size
+    for i in range(n_resamples):
+        idx = rng.integers(0, n, size=n)
+        boots[i] = float(arr[idx].mean())
+    lo = float(np.percentile(boots, 2.5))
+    hi = float(np.percentile(boots, 97.5))
+    return (lo, hi)


-def win_rate(trades: Iterable[Trade]) -> tuple[int, int, float]:
-    """Return ``(wins, n_resolved, wr)`` ignoring pending trades."""
-    resolved = [t for t in trades if not t.is_pending]
-    wins = sum(1 for t in resolved if t.is_win)
-    n = len(resolved)
+# ---------------------------------------------------------------------------
+# compute_stats
+# ---------------------------------------------------------------------------
+
+
+def _group_stats(
+    overlay_values: list[float | None],
+    *,
+    include_ci: bool,
+    bootstrap_seed: int,
+) -> dict[str, Any]:
+    closed = [v for v in overlay_values if v is not None]
+    n = len(closed)
+    wins = sum(1 for v in closed if v > 0)
    wr = (wins / n) if n else 0.0
-    return wins, n, wr
-
-
-def expectancy(trades: Iterable[Trade], overlay: str = "pl_marius") -> float:
-    """Mean P/L (in R) over non-pending trades, on the given overlay."""
-    if overlay not in {"pl_marius", "pl_theoretical"}:
-        raise ValueError(f"unknown overlay {overlay!r}")
-    if overlay == "pl_marius":
-        vals = [t.pl_marius for t in trades if t.pl_marius is not None]
-    else:
-        vals = [t.pl_theoretical for t in trades if not t.is_pending]
-    if not vals:
-        return 0.0
-    return sum(vals) / len(vals)
-
-
-# ---------------------------------------------------------------------------
-# Group stats
-# ---------------------------------------------------------------------------
-
-
-@dataclass(frozen=True)
-class GroupStats:
-    key: str
-    n_total: int
-    n_resolved: int
-    wins: int
-    wr: float
-    wr_ci_lo: float
-    wr_ci_hi: float
-    exp_marius: float
-    exp_marius_ci_lo: float
-    exp_marius_ci_hi: float
-    exp_theoretical: float
-    exp_theoretical_ci_lo: float
-    exp_theoretical_ci_hi: float
-
-
-def group_by(trades: Iterable[Trade], field_name: str) -> dict[str, list[Trade]]:
-    out: dict[str, list[Trade]] = {}
-    for t in trades:
-        key = getattr(t, field_name, "") or "(blank)"
-        out.setdefault(key, []).append(t)
+    out: dict[str, Any] = {
+        "n": n,
+        "wr": wr,
+        "expectancy": (sum(closed) / n) if n else 0.0,
+    }
+    if include_ci:
+        out["wr_ci_95"] = wilson_ci(wins, n)
+        out["expectancy_ci_95"] = bootstrap_expectancy_ci(
+            closed, seed=bootstrap_seed
+        )
    return out


-def compute_group_stats(
-    trades: list[Trade],
-    *,
-    label: str,
-    bootstrap_iterations: int = 2000,
-    seed: int | None = None,
-) -> GroupStats:
-    wins, n_resolved, wr = win_rate(trades)
-    wr_lo, wr_hi = wilson_ci(wins, n_resolved)
-
-    pl_m_vals = [t.pl_marius for t in trades if t.pl_marius is not None]
-    exp_m = (sum(pl_m_vals) / len(pl_m_vals)) if pl_m_vals else 0.0
-    exp_m_lo, exp_m_hi = bootstrap_ci(
-        pl_m_vals, iterations=bootstrap_iterations, seed=seed
-    )
-
-    pl_t_vals = [t.pl_theoretical for t in trades if not t.is_pending]
-    exp_t = (sum(pl_t_vals) / len(pl_t_vals)) if pl_t_vals else 0.0
-    exp_t_lo, exp_t_hi = bootstrap_ci(
-        pl_t_vals,
-        iterations=bootstrap_iterations,
-        seed=None if seed is None else seed + 1,
-    )
-
-    return GroupStats(
-        key=label,
-        n_total=len(trades),
-        n_resolved=n_resolved,
-        wins=wins,
-        wr=wr,
-        wr_ci_lo=wr_lo,
-        wr_ci_hi=wr_hi,
-        exp_marius=exp_m,
-        exp_marius_ci_lo=exp_m_lo,
-        exp_marius_ci_hi=exp_m_hi,
-        exp_theoretical=exp_t,
-        exp_theoretical_ci_lo=exp_t_lo,
-        exp_theoretical_ci_hi=exp_t_hi,
-    )
+def _overlay_value(row: dict[str, str], overlay: str) -> float | None:
+    raw = row.get(overlay, "")
+    return _parse_optional_float(raw)


-# ---------------------------------------------------------------------------
-# Calibration mode
-# ---------------------------------------------------------------------------
+def compute_stats(
+    csv_path: Path | str = "data/jurnal.csv",
+    overlay: str = "pl_marius",
+) -> dict[str, Any]:
+    """Compute aggregate WR + expectancy stats over the backtest rows.

+    Calibration rows (``manual_calibration`` / ``vision_calibration``) are
+    excluded; use :func:`compute_calibration` for the P4 mismatch report.

-@dataclass(frozen=True)
-class CalibrationReport:
-    pairs: int
-    field_mismatches: dict[str, int]
-    total_comparisons: int
-
-    @property
-    def overall_mismatch_rate(self) -> float:
-        if self.total_comparisons == 0:
-            return 0.0
-        total = sum(self.field_mismatches.values())
-        return total / self.total_comparisons
-
-
-def _normalise_for_compare(field_name: str, value: str) -> str:
-    s = (value or "").strip()
-    if field_name in {"entry", "sl", "tp0", "tp1", "tp2"}:
-        try:
-            return f"{float(s):.4f}"
-        except ValueError:
-            return s
-    return s
-
-
-def calibration_mismatch(
-    trades: Iterable[Trade],
-    *,
-    fields: tuple[str, ...] = CORE_CALIBRATION_FIELDS,
-) -> CalibrationReport:
-    """Pair ``manual_calibration`` and ``vision_calibration`` rows by
-    ``screenshot_file``, then count mismatches per ``fields``.
-
-    Returns a :class:`CalibrationReport`. Unpaired calibration rows are
-    silently ignored — they cannot contribute to a comparison.
+    ``overlay`` selects the P/L column: ``"pl_marius"`` (default — the real
+    overlay Marius trades) or ``"pl_theoretical"`` (1/3-1/3-1/3 hold-to-TP2).
    """
-    manual: dict[str, Trade] = {}
-    vision: dict[str, Trade] = {}
-    for t in trades:
-        if t.source == "manual_calibration":
-            manual[t.screenshot_file] = t
-        elif t.source == "vision_calibration":
-            vision[t.screenshot_file] = t
+    if overlay not in {"pl_marius", "pl_theoretical"}:
+        raise ValueError(f"unknown overlay {overlay!r}")

-    paired_files = sorted(set(manual) & set(vision))
-    field_mismatches: dict[str, int] = {f: 0 for f in fields}
-    for f in paired_files:
-        m = manual[f]
-        v = vision[f]
-        for fld in fields:
-            mv = _normalise_for_compare(fld, m.raw.get(fld, ""))
-            vv = _normalise_for_compare(fld, v.raw.get(fld, ""))
-            if mv != vv:
-                field_mismatches[fld] += 1
+    rows = [r for r in _load_rows(csv_path) if r.get("source", "") in BACKTEST_SOURCES]

-    total_comparisons = len(paired_files) * len(fields)
-    return CalibrationReport(
-        pairs=len(paired_files),
-        field_mismatches=field_mismatches,
-        total_comparisons=total_comparisons,
+    if not rows:
+        return {
+            "n_total": 0,
+            "n_pending": 0,
+            "n_closed": 0,
+            "wr": 0.0,
+            "wr_ci_95": (0.0, 0.0),
+            "expectancy": 0.0,
+            "expectancy_ci_95": (0.0, 0.0),
+            "per_set": {},
+            "per_calitate": {},
+            "per_directie": {},
+        }
+
+    # Pending status is overlay-independent: a trade is pending iff
+    # pl_marius is blank (outcome_path in {pending, TP0->pending}).
+    # pl_theoretical is concrete even for pending rows, so it would otherwise
+    # let pending trades sneak into the closed-trades stats — we mask those
+    # out explicitly here.
+    pending_mask = [_parse_optional_float(r.get("pl_marius", "")) is None for r in rows]
+    overlay_vals: list[float | None] = []
+    for r, is_pending in zip(rows, pending_mask):
+        overlay_vals.append(None if is_pending else _overlay_value(r, overlay))
+    n_total = len(rows)
+    n_pending = sum(1 for p in pending_mask if p)
+    n_closed = n_total - n_pending
+
+    overall = _group_stats(
+        overlay_vals, include_ci=True, bootstrap_seed=42
    )

+    def _split(field: str, include_ci: bool) -> dict[str, dict[str, Any]]:
+        groups: dict[str, list[float | None]] = {}
+        for r, v in zip(rows, overlay_vals):
+            key = r.get(field, "") or "(blank)"
+            groups.setdefault(key, []).append(v)
+        out: dict[str, dict[str, Any]] = {}
+        for k in sorted(groups):
+            sub_seed = 42 + (abs(hash(("split", field, k))) % 1_000_000)
+            out[k] = _group_stats(
+                groups[k], include_ci=include_ci, bootstrap_seed=sub_seed
+            )
+        return out
+
+    return {
+        "n_total": n_total,
+        "n_pending": n_pending,
+        "n_closed": n_closed,
+        "wr": overall["wr"],
+        "wr_ci_95": overall["wr_ci_95"],
+        "expectancy": overall["expectancy"],
+        "expectancy_ci_95": overall["expectancy_ci_95"],
+        "per_set": _split("set", include_ci=True),
+        "per_calitate": _split("calitate", include_ci=True),
+        # per_directie skips CI per spec (no wr_ci_95 / expectancy_ci_95 keys).
+        "per_directie": {
+            k: {"n": v["n"], "wr": v["wr"], "expectancy": v["expectancy"]}
+            for k, v in _split("directie", include_ci=False).items()
+        },
+    }
+

 # ---------------------------------------------------------------------------
-# Reporting
+# render_stats
 # ---------------------------------------------------------------------------


@@ -375,110 +274,228 @@ def _fmt_pct(p: float) -> str:


 def _fmt_r(x: float) -> str:
-    return f"{x:+.3f}R"
+    return f"{x:+.2f} R"


-def _fmt_stats_row(s: GroupStats) -> str:
-    return (
-        f"{s.key:<14} N={s.n_total:>3} (resolved {s.n_resolved:>3})  "
-        f"WR={_fmt_pct(s.wr)} [{_fmt_pct(s.wr_ci_lo)}, {_fmt_pct(s.wr_ci_hi)}]  "
-        f"E_marius={_fmt_r(s.exp_marius)} "
-        f"[{_fmt_r(s.exp_marius_ci_lo)}, {_fmt_r(s.exp_marius_ci_hi)}]  "
-        f"E_theor={_fmt_r(s.exp_theoretical)}"
-    )
+def _set_sort_key(name: str) -> tuple[int, str]:
+    order = ["A1", "A2", "A3", "B", "C", "D", "Other"]
+    return (order.index(name), name) if name in order else (len(order), name)


-def format_report(
-    trades: list[Trade],
-    *,
-    bootstrap_iterations: int = 2000,
-    seed: int | None = None,
-) -> str:
-    """Render the main stats report.
-
-    Only ``source in {vision, manual}`` rows are included in the WR /
-    expectancy computations; calibration rows are reported separately via
-    ``--calibration``.
-    """
-    backtest = [t for t in trades if t.source in BACKTEST_SOURCES]
+def render_stats(stats: dict[str, Any], overlay: str) -> str:
    lines: list[str] = []
-    lines.append("=== M2D Backtest Stats ===")
-    lines.append(f"Backtest rows: {len(backtest)} (calibration excluded)")
-    lines.append("")
-
-    if not backtest:
-        lines.append("(no backtest trades yet)")
-        return "\n".join(lines)
-
-    overall = compute_group_stats(
-        backtest,
-        label="OVERALL",
-        bootstrap_iterations=bootstrap_iterations,
-        seed=seed,
-    )
-    lines.append("-- Overall --")
-    lines.append(_fmt_stats_row(overall))
-    lines.append("")
-
-    def _emit_group(title: str, field_name: str, key_order: list[str] | None = None) -> None:
-        lines.append(f"-- By {title} --")
-        groups = group_by(backtest, field_name)
-        keys = key_order if key_order is not None else sorted(groups)
-        for k in keys:
-            if k not in groups:
-                continue
-            sub_seed = None if seed is None else seed + abs(hash(k)) % 10_000
-            s = compute_group_stats(
-                groups[k],
-                label=k,
-                bootstrap_iterations=bootstrap_iterations,
-                seed=sub_seed,
-            )
-            lines.append(_fmt_stats_row(s))
-        lines.append("")
-
-    _emit_group(
-        "Set",
-        "set",
-        key_order=["A1", "A2", "A3", "B", "C", "D", "Other"],
-    )
-    _emit_group("Instrument", "instrument")
+    lines.append(f"=== Stats jurnal.csv (overlay: {overlay}) ===")
    lines.append(
-        "[!] By calitate — descriptor only (post-outcome, biased; do not use "
-        "as a GO LIVE filter — see STOPPING_RULE.md §3)."
-    )
-    _emit_group(
-        "calitate",
-        "calitate",
-        key_order=["Clară", "Mai mare ca impuls", "Slabă", "n/a"],
+        f"Trade-uri totale: {stats['n_total']} | "
+        f"închise: {stats['n_closed']} | pending: {stats['n_pending']}"
    )

-    return "\n".join(lines).rstrip() + "\n"
-
-
-def format_calibration_report(trades: list[Trade]) -> str:
-    cal = calibration_mismatch(trades)
-    lines: list[str] = []
-    lines.append("=== Calibration P4 gate ===")
-    lines.append(f"Paired screenshots (manual ∩ vision): {cal.pairs}")
-    if cal.pairs == 0:
-        lines.append("(no calibration pairs yet)")
+    if stats["n_total"] == 0:
+        lines.append("")
+        lines.append("(nu sunt trade-uri backtest în CSV)")
        return "\n".join(lines) + "\n"

    lines.append("")
-    lines.append(f"{'field':<14} mismatches / pairs    rate")
-    for fld in CORE_CALIBRATION_FIELDS:
-        m = cal.field_mismatches.get(fld, 0)
-        rate = (m / cal.pairs) if cal.pairs else 0.0
-        lines.append(f"{fld:<14} {m:>3} / {cal.pairs:<3}        {_fmt_pct(rate)}")
-    lines.append("")
+    lo, hi = stats["wr_ci_95"]
+    e_lo, e_hi = stats["expectancy_ci_95"]
+    lines.append(f"GLOBAL (n={stats['n_closed']}):")
    lines.append(
-        f"Overall mismatch rate: {_fmt_pct(cal.overall_mismatch_rate)} "
-        f"({sum(cal.field_mismatches.values())} of {cal.total_comparisons} comparisons)"
+        f"  WR:          {_fmt_pct(stats['wr'])} "
+        f"[95% CI: {_fmt_pct(lo)}, {_fmt_pct(hi)}]"
    )
-    threshold = 0.10
-    verdict = "PASS" if cal.overall_mismatch_rate <= threshold else "FAIL"
-    lines.append(f"P4 gate (<= 10%): {verdict}")
+    lines.append(
+        f"  Expectancy:  {_fmt_r(stats['expectancy'])} "
+        f"[95% CI: {_fmt_r(e_lo)}, {_fmt_r(e_hi)}]"
+    )
+    lines.append("")
+
+    def _emit_split(
+        title: str,
+        data: dict[str, dict[str, Any]],
+        *,
+        sort_keys: list[str] | None = None,
+        include_ci: bool = True,
+    ) -> None:
+        lines.append(title)
+        keys = sort_keys if sort_keys is not None else sorted(data)
+        for k in keys:
+            if k not in data:
+                continue
+            d = data[k]
+            if include_ci and "wr_ci_95" in d:
+                clo, chi = d["wr_ci_95"]
+                lines.append(
+                    f"  {k:<14} n={d['n']:>3}  "
+                    f"WR {_fmt_pct(d['wr'])} "
+                    f"[{_fmt_pct(clo)}, {_fmt_pct(chi)}]  "
+                    f"E {_fmt_r(d['expectancy'])}"
+                )
+            else:
+                lines.append(
+                    f"  {k:<14} n={d['n']:>3}  "
+                    f"WR {_fmt_pct(d['wr'])}  "
+                    f"E {_fmt_r(d['expectancy'])}"
+                )
+        lines.append("")
+
+    _emit_split(
+        "PER SET:",
+        stats["per_set"],
+        sort_keys=sorted(stats["per_set"], key=_set_sort_key),
+    )
+
+    lines.append(
+        "PER CALITATE (⚠️ DESCRIPTOR ONLY — biased post-outcome, NU folosi ca filtru):"
+    )
+    cal_order = ["Clară", "Mai mare ca impuls", "Slabă", "n/a"]
+    keys = [k for k in cal_order if k in stats["per_calitate"]] + [
+        k for k in sorted(stats["per_calitate"]) if k not in cal_order
+    ]
+    for k in keys:
+        d = stats["per_calitate"][k]
+        clo, chi = d["wr_ci_95"]
+        lines.append(
+            f"  {k:<20} n={d['n']:>3}  "
+            f"WR {_fmt_pct(d['wr'])} "
+            f"[{_fmt_pct(clo)}, {_fmt_pct(chi)}]  "
+            f"E {_fmt_r(d['expectancy'])}"
+        )
+    lines.append("")
+
+    _emit_split("PER DIRECȚIE:", stats["per_directie"], include_ci=False)
+
+    # STOPPING_RULE gate check — flag every Set that hasn't crossed N>=40.
+    lines.append(f"⚠️ STOPPING RULE check (vezi STOPPING_RULE.md, N>={STOPPING_RULE_N}):")
+    set_keys = sorted(stats["per_set"], key=_set_sort_key)
+    any_flagged = False
+    for k in set_keys:
+        n = stats["per_set"][k]["n"]
+        if n < STOPPING_RULE_N:
+            lines.append(f"   {k}: N={n} < {STOPPING_RULE_N} → NEEDS MORE DATA")
+            any_flagged = True
+    if not any_flagged:
+        lines.append(f"   toate Set-urile au N>={STOPPING_RULE_N} (eligibile pentru GO LIVE check).")
+
+    return "\n".join(lines) + "\n"
+
+
+# ---------------------------------------------------------------------------
+# compute_calibration
+# ---------------------------------------------------------------------------
+
+
+def _calibration_match(field: str, m_val: str, v_val: str, tol: float = 0.01) -> bool:
+    if field in NUMERIC_CALIBRATION_FIELDS:
+        try:
+            return abs(float(m_val) - float(v_val)) <= tol
+        except ValueError:
+            return (m_val or "").strip() == (v_val or "").strip()
+    return (m_val or "").strip() == (v_val or "").strip()
+
+
+def compute_calibration(
+    csv_path: Path | str = "data/jurnal.csv",
+) -> dict[str, Any]:
+    """Pair calibration legs by ``screenshot_file`` and report per-field mismatch.
+
+    Returns a dict ``{"n_pairs": int, "fields": {field: {match, mismatch,
+    match_rate, mismatch_examples}}}``. ``mismatch_examples`` holds up to 3
+    strings ``"<screenshot_file>: manual=X vs vision=Y"`` per field.
+
+    Numeric fields (``entry/sl/tp0/tp1/tp2``) use a tolerance of 0.01;
+    everything else is exact-string equality after strip.
+    """
+    rows = _load_rows(csv_path)
+    manual: dict[str, dict[str, str]] = {}
+    vision: dict[str, dict[str, str]] = {}
+    for r in rows:
+        src = r.get("source", "")
+        if src == "manual_calibration":
+            manual[r.get("screenshot_file", "")] = r
+        elif src == "vision_calibration":
+            vision[r.get("screenshot_file", "")] = r
+
+    paired_files = sorted(set(manual) & set(vision))
+    fields_report: dict[str, dict[str, Any]] = {
+        f: {
+            "match": 0,
+            "mismatch": 0,
+            "match_rate": 0.0,
+            "mismatch_examples": [],
+        }
+        for f in CORE_CALIBRATION_FIELDS
+    }
+
+    for f in paired_files:
+        m = manual[f]
+        v = vision[f]
+        for fld in CORE_CALIBRATION_FIELDS:
+            mv = m.get(fld, "")
+            vv = v.get(fld, "")
+            if _calibration_match(fld, mv, vv):
+                fields_report[fld]["match"] += 1
+            else:
+                fields_report[fld]["mismatch"] += 1
+                examples = fields_report[fld]["mismatch_examples"]
+                if len(examples) < 3:
+                    examples.append(f"{f}: manual={mv!r} vs vision={vv!r}")
+
+    for fld, data in fields_report.items():
+        total = data["match"] + data["mismatch"]
+        data["match_rate"] = (data["match"] / total) if total else 0.0
+
+    return {"n_pairs": len(paired_files), "fields": fields_report}
+
+
+def render_calibration(cal: dict[str, Any]) -> str:
+    lines: list[str] = []
+    lines.append("=== Calibration P4 gate (vezi STOPPING_RULE.md §P4) ===")
+    lines.append(f"Perechi calibration: {cal['n_pairs']}")
+    if cal["n_pairs"] == 0:
+        lines.append("(nu există perechi manual_calibration ∩ vision_calibration)")
+        return "\n".join(lines) + "\n"
+
+    lines.append("")
+    lines.append(f"{'field':<14} match  mismatch  rate")
+    total_mismatches = 0
+    total_comparisons = 0
+    for fld in CORE_CALIBRATION_FIELDS:
+        d = cal["fields"][fld]
+        n = d["match"] + d["mismatch"]
+        total_mismatches += d["mismatch"]
+        total_comparisons += n
+        lines.append(
+            f"{fld:<14} {d['match']:>5}  {d['mismatch']:>8}  "
+            f"{_fmt_pct(d['match_rate'])}"
+        )
+
+    lines.append("")
+    overall_match_rate = (
+        (total_comparisons - total_mismatches) / total_comparisons
+        if total_comparisons
+        else 0.0
+    )
+    overall_mismatch_rate = 1.0 - overall_match_rate
+    verdict = "PASS" if overall_mismatch_rate <= 0.10 else "FAIL"
+    lines.append(
+        f"Overall mismatch rate: {_fmt_pct(overall_mismatch_rate)} "
+        f"({total_mismatches}/{total_comparisons}) → P4 gate: {verdict}"
+    )
+
+    has_examples = any(
+        cal["fields"][f]["mismatch_examples"] for f in CORE_CALIBRATION_FIELDS
+    )
+    if has_examples:
+        lines.append("")
+        lines.append("Mismatch examples (max 3 per field):")
+        for fld in CORE_CALIBRATION_FIELDS:
+            ex = cal["fields"][fld]["mismatch_examples"]
+            if not ex:
+                continue
+            lines.append(f"  [{fld}]")
+            for e in ex:
+                lines.append(f"    - {e}")
+
    return "\n".join(lines) + "\n"


@@ -498,43 +515,37 @@ def main(argv: list[str] | None = None) -> int:
        default=Path("data/jurnal.csv"),
        help="Path to the jurnal CSV (default: data/jurnal.csv).",
    )
+    parser.add_argument(
+        "--overlay",
+        choices=("pl_marius", "pl_theoretical"),
+        default="pl_marius",
+        help="Which P/L overlay to use (default: pl_marius).",
+    )
    parser.add_argument(
        "--calibration",
        action="store_true",
        help="Show P4 calibration mismatch report instead of backtest stats.",
    )
-    parser.add_argument(
-        "--bootstrap-iterations",
-        type=int,
-        default=2000,
-        help="Bootstrap iterations for expectancy CI (default: 2000).",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=None,
-        help="Seed for the bootstrap RNG (set for deterministic output).",
-    )
    args = parser.parse_args(argv)

-    trades = load_trades(args.csv)
-    if args.calibration:
-        out = format_calibration_report(trades)
-    else:
-        out = format_report(
-            trades,
-            bootstrap_iterations=args.bootstrap_iterations,
-            seed=args.seed,
-        )
-    # Force UTF-8 on stdout: the report contains diacritics ("Clară", "Slabă")
-    # and a console codepage like cp1252 would crash on those.
    try:
        sys.stdout.reconfigure(encoding="utf-8")  # type: ignore[attr-defined]
    except (AttributeError, OSError):
        pass
-    sys.stdout.write(out)
+
+    if args.calibration:
+        cal = compute_calibration(args.csv)
+        sys.stdout.write(render_calibration(cal))
+    else:
+        stats = compute_stats(args.csv, overlay=args.overlay)
+        sys.stdout.write(render_stats(stats, args.overlay))
    return 0


 if __name__ == "__main__":
    raise SystemExit(main())
+
+
+# Ensure the canonical CSV schema is importable from one place — fail fast if
+# someone removes append_row.CSV_COLUMNS that this module depends on.
+assert CSV_COLUMNS is not None