commands: m2d-log + backtest + batch + stats slash commands (124 tests pass)

This commit is contained in:
Marius
2026-05-13 12:48:26 +03:00
parent 26d084dc4b
commit 34af5b631e
7 changed files with 1111 additions and 730 deletions

View File

@@ -1,21 +1,20 @@
"""Backtest statistics for ``data/jurnal.csv``.
Outputs:
- Overall + per-Set + per-calitate + per-instrument WR, expectancy.
- Wilson 95% CI for WR (closed form).
- Bootstrap percentile 95% CI for expectancy (deterministic via ``seed``).
- ``--calibration`` mode: joins ``manual_calibration`` rows with their
``vision_calibration`` counterparts on ``screenshot_file`` and reports
field-by-field mismatch rates for the P4 gate (see ``STOPPING_RULE.md``).
Public API:
- ``compute_stats(csv_path, overlay) -> dict``
- ``render_stats(stats, overlay) -> str``
- ``compute_calibration(csv_path) -> dict``
- ``render_calibration(cal) -> str``
- ``main()`` — CLI entry point.
A "win" is any trade with ``pl_marius > 0``. Pending trades
(``pl_marius`` blank, i.e. ``outcome_path in {pending, TP0->pending}``) are
excluded from both WR and expectancy: there is no realised outcome yet.
A "win" is a closed trade with ``pl_overlay > 0`` (where ``pl_overlay`` is
either ``pl_marius`` or ``pl_theoretical``). Pending trades — ``pl_marius``
blank, i.e. ``outcome_path in {pending, TP0->pending}`` — are excluded from
both WR and expectancy: there is no realised outcome yet.
The ``calitate`` field is a known-biased descriptor (post-outcome
classification — see ``STOPPING_RULE.md`` §3). It is reported as
informational only and explicitly flagged as such; do NOT use it as a
filter for GO LIVE decisions.
The ``calitate`` field is a known-biased descriptor: it is classified
post-outcome (see ``STOPPING_RULE.md`` §3). The per-``calitate`` split is
reported with an explicit *descriptor only — biased post-outcome* caveat.
"""
from __future__ import annotations
@@ -23,32 +22,42 @@ from __future__ import annotations
import argparse
import csv
import math
import random
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable
from typing import Any, Iterable
import numpy as np
from scripts.append_row import CSV_COLUMNS
__all__ = [
"CORE_CALIBRATION_FIELDS",
"BACKTEST_SOURCES",
"CALIBRATION_SOURCES",
"Trade",
"GroupStats",
"load_trades",
"CORE_CALIBRATION_FIELDS",
"NUMERIC_CALIBRATION_FIELDS",
"STOPPING_RULE_N",
"wilson_ci",
"bootstrap_ci",
"win_rate",
"expectancy",
"group_by",
"compute_group_stats",
"calibration_mismatch",
"format_report",
"bootstrap_expectancy_ci",
"compute_stats",
"render_stats",
"compute_calibration",
"render_calibration",
"main",
]
# Fields compared in the calibration mismatch gate (STOPPING_RULE.md §P4).
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
BACKTEST_SOURCES: frozenset[str] = frozenset({"vision", "manual"})
CALIBRATION_SOURCES: frozenset[str] = frozenset(
{"manual_calibration", "vision_calibration"}
)
# Calibration P4 gate (STOPPING_RULE.md §P4) — explicitly reported per field.
CORE_CALIBRATION_FIELDS: tuple[str, ...] = (
"entry",
"sl",
@@ -58,315 +67,205 @@ CORE_CALIBRATION_FIELDS: tuple[str, ...] = (
"outcome_path",
"max_reached",
"directie",
"instrument",
)
BACKTEST_SOURCES: frozenset[str] = frozenset({"vision", "manual"})
CALIBRATION_SOURCES: frozenset[str] = frozenset(
{"manual_calibration", "vision_calibration"}
NUMERIC_CALIBRATION_FIELDS: frozenset[str] = frozenset(
{"entry", "sl", "tp0", "tp1", "tp2"}
)
# STOPPING_RULE.md §"GO LIVE" gate: N >= 40 per Set.
STOPPING_RULE_N: int = 40
# ---------------------------------------------------------------------------
# Loading / typed access
# Loading
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class Trade:
"""One realised (or pending) trade row, typed."""
id: int
screenshot_file: str
source: str
data: str
zi: str
ora_ro: str
instrument: str
directie: str
calitate: str
set: str
outcome_path: str
max_reached: str
be_moved: bool
pl_marius: float | None
pl_theoretical: float
raw: dict[str, str] = field(default_factory=dict)
@property
def is_pending(self) -> bool:
return self.pl_marius is None
@property
def is_win(self) -> bool:
return self.pl_marius is not None and self.pl_marius > 0
def _parse_optional_float(value: str) -> float | None:
s = (value or "").strip()
if s == "":
return None
return float(s)
try:
return float(s)
except ValueError:
return None
def _parse_bool(value: str) -> bool:
return (value or "").strip().lower() in {"true", "1", "yes", "da"}
def _row_to_trade(row: dict[str, str]) -> Trade:
return Trade(
id=int(row.get("id") or 0),
screenshot_file=row.get("screenshot_file", ""),
source=row.get("source", ""),
data=row.get("data", ""),
zi=row.get("zi", ""),
ora_ro=row.get("ora_ro", ""),
instrument=row.get("instrument", ""),
directie=row.get("directie", ""),
calitate=row.get("calitate", ""),
set=row.get("set", ""),
outcome_path=row.get("outcome_path", ""),
max_reached=row.get("max_reached", ""),
be_moved=_parse_bool(row.get("be_moved", "")),
pl_marius=_parse_optional_float(row.get("pl_marius", "")),
pl_theoretical=float(row.get("pl_theoretical") or 0.0),
raw=dict(row),
)
def load_trades(csv_path: Path | str) -> list[Trade]:
"""Load all rows of ``csv_path`` as :class:`Trade` objects.
Returns ``[]`` if the file does not exist or is empty.
"""
def _load_rows(csv_path: Path | str) -> list[dict[str, str]]:
p = Path(csv_path)
if not p.exists() or p.stat().st_size == 0:
return []
with p.open("r", encoding="utf-8", newline="") as fh:
reader = csv.DictReader(fh)
return [_row_to_trade(r) for r in reader]
return list(csv.DictReader(fh))
# ---------------------------------------------------------------------------
# Statistics primitives
# CI primitives
# ---------------------------------------------------------------------------
def wilson_ci(wins: int, n: int, z: float = 1.96) -> tuple[float, float]:
"""Wilson score interval for a binomial proportion.
Returns ``(lo, hi)`` as proportions in [0, 1]. For ``n == 0`` returns
``(0.0, 0.0)``. ``z = 1.96`` corresponds to a 95% CI.
Returns ``(lo, hi)`` clamped to ``[0.0, 1.0]``. For ``n == 0`` returns
``(0.0, 0.0)``. ``z = 1.96`` ≈ 95% confidence.
"""
if n <= 0:
return (0.0, 0.0)
if wins < 0 or wins > n:
raise ValueError(f"wins={wins} out of range for n={n}")
p_hat = wins / n
p = wins / n
denom = 1.0 + (z * z) / n
center = p_hat + (z * z) / (2.0 * n)
half = z * math.sqrt((p_hat * (1.0 - p_hat) + (z * z) / (4.0 * n)) / n)
lo = (center - half) / denom
hi = (center + half) / denom
return (max(0.0, lo), min(1.0, hi))
center = (p + (z * z) / (2.0 * n)) / denom
spread = z * math.sqrt(p * (1.0 - p) / n + (z * z) / (4.0 * n * n)) / denom
return (max(0.0, center - spread), min(1.0, center + spread))
def bootstrap_ci(
values: list[float],
*,
iterations: int = 2000,
alpha: float = 0.05,
seed: int | None = None,
def bootstrap_expectancy_ci(
values: list[float] | np.ndarray,
n_resamples: int = 5000,
seed: int = 42,
) -> tuple[float, float]:
"""Percentile-method bootstrap CI for the mean of ``values``.
"""Percentile-method bootstrap 95% CI for the mean of ``values``.
Deterministic when ``seed`` is provided. Returns ``(lo, hi)``. For
``len(values) < 2`` returns ``(mean, mean)``.
Deterministic for a given ``seed``. Empty input → ``(0.0, 0.0)``.
Single value → ``(value, value)`` (no variance to resample).
"""
if not values:
arr = np.asarray(list(values), dtype=float)
if arr.size == 0:
return (0.0, 0.0)
n = len(values)
mean = sum(values) / n
if n < 2 or iterations <= 0:
return (mean, mean)
rng = random.Random(seed)
means: list[float] = []
for _ in range(iterations):
s = 0.0
for _ in range(n):
s += values[rng.randrange(n)]
means.append(s / n)
means.sort()
lo_idx = int(math.floor((alpha / 2.0) * iterations))
hi_idx = int(math.ceil((1.0 - alpha / 2.0) * iterations)) - 1
lo_idx = max(0, min(iterations - 1, lo_idx))
hi_idx = max(0, min(iterations - 1, hi_idx))
return (means[lo_idx], means[hi_idx])
if arr.size == 1:
v = float(arr[0])
return (v, v)
rng = np.random.default_rng(seed)
boots = np.empty(n_resamples, dtype=float)
n = arr.size
for i in range(n_resamples):
idx = rng.integers(0, n, size=n)
boots[i] = float(arr[idx].mean())
lo = float(np.percentile(boots, 2.5))
hi = float(np.percentile(boots, 97.5))
return (lo, hi)
def win_rate(trades: Iterable[Trade]) -> tuple[int, int, float]:
"""Return ``(wins, n_resolved, wr)`` ignoring pending trades."""
resolved = [t for t in trades if not t.is_pending]
wins = sum(1 for t in resolved if t.is_win)
n = len(resolved)
# ---------------------------------------------------------------------------
# compute_stats
# ---------------------------------------------------------------------------
def _group_stats(
overlay_values: list[float | None],
*,
include_ci: bool,
bootstrap_seed: int,
) -> dict[str, Any]:
closed = [v for v in overlay_values if v is not None]
n = len(closed)
wins = sum(1 for v in closed if v > 0)
wr = (wins / n) if n else 0.0
return wins, n, wr
def expectancy(trades: Iterable[Trade], overlay: str = "pl_marius") -> float:
"""Mean P/L (in R) over non-pending trades, on the given overlay."""
if overlay not in {"pl_marius", "pl_theoretical"}:
raise ValueError(f"unknown overlay {overlay!r}")
if overlay == "pl_marius":
vals = [t.pl_marius for t in trades if t.pl_marius is not None]
else:
vals = [t.pl_theoretical for t in trades if not t.is_pending]
if not vals:
return 0.0
return sum(vals) / len(vals)
# ---------------------------------------------------------------------------
# Group stats
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class GroupStats:
key: str
n_total: int
n_resolved: int
wins: int
wr: float
wr_ci_lo: float
wr_ci_hi: float
exp_marius: float
exp_marius_ci_lo: float
exp_marius_ci_hi: float
exp_theoretical: float
exp_theoretical_ci_lo: float
exp_theoretical_ci_hi: float
def group_by(trades: Iterable[Trade], field_name: str) -> dict[str, list[Trade]]:
out: dict[str, list[Trade]] = {}
for t in trades:
key = getattr(t, field_name, "") or "(blank)"
out.setdefault(key, []).append(t)
out: dict[str, Any] = {
"n": n,
"wr": wr,
"expectancy": (sum(closed) / n) if n else 0.0,
}
if include_ci:
out["wr_ci_95"] = wilson_ci(wins, n)
out["expectancy_ci_95"] = bootstrap_expectancy_ci(
closed, seed=bootstrap_seed
)
return out
def compute_group_stats(
trades: list[Trade],
*,
label: str,
bootstrap_iterations: int = 2000,
seed: int | None = None,
) -> GroupStats:
wins, n_resolved, wr = win_rate(trades)
wr_lo, wr_hi = wilson_ci(wins, n_resolved)
pl_m_vals = [t.pl_marius for t in trades if t.pl_marius is not None]
exp_m = (sum(pl_m_vals) / len(pl_m_vals)) if pl_m_vals else 0.0
exp_m_lo, exp_m_hi = bootstrap_ci(
pl_m_vals, iterations=bootstrap_iterations, seed=seed
)
pl_t_vals = [t.pl_theoretical for t in trades if not t.is_pending]
exp_t = (sum(pl_t_vals) / len(pl_t_vals)) if pl_t_vals else 0.0
exp_t_lo, exp_t_hi = bootstrap_ci(
pl_t_vals,
iterations=bootstrap_iterations,
seed=None if seed is None else seed + 1,
)
return GroupStats(
key=label,
n_total=len(trades),
n_resolved=n_resolved,
wins=wins,
wr=wr,
wr_ci_lo=wr_lo,
wr_ci_hi=wr_hi,
exp_marius=exp_m,
exp_marius_ci_lo=exp_m_lo,
exp_marius_ci_hi=exp_m_hi,
exp_theoretical=exp_t,
exp_theoretical_ci_lo=exp_t_lo,
exp_theoretical_ci_hi=exp_t_hi,
)
def _overlay_value(row: dict[str, str], overlay: str) -> float | None:
raw = row.get(overlay, "")
return _parse_optional_float(raw)
# ---------------------------------------------------------------------------
# Calibration mode
# ---------------------------------------------------------------------------
def compute_stats(
csv_path: Path | str = "data/jurnal.csv",
overlay: str = "pl_marius",
) -> dict[str, Any]:
"""Compute aggregate WR + expectancy stats over the backtest rows.
Calibration rows (``manual_calibration`` / ``vision_calibration``) are
excluded; use :func:`compute_calibration` for the P4 mismatch report.
@dataclass(frozen=True)
class CalibrationReport:
pairs: int
field_mismatches: dict[str, int]
total_comparisons: int
@property
def overall_mismatch_rate(self) -> float:
if self.total_comparisons == 0:
return 0.0
total = sum(self.field_mismatches.values())
return total / self.total_comparisons
def _normalise_for_compare(field_name: str, value: str) -> str:
s = (value or "").strip()
if field_name in {"entry", "sl", "tp0", "tp1", "tp2"}:
try:
return f"{float(s):.4f}"
except ValueError:
return s
return s
def calibration_mismatch(
trades: Iterable[Trade],
*,
fields: tuple[str, ...] = CORE_CALIBRATION_FIELDS,
) -> CalibrationReport:
"""Pair ``manual_calibration`` and ``vision_calibration`` rows by
``screenshot_file``, then count mismatches per ``fields``.
Returns a :class:`CalibrationReport`. Unpaired calibration rows are
silently ignored — they cannot contribute to a comparison.
``overlay`` selects the P/L column: ``"pl_marius"`` (default — the real
overlay Marius trades) or ``"pl_theoretical"`` (1/3-1/3-1/3 hold-to-TP2).
"""
manual: dict[str, Trade] = {}
vision: dict[str, Trade] = {}
for t in trades:
if t.source == "manual_calibration":
manual[t.screenshot_file] = t
elif t.source == "vision_calibration":
vision[t.screenshot_file] = t
if overlay not in {"pl_marius", "pl_theoretical"}:
raise ValueError(f"unknown overlay {overlay!r}")
paired_files = sorted(set(manual) & set(vision))
field_mismatches: dict[str, int] = {f: 0 for f in fields}
for f in paired_files:
m = manual[f]
v = vision[f]
for fld in fields:
mv = _normalise_for_compare(fld, m.raw.get(fld, ""))
vv = _normalise_for_compare(fld, v.raw.get(fld, ""))
if mv != vv:
field_mismatches[fld] += 1
rows = [r for r in _load_rows(csv_path) if r.get("source", "") in BACKTEST_SOURCES]
total_comparisons = len(paired_files) * len(fields)
return CalibrationReport(
pairs=len(paired_files),
field_mismatches=field_mismatches,
total_comparisons=total_comparisons,
if not rows:
return {
"n_total": 0,
"n_pending": 0,
"n_closed": 0,
"wr": 0.0,
"wr_ci_95": (0.0, 0.0),
"expectancy": 0.0,
"expectancy_ci_95": (0.0, 0.0),
"per_set": {},
"per_calitate": {},
"per_directie": {},
}
# Pending status is overlay-independent: a trade is pending iff
# pl_marius is blank (outcome_path in {pending, TP0->pending}).
# pl_theoretical is concrete even for pending rows, so it would otherwise
# let pending trades sneak into the closed-trades stats — we mask those
# out explicitly here.
pending_mask = [_parse_optional_float(r.get("pl_marius", "")) is None for r in rows]
overlay_vals: list[float | None] = []
for r, is_pending in zip(rows, pending_mask):
overlay_vals.append(None if is_pending else _overlay_value(r, overlay))
n_total = len(rows)
n_pending = sum(1 for p in pending_mask if p)
n_closed = n_total - n_pending
overall = _group_stats(
overlay_vals, include_ci=True, bootstrap_seed=42
)
def _split(field: str, include_ci: bool) -> dict[str, dict[str, Any]]:
groups: dict[str, list[float | None]] = {}
for r, v in zip(rows, overlay_vals):
key = r.get(field, "") or "(blank)"
groups.setdefault(key, []).append(v)
out: dict[str, dict[str, Any]] = {}
for k in sorted(groups):
sub_seed = 42 + (abs(hash(("split", field, k))) % 1_000_000)
out[k] = _group_stats(
groups[k], include_ci=include_ci, bootstrap_seed=sub_seed
)
return out
return {
"n_total": n_total,
"n_pending": n_pending,
"n_closed": n_closed,
"wr": overall["wr"],
"wr_ci_95": overall["wr_ci_95"],
"expectancy": overall["expectancy"],
"expectancy_ci_95": overall["expectancy_ci_95"],
"per_set": _split("set", include_ci=True),
"per_calitate": _split("calitate", include_ci=True),
# per_directie skips CI per spec (no wr_ci_95 / expectancy_ci_95 keys).
"per_directie": {
k: {"n": v["n"], "wr": v["wr"], "expectancy": v["expectancy"]}
for k, v in _split("directie", include_ci=False).items()
},
}
# ---------------------------------------------------------------------------
# Reporting
# render_stats
# ---------------------------------------------------------------------------
@@ -375,110 +274,228 @@ def _fmt_pct(p: float) -> str:
def _fmt_r(x: float) -> str:
return f"{x:+.3f}R"
return f"{x:+.2f} R"
def _fmt_stats_row(s: GroupStats) -> str:
return (
f"{s.key:<14} N={s.n_total:>3} (resolved {s.n_resolved:>3}) "
f"WR={_fmt_pct(s.wr)} [{_fmt_pct(s.wr_ci_lo)}, {_fmt_pct(s.wr_ci_hi)}] "
f"E_marius={_fmt_r(s.exp_marius)} "
f"[{_fmt_r(s.exp_marius_ci_lo)}, {_fmt_r(s.exp_marius_ci_hi)}] "
f"E_theor={_fmt_r(s.exp_theoretical)}"
)
def _set_sort_key(name: str) -> tuple[int, str]:
order = ["A1", "A2", "A3", "B", "C", "D", "Other"]
return (order.index(name), name) if name in order else (len(order), name)
def format_report(
trades: list[Trade],
*,
bootstrap_iterations: int = 2000,
seed: int | None = None,
) -> str:
"""Render the main stats report.
Only ``source in {vision, manual}`` rows are included in the WR /
expectancy computations; calibration rows are reported separately via
``--calibration``.
"""
backtest = [t for t in trades if t.source in BACKTEST_SOURCES]
def render_stats(stats: dict[str, Any], overlay: str) -> str:
lines: list[str] = []
lines.append("=== M2D Backtest Stats ===")
lines.append(f"Backtest rows: {len(backtest)} (calibration excluded)")
lines.append("")
if not backtest:
lines.append("(no backtest trades yet)")
return "\n".join(lines)
overall = compute_group_stats(
backtest,
label="OVERALL",
bootstrap_iterations=bootstrap_iterations,
seed=seed,
)
lines.append("-- Overall --")
lines.append(_fmt_stats_row(overall))
lines.append("")
def _emit_group(title: str, field_name: str, key_order: list[str] | None = None) -> None:
lines.append(f"-- By {title} --")
groups = group_by(backtest, field_name)
keys = key_order if key_order is not None else sorted(groups)
for k in keys:
if k not in groups:
continue
sub_seed = None if seed is None else seed + abs(hash(k)) % 10_000
s = compute_group_stats(
groups[k],
label=k,
bootstrap_iterations=bootstrap_iterations,
seed=sub_seed,
)
lines.append(_fmt_stats_row(s))
lines.append("")
_emit_group(
"Set",
"set",
key_order=["A1", "A2", "A3", "B", "C", "D", "Other"],
)
_emit_group("Instrument", "instrument")
lines.append(f"=== Stats jurnal.csv (overlay: {overlay}) ===")
lines.append(
"[!] By calitate — descriptor only (post-outcome, biased; do not use "
"as a GO LIVE filter — see STOPPING_RULE.md §3)."
)
_emit_group(
"calitate",
"calitate",
key_order=["Clară", "Mai mare ca impuls", "Slabă", "n/a"],
f"Trade-uri totale: {stats['n_total']} | "
f"închise: {stats['n_closed']} | pending: {stats['n_pending']}"
)
return "\n".join(lines).rstrip() + "\n"
def format_calibration_report(trades: list[Trade]) -> str:
cal = calibration_mismatch(trades)
lines: list[str] = []
lines.append("=== Calibration P4 gate ===")
lines.append(f"Paired screenshots (manual ∩ vision): {cal.pairs}")
if cal.pairs == 0:
lines.append("(no calibration pairs yet)")
if stats["n_total"] == 0:
lines.append("")
lines.append("(nu sunt trade-uri backtest în CSV)")
return "\n".join(lines) + "\n"
lines.append("")
lines.append(f"{'field':<14} mismatches / pairs rate")
for fld in CORE_CALIBRATION_FIELDS:
m = cal.field_mismatches.get(fld, 0)
rate = (m / cal.pairs) if cal.pairs else 0.0
lines.append(f"{fld:<14} {m:>3} / {cal.pairs:<3} {_fmt_pct(rate)}")
lines.append("")
lo, hi = stats["wr_ci_95"]
e_lo, e_hi = stats["expectancy_ci_95"]
lines.append(f"GLOBAL (n={stats['n_closed']}):")
lines.append(
f"Overall mismatch rate: {_fmt_pct(cal.overall_mismatch_rate)} "
f"({sum(cal.field_mismatches.values())} of {cal.total_comparisons} comparisons)"
f" WR: {_fmt_pct(stats['wr'])} "
f"[95% CI: {_fmt_pct(lo)}, {_fmt_pct(hi)}]"
)
threshold = 0.10
verdict = "PASS" if cal.overall_mismatch_rate <= threshold else "FAIL"
lines.append(f"P4 gate (<= 10%): {verdict}")
lines.append(
f" Expectancy: {_fmt_r(stats['expectancy'])} "
f"[95% CI: {_fmt_r(e_lo)}, {_fmt_r(e_hi)}]"
)
lines.append("")
def _emit_split(
title: str,
data: dict[str, dict[str, Any]],
*,
sort_keys: list[str] | None = None,
include_ci: bool = True,
) -> None:
lines.append(title)
keys = sort_keys if sort_keys is not None else sorted(data)
for k in keys:
if k not in data:
continue
d = data[k]
if include_ci and "wr_ci_95" in d:
clo, chi = d["wr_ci_95"]
lines.append(
f" {k:<14} n={d['n']:>3} "
f"WR {_fmt_pct(d['wr'])} "
f"[{_fmt_pct(clo)}, {_fmt_pct(chi)}] "
f"E {_fmt_r(d['expectancy'])}"
)
else:
lines.append(
f" {k:<14} n={d['n']:>3} "
f"WR {_fmt_pct(d['wr'])} "
f"E {_fmt_r(d['expectancy'])}"
)
lines.append("")
_emit_split(
"PER SET:",
stats["per_set"],
sort_keys=sorted(stats["per_set"], key=_set_sort_key),
)
lines.append(
"PER CALITATE (⚠️ DESCRIPTOR ONLY — biased post-outcome, NU folosi ca filtru):"
)
cal_order = ["Clară", "Mai mare ca impuls", "Slabă", "n/a"]
keys = [k for k in cal_order if k in stats["per_calitate"]] + [
k for k in sorted(stats["per_calitate"]) if k not in cal_order
]
for k in keys:
d = stats["per_calitate"][k]
clo, chi = d["wr_ci_95"]
lines.append(
f" {k:<20} n={d['n']:>3} "
f"WR {_fmt_pct(d['wr'])} "
f"[{_fmt_pct(clo)}, {_fmt_pct(chi)}] "
f"E {_fmt_r(d['expectancy'])}"
)
lines.append("")
_emit_split("PER DIRECȚIE:", stats["per_directie"], include_ci=False)
# STOPPING_RULE gate check — flag every Set that hasn't crossed N>=40.
lines.append(f"⚠️ STOPPING RULE check (vezi STOPPING_RULE.md, N>={STOPPING_RULE_N}):")
set_keys = sorted(stats["per_set"], key=_set_sort_key)
any_flagged = False
for k in set_keys:
n = stats["per_set"][k]["n"]
if n < STOPPING_RULE_N:
lines.append(f" {k}: N={n} < {STOPPING_RULE_N} → NEEDS MORE DATA")
any_flagged = True
if not any_flagged:
lines.append(f" toate Set-urile au N>={STOPPING_RULE_N} (eligibile pentru GO LIVE check).")
return "\n".join(lines) + "\n"
# ---------------------------------------------------------------------------
# compute_calibration
# ---------------------------------------------------------------------------
def _calibration_match(field: str, m_val: str, v_val: str, tol: float = 0.01) -> bool:
if field in NUMERIC_CALIBRATION_FIELDS:
try:
return abs(float(m_val) - float(v_val)) <= tol
except ValueError:
return (m_val or "").strip() == (v_val or "").strip()
return (m_val or "").strip() == (v_val or "").strip()
def compute_calibration(
csv_path: Path | str = "data/jurnal.csv",
) -> dict[str, Any]:
"""Pair calibration legs by ``screenshot_file`` and report per-field mismatch.
Returns a dict ``{"n_pairs": int, "fields": {field: {match, mismatch,
match_rate, mismatch_examples}}}``. ``mismatch_examples`` holds up to 3
strings ``"<screenshot_file>: manual=X vs vision=Y"`` per field.
Numeric fields (``entry/sl/tp0/tp1/tp2``) use a tolerance of 0.01;
everything else is exact-string equality after strip.
"""
rows = _load_rows(csv_path)
manual: dict[str, dict[str, str]] = {}
vision: dict[str, dict[str, str]] = {}
for r in rows:
src = r.get("source", "")
if src == "manual_calibration":
manual[r.get("screenshot_file", "")] = r
elif src == "vision_calibration":
vision[r.get("screenshot_file", "")] = r
paired_files = sorted(set(manual) & set(vision))
fields_report: dict[str, dict[str, Any]] = {
f: {
"match": 0,
"mismatch": 0,
"match_rate": 0.0,
"mismatch_examples": [],
}
for f in CORE_CALIBRATION_FIELDS
}
for f in paired_files:
m = manual[f]
v = vision[f]
for fld in CORE_CALIBRATION_FIELDS:
mv = m.get(fld, "")
vv = v.get(fld, "")
if _calibration_match(fld, mv, vv):
fields_report[fld]["match"] += 1
else:
fields_report[fld]["mismatch"] += 1
examples = fields_report[fld]["mismatch_examples"]
if len(examples) < 3:
examples.append(f"{f}: manual={mv!r} vs vision={vv!r}")
for fld, data in fields_report.items():
total = data["match"] + data["mismatch"]
data["match_rate"] = (data["match"] / total) if total else 0.0
return {"n_pairs": len(paired_files), "fields": fields_report}
def render_calibration(cal: dict[str, Any]) -> str:
lines: list[str] = []
lines.append("=== Calibration P4 gate (vezi STOPPING_RULE.md §P4) ===")
lines.append(f"Perechi calibration: {cal['n_pairs']}")
if cal["n_pairs"] == 0:
lines.append("(nu există perechi manual_calibration ∩ vision_calibration)")
return "\n".join(lines) + "\n"
lines.append("")
lines.append(f"{'field':<14} match mismatch rate")
total_mismatches = 0
total_comparisons = 0
for fld in CORE_CALIBRATION_FIELDS:
d = cal["fields"][fld]
n = d["match"] + d["mismatch"]
total_mismatches += d["mismatch"]
total_comparisons += n
lines.append(
f"{fld:<14} {d['match']:>5} {d['mismatch']:>8} "
f"{_fmt_pct(d['match_rate'])}"
)
lines.append("")
overall_match_rate = (
(total_comparisons - total_mismatches) / total_comparisons
if total_comparisons
else 0.0
)
overall_mismatch_rate = 1.0 - overall_match_rate
verdict = "PASS" if overall_mismatch_rate <= 0.10 else "FAIL"
lines.append(
f"Overall mismatch rate: {_fmt_pct(overall_mismatch_rate)} "
f"({total_mismatches}/{total_comparisons}) → P4 gate: {verdict}"
)
has_examples = any(
cal["fields"][f]["mismatch_examples"] for f in CORE_CALIBRATION_FIELDS
)
if has_examples:
lines.append("")
lines.append("Mismatch examples (max 3 per field):")
for fld in CORE_CALIBRATION_FIELDS:
ex = cal["fields"][fld]["mismatch_examples"]
if not ex:
continue
lines.append(f" [{fld}]")
for e in ex:
lines.append(f" - {e}")
return "\n".join(lines) + "\n"
@@ -498,43 +515,37 @@ def main(argv: list[str] | None = None) -> int:
default=Path("data/jurnal.csv"),
help="Path to the jurnal CSV (default: data/jurnal.csv).",
)
parser.add_argument(
"--overlay",
choices=("pl_marius", "pl_theoretical"),
default="pl_marius",
help="Which P/L overlay to use (default: pl_marius).",
)
parser.add_argument(
"--calibration",
action="store_true",
help="Show P4 calibration mismatch report instead of backtest stats.",
)
parser.add_argument(
"--bootstrap-iterations",
type=int,
default=2000,
help="Bootstrap iterations for expectancy CI (default: 2000).",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Seed for the bootstrap RNG (set for deterministic output).",
)
args = parser.parse_args(argv)
trades = load_trades(args.csv)
if args.calibration:
out = format_calibration_report(trades)
else:
out = format_report(
trades,
bootstrap_iterations=args.bootstrap_iterations,
seed=args.seed,
)
# Force UTF-8 on stdout: the report contains diacritics ("Clară", "Slabă")
# and a console codepage like cp1252 would crash on those.
try:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
except (AttributeError, OSError):
pass
sys.stdout.write(out)
if args.calibration:
cal = compute_calibration(args.csv)
sys.stdout.write(render_calibration(cal))
else:
stats = compute_stats(args.csv, overlay=args.overlay)
sys.stdout.write(render_stats(stats, args.overlay))
return 0
if __name__ == "__main__":
raise SystemExit(main())
# Ensure the canonical CSV schema is importable from one place — fail fast if
# someone removes append_row.CSV_COLUMNS that this module depends on.
assert CSV_COLUMNS is not None