feat(cli): atm validate-calibration — offline color classification gate

Adds `atm validate-calibration LABEL_FILE` subcommand that runs the Detector on a set of labeled PNG frames and reports per-sample PASS/FAIL with top-3 candidate colors and RGB-distance suggestions for failures. Exits 0 on 100% PASS, 1 on any FAIL, 2 on missing/malformed label file. - New module src/atm/validate.py with ValidationReport + SampleRecord dataclasses; reuses Detector.step(frame), does not reimplement color classification. - main.py: new `validate-calibration` subparser and _cmd_validate_calibration handler wired into the dispatch map. - samples/calibration_labels.json seeded with 3 entries from the 2026-04-17 incident, plus a README describing the schema. - tests/test_validate.py covers the 3 planned cases: PASS, FAIL w/ top-3 + suggestion, missing file (graceful error, no traceback). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 11:54:48 +03:00
parent 23865776e3
commit 8bae507bbd
5 changed files with 535 additions and 0 deletions
--- a/samples/calibration_labels.README.md
+++ b/samples/calibration_labels.README.md
@@ -0,0 +1,33 @@
+# calibration_labels.json — schema
+
+Used by `atm validate-calibration` to check that the current color calibration
+classifies known-good screenshots correctly before a live session.
+
+## Schema
+
+A JSON array of entries. Each entry:
+
+| Field      | Type    | Required | Description                                                    |
+|------------|---------|----------|----------------------------------------------------------------|
+| `path`     | string  | yes      | Path to a PNG frame (relative to CWD or absolute).             |
+| `expected` | string  | yes      | Expected color name: one of `turquoise`, `yellow`, `dark_green`, `dark_red`, `light_green`, `light_red`, `gray`. |
+| `note`     | string  | no       | Freeform annotation; shown in SUGGESTIONS output.              |
+
+## Usage
+
+```bash
+atm validate-calibration samples/calibration_labels.json
+```
+
+Exit codes:
+- `0` — every sample PASS
+- `1` — one or more FAIL
+- `2` — label file missing or malformed JSON
+
+## Adding new samples
+
+1. Find a screenshot in `logs/fires/` whose dot color you can verify by eye.
+2. Append an entry with `path`, `expected`, and an optional `note`.
+3. Re-run validation. If it FAILs, the SUGGESTIONS section will tell you the
+   RGB distance between the observed pixel and the expected color's center —
+   use that as input for `atm calibrate`.
--- a/samples/calibration_labels.json
+++ b/samples/calibration_labels.json
@@ -0,0 +1,17 @@
+[
+  {
+    "path": "logs/fires/20260417_201500_arm_sell.png",
+    "expected": "yellow",
+    "note": "first arm of SELL cycle 2026-04-17"
+  },
+  {
+    "path": "logs/fires/20260417_205302_ss.png",
+    "expected": "dark_red",
+    "note": "user confirmed via screenshot (missed live alert)"
+  },
+  {
+    "path": "logs/fires/20260417_210441_ss.png",
+    "expected": "light_red",
+    "note": "fire phase (missed live alert)"
+  }
+]
--- a/src/atm/main.py
+++ b/src/atm/main.py
@@ -135,6 +135,16 @@ def main(argv=None) -> None:
        metavar="PATH", help="Journal JSONL file (default: trades.jsonl)",
    )

+    # validate-calibration
+    p_valid = sub.add_parser(
+        "validate-calibration",
+        help="Offline: run Detector on labeled frames and report PASS/FAIL",
+    )
+    p_valid.add_argument(
+        "label_file", type=Path, metavar="LABEL_FILE",
+        help="JSON array with [{path, expected, note?}, ...] entries",
+    )
+
    args = parser.parse_args(argv)

    _dispatch = {
@@ -145,6 +155,7 @@ def main(argv=None) -> None:
        "debug": _cmd_debug,
        "journal": _cmd_journal,
        "report": _cmd_report,
+        "validate-calibration": _cmd_validate_calibration,
    }
    _dispatch[args.command](args)

@@ -418,6 +429,37 @@ def _cmd_report(args) -> None:
    )


+def _cmd_validate_calibration(args) -> None:
+    """Run offline calibration validation; exit 0 on 100% PASS, 1 otherwise."""
+    try:
+        from atm.validate import validate_calibration, ValidationError
+    except ImportError as exc:
+        sys.exit(f"validate module not available: {exc}")
+
+    label_file = Path(args.label_file)
+    try:
+        cfg = Config.load_current(Path("configs"))
+    except FileNotFoundError as exc:
+        sys.exit(f"config not found: {exc}")
+
+    try:
+        config_name = ""
+        cur_ptr = Path("configs") / "current.txt"
+        if cur_ptr.exists():
+            config_name = cur_ptr.read_text(encoding="utf-8").strip()
+    except Exception:
+        config_name = ""
+
+    try:
+        report = validate_calibration(label_file, cfg, config_name=config_name)
+    except ValidationError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        sys.exit(2)
+
+    print(report.render())
+    sys.exit(0 if report.all_pass else 1)
+
+
 # ---------------------------------------------------------------------------
 # Live loop
 # ---------------------------------------------------------------------------
--- a/src/atm/validate.py
+++ b/src/atm/validate.py
@@ -0,0 +1,229 @@
+"""Offline calibration validation: run Detector on labeled frames, report PASS/FAIL.
+
+Used by the `atm validate-calibration` subcommand. Reports per-sample detection
+results against expected labels, and for failures, computes RGB distance to
+each color threshold and emits tuning suggestions.
+
+Reuses `Detector.step(frame)` - does NOT reimplement color classification.
+"""
+from __future__ import annotations
+
+import json
+import math
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from .config import Config
+
+
+@dataclass
+class SampleRecord:
+    path: str
+    expected: str
+    detected: str | None
+    confidence: float
+    rgb: tuple[int, int, int] | None
+    top3: list[tuple[str, float]]  # [(name, score), ...] ranked by RGB distance
+    passed: bool
+    note: str = ""
+    error: str | None = None  # non-None if frame load failed / schema bad
+
+
+@dataclass
+class ValidationReport:
+    records: list[SampleRecord] = field(default_factory=list)
+    config_name: str = ""
+
+    @property
+    def total(self) -> int:
+        return len(self.records)
+
+    @property
+    def passed(self) -> int:
+        return sum(1 for r in self.records if r.passed)
+
+    @property
+    def failed(self) -> int:
+        return self.total - self.passed
+
+    @property
+    def all_pass(self) -> bool:
+        return self.total > 0 and self.failed == 0
+
+    def render(self) -> str:
+        lines: list[str] = []
+        hdr = f"Testing {self.total} frames"
+        if self.config_name:
+            hdr += f" against config {self.config_name}"
+        hdr += "..."
+        lines.append(hdr)
+        lines.append("")
+
+        for r in self.records:
+            name = Path(r.path).name or r.path
+            if r.error:
+                lines.append(f"  [FAIL] {name}")
+                lines.append(f"         error: {r.error}")
+                continue
+            tag = "PASS" if r.passed else "FAIL"
+            rgb_str = f"RGB {r.rgb}" if r.rgb is not None else "RGB n/a"
+            detected = r.detected if r.detected is not None else "none"
+            lines.append(f"  [{tag}] {name}")
+            lines.append(
+                f"         expected={r.expected}  detected={detected}  "
+                f"(conf {r.confidence:.2f}, {rgb_str})"
+            )
+            if not r.passed and r.top3:
+                top3_str = " ".join(f"{n}({c:.2f})" for n, c in r.top3)
+                lines.append(f"         Top 3 candidates: {top3_str}")
+
+        lines.append("")
+        pct = (self.passed / self.total * 100.0) if self.total else 0.0
+        lines.append(f"SUMMARY: {self.passed}/{self.total} PASS ({pct:.0f}%)")
+
+        fails = [r for r in self.records if not r.passed]
+        if fails:
+            lines.append("FAILED:")
+            for r in fails:
+                name = Path(r.path).name or r.path
+                if r.error:
+                    lines.append(f"  - {name}: {r.error}")
+                    continue
+                detected = r.detected if r.detected is not None else "none"
+                lines.append(
+                    f"  - {name}: expected {r.expected}, got {detected}"
+                )
+
+            sug_lines = [
+                r._suggestion  # type: ignore[attr-defined]
+                for r in fails
+                if getattr(r, "_suggestion", "")
+            ]
+            if sug_lines:
+                lines.append("")
+                lines.append("SUGGESTIONS:")
+                for s in sug_lines:
+                    lines.append(f"  - {s}")
+
+        return "\n".join(lines)
+
+    def __str__(self) -> str:
+        return self.render()
+
+
+class ValidationError(Exception):
+    """Raised for missing label files or invalid schema."""
+
+
+def _rgb_distance(a: tuple[int, int, int], b: tuple[int, int, int]) -> float:
+    return math.sqrt(sum((a[i] - b[i]) ** 2 for i in range(3)))
+
+
+def _load_labels(label_file: Path) -> list[dict[str, Any]]:
+    if not label_file.exists():
+        raise ValidationError(f"label file not found: {label_file}")
+    try:
+        data = json.loads(label_file.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise ValidationError(f"invalid JSON in {label_file}: {exc}") from exc
+    if not isinstance(data, list):
+        raise ValidationError(
+            f"label file must be a JSON array; got {type(data).__name__}"
+        )
+    return data
+
+
+def validate_calibration(
+    label_file: Path,
+    cfg: Config,
+    config_name: str = "",
+) -> ValidationReport:
+    """Run Detector on each labeled frame; return a ValidationReport.
+
+    Reuses `Detector.step(frame)`. Loads frames via cv2.imread.
+    Raises ValidationError if the label file is missing or malformed.
+    """
+    import cv2  # local import keeps module import cheap
+    from .detector import Detector
+
+    entries = _load_labels(label_file)
+    report = ValidationReport(config_name=config_name)
+
+    palette = {
+        name: spec.rgb
+        for name, spec in cfg.colors.items()
+        if name != "background"
+    }
+
+    detector = Detector(cfg=cfg, capture=lambda: None)
+
+    for entry in entries:
+        path = str(entry.get("path", ""))
+        expected = str(entry.get("expected", ""))
+        note = str(entry.get("note", ""))
+
+        if not path or not expected:
+            rec = SampleRecord(
+                path=path, expected=expected, detected=None, confidence=0.0,
+                rgb=None, top3=[], passed=False, note=note,
+                error="missing 'path' or 'expected' field",
+            )
+            rec._suggestion = ""  # type: ignore[attr-defined]
+            report.records.append(rec)
+            continue
+
+        frame = cv2.imread(path)
+        if frame is None:
+            rec = SampleRecord(
+                path=path, expected=expected, detected=None, confidence=0.0,
+                rgb=None, top3=[], passed=False, note=note,
+                error=f"cv2.imread failed for {path}",
+            )
+            rec._suggestion = ""  # type: ignore[attr-defined]
+            report.records.append(rec)
+            continue
+
+        result = detector.step(ts=0.0, frame=frame)
+
+        match = result.match
+        if match is None:
+            detected: str | None = None
+            confidence = 0.0
+        else:
+            detected = match.name if match.name != "UNKNOWN" else None
+            confidence = match.confidence
+
+        rgb = result.rgb
+
+        # Top 3 candidates: rank palette entries by RGB distance to observed.
+        top3: list[tuple[str, float]] = []
+        if rgb is not None:
+            scored: list[tuple[str, float]] = []
+            for name, ref in palette.items():
+                scored.append((name, _rgb_distance(rgb, ref)))
+            scored.sort(key=lambda t: t[1])
+            top3 = [(n, 1.0 / (1.0 + d / 20.0)) for n, d in scored[:3]]
+
+        passed = detected == expected
+
+        rec = SampleRecord(
+            path=path, expected=expected, detected=detected,
+            confidence=confidence, rgb=rgb, top3=top3, passed=passed, note=note,
+        )
+
+        if not passed and rgb is not None and expected in palette:
+            ref = palette[expected]
+            tol = cfg.colors[expected].tolerance
+            dist = _rgb_distance(rgb, ref)
+            rec._suggestion = (  # type: ignore[attr-defined]
+                f"{expected} praguri curente: RGB{ref} +/- {tol:.0f}. "
+                f"Pixelul observat {rgb} e la distanta {dist:.1f} "
+                f"-> recalibreaza cu acest sample."
+            )
+        else:
+            rec._suggestion = ""  # type: ignore[attr-defined]
+
+        report.records.append(rec)
+
+    return report
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -0,0 +1,214 @@
+"""Tests for atm.validate — offline calibration validation.
+
+Covers the 3 tests from plan section D':
+  17. test_validate_calibration_pass
+  18. test_validate_calibration_fail_reports_top_candidates
+  19. test_validate_calibration_file_not_found
+"""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from atm.config import (
+    CanaryRegion,
+    ColorSpec,
+    Config,
+    DiscordCfg,
+    ROI,
+    TelegramCfg,
+    YAxisCalib,
+)
+from atm.detector import DetectionResult
+from atm.vision import ColorMatch
+
+
+def _make_config() -> Config:
+    """Minimal Config with a palette large enough to support top-3 candidates."""
+    colors = {
+        "turquoise":   ColorSpec(rgb=(0, 200, 200),    tolerance=30),
+        "yellow":      ColorSpec(rgb=(255, 255, 0),    tolerance=30),
+        "dark_green":  ColorSpec(rgb=(0, 100, 0),      tolerance=30),
+        "dark_red":    ColorSpec(rgb=(165, 42, 42),    tolerance=30),
+        "light_green": ColorSpec(rgb=(144, 238, 144),  tolerance=30),
+        "light_red":   ColorSpec(rgb=(255, 182, 193),  tolerance=30),
+        "gray":        ColorSpec(rgb=(128, 128, 128),  tolerance=30),
+        "background":  ColorSpec(rgb=(18, 18, 18),     tolerance=15),
+    }
+    return Config(
+        window_title="test",
+        dot_roi=ROI(x=0, y=0, w=100, h=100),
+        chart_roi=ROI(x=0, y=0, w=100, h=100),
+        colors=colors,
+        y_axis=YAxisCalib(p1_y=0, p1_price=100.0, p2_y=100, p2_price=0.0),
+        canary=CanaryRegion(
+            roi=ROI(x=0, y=0, w=10, h=10),
+            baseline_phash="0" * 64,
+        ),
+        discord=DiscordCfg(webhook_url="http://localhost/fake"),
+        telegram=TelegramCfg(bot_token="fake_token", chat_id="123"),
+        debounce_depth=1,
+    )
+
+
+def _write_labels(tmp_path: Path, entries: list[dict]) -> Path:
+    f = tmp_path / "labels.json"
+    f.write_text(json.dumps(entries), encoding="utf-8")
+    return f
+
+
+def _write_blank_png(tmp_path: Path, name: str) -> Path:
+    """Write a trivially-valid 10x10 BGR image so cv2.imread returns non-None."""
+    import cv2
+    p = tmp_path / name
+    arr = np.zeros((10, 10, 3), dtype=np.uint8)
+    cv2.imwrite(str(p), arr)
+    return p
+
+
+# ---------------------------------------------------------------------------
+# Test 17: PASS path — mocked Detector.step returns expected color
+# ---------------------------------------------------------------------------
+
+def test_validate_calibration_pass(monkeypatch, tmp_path):
+    from atm import validate as validate_mod
+
+    img_path = _write_blank_png(tmp_path, "yellow_sample.png")
+    labels = _write_labels(
+        tmp_path,
+        [{"path": str(img_path), "expected": "yellow", "note": "test"}],
+    )
+
+    def fake_step(self, ts, frame=None):
+        return DetectionResult(
+            ts=ts,
+            window_found=True,
+            dot_found=True,
+            rgb=(250, 250, 5),
+            match=ColorMatch(name="yellow", distance=6.0, confidence=0.94),
+            accepted=True,
+            color="yellow",
+        )
+
+    monkeypatch.setattr("atm.detector.Detector.step", fake_step)
+
+    report = validate_mod.validate_calibration(labels, _make_config())
+
+    assert report.total == 1
+    assert report.passed == 1
+    assert report.failed == 0
+    assert report.all_pass is True
+    rec = report.records[0]
+    assert rec.passed is True
+    assert rec.detected == "yellow"
+    assert rec.expected == "yellow"
+    assert "[PASS]" in report.render()
+
+    # CLI wiring: exit 0
+    import atm.main as _main
+
+    class _Args:
+        label_file = labels
+
+    monkeypatch.setattr("atm.config.Config.load_current", classmethod(lambda cls, d: _make_config()))
+    with pytest.raises(SystemExit) as exc_info:
+        _main._cmd_validate_calibration(_Args())
+    assert exc_info.value.code == 0
+
+
+# ---------------------------------------------------------------------------
+# Test 18: FAIL path — Detector returns wrong color; report lists top 3
+#                      candidates and a SUGGESTIONS line with RGB distance.
+# ---------------------------------------------------------------------------
+
+def test_validate_calibration_fail_reports_top_candidates(monkeypatch, tmp_path):
+    from atm import validate as validate_mod
+
+    img_path = _write_blank_png(tmp_path, "dark_red_sample.png")
+    labels = _write_labels(
+        tmp_path,
+        [{"path": str(img_path), "expected": "dark_red", "note": "missed dark_red"}],
+    )
+
+    # Observed RGB closer to gray than dark_red (like the real 2026-04-17 miss).
+    def fake_step(self, ts, frame=None):
+        return DetectionResult(
+            ts=ts,
+            window_found=True,
+            dot_found=True,
+            rgb=(135, 62, 67),
+            match=ColorMatch(name="gray", distance=45.0, confidence=0.12),
+            accepted=True,
+            color="gray",
+        )
+
+    monkeypatch.setattr("atm.detector.Detector.step", fake_step)
+
+    report = validate_mod.validate_calibration(labels, _make_config())
+
+    assert report.total == 1
+    assert report.failed == 1
+    assert report.all_pass is False
+
+    rec = report.records[0]
+    assert rec.passed is False
+    assert rec.detected == "gray"
+    assert rec.expected == "dark_red"
+    # Top 3 candidates populated (name, score) sorted by RGB distance.
+    assert len(rec.top3) == 3
+    names = [n for n, _ in rec.top3]
+    # dark_red should appear in top candidates since observed RGB(135,62,67)
+    # is reasonably close to dark_red(165,42,42).
+    assert "dark_red" in names
+
+    rendered = report.render()
+    assert "[FAIL]" in rendered
+    assert "Top 3 candidates:" in rendered
+    assert "SUGGESTIONS:" in rendered
+    # The suggestion must mention the expected color's RGB and the measured distance.
+    assert "dark_red" in rendered
+    assert "(165, 42, 42)" in rendered
+
+    # CLI wiring: exit 1
+    import atm.main as _main
+
+    class _Args:
+        label_file = labels
+
+    monkeypatch.setattr("atm.config.Config.load_current", classmethod(lambda cls, d: _make_config()))
+    with pytest.raises(SystemExit) as exc_info:
+        _main._cmd_validate_calibration(_Args())
+    assert exc_info.value.code == 1
+
+
+# ---------------------------------------------------------------------------
+# Test 19: missing label file — clean error, non-zero exit, no stack trace
+# ---------------------------------------------------------------------------
+
+def test_validate_calibration_file_not_found(monkeypatch, tmp_path, capsys):
+    from atm import validate as validate_mod
+
+    missing = tmp_path / "nope.json"
+
+    # Library-level: raises ValidationError (not bare FileNotFoundError).
+    with pytest.raises(validate_mod.ValidationError) as exc_info:
+        validate_mod.validate_calibration(missing, _make_config())
+    assert "not found" in str(exc_info.value).lower()
+
+    # CLI-level: graceful sys.exit with non-zero code, message on stderr.
+    import atm.main as _main
+
+    class _Args:
+        label_file = missing
+
+    monkeypatch.setattr("atm.config.Config.load_current", classmethod(lambda cls, d: _make_config()))
+    with pytest.raises(SystemExit) as exc_info:
+        _main._cmd_validate_calibration(_Args())
+    assert exc_info.value.code != 0
+    err = capsys.readouterr().err
+    assert "not found" in err.lower()
+    # Ensure no python traceback leaked through.
+    assert "Traceback" not in err