"""Append a validated M2D extraction to ``data/trades.csv``. Pipeline: JSON file --> pydantic validate (M2DExtraction) --> load data/_meta.yaml (versions + schema) --> compute ora_ro, zi, set, pl_marius, pl_theoretical --> dedup on (screenshot_file, source) --> atomic CSV write (temp file + os.replace) Source values - ``manual`` : Marius logged by hand - ``vision`` : produced by the vision subagent - ``manual_calibration`` : calibration P4 — manual leg - ``vision_calibration`` : calibration P4 — vision leg A row with ``source=manual_calibration`` and a row with ``source=vision_calibration`` for the *same* screenshot are allowed to coexist (different dedup keys); a duplicate ``(screenshot_file, source)`` pair is rejected (or skipped — see ``append_row`` ``on_duplicate`` argument). """ from __future__ import annotations import csv import json import os import tempfile from pathlib import Path from typing import Any, Literal import yaml from scripts.calendar_parse import calc_set, load_calendar, utc_to_ro from scripts.pl_calc import pl_marius, pl_theoretical from scripts.vision_schema import M2DExtraction, parse_extraction_dict __all__ = [ "CSV_COLUMNS", "VALID_SOURCES", "build_row", "read_rows", "append_row", "append_row_from_json", ] Source = Literal["manual", "vision", "manual_calibration", "vision_calibration"] VALID_SOURCES: frozenset[str] = frozenset( {"manual", "vision", "manual_calibration", "vision_calibration"} ) CSV_COLUMNS: tuple[str, ...] = ( "screenshot_file", "source", "data", "ora_utc", "ora_ro", "zi", "set", "instrument", "directie", "tf_mare", "tf_mic", "calitate", "entry", "sl", "tp0", "tp1", "tp2", "risc_pct", "outcome_path", "max_reached", "be_moved", "confidence", "ambiguities", "note", "pl_marius", "pl_theoretical", "indicator_version", "pl_overlay_version", "csv_schema_version", ) def _load_meta(meta_path: Path) -> dict[str, Any]: with meta_path.open("r", encoding="utf-8") as fh: meta = yaml.safe_load(fh) or {} required = ("indicator_version", "pl_overlay_version", "csv_schema_version") missing = [k for k in required if k not in meta] if missing: raise ValueError(f"_meta.yaml missing required keys: {missing}") return meta def _format_optional(value: float | None) -> str: return "" if value is None else f"{value:.4f}" def build_row( extraction: M2DExtraction, source: str, meta: dict[str, Any], calendar: list[dict[str, Any]], ) -> dict[str, str]: """Compute the full CSV row dict for one extraction.""" if source not in VALID_SOURCES: raise ValueError( f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}" ) d_ro, t_ro, zi = utc_to_ro(extraction.data, extraction.ora_utc) set_label = calc_set(d_ro, t_ro, zi, calendar) pl_m = pl_marius(extraction.outcome_path, extraction.be_moved) pl_t = pl_theoretical(extraction.max_reached) return { "screenshot_file": extraction.screenshot_file, "source": source, "data": extraction.data, "ora_utc": extraction.ora_utc, "ora_ro": t_ro.strftime("%H:%M"), "zi": zi, "set": set_label, "instrument": extraction.instrument, "directie": extraction.directie, "tf_mare": extraction.tf_mare, "tf_mic": extraction.tf_mic, "calitate": extraction.calitate, "entry": f"{extraction.entry}", "sl": f"{extraction.sl}", "tp0": f"{extraction.tp0}", "tp1": f"{extraction.tp1}", "tp2": f"{extraction.tp2}", "risc_pct": f"{extraction.risc_pct}", "outcome_path": extraction.outcome_path, "max_reached": extraction.max_reached, "be_moved": "true" if extraction.be_moved else "false", "confidence": extraction.confidence, "ambiguities": json.dumps(extraction.ambiguities, ensure_ascii=False), "note": extraction.note, "pl_marius": _format_optional(pl_m), "pl_theoretical": _format_optional(pl_t), "indicator_version": str(meta["indicator_version"]), "pl_overlay_version": str(meta["pl_overlay_version"]), "csv_schema_version": str(meta["csv_schema_version"]), } def read_rows(csv_path: Path) -> list[dict[str, str]]: """Read existing rows; return [] if the file does not exist or is empty.""" if not csv_path.exists() or csv_path.stat().st_size == 0: return [] with csv_path.open("r", encoding="utf-8", newline="") as fh: reader = csv.DictReader(fh) return list(reader) def _atomic_write(csv_path: Path, rows: list[dict[str, str]]) -> None: csv_path.parent.mkdir(parents=True, exist_ok=True) fd, tmp_name = tempfile.mkstemp( prefix=csv_path.name + ".", suffix=".tmp", dir=str(csv_path.parent), ) try: with os.fdopen(fd, "w", encoding="utf-8", newline="") as fh: writer = csv.DictWriter(fh, fieldnames=list(CSV_COLUMNS)) writer.writeheader() for r in rows: writer.writerow({k: r.get(k, "") for k in CSV_COLUMNS}) os.replace(tmp_name, csv_path) except Exception: try: os.unlink(tmp_name) except OSError: pass raise def append_row( extraction: M2DExtraction, source: str, csv_path: Path, meta_path: Path, calendar_path: Path, on_duplicate: Literal["raise", "skip"] = "raise", ) -> dict[str, str]: """Append one extraction to the CSV. Dedup key: ``(screenshot_file, source)``. If a row with the same key already exists, behaviour is controlled by ``on_duplicate``: - ``"raise"`` (default): raise ``ValueError``. - ``"skip"``: leave the CSV untouched and return the *existing* row. """ meta = _load_meta(meta_path) calendar = load_calendar(calendar_path) row = build_row(extraction, source, meta, calendar) existing = read_rows(csv_path) key = (row["screenshot_file"], row["source"]) for r in existing: if (r.get("screenshot_file"), r.get("source")) == key: if on_duplicate == "skip": return r raise ValueError( f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r} " f"already exists in {csv_path}" ) existing.append(row) _atomic_write(csv_path, existing) return row def append_row_from_json( json_path: Path, source: str, csv_path: Path, meta_path: Path, calendar_path: Path, on_duplicate: Literal["raise", "skip"] = "raise", ) -> dict[str, str]: """Convenience wrapper: load JSON, validate, append.""" with Path(json_path).open("r", encoding="utf-8") as fh: payload = json.load(fh) extraction = parse_extraction_dict(payload) return append_row( extraction=extraction, source=source, csv_path=csv_path, meta_path=meta_path, calendar_path=calendar_path, on_duplicate=on_duplicate, )