scripts: regenerate_md + stats + tests (116-144 passing across modules)

2026-05-13 12:45:05 +03:00
parent ce80151c58
commit 26d084dc4b
6 changed files with 1843 additions and 283 deletions
--- a/scripts/append_row.py
+++ b/scripts/append_row.py
@@ -1,22 +1,26 @@
-"""Append a validated M2D extraction to ``data/trades.csv``.
+"""Append a validated M2D extraction to ``data/jurnal.csv``.

 Pipeline:
    JSON file --> pydantic validate (M2DExtraction)
-              --> load data/_meta.yaml (versions + schema)
-              --> compute ora_ro, zi, set, pl_marius, pl_theoretical
+              --> load data/_meta.yaml (versions)
+              --> compute id, ora_ro, zi, set, pl_marius, pl_theoretical, extracted_at
              --> dedup on (screenshot_file, source)
-              --> atomic CSV write (temp file + os.replace)
+              --> atomic CSV write (sibling .tmp + os.replace)

 Source values
-    - ``manual``               : Marius logged by hand
    - ``vision``               : produced by the vision subagent
+    - ``manual``               : Marius logged by hand
    - ``manual_calibration``   : calibration P4 — manual leg
    - ``vision_calibration``   : calibration P4 — vision leg

 A row with ``source=manual_calibration`` and a row with ``source=vision_calibration``
-for the *same* screenshot are allowed to coexist (different dedup keys); a
-duplicate ``(screenshot_file, source)`` pair is rejected (or skipped — see
-``append_row`` ``on_duplicate`` argument).
+for the *same* screenshot are allowed to coexist (different dedup keys).
+
+Failure mode: ``append_extraction`` NEVER raises. On any error (missing JSON,
+pydantic ValidationError, dedup hit, etc.) it returns
+``{"status": "rejected", "reason": "...", "id": None, "row": None}`` so the
+caller (a slash command) can decide what to do with the screenshot
+(move to ``needs_review/``, log to workflow, etc.).
 """

 from __future__ import annotations
@@ -24,41 +28,43 @@ from __future__ import annotations
 import csv
 import json
 import os
-import tempfile
+import traceback
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Literal

 import yaml
+from pydantic import ValidationError

 from scripts.calendar_parse import calc_set, load_calendar, utc_to_ro
 from scripts.pl_calc import pl_marius, pl_theoretical
-from scripts.vision_schema import M2DExtraction, parse_extraction_dict
+from scripts.vision_schema import M2DExtraction, parse_extraction

 __all__ = [
    "CSV_COLUMNS",
    "VALID_SOURCES",
-    "build_row",
-    "read_rows",
-    "append_row",
-    "append_row_from_json",
+    "ZI_RO_MAP",
+    "csv_columns",
+    "append_extraction",
 ]


-Source = Literal["manual", "vision", "manual_calibration", "vision_calibration"]
+Source = Literal["vision", "manual", "manual_calibration", "vision_calibration"]

 VALID_SOURCES: frozenset[str] = frozenset(
-    {"manual", "vision", "manual_calibration", "vision_calibration"}
+    {"vision", "manual", "manual_calibration", "vision_calibration"}
 )


+# Canonical column order (29) — must stay stable; regenerate_md + stats depend on it.
 CSV_COLUMNS: tuple[str, ...] = (
+    "id",
    "screenshot_file",
    "source",
    "data",
-    "ora_utc",
-    "ora_ro",
    "zi",
-    "set",
+    "ora_ro",
+    "ora_utc",
    "instrument",
    "directie",
    "tf_mare",
@@ -73,17 +79,38 @@ CSV_COLUMNS: tuple[str, ...] = (
    "outcome_path",
    "max_reached",
    "be_moved",
-    "confidence",
-    "ambiguities",
-    "note",
    "pl_marius",
    "pl_theoretical",
+    "set",
    "indicator_version",
    "pl_overlay_version",
    "csv_schema_version",
+    "extracted_at",
+    "note",
 )


+ZI_RO_MAP: dict[str, str] = {
+    "Mon": "Lu",
+    "Tue": "Ma",
+    "Wed": "Mi",
+    "Thu": "Jo",
+    "Fri": "Vi",
+    "Sat": "Sa",
+    "Sun": "Du",
+}
+
+
+def csv_columns() -> list[str]:
+    """Return the 29-column header in canonical order."""
+    return list(CSV_COLUMNS)
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
 def _load_meta(meta_path: Path) -> dict[str, Any]:
    with meta_path.open("r", encoding="utf-8") as fh:
        meta = yaml.safe_load(fh) or {}
@@ -94,35 +121,69 @@ def _load_meta(meta_path: Path) -> dict[str, Any]:
    return meta


+def _read_existing_rows(csv_path: Path) -> list[dict[str, str]]:
+    if not csv_path.exists() or csv_path.stat().st_size == 0:
+        return []
+    with csv_path.open("r", encoding="utf-8", newline="") as fh:
+        reader = csv.DictReader(fh)
+        return list(reader)
+
+
+def _next_id(rows: list[dict[str, str]]) -> int:
+    max_id = 0
+    for r in rows:
+        raw = r.get("id", "")
+        if not raw:
+            continue
+        try:
+            v = int(raw)
+        except (TypeError, ValueError):
+            continue
+        if v > max_id:
+            max_id = v
+    return max_id + 1
+
+
 def _format_optional(value: float | None) -> str:
    return "" if value is None else f"{value:.4f}"


-def build_row(
+def _write_csv_atomic(
+    csv_path: Path, rows: list[dict[str, str]], columns: list[str]
+) -> None:
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = csv_path.with_suffix(csv_path.suffix + ".tmp")
+    with tmp.open("w", encoding="utf-8", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=columns)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({k: row.get(k, "") for k in columns})
+    os.replace(tmp, csv_path)
+
+
+def _build_row(
    extraction: M2DExtraction,
+    *,
    source: str,
+    row_id: int,
    meta: dict[str, Any],
    calendar: list[dict[str, Any]],
+    extracted_at: str,
 ) -> dict[str, str]:
-    """Compute the full CSV row dict for one extraction."""
-    if source not in VALID_SOURCES:
-        raise ValueError(
-            f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}"
-        )
-
-    d_ro, t_ro, zi = utc_to_ro(extraction.data, extraction.ora_utc)
-    set_label = calc_set(d_ro, t_ro, zi, calendar)
+    d_ro, t_ro, day_short = utc_to_ro(extraction.data, extraction.ora_utc)
+    set_label = calc_set(d_ro, t_ro, day_short, calendar)
    pl_m = pl_marius(extraction.outcome_path, extraction.be_moved)
    pl_t = pl_theoretical(extraction.max_reached)
+    zi_ro = ZI_RO_MAP[day_short]

    return {
+        "id": str(row_id),
        "screenshot_file": extraction.screenshot_file,
        "source": source,
        "data": extraction.data,
-        "ora_utc": extraction.ora_utc,
+        "zi": zi_ro,
        "ora_ro": t_ro.strftime("%H:%M"),
-        "zi": zi,
-        "set": set_label,
+        "ora_utc": extraction.ora_utc,
        "instrument": extraction.instrument,
        "directie": extraction.directie,
        "tf_mare": extraction.tf_mare,
@@ -136,102 +197,115 @@ def build_row(
        "risc_pct": f"{extraction.risc_pct}",
        "outcome_path": extraction.outcome_path,
        "max_reached": extraction.max_reached,
-        "be_moved": "true" if extraction.be_moved else "false",
-        "confidence": extraction.confidence,
-        "ambiguities": json.dumps(extraction.ambiguities, ensure_ascii=False),
-        "note": extraction.note,
+        "be_moved": str(extraction.be_moved),
        "pl_marius": _format_optional(pl_m),
        "pl_theoretical": _format_optional(pl_t),
+        "set": set_label,
        "indicator_version": str(meta["indicator_version"]),
        "pl_overlay_version": str(meta["pl_overlay_version"]),
        "csv_schema_version": str(meta["csv_schema_version"]),
+        "extracted_at": extracted_at,
+        "note": extraction.note,
    }


-def read_rows(csv_path: Path) -> list[dict[str, str]]:
-    """Read existing rows; return [] if the file does not exist or is empty."""
-    if not csv_path.exists() or csv_path.stat().st_size == 0:
-        return []
-    with csv_path.open("r", encoding="utf-8", newline="") as fh:
-        reader = csv.DictReader(fh)
-        return list(reader)
+def _reject(reason: str) -> dict[str, Any]:
+    return {"status": "rejected", "reason": reason, "id": None, "row": None}


-def _atomic_write(csv_path: Path, rows: list[dict[str, str]]) -> None:
-    csv_path.parent.mkdir(parents=True, exist_ok=True)
-    fd, tmp_name = tempfile.mkstemp(
-        prefix=csv_path.name + ".",
-        suffix=".tmp",
-        dir=str(csv_path.parent),
-    )
-    try:
-        with os.fdopen(fd, "w", encoding="utf-8", newline="") as fh:
-            writer = csv.DictWriter(fh, fieldnames=list(CSV_COLUMNS))
-            writer.writeheader()
-            for r in rows:
-                writer.writerow({k: r.get(k, "") for k in CSV_COLUMNS})
-        os.replace(tmp_name, csv_path)
-    except Exception:
-        try:
-            os.unlink(tmp_name)
-        except OSError:
-            pass
-        raise
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------


-def append_row(
-    extraction: M2DExtraction,
+def append_extraction(
+    json_path: Path | str,
    source: str,
-    csv_path: Path,
-    meta_path: Path,
-    calendar_path: Path,
-    on_duplicate: Literal["raise", "skip"] = "raise",
-) -> dict[str, str]:
-    """Append one extraction to the CSV.
+    csv_path: Path | str = "data/jurnal.csv",
+    meta_path: Path | str = "data/_meta.yaml",
+    calendar_path: Path | str = "calendar_evenimente.yaml",
+) -> dict[str, Any]:
+    """Append one validated extraction to the jurnal CSV.

-    Dedup key: ``(screenshot_file, source)``. If a row with the same key
-    already exists, behaviour is controlled by ``on_duplicate``:
+    Never raises. Returns one of:

-    - ``"raise"`` (default): raise ``ValueError``.
-    - ``"skip"``: leave the CSV untouched and return the *existing* row.
+    - ``{"status": "ok", "reason": "", "id": <int>, "row": <dict>}``
+    - ``{"status": "rejected", "reason": <str>, "id": None, "row": None}``
    """
-    meta = _load_meta(meta_path)
-    calendar = load_calendar(calendar_path)
-    row = build_row(extraction, source, meta, calendar)
+    json_path = Path(json_path)
+    csv_path = Path(csv_path)
+    meta_path = Path(meta_path)
+    calendar_path = Path(calendar_path)

-    existing = read_rows(csv_path)
-    key = (row["screenshot_file"], row["source"])
+    if source not in VALID_SOURCES:
+        return _reject(
+            f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}"
+        )
+
+    if not json_path.exists():
+        return _reject(f"JSON file not found: {json_path}")
+
+    try:
+        with json_path.open("r", encoding="utf-8") as fh:
+            raw = fh.read()
+    except OSError as exc:
+        return _reject(f"failed to read JSON {json_path}: {exc}")
+
+    try:
+        extraction = parse_extraction(raw)
+    except ValidationError as exc:
+        return _reject(f"validation error: {exc}")
+    except (ValueError, json.JSONDecodeError) as exc:
+        return _reject(f"validation error (json parse): {exc}")
+
+    try:
+        meta = _load_meta(meta_path)
+    except (FileNotFoundError, OSError) as exc:
+        return _reject(f"_meta.yaml not found: {exc}")
+    except (ValueError, yaml.YAMLError) as exc:
+        return _reject(f"_meta.yaml invalid: {exc}")
+
+    try:
+        calendar = load_calendar(calendar_path)
+    except (FileNotFoundError, OSError) as exc:
+        return _reject(f"calendar not found: {exc}")
+    except (ValueError, yaml.YAMLError) as exc:
+        return _reject(f"calendar invalid: {exc}")
+
+    try:
+        existing = _read_existing_rows(csv_path)
+    except OSError as exc:
+        return _reject(f"failed to read existing CSV {csv_path}: {exc}")
+
+    key = (extraction.screenshot_file, source)
    for r in existing:
        if (r.get("screenshot_file"), r.get("source")) == key:
-            if on_duplicate == "skip":
-                return r
-            raise ValueError(
-                f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r} "
-                f"already exists in {csv_path}"
+            return _reject(
+                f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r}"
            )

-    existing.append(row)
-    _atomic_write(csv_path, existing)
-    return row
-
-
-def append_row_from_json(
-    json_path: Path,
-    source: str,
-    csv_path: Path,
-    meta_path: Path,
-    calendar_path: Path,
-    on_duplicate: Literal["raise", "skip"] = "raise",
-) -> dict[str, str]:
-    """Convenience wrapper: load JSON, validate, append."""
-    with Path(json_path).open("r", encoding="utf-8") as fh:
-        payload = json.load(fh)
-    extraction = parse_extraction_dict(payload)
-    return append_row(
-        extraction=extraction,
-        source=source,
-        csv_path=csv_path,
-        meta_path=meta_path,
-        calendar_path=calendar_path,
-        on_duplicate=on_duplicate,
+    row_id = _next_id(existing)
+    extracted_at = (
+        datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S") + "Z"
    )
+
+    try:
+        row = _build_row(
+            extraction,
+            source=source,
+            row_id=row_id,
+            meta=meta,
+            calendar=calendar,
+            extracted_at=extracted_at,
+        )
+    except (KeyError, ValueError) as exc:
+        return _reject(f"derived-field computation failed: {exc}")
+
+    try:
+        _write_csv_atomic(csv_path, [*existing, row], list(CSV_COLUMNS))
+    except OSError as exc:
+        return _reject(
+            f"atomic write failed: {exc}\n{traceback.format_exc()}"
+        )
+
+    return {"status": "ok", "reason": "", "id": row_id, "row": row}