scripts: regenerate_md + stats + tests (116-144 passing across modules)

This commit is contained in:
Marius
2026-05-13 12:45:05 +03:00
parent ce80151c58
commit 26d084dc4b
6 changed files with 1843 additions and 283 deletions

View File

@@ -1,22 +1,26 @@
"""Append a validated M2D extraction to ``data/trades.csv``.
"""Append a validated M2D extraction to ``data/jurnal.csv``.
Pipeline:
JSON file --> pydantic validate (M2DExtraction)
--> load data/_meta.yaml (versions + schema)
--> compute ora_ro, zi, set, pl_marius, pl_theoretical
--> load data/_meta.yaml (versions)
--> compute id, ora_ro, zi, set, pl_marius, pl_theoretical, extracted_at
--> dedup on (screenshot_file, source)
--> atomic CSV write (temp file + os.replace)
--> atomic CSV write (sibling .tmp + os.replace)
Source values
- ``manual`` : Marius logged by hand
- ``vision`` : produced by the vision subagent
- ``manual`` : Marius logged by hand
- ``manual_calibration`` : calibration P4 — manual leg
- ``vision_calibration`` : calibration P4 — vision leg
A row with ``source=manual_calibration`` and a row with ``source=vision_calibration``
for the *same* screenshot are allowed to coexist (different dedup keys); a
duplicate ``(screenshot_file, source)`` pair is rejected (or skipped — see
``append_row`` ``on_duplicate`` argument).
for the *same* screenshot are allowed to coexist (different dedup keys).
Failure mode: ``append_extraction`` NEVER raises. On any error (missing JSON,
pydantic ValidationError, dedup hit, etc.) it returns
``{"status": "rejected", "reason": "...", "id": None, "row": None}`` so the
caller (a slash command) can decide what to do with the screenshot
(move to ``needs_review/``, log to workflow, etc.).
"""
from __future__ import annotations
@@ -24,41 +28,43 @@ from __future__ import annotations
import csv
import json
import os
import tempfile
import traceback
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Literal
import yaml
from pydantic import ValidationError
from scripts.calendar_parse import calc_set, load_calendar, utc_to_ro
from scripts.pl_calc import pl_marius, pl_theoretical
from scripts.vision_schema import M2DExtraction, parse_extraction_dict
from scripts.vision_schema import M2DExtraction, parse_extraction
__all__ = [
"CSV_COLUMNS",
"VALID_SOURCES",
"build_row",
"read_rows",
"append_row",
"append_row_from_json",
"ZI_RO_MAP",
"csv_columns",
"append_extraction",
]
Source = Literal["manual", "vision", "manual_calibration", "vision_calibration"]
Source = Literal["vision", "manual", "manual_calibration", "vision_calibration"]
VALID_SOURCES: frozenset[str] = frozenset(
{"manual", "vision", "manual_calibration", "vision_calibration"}
{"vision", "manual", "manual_calibration", "vision_calibration"}
)
# Canonical column order (29) — must stay stable; regenerate_md + stats depend on it.
CSV_COLUMNS: tuple[str, ...] = (
"id",
"screenshot_file",
"source",
"data",
"ora_utc",
"ora_ro",
"zi",
"set",
"ora_ro",
"ora_utc",
"instrument",
"directie",
"tf_mare",
@@ -73,17 +79,38 @@ CSV_COLUMNS: tuple[str, ...] = (
"outcome_path",
"max_reached",
"be_moved",
"confidence",
"ambiguities",
"note",
"pl_marius",
"pl_theoretical",
"set",
"indicator_version",
"pl_overlay_version",
"csv_schema_version",
"extracted_at",
"note",
)
ZI_RO_MAP: dict[str, str] = {
"Mon": "Lu",
"Tue": "Ma",
"Wed": "Mi",
"Thu": "Jo",
"Fri": "Vi",
"Sat": "Sa",
"Sun": "Du",
}
def csv_columns() -> list[str]:
"""Return the 29-column header in canonical order."""
return list(CSV_COLUMNS)
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def _load_meta(meta_path: Path) -> dict[str, Any]:
with meta_path.open("r", encoding="utf-8") as fh:
meta = yaml.safe_load(fh) or {}
@@ -94,35 +121,69 @@ def _load_meta(meta_path: Path) -> dict[str, Any]:
return meta
def _read_existing_rows(csv_path: Path) -> list[dict[str, str]]:
if not csv_path.exists() or csv_path.stat().st_size == 0:
return []
with csv_path.open("r", encoding="utf-8", newline="") as fh:
reader = csv.DictReader(fh)
return list(reader)
def _next_id(rows: list[dict[str, str]]) -> int:
max_id = 0
for r in rows:
raw = r.get("id", "")
if not raw:
continue
try:
v = int(raw)
except (TypeError, ValueError):
continue
if v > max_id:
max_id = v
return max_id + 1
def _format_optional(value: float | None) -> str:
return "" if value is None else f"{value:.4f}"
def build_row(
def _write_csv_atomic(
csv_path: Path, rows: list[dict[str, str]], columns: list[str]
) -> None:
csv_path.parent.mkdir(parents=True, exist_ok=True)
tmp = csv_path.with_suffix(csv_path.suffix + ".tmp")
with tmp.open("w", encoding="utf-8", newline="") as fh:
writer = csv.DictWriter(fh, fieldnames=columns)
writer.writeheader()
for row in rows:
writer.writerow({k: row.get(k, "") for k in columns})
os.replace(tmp, csv_path)
def _build_row(
extraction: M2DExtraction,
*,
source: str,
row_id: int,
meta: dict[str, Any],
calendar: list[dict[str, Any]],
extracted_at: str,
) -> dict[str, str]:
"""Compute the full CSV row dict for one extraction."""
if source not in VALID_SOURCES:
raise ValueError(
f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}"
)
d_ro, t_ro, zi = utc_to_ro(extraction.data, extraction.ora_utc)
set_label = calc_set(d_ro, t_ro, zi, calendar)
d_ro, t_ro, day_short = utc_to_ro(extraction.data, extraction.ora_utc)
set_label = calc_set(d_ro, t_ro, day_short, calendar)
pl_m = pl_marius(extraction.outcome_path, extraction.be_moved)
pl_t = pl_theoretical(extraction.max_reached)
zi_ro = ZI_RO_MAP[day_short]
return {
"id": str(row_id),
"screenshot_file": extraction.screenshot_file,
"source": source,
"data": extraction.data,
"ora_utc": extraction.ora_utc,
"zi": zi_ro,
"ora_ro": t_ro.strftime("%H:%M"),
"zi": zi,
"set": set_label,
"ora_utc": extraction.ora_utc,
"instrument": extraction.instrument,
"directie": extraction.directie,
"tf_mare": extraction.tf_mare,
@@ -136,102 +197,115 @@ def build_row(
"risc_pct": f"{extraction.risc_pct}",
"outcome_path": extraction.outcome_path,
"max_reached": extraction.max_reached,
"be_moved": "true" if extraction.be_moved else "false",
"confidence": extraction.confidence,
"ambiguities": json.dumps(extraction.ambiguities, ensure_ascii=False),
"note": extraction.note,
"be_moved": str(extraction.be_moved),
"pl_marius": _format_optional(pl_m),
"pl_theoretical": _format_optional(pl_t),
"set": set_label,
"indicator_version": str(meta["indicator_version"]),
"pl_overlay_version": str(meta["pl_overlay_version"]),
"csv_schema_version": str(meta["csv_schema_version"]),
"extracted_at": extracted_at,
"note": extraction.note,
}
def read_rows(csv_path: Path) -> list[dict[str, str]]:
"""Read existing rows; return [] if the file does not exist or is empty."""
if not csv_path.exists() or csv_path.stat().st_size == 0:
return []
with csv_path.open("r", encoding="utf-8", newline="") as fh:
reader = csv.DictReader(fh)
return list(reader)
def _reject(reason: str) -> dict[str, Any]:
return {"status": "rejected", "reason": reason, "id": None, "row": None}
def _atomic_write(csv_path: Path, rows: list[dict[str, str]]) -> None:
csv_path.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_name = tempfile.mkstemp(
prefix=csv_path.name + ".",
suffix=".tmp",
dir=str(csv_path.parent),
)
try:
with os.fdopen(fd, "w", encoding="utf-8", newline="") as fh:
writer = csv.DictWriter(fh, fieldnames=list(CSV_COLUMNS))
writer.writeheader()
for r in rows:
writer.writerow({k: r.get(k, "") for k in CSV_COLUMNS})
os.replace(tmp_name, csv_path)
except Exception:
try:
os.unlink(tmp_name)
except OSError:
pass
raise
# ---------------------------------------------------------------------------
# public API
# ---------------------------------------------------------------------------
def append_row(
extraction: M2DExtraction,
def append_extraction(
json_path: Path | str,
source: str,
csv_path: Path,
meta_path: Path,
calendar_path: Path,
on_duplicate: Literal["raise", "skip"] = "raise",
) -> dict[str, str]:
"""Append one extraction to the CSV.
csv_path: Path | str = "data/jurnal.csv",
meta_path: Path | str = "data/_meta.yaml",
calendar_path: Path | str = "calendar_evenimente.yaml",
) -> dict[str, Any]:
"""Append one validated extraction to the jurnal CSV.
Dedup key: ``(screenshot_file, source)``. If a row with the same key
already exists, behaviour is controlled by ``on_duplicate``:
Never raises. Returns one of:
- ``"raise"`` (default): raise ``ValueError``.
- ``"skip"``: leave the CSV untouched and return the *existing* row.
- ``{"status": "ok", "reason": "", "id": <int>, "row": <dict>}``
- ``{"status": "rejected", "reason": <str>, "id": None, "row": None}``
"""
meta = _load_meta(meta_path)
calendar = load_calendar(calendar_path)
row = build_row(extraction, source, meta, calendar)
json_path = Path(json_path)
csv_path = Path(csv_path)
meta_path = Path(meta_path)
calendar_path = Path(calendar_path)
existing = read_rows(csv_path)
key = (row["screenshot_file"], row["source"])
if source not in VALID_SOURCES:
return _reject(
f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}"
)
if not json_path.exists():
return _reject(f"JSON file not found: {json_path}")
try:
with json_path.open("r", encoding="utf-8") as fh:
raw = fh.read()
except OSError as exc:
return _reject(f"failed to read JSON {json_path}: {exc}")
try:
extraction = parse_extraction(raw)
except ValidationError as exc:
return _reject(f"validation error: {exc}")
except (ValueError, json.JSONDecodeError) as exc:
return _reject(f"validation error (json parse): {exc}")
try:
meta = _load_meta(meta_path)
except (FileNotFoundError, OSError) as exc:
return _reject(f"_meta.yaml not found: {exc}")
except (ValueError, yaml.YAMLError) as exc:
return _reject(f"_meta.yaml invalid: {exc}")
try:
calendar = load_calendar(calendar_path)
except (FileNotFoundError, OSError) as exc:
return _reject(f"calendar not found: {exc}")
except (ValueError, yaml.YAMLError) as exc:
return _reject(f"calendar invalid: {exc}")
try:
existing = _read_existing_rows(csv_path)
except OSError as exc:
return _reject(f"failed to read existing CSV {csv_path}: {exc}")
key = (extraction.screenshot_file, source)
for r in existing:
if (r.get("screenshot_file"), r.get("source")) == key:
if on_duplicate == "skip":
return r
raise ValueError(
f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r} "
f"already exists in {csv_path}"
return _reject(
f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r}"
)
existing.append(row)
_atomic_write(csv_path, existing)
return row
def append_row_from_json(
json_path: Path,
source: str,
csv_path: Path,
meta_path: Path,
calendar_path: Path,
on_duplicate: Literal["raise", "skip"] = "raise",
) -> dict[str, str]:
"""Convenience wrapper: load JSON, validate, append."""
with Path(json_path).open("r", encoding="utf-8") as fh:
payload = json.load(fh)
extraction = parse_extraction_dict(payload)
return append_row(
extraction=extraction,
source=source,
csv_path=csv_path,
meta_path=meta_path,
calendar_path=calendar_path,
on_duplicate=on_duplicate,
row_id = _next_id(existing)
extracted_at = (
datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S") + "Z"
)
try:
row = _build_row(
extraction,
source=source,
row_id=row_id,
meta=meta,
calendar=calendar,
extracted_at=extracted_at,
)
except (KeyError, ValueError) as exc:
return _reject(f"derived-field computation failed: {exc}")
try:
_write_csv_atomic(csv_path, [*existing, row], list(CSV_COLUMNS))
except OSError as exc:
return _reject(
f"atomic write failed: {exc}\n{traceback.format_exc()}"
)
return {"status": "ok", "reason": "", "id": row_id, "row": row}