scripts: regenerate_md + stats + tests (116-144 passing across modules)
This commit is contained in:
@@ -1,22 +1,26 @@
|
||||
"""Append a validated M2D extraction to ``data/trades.csv``.
|
||||
"""Append a validated M2D extraction to ``data/jurnal.csv``.
|
||||
|
||||
Pipeline:
|
||||
JSON file --> pydantic validate (M2DExtraction)
|
||||
--> load data/_meta.yaml (versions + schema)
|
||||
--> compute ora_ro, zi, set, pl_marius, pl_theoretical
|
||||
--> load data/_meta.yaml (versions)
|
||||
--> compute id, ora_ro, zi, set, pl_marius, pl_theoretical, extracted_at
|
||||
--> dedup on (screenshot_file, source)
|
||||
--> atomic CSV write (temp file + os.replace)
|
||||
--> atomic CSV write (sibling .tmp + os.replace)
|
||||
|
||||
Source values
|
||||
- ``manual`` : Marius logged by hand
|
||||
- ``vision`` : produced by the vision subagent
|
||||
- ``manual`` : Marius logged by hand
|
||||
- ``manual_calibration`` : calibration P4 — manual leg
|
||||
- ``vision_calibration`` : calibration P4 — vision leg
|
||||
|
||||
A row with ``source=manual_calibration`` and a row with ``source=vision_calibration``
|
||||
for the *same* screenshot are allowed to coexist (different dedup keys); a
|
||||
duplicate ``(screenshot_file, source)`` pair is rejected (or skipped — see
|
||||
``append_row`` ``on_duplicate`` argument).
|
||||
for the *same* screenshot are allowed to coexist (different dedup keys).
|
||||
|
||||
Failure mode: ``append_extraction`` NEVER raises. On any error (missing JSON,
|
||||
pydantic ValidationError, dedup hit, etc.) it returns
|
||||
``{"status": "rejected", "reason": "...", "id": None, "row": None}`` so the
|
||||
caller (a slash command) can decide what to do with the screenshot
|
||||
(move to ``needs_review/``, log to workflow, etc.).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -24,41 +28,43 @@ from __future__ import annotations
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
import yaml
|
||||
from pydantic import ValidationError
|
||||
|
||||
from scripts.calendar_parse import calc_set, load_calendar, utc_to_ro
|
||||
from scripts.pl_calc import pl_marius, pl_theoretical
|
||||
from scripts.vision_schema import M2DExtraction, parse_extraction_dict
|
||||
from scripts.vision_schema import M2DExtraction, parse_extraction
|
||||
|
||||
__all__ = [
|
||||
"CSV_COLUMNS",
|
||||
"VALID_SOURCES",
|
||||
"build_row",
|
||||
"read_rows",
|
||||
"append_row",
|
||||
"append_row_from_json",
|
||||
"ZI_RO_MAP",
|
||||
"csv_columns",
|
||||
"append_extraction",
|
||||
]
|
||||
|
||||
|
||||
Source = Literal["manual", "vision", "manual_calibration", "vision_calibration"]
|
||||
Source = Literal["vision", "manual", "manual_calibration", "vision_calibration"]
|
||||
|
||||
VALID_SOURCES: frozenset[str] = frozenset(
|
||||
{"manual", "vision", "manual_calibration", "vision_calibration"}
|
||||
{"vision", "manual", "manual_calibration", "vision_calibration"}
|
||||
)
|
||||
|
||||
|
||||
# Canonical column order (29) — must stay stable; regenerate_md + stats depend on it.
|
||||
CSV_COLUMNS: tuple[str, ...] = (
|
||||
"id",
|
||||
"screenshot_file",
|
||||
"source",
|
||||
"data",
|
||||
"ora_utc",
|
||||
"ora_ro",
|
||||
"zi",
|
||||
"set",
|
||||
"ora_ro",
|
||||
"ora_utc",
|
||||
"instrument",
|
||||
"directie",
|
||||
"tf_mare",
|
||||
@@ -73,17 +79,38 @@ CSV_COLUMNS: tuple[str, ...] = (
|
||||
"outcome_path",
|
||||
"max_reached",
|
||||
"be_moved",
|
||||
"confidence",
|
||||
"ambiguities",
|
||||
"note",
|
||||
"pl_marius",
|
||||
"pl_theoretical",
|
||||
"set",
|
||||
"indicator_version",
|
||||
"pl_overlay_version",
|
||||
"csv_schema_version",
|
||||
"extracted_at",
|
||||
"note",
|
||||
)
|
||||
|
||||
|
||||
ZI_RO_MAP: dict[str, str] = {
|
||||
"Mon": "Lu",
|
||||
"Tue": "Ma",
|
||||
"Wed": "Mi",
|
||||
"Thu": "Jo",
|
||||
"Fri": "Vi",
|
||||
"Sat": "Sa",
|
||||
"Sun": "Du",
|
||||
}
|
||||
|
||||
|
||||
def csv_columns() -> list[str]:
|
||||
"""Return the 29-column header in canonical order."""
|
||||
return list(CSV_COLUMNS)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _load_meta(meta_path: Path) -> dict[str, Any]:
|
||||
with meta_path.open("r", encoding="utf-8") as fh:
|
||||
meta = yaml.safe_load(fh) or {}
|
||||
@@ -94,35 +121,69 @@ def _load_meta(meta_path: Path) -> dict[str, Any]:
|
||||
return meta
|
||||
|
||||
|
||||
def _read_existing_rows(csv_path: Path) -> list[dict[str, str]]:
|
||||
if not csv_path.exists() or csv_path.stat().st_size == 0:
|
||||
return []
|
||||
with csv_path.open("r", encoding="utf-8", newline="") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
return list(reader)
|
||||
|
||||
|
||||
def _next_id(rows: list[dict[str, str]]) -> int:
|
||||
max_id = 0
|
||||
for r in rows:
|
||||
raw = r.get("id", "")
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
v = int(raw)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if v > max_id:
|
||||
max_id = v
|
||||
return max_id + 1
|
||||
|
||||
|
||||
def _format_optional(value: float | None) -> str:
|
||||
return "" if value is None else f"{value:.4f}"
|
||||
|
||||
|
||||
def build_row(
|
||||
def _write_csv_atomic(
|
||||
csv_path: Path, rows: list[dict[str, str]], columns: list[str]
|
||||
) -> None:
|
||||
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = csv_path.with_suffix(csv_path.suffix + ".tmp")
|
||||
with tmp.open("w", encoding="utf-8", newline="") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=columns)
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow({k: row.get(k, "") for k in columns})
|
||||
os.replace(tmp, csv_path)
|
||||
|
||||
|
||||
def _build_row(
|
||||
extraction: M2DExtraction,
|
||||
*,
|
||||
source: str,
|
||||
row_id: int,
|
||||
meta: dict[str, Any],
|
||||
calendar: list[dict[str, Any]],
|
||||
extracted_at: str,
|
||||
) -> dict[str, str]:
|
||||
"""Compute the full CSV row dict for one extraction."""
|
||||
if source not in VALID_SOURCES:
|
||||
raise ValueError(
|
||||
f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}"
|
||||
)
|
||||
|
||||
d_ro, t_ro, zi = utc_to_ro(extraction.data, extraction.ora_utc)
|
||||
set_label = calc_set(d_ro, t_ro, zi, calendar)
|
||||
d_ro, t_ro, day_short = utc_to_ro(extraction.data, extraction.ora_utc)
|
||||
set_label = calc_set(d_ro, t_ro, day_short, calendar)
|
||||
pl_m = pl_marius(extraction.outcome_path, extraction.be_moved)
|
||||
pl_t = pl_theoretical(extraction.max_reached)
|
||||
zi_ro = ZI_RO_MAP[day_short]
|
||||
|
||||
return {
|
||||
"id": str(row_id),
|
||||
"screenshot_file": extraction.screenshot_file,
|
||||
"source": source,
|
||||
"data": extraction.data,
|
||||
"ora_utc": extraction.ora_utc,
|
||||
"zi": zi_ro,
|
||||
"ora_ro": t_ro.strftime("%H:%M"),
|
||||
"zi": zi,
|
||||
"set": set_label,
|
||||
"ora_utc": extraction.ora_utc,
|
||||
"instrument": extraction.instrument,
|
||||
"directie": extraction.directie,
|
||||
"tf_mare": extraction.tf_mare,
|
||||
@@ -136,102 +197,115 @@ def build_row(
|
||||
"risc_pct": f"{extraction.risc_pct}",
|
||||
"outcome_path": extraction.outcome_path,
|
||||
"max_reached": extraction.max_reached,
|
||||
"be_moved": "true" if extraction.be_moved else "false",
|
||||
"confidence": extraction.confidence,
|
||||
"ambiguities": json.dumps(extraction.ambiguities, ensure_ascii=False),
|
||||
"note": extraction.note,
|
||||
"be_moved": str(extraction.be_moved),
|
||||
"pl_marius": _format_optional(pl_m),
|
||||
"pl_theoretical": _format_optional(pl_t),
|
||||
"set": set_label,
|
||||
"indicator_version": str(meta["indicator_version"]),
|
||||
"pl_overlay_version": str(meta["pl_overlay_version"]),
|
||||
"csv_schema_version": str(meta["csv_schema_version"]),
|
||||
"extracted_at": extracted_at,
|
||||
"note": extraction.note,
|
||||
}
|
||||
|
||||
|
||||
def read_rows(csv_path: Path) -> list[dict[str, str]]:
|
||||
"""Read existing rows; return [] if the file does not exist or is empty."""
|
||||
if not csv_path.exists() or csv_path.stat().st_size == 0:
|
||||
return []
|
||||
with csv_path.open("r", encoding="utf-8", newline="") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
return list(reader)
|
||||
def _reject(reason: str) -> dict[str, Any]:
|
||||
return {"status": "rejected", "reason": reason, "id": None, "row": None}
|
||||
|
||||
|
||||
def _atomic_write(csv_path: Path, rows: list[dict[str, str]]) -> None:
|
||||
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fd, tmp_name = tempfile.mkstemp(
|
||||
prefix=csv_path.name + ".",
|
||||
suffix=".tmp",
|
||||
dir=str(csv_path.parent),
|
||||
)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8", newline="") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=list(CSV_COLUMNS))
|
||||
writer.writeheader()
|
||||
for r in rows:
|
||||
writer.writerow({k: r.get(k, "") for k in CSV_COLUMNS})
|
||||
os.replace(tmp_name, csv_path)
|
||||
except Exception:
|
||||
try:
|
||||
os.unlink(tmp_name)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
# ---------------------------------------------------------------------------
|
||||
# public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def append_row(
|
||||
extraction: M2DExtraction,
|
||||
def append_extraction(
|
||||
json_path: Path | str,
|
||||
source: str,
|
||||
csv_path: Path,
|
||||
meta_path: Path,
|
||||
calendar_path: Path,
|
||||
on_duplicate: Literal["raise", "skip"] = "raise",
|
||||
) -> dict[str, str]:
|
||||
"""Append one extraction to the CSV.
|
||||
csv_path: Path | str = "data/jurnal.csv",
|
||||
meta_path: Path | str = "data/_meta.yaml",
|
||||
calendar_path: Path | str = "calendar_evenimente.yaml",
|
||||
) -> dict[str, Any]:
|
||||
"""Append one validated extraction to the jurnal CSV.
|
||||
|
||||
Dedup key: ``(screenshot_file, source)``. If a row with the same key
|
||||
already exists, behaviour is controlled by ``on_duplicate``:
|
||||
Never raises. Returns one of:
|
||||
|
||||
- ``"raise"`` (default): raise ``ValueError``.
|
||||
- ``"skip"``: leave the CSV untouched and return the *existing* row.
|
||||
- ``{"status": "ok", "reason": "", "id": <int>, "row": <dict>}``
|
||||
- ``{"status": "rejected", "reason": <str>, "id": None, "row": None}``
|
||||
"""
|
||||
meta = _load_meta(meta_path)
|
||||
calendar = load_calendar(calendar_path)
|
||||
row = build_row(extraction, source, meta, calendar)
|
||||
json_path = Path(json_path)
|
||||
csv_path = Path(csv_path)
|
||||
meta_path = Path(meta_path)
|
||||
calendar_path = Path(calendar_path)
|
||||
|
||||
existing = read_rows(csv_path)
|
||||
key = (row["screenshot_file"], row["source"])
|
||||
if source not in VALID_SOURCES:
|
||||
return _reject(
|
||||
f"invalid source {source!r}; must be one of {sorted(VALID_SOURCES)}"
|
||||
)
|
||||
|
||||
if not json_path.exists():
|
||||
return _reject(f"JSON file not found: {json_path}")
|
||||
|
||||
try:
|
||||
with json_path.open("r", encoding="utf-8") as fh:
|
||||
raw = fh.read()
|
||||
except OSError as exc:
|
||||
return _reject(f"failed to read JSON {json_path}: {exc}")
|
||||
|
||||
try:
|
||||
extraction = parse_extraction(raw)
|
||||
except ValidationError as exc:
|
||||
return _reject(f"validation error: {exc}")
|
||||
except (ValueError, json.JSONDecodeError) as exc:
|
||||
return _reject(f"validation error (json parse): {exc}")
|
||||
|
||||
try:
|
||||
meta = _load_meta(meta_path)
|
||||
except (FileNotFoundError, OSError) as exc:
|
||||
return _reject(f"_meta.yaml not found: {exc}")
|
||||
except (ValueError, yaml.YAMLError) as exc:
|
||||
return _reject(f"_meta.yaml invalid: {exc}")
|
||||
|
||||
try:
|
||||
calendar = load_calendar(calendar_path)
|
||||
except (FileNotFoundError, OSError) as exc:
|
||||
return _reject(f"calendar not found: {exc}")
|
||||
except (ValueError, yaml.YAMLError) as exc:
|
||||
return _reject(f"calendar invalid: {exc}")
|
||||
|
||||
try:
|
||||
existing = _read_existing_rows(csv_path)
|
||||
except OSError as exc:
|
||||
return _reject(f"failed to read existing CSV {csv_path}: {exc}")
|
||||
|
||||
key = (extraction.screenshot_file, source)
|
||||
for r in existing:
|
||||
if (r.get("screenshot_file"), r.get("source")) == key:
|
||||
if on_duplicate == "skip":
|
||||
return r
|
||||
raise ValueError(
|
||||
f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r} "
|
||||
f"already exists in {csv_path}"
|
||||
return _reject(
|
||||
f"duplicate row: screenshot_file={key[0]!r} source={key[1]!r}"
|
||||
)
|
||||
|
||||
existing.append(row)
|
||||
_atomic_write(csv_path, existing)
|
||||
return row
|
||||
|
||||
|
||||
def append_row_from_json(
|
||||
json_path: Path,
|
||||
source: str,
|
||||
csv_path: Path,
|
||||
meta_path: Path,
|
||||
calendar_path: Path,
|
||||
on_duplicate: Literal["raise", "skip"] = "raise",
|
||||
) -> dict[str, str]:
|
||||
"""Convenience wrapper: load JSON, validate, append."""
|
||||
with Path(json_path).open("r", encoding="utf-8") as fh:
|
||||
payload = json.load(fh)
|
||||
extraction = parse_extraction_dict(payload)
|
||||
return append_row(
|
||||
extraction=extraction,
|
||||
source=source,
|
||||
csv_path=csv_path,
|
||||
meta_path=meta_path,
|
||||
calendar_path=calendar_path,
|
||||
on_duplicate=on_duplicate,
|
||||
row_id = _next_id(existing)
|
||||
extracted_at = (
|
||||
datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S") + "Z"
|
||||
)
|
||||
|
||||
try:
|
||||
row = _build_row(
|
||||
extraction,
|
||||
source=source,
|
||||
row_id=row_id,
|
||||
meta=meta,
|
||||
calendar=calendar,
|
||||
extracted_at=extracted_at,
|
||||
)
|
||||
except (KeyError, ValueError) as exc:
|
||||
return _reject(f"derived-field computation failed: {exc}")
|
||||
|
||||
try:
|
||||
_write_csv_atomic(csv_path, [*existing, row], list(CSV_COLUMNS))
|
||||
except OSError as exc:
|
||||
return _reject(
|
||||
f"atomic write failed: {exc}\n{traceback.format_exc()}"
|
||||
)
|
||||
|
||||
return {"status": "ok", "reason": "", "id": row_id, "row": row}
|
||||
|
||||
Reference in New Issue
Block a user