#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ repair_extractions.py — one-shot repair of malformed extraction JSON. Subagents systematically emit unescaped ASCII double-quotes inside string values (Romanian text like „Unu" uses a closing " that terminates the JSON string early). Re-extraction reproduces the bug, so we repair instead. IMPORTANT — why NOT json_repair: json_repair "recovers" an unescaped quote by ending the string at the stray quote and reinterpreting the trailing text as a new key, which (a) TRUNCATES the value and (b) injects garbage keys. The truncation is silent (the field is still non-empty) and slips past a naive presence check. So we use a faithful char-scanner that ESCAPES stray quotes (\\") instead of splitting on them, then validate the result against the real activity schema (additionalProperties:false also catches any residual split). This is an OFFLINE maintenance tool. build_database.py must NOT depend on it — the "DB regenerable from data/extracted/" invariant requires plain valid JSON on disk. We write clean JSON back to data/extracted/ and the build reads vanilla json. Source selection (faithful recovery needs the ORIGINAL malformed text): * a chunk is a candidate when a MALFORMED original exists — either the top-level data/extracted/.json is itself invalid, or a malformed original sits in data/extracted/_rejected/.json. * the malformed original is preferred as the repair source. * chunks whose only artifact is already-valid JSON (e.g. a prior json_repair output that lost the original) are NOT silently "repaired" — if such a chunk has no valid top-level file it is reported as needing RE-EXTRACTION. Usage: python scripts/repair_extractions.py # report only (dry run) python scripts/repair_extractions.py --apply # write repaired JSON """ from __future__ import annotations import argparse import glob import json from pathlib import Path from typing import Optional SCRIPT_DIR = Path(__file__).resolve().parent REPO_ROOT = SCRIPT_DIR.parent EXTRACTED = REPO_ROOT / "data" / "extracted" REJECTED = EXTRACTED / "_rejected" if str(SCRIPT_DIR) not in __import__("sys").path: __import__("sys").path.insert(0, str(SCRIPT_DIR)) from import_common import DEFAULT_SCHEMA_PATH, load_schema, validate_extraction # noqa: E402 def escape_stray_quotes(s: str) -> str: """Escape ASCII double-quotes that occur INSIDE a JSON string value. A `"` inside a string is treated as a real string-close only when the next non-whitespace char is structural (`,` `}` `]` `:`) or EOF; otherwise it is content and is escaped to `\\"`. This preserves the full value instead of truncating it (the json_repair failure mode). """ out: list[str] = [] in_str = False esc = False n = len(s) i = 0 while i < n: c = s[i] if esc: out.append(c) esc = False i += 1 continue if c == "\\": out.append(c) esc = True i += 1 continue if c == '"': if not in_str: in_str = True out.append(c) else: j = i + 1 while j < n and s[j] in " \t\r\n": j += 1 nxt = s[j] if j < n else "" if nxt in ",}]:" or nxt == "": in_str = False out.append(c) else: out.append('\\"') # content quote → escape, keep value whole i += 1 continue out.append(c) i += 1 return "".join(out) def _is_valid_json(path: Path) -> bool: try: json.loads(path.read_text(encoding="utf-8")) return True except (json.JSONDecodeError, OSError): return False def _malformed_source(key: str) -> Optional[Path]: """Return the malformed-original file for a chunk, preferring top-level.""" live = EXTRACTED / f"{key}.json" if live.exists() and not _is_valid_json(live): return live rej = REJECTED / f"{key}.json" if rej.exists() and not _is_valid_json(rej): return rej return None def _candidate_keys() -> tuple[dict[str, Path], list[str]]: """ (repair_candidates, needs_reextraction). repair_candidates: key -> malformed source file (faithfully repairable). needs_reextraction: chunks with no malformed original AND no valid top-level file (their original was lost) — must be re-extracted. """ keys = set() for fn in glob.glob(str(EXTRACTED / "*.json")): keys.add(Path(fn).stem) for fn in glob.glob(str(REJECTED / "*.json")): keys.add(Path(fn).stem) candidates: dict[str, Path] = {} needs_reextraction: list[str] = [] for key in sorted(keys): # A malformed original anywhere is faithfully repairable, and is the # source of truth even if a (json_repair-produced, possibly truncated) # valid top-level file exists — escaping the original never truncates, # so re-repairing from it is always >= the json_repair output. src = _malformed_source(key) if src is not None: candidates[key] = src continue live = EXTRACTED / f"{key}.json" if live.exists() and _is_valid_json(live): continue # genuinely-valid extraction, nothing to do # no valid top-level and no malformed original to repair from needs_reextraction.append(key) return candidates, needs_reextraction def repair(apply: bool) -> int: schema = load_schema(DEFAULT_SCHEMA_PATH) candidates, needs_reextraction = _candidate_keys() print("=" * 64) print(f"REPAIR EXTRACTIONS ({'APPLY' if apply else 'dry run'})") print("=" * 64) print(f"repair candidates: {len(candidates)}") def _textlen(data: dict) -> int: total = 0 for a in data.get("activities", []): if isinstance(a, dict): for v in a.values(): if isinstance(v, str): total += len(v) return total ok = 0 kept_toplevel = 0 still_bad: list[str] = [] schema_fail: list[tuple[str, str]] = [] for key, src in candidates.items(): live = EXTRACTED / f"{key}.json" live_valid = live.exists() and _is_valid_json(live) raw = src.read_text(encoding="utf-8") fixed = escape_stray_quotes(raw) try: data = json.loads(fixed) except json.JSONDecodeError as exc: if live_valid: kept_toplevel += 1 # genuine top-level is fine; stale _rejected else: still_bad.append(f"{key}: still invalid after escape ({exc})") continue errors = validate_extraction(data, schema) if errors: if live_valid: kept_toplevel += 1 else: schema_fail.append((key, errors[0])) print(f" {key[:50]:<50} SCHEMA-FAIL: {errors[0][:40]}") continue # Faithfulness guard: only replace a valid top-level when the escaped # repair carries STRICTLY more text (i.e. the top-level was a truncated # json_repair output). Genuine extractions are kept untouched. if live_valid: try: live_data = json.loads(live.read_text(encoding="utf-8")) except json.JSONDecodeError: live_data = {} if _textlen(data) <= _textlen(live_data): kept_toplevel += 1 continue n = len(data.get("activities", [])) print(f" {key[:50]:<50} {n:>3} acts REPAIR") if apply: live.write_text( json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8" ) ok += 1 print("-" * 64) print(f"repaired: {ok} | kept genuine top-level: {kept_toplevel} | " f"schema-fail: {len(schema_fail)} | still-bad: {len(still_bad)} | " f"needs re-extraction: {len(needs_reextraction)}") for key, err in schema_fail: print(f" ⚠ schema {key}: {err[:60]}") for msg in still_bad: print(f" ✘ {msg}") for key in needs_reextraction: print(f" ↻ re-extract: {key}") if not apply: print("\nDry run — re-run with --apply to write repaired JSON.") print("=" * 64) return 0 def main(argv: Optional[list[str]] = None) -> int: parser = argparse.ArgumentParser(description="Repair malformed extraction JSON.") parser.add_argument("--apply", action="store_true", help="write repaired JSON (default: dry run)") args = parser.parse_args(argv) return repair(args.apply) if __name__ == "__main__": raise SystemExit(main())