Prevent + net the unescaped-quote bug in the durable prompts/pipeline

The escape-ASCII-quote rule previously lived only in ephemeral Agent-call strings. Bake it into the durable artifacts so the next session doesn't re-derive it: - SUBAGENT_PROMPT.md + ENRICHMENT_PROMPT.md: explicit rule to escape any ASCII " inside JSON string values (Romanian „cuvânt" is the trap). - run_enrichment.py collect_enrichment: repair malformed parts with escape_stray_quotes instead of dropping them — the enrichment path had no repair net (bad parts were silently dropped, losing that activity's enrichment). Extraction already had one; now both do. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 18:16:04 +00:00
parent bcfb6841eb
commit d6971e47f8
3 changed files with 46 additions and 3 deletions
--- a/scripts/run_enrichment.py
+++ b/scripts/run_enrichment.py
@@ -49,6 +49,7 @@ from import_common import (  # noqa: E402
    find_chunk_text,
    normalize_name,
 )
+from repair_extractions import escape_stray_quotes  # noqa: E402

 ENRICHMENT_PROMPT = SCRIPT_DIR / "ENRICHMENT_PROMPT.md"

@@ -169,11 +170,26 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict:
    """Merge data/enrichment_parts/*.json into one flat content_key map."""
    merged: dict = {}
    bad: list[str] = []
+    repaired: list[str] = []
    if parts_dir.is_dir():
        for part in sorted(parts_dir.glob("*.json")):
+            raw = part.read_text(encoding="utf-8")
            try:
-                data = json.loads(part.read_text(encoding="utf-8"))
-            except (json.JSONDecodeError, OSError):
+                data = json.loads(raw)
+            except json.JSONDecodeError:
+                # Enrichment subagents hit the same unescaped-ASCII-quote bug as
+                # extraction (description_ro is full of Romanian „…"). Repair by
+                # escaping rather than dropping the activity's enrichment.
+                try:
+                    data = json.loads(escape_stray_quotes(raw))
+                    repaired.append(part.name)
+                except json.JSONDecodeError:
+                    bad.append(part.name)
+                    continue
+            except OSError:
+                bad.append(part.name)
+                continue
+            if not isinstance(data, dict):
                bad.append(part.name)
                continue
            key = data.get("content_key") or part.stem
@@ -182,7 +198,8 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict:
    out_path.write_text(
        json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8"
    )
-    return {"entries": len(merged), "bad_parts": bad, "out": str(out_path)}
+    return {"entries": len(merged), "repaired": repaired,
+            "bad_parts": bad, "out": str(out_path)}


 def run_emit(
@@ -235,6 +252,8 @@ def main(argv: Optional[list[str]] = None) -> int:
    if args.collect:
        result = collect_enrichment(Path(args.parts), Path(args.out))
        print(f"collected  : {result['entries']} entries -> {result['out']}")
+        if result["repaired"]:
+            print(f"repaired   : {len(result['repaired'])} parts (unescaped-quote fix)")
        if result["bad_parts"]:
            print(f"bad parts  : {len(result['bad_parts'])} (skipped)")
            for name in result["bad_parts"]: