From d6971e47f8e71da24007f683232ea2af134c99aa Mon Sep 17 00:00:00 2001
From: Claude Agent <claude-agent@romfast.ro>
Date: Fri, 29 May 2026 18:16:04 +0000
Subject: [PATCH] Prevent + net the unescaped-quote bug in the durable
 prompts/pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The escape-ASCII-quote rule previously lived only in ephemeral Agent-call
strings. Bake it into the durable artifacts so the next session doesn't
re-derive it:
- SUBAGENT_PROMPT.md + ENRICHMENT_PROMPT.md: explicit rule to escape any
  ASCII " inside JSON string values (Romanian „cuvânt" is the trap).
- run_enrichment.py collect_enrichment: repair malformed parts with
  escape_stray_quotes instead of dropping them — the enrichment path had no
  repair net (bad parts were silently dropped, losing that activity's
  enrichment). Extraction already had one; now both do.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/ENRICHMENT_PROMPT.md |  9 +++++++++
 scripts/SUBAGENT_PROMPT.md   | 15 +++++++++++++++
 scripts/run_enrichment.py    | 25 ++++++++++++++++++++++---
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/scripts/ENRICHMENT_PROMPT.md b/scripts/ENRICHMENT_PROMPT.md
index 6c63232..7e474f7 100644
--- a/scripts/ENRICHMENT_PROMPT.md
+++ b/scripts/ENRICHMENT_PROMPT.md
@@ -92,6 +92,15 @@ Include only the fields you actually fill. Always include `content_key` and
 `estimated_fields` (use `[]` if nothing was inferred). Output valid JSON only —
 no commentary, no markdown fences in the file itself.
 
+### CRITICAL — escape quotes inside string values
+
+Any ASCII double-quote (`"`, U+0022) inside a string value MUST be escaped as
+`\"`. Your Romanian text is full of `„cuvânt"` — written raw, the closing ASCII
+`"` terminates the JSON string early and the whole file fails to parse (and your
+enrichment for this activity is silently lost). Either keep the typographic
+marks (`„ "`) or escape every literal ASCII `"`. After writing, re-read the file
+and confirm it parses as valid JSON.
+
 ## Report
 
 After writing the file, report in under 30 words: the activity name and which
diff --git a/scripts/SUBAGENT_PROMPT.md b/scripts/SUBAGENT_PROMPT.md
index feb0574..373767e 100644
--- a/scripts/SUBAGENT_PROMPT.md
+++ b/scripts/SUBAGENT_PROMPT.md
@@ -74,6 +74,21 @@ The file is one JSON object: a `header` plus an `activities` array.
 - Do **not** paraphrase the `source_excerpt` — copy it character for character.
 - Better to extract fewer activities accurately than to pad the output.
 
+## Escaping quotes inside JSON strings (CRITICAL)
+
+Any ASCII double-quote (`"`, U+0022) that appears **inside a string value** must
+be written escaped as `\"`. This is the single most common way these extractions
+break: Romanian source text uses typographic quotes like `„cuvânt"` where the
+closing mark is a plain ASCII `"`. Written raw, it terminates the JSON string
+early and corrupts the whole file. So:
+
+- `"description": "grupul cântă „Unu\" în cor"`  ← correct (inner `"` escaped)
+- `"description": "grupul cântă „Unu" în cor"`   ← BROKEN (unescaped `"`)
+
+Prefer keeping the source's typographic quotes (`„ "`), but whenever a literal
+ASCII `"` lands inside a value, escape it. After writing, re-read the file and
+confirm it parses as valid JSON.
+
 ## Writing large outputs in batches (IMPORTANT)
 
 A single Write tool call has a hard ~32K output-token limit. Dense chunks
diff --git a/scripts/run_enrichment.py b/scripts/run_enrichment.py
index dcf434a..0245cd8 100644
--- a/scripts/run_enrichment.py
+++ b/scripts/run_enrichment.py
@@ -49,6 +49,7 @@ from import_common import (  # noqa: E402
     find_chunk_text,
     normalize_name,
 )
+from repair_extractions import escape_stray_quotes  # noqa: E402
 
 ENRICHMENT_PROMPT = SCRIPT_DIR / "ENRICHMENT_PROMPT.md"
 
@@ -169,11 +170,26 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict:
     """Merge data/enrichment_parts/*.json into one flat content_key map."""
     merged: dict = {}
     bad: list[str] = []
+    repaired: list[str] = []
     if parts_dir.is_dir():
         for part in sorted(parts_dir.glob("*.json")):
+            raw = part.read_text(encoding="utf-8")
             try:
-                data = json.loads(part.read_text(encoding="utf-8"))
-            except (json.JSONDecodeError, OSError):
+                data = json.loads(raw)
+            except json.JSONDecodeError:
+                # Enrichment subagents hit the same unescaped-ASCII-quote bug as
+                # extraction (description_ro is full of Romanian „…"). Repair by
+                # escaping rather than dropping the activity's enrichment.
+                try:
+                    data = json.loads(escape_stray_quotes(raw))
+                    repaired.append(part.name)
+                except json.JSONDecodeError:
+                    bad.append(part.name)
+                    continue
+            except OSError:
+                bad.append(part.name)
+                continue
+            if not isinstance(data, dict):
                 bad.append(part.name)
                 continue
             key = data.get("content_key") or part.stem
@@ -182,7 +198,8 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict:
     out_path.write_text(
         json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8"
     )
-    return {"entries": len(merged), "bad_parts": bad, "out": str(out_path)}
+    return {"entries": len(merged), "repaired": repaired,
+            "bad_parts": bad, "out": str(out_path)}
 
 
 def run_emit(
@@ -235,6 +252,8 @@ def main(argv: Optional[list[str]] = None) -> int:
     if args.collect:
         result = collect_enrichment(Path(args.parts), Path(args.out))
         print(f"collected  : {result['entries']} entries -> {result['out']}")
+        if result["repaired"]:
+            print(f"repaired   : {len(result['repaired'])} parts (unescaped-quote fix)")
         if result["bad_parts"]:
             print(f"bad parts  : {len(result['bad_parts'])} (skipped)")
             for name in result["bad_parts"]: