From d6971e47f8e71da24007f683232ea2af134c99aa Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Fri, 29 May 2026 18:16:04 +0000 Subject: [PATCH] Prevent + net the unescaped-quote bug in the durable prompts/pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The escape-ASCII-quote rule previously lived only in ephemeral Agent-call strings. Bake it into the durable artifacts so the next session doesn't re-derive it: - SUBAGENT_PROMPT.md + ENRICHMENT_PROMPT.md: explicit rule to escape any ASCII " inside JSON string values (Romanian „cuvânt" is the trap). - run_enrichment.py collect_enrichment: repair malformed parts with escape_stray_quotes instead of dropping them — the enrichment path had no repair net (bad parts were silently dropped, losing that activity's enrichment). Extraction already had one; now both do. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/ENRICHMENT_PROMPT.md | 9 +++++++++ scripts/SUBAGENT_PROMPT.md | 15 +++++++++++++++ scripts/run_enrichment.py | 25 ++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/scripts/ENRICHMENT_PROMPT.md b/scripts/ENRICHMENT_PROMPT.md index 6c63232..7e474f7 100644 --- a/scripts/ENRICHMENT_PROMPT.md +++ b/scripts/ENRICHMENT_PROMPT.md @@ -92,6 +92,15 @@ Include only the fields you actually fill. Always include `content_key` and `estimated_fields` (use `[]` if nothing was inferred). Output valid JSON only — no commentary, no markdown fences in the file itself. +### CRITICAL — escape quotes inside string values + +Any ASCII double-quote (`"`, U+0022) inside a string value MUST be escaped as +`\"`. Your Romanian text is full of `„cuvânt"` — written raw, the closing ASCII +`"` terminates the JSON string early and the whole file fails to parse (and your +enrichment for this activity is silently lost). Either keep the typographic +marks (`„ "`) or escape every literal ASCII `"`. After writing, re-read the file +and confirm it parses as valid JSON. + ## Report After writing the file, report in under 30 words: the activity name and which diff --git a/scripts/SUBAGENT_PROMPT.md b/scripts/SUBAGENT_PROMPT.md index feb0574..373767e 100644 --- a/scripts/SUBAGENT_PROMPT.md +++ b/scripts/SUBAGENT_PROMPT.md @@ -74,6 +74,21 @@ The file is one JSON object: a `header` plus an `activities` array. - Do **not** paraphrase the `source_excerpt` — copy it character for character. - Better to extract fewer activities accurately than to pad the output. +## Escaping quotes inside JSON strings (CRITICAL) + +Any ASCII double-quote (`"`, U+0022) that appears **inside a string value** must +be written escaped as `\"`. This is the single most common way these extractions +break: Romanian source text uses typographic quotes like `„cuvânt"` where the +closing mark is a plain ASCII `"`. Written raw, it terminates the JSON string +early and corrupts the whole file. So: + +- `"description": "grupul cântă „Unu\" în cor"` ← correct (inner `"` escaped) +- `"description": "grupul cântă „Unu" în cor"` ← BROKEN (unescaped `"`) + +Prefer keeping the source's typographic quotes (`„ "`), but whenever a literal +ASCII `"` lands inside a value, escape it. After writing, re-read the file and +confirm it parses as valid JSON. + ## Writing large outputs in batches (IMPORTANT) A single Write tool call has a hard ~32K output-token limit. Dense chunks diff --git a/scripts/run_enrichment.py b/scripts/run_enrichment.py index dcf434a..0245cd8 100644 --- a/scripts/run_enrichment.py +++ b/scripts/run_enrichment.py @@ -49,6 +49,7 @@ from import_common import ( # noqa: E402 find_chunk_text, normalize_name, ) +from repair_extractions import escape_stray_quotes # noqa: E402 ENRICHMENT_PROMPT = SCRIPT_DIR / "ENRICHMENT_PROMPT.md" @@ -169,11 +170,26 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict: """Merge data/enrichment_parts/*.json into one flat content_key map.""" merged: dict = {} bad: list[str] = [] + repaired: list[str] = [] if parts_dir.is_dir(): for part in sorted(parts_dir.glob("*.json")): + raw = part.read_text(encoding="utf-8") try: - data = json.loads(part.read_text(encoding="utf-8")) - except (json.JSONDecodeError, OSError): + data = json.loads(raw) + except json.JSONDecodeError: + # Enrichment subagents hit the same unescaped-ASCII-quote bug as + # extraction (description_ro is full of Romanian „…"). Repair by + # escaping rather than dropping the activity's enrichment. + try: + data = json.loads(escape_stray_quotes(raw)) + repaired.append(part.name) + except json.JSONDecodeError: + bad.append(part.name) + continue + except OSError: + bad.append(part.name) + continue + if not isinstance(data, dict): bad.append(part.name) continue key = data.get("content_key") or part.stem @@ -182,7 +198,8 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict: out_path.write_text( json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8" ) - return {"entries": len(merged), "bad_parts": bad, "out": str(out_path)} + return {"entries": len(merged), "repaired": repaired, + "bad_parts": bad, "out": str(out_path)} def run_emit( @@ -235,6 +252,8 @@ def main(argv: Optional[list[str]] = None) -> int: if args.collect: result = collect_enrichment(Path(args.parts), Path(args.out)) print(f"collected : {result['entries']} entries -> {result['out']}") + if result["repaired"]: + print(f"repaired : {len(result['repaired'])} parts (unescaped-quote fix)") if result["bad_parts"]: print(f"bad parts : {len(result['bad_parts'])} (skipped)") for name in result["bad_parts"]: