Prevent + net the unescaped-quote bug in the durable prompts/pipeline
The escape-ASCII-quote rule previously lived only in ephemeral Agent-call strings. Bake it into the durable artifacts so the next session doesn't re-derive it: - SUBAGENT_PROMPT.md + ENRICHMENT_PROMPT.md: explicit rule to escape any ASCII " inside JSON string values (Romanian „cuvânt" is the trap). - run_enrichment.py collect_enrichment: repair malformed parts with escape_stray_quotes instead of dropping them — the enrichment path had no repair net (bad parts were silently dropped, losing that activity's enrichment). Extraction already had one; now both do. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -92,6 +92,15 @@ Include only the fields you actually fill. Always include `content_key` and
|
|||||||
`estimated_fields` (use `[]` if nothing was inferred). Output valid JSON only —
|
`estimated_fields` (use `[]` if nothing was inferred). Output valid JSON only —
|
||||||
no commentary, no markdown fences in the file itself.
|
no commentary, no markdown fences in the file itself.
|
||||||
|
|
||||||
|
### CRITICAL — escape quotes inside string values
|
||||||
|
|
||||||
|
Any ASCII double-quote (`"`, U+0022) inside a string value MUST be escaped as
|
||||||
|
`\"`. Your Romanian text is full of `„cuvânt"` — written raw, the closing ASCII
|
||||||
|
`"` terminates the JSON string early and the whole file fails to parse (and your
|
||||||
|
enrichment for this activity is silently lost). Either keep the typographic
|
||||||
|
marks (`„ "`) or escape every literal ASCII `"`. After writing, re-read the file
|
||||||
|
and confirm it parses as valid JSON.
|
||||||
|
|
||||||
## Report
|
## Report
|
||||||
|
|
||||||
After writing the file, report in under 30 words: the activity name and which
|
After writing the file, report in under 30 words: the activity name and which
|
||||||
|
|||||||
@@ -74,6 +74,21 @@ The file is one JSON object: a `header` plus an `activities` array.
|
|||||||
- Do **not** paraphrase the `source_excerpt` — copy it character for character.
|
- Do **not** paraphrase the `source_excerpt` — copy it character for character.
|
||||||
- Better to extract fewer activities accurately than to pad the output.
|
- Better to extract fewer activities accurately than to pad the output.
|
||||||
|
|
||||||
|
## Escaping quotes inside JSON strings (CRITICAL)
|
||||||
|
|
||||||
|
Any ASCII double-quote (`"`, U+0022) that appears **inside a string value** must
|
||||||
|
be written escaped as `\"`. This is the single most common way these extractions
|
||||||
|
break: Romanian source text uses typographic quotes like `„cuvânt"` where the
|
||||||
|
closing mark is a plain ASCII `"`. Written raw, it terminates the JSON string
|
||||||
|
early and corrupts the whole file. So:
|
||||||
|
|
||||||
|
- `"description": "grupul cântă „Unu\" în cor"` ← correct (inner `"` escaped)
|
||||||
|
- `"description": "grupul cântă „Unu" în cor"` ← BROKEN (unescaped `"`)
|
||||||
|
|
||||||
|
Prefer keeping the source's typographic quotes (`„ "`), but whenever a literal
|
||||||
|
ASCII `"` lands inside a value, escape it. After writing, re-read the file and
|
||||||
|
confirm it parses as valid JSON.
|
||||||
|
|
||||||
## Writing large outputs in batches (IMPORTANT)
|
## Writing large outputs in batches (IMPORTANT)
|
||||||
|
|
||||||
A single Write tool call has a hard ~32K output-token limit. Dense chunks
|
A single Write tool call has a hard ~32K output-token limit. Dense chunks
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ from import_common import ( # noqa: E402
|
|||||||
find_chunk_text,
|
find_chunk_text,
|
||||||
normalize_name,
|
normalize_name,
|
||||||
)
|
)
|
||||||
|
from repair_extractions import escape_stray_quotes # noqa: E402
|
||||||
|
|
||||||
ENRICHMENT_PROMPT = SCRIPT_DIR / "ENRICHMENT_PROMPT.md"
|
ENRICHMENT_PROMPT = SCRIPT_DIR / "ENRICHMENT_PROMPT.md"
|
||||||
|
|
||||||
@@ -169,11 +170,26 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict:
|
|||||||
"""Merge data/enrichment_parts/*.json into one flat content_key map."""
|
"""Merge data/enrichment_parts/*.json into one flat content_key map."""
|
||||||
merged: dict = {}
|
merged: dict = {}
|
||||||
bad: list[str] = []
|
bad: list[str] = []
|
||||||
|
repaired: list[str] = []
|
||||||
if parts_dir.is_dir():
|
if parts_dir.is_dir():
|
||||||
for part in sorted(parts_dir.glob("*.json")):
|
for part in sorted(parts_dir.glob("*.json")):
|
||||||
|
raw = part.read_text(encoding="utf-8")
|
||||||
try:
|
try:
|
||||||
data = json.loads(part.read_text(encoding="utf-8"))
|
data = json.loads(raw)
|
||||||
except (json.JSONDecodeError, OSError):
|
except json.JSONDecodeError:
|
||||||
|
# Enrichment subagents hit the same unescaped-ASCII-quote bug as
|
||||||
|
# extraction (description_ro is full of Romanian „…"). Repair by
|
||||||
|
# escaping rather than dropping the activity's enrichment.
|
||||||
|
try:
|
||||||
|
data = json.loads(escape_stray_quotes(raw))
|
||||||
|
repaired.append(part.name)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
bad.append(part.name)
|
||||||
|
continue
|
||||||
|
except OSError:
|
||||||
|
bad.append(part.name)
|
||||||
|
continue
|
||||||
|
if not isinstance(data, dict):
|
||||||
bad.append(part.name)
|
bad.append(part.name)
|
||||||
continue
|
continue
|
||||||
key = data.get("content_key") or part.stem
|
key = data.get("content_key") or part.stem
|
||||||
@@ -182,7 +198,8 @@ def collect_enrichment(parts_dir: Path, out_path: Path) -> dict:
|
|||||||
out_path.write_text(
|
out_path.write_text(
|
||||||
json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8"
|
json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||||
)
|
)
|
||||||
return {"entries": len(merged), "bad_parts": bad, "out": str(out_path)}
|
return {"entries": len(merged), "repaired": repaired,
|
||||||
|
"bad_parts": bad, "out": str(out_path)}
|
||||||
|
|
||||||
|
|
||||||
def run_emit(
|
def run_emit(
|
||||||
@@ -235,6 +252,8 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|||||||
if args.collect:
|
if args.collect:
|
||||||
result = collect_enrichment(Path(args.parts), Path(args.out))
|
result = collect_enrichment(Path(args.parts), Path(args.out))
|
||||||
print(f"collected : {result['entries']} entries -> {result['out']}")
|
print(f"collected : {result['entries']} entries -> {result['out']}")
|
||||||
|
if result["repaired"]:
|
||||||
|
print(f"repaired : {len(result['repaired'])} parts (unescaped-quote fix)")
|
||||||
if result["bad_parts"]:
|
if result["bad_parts"]:
|
||||||
print(f"bad parts : {len(result['bad_parts'])} (skipped)")
|
print(f"bad parts : {len(result['bad_parts'])} (skipped)")
|
||||||
for name in result["bad_parts"]:
|
for name in result["bad_parts"]:
|
||||||
|
|||||||
Reference in New Issue
Block a user