Faza 1 complete: bilingual+enrichment plumbing, UI/filters, frozen DB
Extraction finished (575/588 chunks; 6 content-filter-blocked, 7 await re-extraction). DB rebuilt and frozen at 9418 activities — content_keys are now stable for the enrichment overlay. Part A (plumbing + UI): - database.py: name_ro/description_ro/rules_ro/variations_ro, indoor_outdoor, space_needed, estimated_fields, source_id/source_ids/chunk_key columns; FTS5 indexes the 4 *_ro columns across CREATE + all 3 triggers; new equality filters + category counts for both axes. - activity.py: new fields + bilingual display helpers (get_display_*, is_estimated, axis displays). - config_taxonomy.py: INDOOR_OUTDOOR/SPACE_NEEDED enums + normalizers (None on unrecognised, no fabrication). - search.py / routes.py / config.py / templates / css: new dropdowns, RO-primary rendering with "(estimat)" markers and collapsible original text, and a /source/<id> download route shipped DARK behind SOURCE_DOWNLOAD_ENABLED (copyright opt-in). - build_database.py: source_id/chunk_key in dict_to_activity; merge_cluster unions source_ids without touching enrichment fields. Part B (enrichment pipeline, built not yet run): - build_database.py: load_enrichment + apply_enrichment (post-dedup, keyed on content_key) + --enrichment CLI + stated-vs-estimated QA. - run_enrichment.py (resumable, --source/--limit pilot scoping, --collect), ENRICHMENT_PROMPT.md. Repair: scripts/repair_extractions.py fixes the subagents' systematic unescaped-ASCII-quote bug with a faithful char-scanner (escapes, never truncates) + schema validation + a strictly-more-text guard. json_repair was tried first, truncated silently, and is NOT used. build_database has no repair dependency. Tests: tests/test_enrichment.py added; 99 pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -86,7 +86,12 @@ def _split_csv(value: Optional[str]) -> list[str]:
|
||||
return [p.strip() for p in str(value).split(",") if p.strip()]
|
||||
|
||||
|
||||
def dict_to_activity(adict: dict, source_file: str) -> Activity:
|
||||
def dict_to_activity(
|
||||
adict: dict,
|
||||
source_file: str,
|
||||
source_id: Optional[str] = None,
|
||||
chunk_key: Optional[str] = None,
|
||||
) -> Activity:
|
||||
"""Build an Activity from one extraction-JSON activity object."""
|
||||
tags = adict.get("tags") or []
|
||||
if isinstance(tags, str):
|
||||
@@ -99,6 +104,9 @@ def dict_to_activity(adict: dict, source_file: str) -> Activity:
|
||||
source_files = [source_file, *source_files]
|
||||
|
||||
return Activity(
|
||||
source_id=source_id,
|
||||
source_ids=[source_id] if source_id else [],
|
||||
chunk_key=chunk_key,
|
||||
name=(adict.get("name") or "").strip(),
|
||||
description=(adict.get("description") or "").strip(),
|
||||
rules=adict.get("rules"),
|
||||
@@ -206,6 +214,19 @@ def merge_cluster(cluster: list[Activity]) -> Activity:
|
||||
if s and s not in sources:
|
||||
sources.append(s)
|
||||
merged.source_files = sources
|
||||
# source provenance: keep rep's chunk_key/source_id as primary, union the
|
||||
# source_ids for the download route. Enrichment fields (name_ro,
|
||||
# description_ro, indoor_outdoor, ...) are intentionally NOT carried here:
|
||||
# enrichment is applied AFTER dedup (plan Part B2), keyed on the merged
|
||||
# row's content_key, so merging must not pre-populate them.
|
||||
merged.source_id = rep.source_id
|
||||
merged.chunk_key = rep.chunk_key
|
||||
source_ids: list[str] = []
|
||||
for a in cluster:
|
||||
for sid in [a.source_id, *(a.source_ids or [])]:
|
||||
if sid and sid not in source_ids:
|
||||
source_ids.append(sid)
|
||||
merged.source_ids = source_ids
|
||||
# popularity_score++ per merged duplicate (plan §4)
|
||||
merged.popularity_score = max(a.popularity_score for a in cluster) + (len(cluster) - 1)
|
||||
return merged
|
||||
@@ -313,6 +334,108 @@ def apply_review_decisions(
|
||||
return kept, stats
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 5b — enrichment overlay (plan Part B)
|
||||
# --------------------------------------------------------------------------
|
||||
# Translation / inferred-filter fields written by run_enrichment.py. Applied
|
||||
# AFTER dedup + review decisions, keyed on the same stable content_key, so the
|
||||
# overlay survives rebuilds as long as extraction text is frozen.
|
||||
_ENRICHMENT_TEXT_FIELDS = ("name_ro", "description_ro", "rules_ro", "variations_ro")
|
||||
_ENRICHMENT_INT_FIELDS = (
|
||||
"participants_min", "participants_max",
|
||||
"duration_min", "duration_max",
|
||||
"age_group_min", "age_group_max",
|
||||
)
|
||||
|
||||
|
||||
def load_enrichment(path: Path) -> dict:
|
||||
"""Load data/enrichment.json (flat map content_key -> field dict)."""
|
||||
if path and path.is_file():
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def apply_enrichment(activities: list[Activity], enrichment: dict) -> dict:
|
||||
"""
|
||||
Overlay enrichment fields onto the post-dedup activity list (plan B2).
|
||||
|
||||
Keyed by content_key. Only fields PRESENT in an entry are written; absent
|
||||
fields leave the underlying DB value untouched. indoor_outdoor /
|
||||
space_needed are normalized to slugs (None on unrecognised). Inferred
|
||||
fields are recorded in `estimated_fields`. Translated / expanded text is
|
||||
NOT re-validated against the source here — expansion fidelity is the
|
||||
enrichment prompt's responsibility (plan B2 comment).
|
||||
|
||||
Returns {entries, matched, orphaned, fields_stated, fields_estimated}.
|
||||
"""
|
||||
from app.config_taxonomy import normalize_indoor_outdoor, normalize_space_needed
|
||||
|
||||
matched_keys: set[str] = set()
|
||||
fields_stated: dict[str, int] = defaultdict(int)
|
||||
fields_estimated: dict[str, int] = defaultdict(int)
|
||||
|
||||
for act in activities:
|
||||
key = content_key(
|
||||
act.normalized_name or normalize_name(act.name),
|
||||
act.language,
|
||||
act.description or "",
|
||||
)
|
||||
entry = enrichment.get(key)
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
matched_keys.add(key)
|
||||
|
||||
estimated = set(entry.get("estimated_fields") or [])
|
||||
|
||||
# bilingual text twins
|
||||
for fld in _ENRICHMENT_TEXT_FIELDS:
|
||||
val = entry.get(fld)
|
||||
if isinstance(val, str) and val.strip():
|
||||
setattr(act, fld, val.strip())
|
||||
|
||||
# inferred / clarified structured numeric fields
|
||||
for fld in _ENRICHMENT_INT_FIELDS:
|
||||
if entry.get(fld) is not None:
|
||||
try:
|
||||
setattr(act, fld, int(entry[fld]))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# enum filters — normalized to slug, dropped if unrecognised
|
||||
if entry.get("indoor_outdoor") is not None:
|
||||
slug = normalize_indoor_outdoor(entry["indoor_outdoor"])
|
||||
if slug:
|
||||
act.indoor_outdoor = slug
|
||||
if entry.get("space_needed") is not None:
|
||||
slug = normalize_space_needed(entry["space_needed"])
|
||||
if slug:
|
||||
act.space_needed = slug
|
||||
|
||||
act.estimated_fields = sorted(estimated)
|
||||
|
||||
# QA tally: stated vs estimated population, per field
|
||||
for fld in (*_ENRICHMENT_INT_FIELDS, "indoor_outdoor", "space_needed"):
|
||||
if entry.get(fld) is None:
|
||||
continue
|
||||
if fld in estimated:
|
||||
fields_estimated[fld] += 1
|
||||
else:
|
||||
fields_stated[fld] += 1
|
||||
|
||||
return {
|
||||
"entries": len(enrichment),
|
||||
"matched": len(matched_keys),
|
||||
"orphaned": len(enrichment) - len(matched_keys),
|
||||
"fields_stated": dict(fields_stated),
|
||||
"fields_estimated": dict(fields_estimated),
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# golden-set recall (plan §7)
|
||||
# --------------------------------------------------------------------------
|
||||
@@ -390,9 +513,8 @@ def collect_activities(
|
||||
|
||||
header = data.get("header", {})
|
||||
chunk_text = find_chunk_text(json_path, header, chunks_dir)
|
||||
source_id = header.get("source_id") or chunk_key_for(json_path, header).rsplit(
|
||||
".part", 1
|
||||
)[0]
|
||||
chunk_key = chunk_key_for(json_path, header)
|
||||
source_id = header.get("source_id") or chunk_key.rsplit(".part", 1)[0]
|
||||
fallback_source = (
|
||||
source_path_for(source_id, sources_dir) or source_id or json_path.stem
|
||||
)
|
||||
@@ -409,7 +531,7 @@ def collect_activities(
|
||||
continue
|
||||
src = adict.get("source_file") or fallback_source
|
||||
raw_categories.append((adict.get("category", ""), normalize_category(adict.get("category", ""))))
|
||||
activities.append(dict_to_activity(adict, src))
|
||||
activities.append(dict_to_activity(adict, src, source_id, chunk_key))
|
||||
|
||||
if hallucinated:
|
||||
_log_hallucinations(json_path, rejected_dir, hallucinated)
|
||||
@@ -496,6 +618,7 @@ def rebuild(
|
||||
sources_dir: Path,
|
||||
db_path: Path,
|
||||
decisions_path: Optional[Path] = None,
|
||||
enrichment_path: Optional[Path] = None,
|
||||
schema_path: Path = DEFAULT_SCHEMA_PATH,
|
||||
golden_dir: Optional[Path] = None,
|
||||
do_swap: bool = True,
|
||||
@@ -517,6 +640,11 @@ def rebuild(
|
||||
decisions = load_review_decisions(Path(decisions_path)) if decisions_path else {}
|
||||
final, decision_stats = apply_review_decisions(deduped, decisions)
|
||||
|
||||
# Enrichment overlay — applied immediately after review decisions, on the
|
||||
# post-dedup list, keyed on the same stable content_key (plan B2).
|
||||
enrichment = load_enrichment(Path(enrichment_path)) if enrichment_path else {}
|
||||
enrichment_stats = apply_enrichment(final, enrichment)
|
||||
|
||||
try:
|
||||
write_database(db_tmp_path, final)
|
||||
backup = atomic_swap(db_tmp_path, db_path) if do_swap else None
|
||||
@@ -529,6 +657,7 @@ def rebuild(
|
||||
**collected,
|
||||
"dedup": dedup_stats,
|
||||
"decisions": decision_stats,
|
||||
"enrichment": enrichment_stats,
|
||||
"final_count": len(final),
|
||||
"backup": str(backup) if backup else None,
|
||||
"swapped": do_swap,
|
||||
@@ -579,6 +708,16 @@ def print_report(report: dict) -> None:
|
||||
f"(auto-merged {d['auto_merged']}, borderline {d['borderline']})")
|
||||
print(f"review decisions : dropped {report['decisions']['dropped']}, "
|
||||
f"resolved {report['decisions']['resolved']}")
|
||||
enr = report.get("enrichment")
|
||||
if enr and enr.get("entries"):
|
||||
print(f"enrichment : {enr['entries']} entries "
|
||||
f"(matched {enr['matched']}, orphaned {enr['orphaned']})")
|
||||
stated, estimated = enr.get("fields_stated", {}), enr.get("fields_estimated", {})
|
||||
all_fields = sorted(set(stated) | set(estimated))
|
||||
if all_fields:
|
||||
print(" field population : (stated / estimated)")
|
||||
for fld in all_fields:
|
||||
print(f" {fld:<18}: {stated.get(fld, 0)} / {estimated.get(fld, 0)}")
|
||||
print(f"final inserted : {report['final_count']}")
|
||||
print(f"% with rules : {qa['pct_with_rules']}")
|
||||
print(f"needs_review rows : {qa['needs_review']}")
|
||||
@@ -615,6 +754,7 @@ def main(argv: Optional[list[str]] = None) -> int:
|
||||
parser.add_argument("--sources", default="data/sources")
|
||||
parser.add_argument("--db", default="data/activities.db")
|
||||
parser.add_argument("--decisions", default="data/review_decisions.json")
|
||||
parser.add_argument("--enrichment", default="data/enrichment.json")
|
||||
parser.add_argument("--golden", default="data/golden")
|
||||
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
|
||||
args = parser.parse_args(argv)
|
||||
@@ -628,6 +768,7 @@ def main(argv: Optional[list[str]] = None) -> int:
|
||||
sources_dir=Path(args.sources),
|
||||
db_path=Path(args.db),
|
||||
decisions_path=Path(args.decisions),
|
||||
enrichment_path=Path(args.enrichment),
|
||||
schema_path=Path(args.schema),
|
||||
golden_dir=Path(args.golden),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user