feat(memory): hybrid retrieval — navigation index.md + RAG hardening
Expose a navigation layer to the agent and harden RAG, after analyzing the OKF note and testing on the real KB. - memory_search.search(): dedupe best-chunk-per-file (a relevant note can no longer be buried by another file's chunks) + keyword fallback tagged degraded:True when Ollama is unreachable (no more hard crash). - update_notes_index.py: emit per-folder index.md + root router; prune empty folders; fix latent subcategory->project bug. - Exclude generated index.md from RAG rglob (reindex/incremental) + indexer scans + heartbeat freshness check (prevents self-pollution / reindex thrash). - CLAUDE.md: reframe memory as hybrid (navigation first, RAG for fuzzy recall). - Delete stale orphan kb/youtube/index.json; correct the OKF source note. - Tests: dedup, keyword fallback, index.md exclusion. Plan + review in docs/. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -316,6 +316,10 @@ def _check_kb_index() -> str | None:
|
||||
|
||||
newer = 0
|
||||
for md in kb_dir.rglob("*.md"):
|
||||
# Skip generated nav files — they're written by the reindex itself, so
|
||||
# comparing them against index.json mtime would cause perpetual reindex.
|
||||
if md.name == "index.md":
|
||||
continue
|
||||
if md.stat().st_mtime > index_mtime:
|
||||
newer += 1
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ Uses Ollama all-minilm embeddings stored in SQLite for cosine similarity search.
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import sqlite3
|
||||
import struct
|
||||
from datetime import datetime, timezone
|
||||
@@ -62,6 +63,11 @@ def init_config(config=None) -> None:
|
||||
init_config()
|
||||
|
||||
|
||||
def _is_indexable(md_file: Path) -> bool:
|
||||
"""Skip generated navigation files so they aren't embedded as if they were notes."""
|
||||
return md_file.name != "index.md"
|
||||
|
||||
|
||||
def get_db() -> sqlite3.Connection:
|
||||
"""Get SQLite connection, create table if needed."""
|
||||
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -211,6 +217,8 @@ def reindex() -> dict:
|
||||
files_count = 0
|
||||
chunks_count = 0
|
||||
for md_file in sorted(MEMORY_DIR.rglob("*.md")):
|
||||
if not _is_indexable(md_file):
|
||||
continue
|
||||
try:
|
||||
n = index_file(md_file)
|
||||
files_count += 1
|
||||
@@ -242,6 +250,8 @@ def incremental_index() -> dict:
|
||||
files_indexed = 0
|
||||
chunks_total = 0
|
||||
for md_file in sorted(MEMORY_DIR.rglob("*.md")):
|
||||
if not _is_indexable(md_file):
|
||||
continue
|
||||
rel_path = str(md_file.relative_to(MEMORY_DIR))
|
||||
file_mtime = datetime.fromtimestamp(
|
||||
md_file.stat().st_mtime, tz=timezone.utc
|
||||
@@ -264,9 +274,55 @@ def incremental_index() -> dict:
|
||||
return {"indexed": files_indexed, "chunks": chunks_total}
|
||||
|
||||
|
||||
def _keyword_fallback(query: str, top_k: int = 5) -> list[dict]:
|
||||
"""Keyword search over indexed chunks. Used when the embedding backend is down.
|
||||
|
||||
Returns the same shape as search() plus "degraded": True so callers can
|
||||
tell the user that semantic recall was unavailable. Ranks best-chunk-per-file
|
||||
by raw term-occurrence count.
|
||||
"""
|
||||
terms = [t for t in re.findall(r"\w+", query.lower()) if len(t) > 2]
|
||||
|
||||
conn = get_db()
|
||||
try:
|
||||
rows = conn.execute("SELECT file_path, chunk_text FROM chunks").fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
best: dict[str, dict] = {}
|
||||
for file_path, chunk_text in rows:
|
||||
low = chunk_text.lower()
|
||||
hits = sum(low.count(t) for t in terms) if terms else 0
|
||||
if hits == 0:
|
||||
continue
|
||||
cur = best.get(file_path)
|
||||
if cur is None or hits > cur["score"]:
|
||||
best[file_path] = {
|
||||
"file": file_path,
|
||||
"chunk": chunk_text,
|
||||
"score": float(hits),
|
||||
"degraded": True,
|
||||
}
|
||||
|
||||
scored = sorted(best.values(), key=lambda x: x["score"], reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
|
||||
def search(query: str, top_k: int = 5) -> list[dict]:
|
||||
"""Search for query. Returns list of {"file": str, "chunk": str, "score": float}."""
|
||||
query_embedding = get_embedding(query)
|
||||
"""Search for query. Returns list of {"file": str, "chunk": str, "score": float}.
|
||||
|
||||
Results are deduped to the best-scoring chunk per file, so a relevant note
|
||||
can't be buried by another file contributing several chunks. If the embedding
|
||||
backend (Ollama) is unreachable, falls back to keyword search and tags each
|
||||
result with "degraded": True instead of raising.
|
||||
"""
|
||||
try:
|
||||
query_embedding = get_embedding(query)
|
||||
except ConnectionError as e:
|
||||
log.warning(
|
||||
"Embedding backend unavailable (%s); falling back to keyword search", e
|
||||
)
|
||||
return _keyword_fallback(query, top_k)
|
||||
|
||||
conn = get_db()
|
||||
try:
|
||||
@@ -279,11 +335,13 @@ def search(query: str, top_k: int = 5) -> list[dict]:
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
scored = []
|
||||
best: dict[str, dict] = {}
|
||||
for file_path, chunk_text, emb_blob in rows:
|
||||
emb = deserialize_embedding(emb_blob)
|
||||
score = cosine_similarity(query_embedding, emb)
|
||||
scored.append({"file": file_path, "chunk": chunk_text, "score": score})
|
||||
cur = best.get(file_path)
|
||||
if cur is None or score > cur["score"]:
|
||||
best[file_path] = {"file": file_path, "chunk": chunk_text, "score": score}
|
||||
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
scored = sorted(best.values(), key=lambda x: x["score"], reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
Reference in New Issue
Block a user