clawd/tools/update_notes_index.py

#!/usr/bin/env python3
"""
Generează index.json pentru KB din fișierele .md
Scanează: kb/, memory/, conversations/
Extrage titlu, dată, tags, și domenii (@work, @health, etc.)
"""

import os
import re
import json
from pathlib import Path
from datetime import datetime

BASE_DIR = Path(__file__).parent.parent
KB_ROOT = BASE_DIR / "kb"
MEMORY_DIR = BASE_DIR / "memory"
CONVERSATIONS_DIR = BASE_DIR / "conversations"
INDEX_FILE = KB_ROOT / "index.json"

# Domenii de agenți
VALID_DOMAINS = ['work', 'health', 'growth', 'sprijin', 'scout']

# Tipuri speciale (pentru grup-sprijin etc.)
VALID_TYPES = ['exercitiu', 'meditatie', 'reflectie', 'intrebare', 'fisa', 'project', 'memory', 'conversation', 'coaching']

# Cache for rules files
_rules_cache = {}

def load_rules(filepath):
    """Încarcă regulile din .rules.json din directorul fișierului sau părinți"""
    dir_path = filepath.parent

    # Check cache
    if str(dir_path) in _rules_cache:
        return _rules_cache[str(dir_path)]

    # Look for .rules.json in current dir and parents (up to kb/)
    rules = {
        "defaultDomains": [],
        "defaultTypes": [],
        "defaultTags": [],
        "inferTypeFromFilename": False,
        "filenameTypeMap": {}
    }

    # Collect rules from all levels (child rules override parent)
    rules_chain = []
    current = dir_path
    while current >= KB_ROOT:
        rules_file = current / ".rules.json"
        if rules_file.exists():
            try:
                with open(rules_file, 'r', encoding='utf-8') as f:
                    rules_chain.insert(0, json.load(f))  # Parent first
            except:
                pass
        current = current.parent

    # Merge rules (child overrides parent)
    for r in rules_chain:
        for key in rules:
            if key in r:
                if isinstance(rules[key], list):
                    # Extend lists (don't override)
                    rules[key] = list(set(rules[key] + r[key]))
                else:
                    rules[key] = r[key]

    _rules_cache[str(dir_path)] = rules
    return rules

def extract_metadata(filepath, category, subcategory=None):
    """Extrage metadata din fișierul markdown"""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extrage titlul (prima linie cu #)
    title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
    title = title_match.group(1) if title_match else filepath.stem

    # Extrage tags (linia cu **Tags:** sau tags:)
    tags = []
    domains = []
    types = []
    tags_match = re.search(r'\*\*Tags?:\*\*\s*(.+)$|^Tags?:\s*(.+)$', content, re.MULTILINE | re.IGNORECASE)
    if tags_match:
        tags_str = tags_match.group(1) or tags_match.group(2)

        # Extrage domenii (@work, @health, etc.)
        domain_matches = re.findall(r'@(\w+)', tags_str)
        domains = [d for d in domain_matches if d in VALID_DOMAINS]
        types = [d for d in domain_matches if d in VALID_TYPES]

        # Extrage tags normale (#tag)
        all_tags = re.findall(r'#([\w-]+)', tags_str)
        tags = [t for t in all_tags if t not in VALID_DOMAINS and t not in VALID_TYPES]

    # Aplică reguli din .rules.json (dacă există)
    rules = load_rules(filepath)

    # Adaugă domains implicite (dacă nu sunt deja)
    for d in rules.get("defaultDomains", []):
        if d not in domains:
            domains.append(d)

    # Adaugă types implicite
    for t in rules.get("defaultTypes", []):
        if t not in types:
            types.append(t)

    # Adaugă tags implicite
    for t in rules.get("defaultTags", []):
        if t not in tags:
            tags.append(t)

    # Inferă type din filename (dacă e configurat)
    if rules.get("inferTypeFromFilename"):
        filename_lower = filepath.stem.lower()
        for pattern, type_name in rules.get("filenameTypeMap", {}).items():
            if pattern in filename_lower and type_name not in types:
                types.append(type_name)
                break

    # Extrage data din filename (YYYY-MM-DD_slug.md sau YYYY-MM-DD.md)
    date_match = re.match(r'(\d{4}-\d{2}-\d{2})', filepath.name)
    date = date_match.group(1) if date_match else ""

    # Pentru fișiere fără dată în nume, folosește mtime
    if not date:
        mtime = filepath.stat().st_mtime
        date = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d')

    # Extrage video URL
    video_match = re.search(r'\*\*(?:Video|Link):\*\*\s*(https?://[^\s]+)', content)
    video_url = video_match.group(1) if video_match else ""

    # Extrage TL;DR sau primele 200 caractere de conținut
    tldr = ""
    tldr_match = re.search(r'##\s*📋?\s*TL;DR\s*\n+(.+?)(?=\n##|\n---|\Z)', content, re.DOTALL)
    if tldr_match:
        tldr = tldr_match.group(1).strip()[:200]
    else:
        # Fallback: primul paragraf după titlu
        para_match = re.search(r'^#.+\n+(.+?)(?=\n\n|\n#|\Z)', content, re.DOTALL)
        if para_match:
            tldr = para_match.group(1).strip()[:200]
    if len(tldr) >= 200:
        tldr += "..."

    # Construiește path-ul relativ pentru web (din dashboard/)
    # Dashboard are symlinks: notes-data -> ../kb, memory -> ../memory, conversations -> ../conversations
    rel_path = str(filepath.relative_to(BASE_DIR))
    # Transformă kb/... în notes-data/... pentru web
    if rel_path.startswith('kb/'):
        rel_path = 'notes-data/' + rel_path[3:]

    return {
        "file": rel_path,
        "title": title,
        "date": date,
        "tags": tags,
        "domains": domains,
        "types": types,
        "category": category,
        "project": subcategory,  # primul nivel sub projects/ (grup-sprijin, vending-master)
        "subdir": None,  # se setează în scan_directory pentru niveluri mai adânci
        "video": video_url,
        "tldr": tldr
    }

def scan_directory(dir_path, category, subcategory=None, recursive=False):
    """Scanează un director pentru fișiere .md"""
    notes = []

    if not dir_path.exists():
        return notes

    # Defaults pentru categorii speciale (memory/, conversations/)
    category_defaults = {
        "memory": {"types": ["memory"], "domains": []},
        "conversations": {"types": ["conversation"], "domains": []}
    }

    if recursive:
        # Scanează recursiv
        for filepath in dir_path.rglob("*.md"):
            if filepath.name.startswith('.') or 'template' in filepath.name.lower():
                continue
            try:
                # Determină project și subdir din path
                # Ex: projects/grup-sprijin/biblioteca/file.md
                #     project = grup-sprijin, subdir = biblioteca
                rel_to_dir = filepath.relative_to(dir_path)
                parts = rel_to_dir.parts[:-1]  # exclude filename

                project = parts[0] if len(parts) > 0 else None
                subdir = parts[1] if len(parts) > 1 else None

                metadata = extract_metadata(filepath, category, project)
                metadata['subdir'] = subdir
                notes.append(metadata)
            except Exception as e:
                print(f"  ! Error processing {filepath}: {e}")
    else:
        # Scanează doar fișierele din director (nu subdirectoare)
        for filepath in sorted(dir_path.glob("*.md"), reverse=True):
            if filepath.name.startswith('.') or 'template' in filepath.name.lower():
                continue
            try:
                metadata = extract_metadata(filepath, category, subcategory)
                # Aplică defaults pentru categoria specială
                if category in category_defaults:
                    defaults = category_defaults[category]
                    for t in defaults.get("types", []):
                        if t not in metadata["types"]:
                            metadata["types"].append(t)
                    for d in defaults.get("domains", []):
                        if d not in metadata["domains"]:
                            metadata["domains"].append(d)
                notes.append(metadata)
            except Exception as e:
                print(f"  ! Error processing {filepath}: {e}")

    return notes

def generate_index():
    """Generează index.json din toate sursele"""
    all_notes = []

    # Stats
    domain_stats = {d: 0 for d in VALID_DOMAINS}
    category_stats = {}

    # Scanează TOATE subdirectoarele din kb/ recursiv
    print("Scanning kb/ (all subdirectories)...")
    for subdir in sorted(KB_ROOT.iterdir()):
        if subdir.is_dir() and not subdir.name.startswith('.'):
            category = subdir.name
            print(f"  [{category}]")
            notes = scan_directory(subdir, category, recursive=True)
            all_notes.extend(notes)
            category_stats[category] = len(notes)
            for n in notes:
                sub = f"/{n['subcategory']}" if n.get('subcategory') else ""
                print(f"    + {n['title'][:42]}...")
                for d in n['domains']:
                    domain_stats[d] += 1

    # 4. Scanează memory/
    print("Scanning memory/...")
    memory_notes = scan_directory(MEMORY_DIR, "memory")
    all_notes.extend(memory_notes)
    category_stats["memory"] = len(memory_notes)
    for n in memory_notes:
        print(f"  + {n['title'][:45]}...")

    # 5. Scanează conversations/
    print("Scanning conversations/...")
    conv_notes = scan_directory(CONVERSATIONS_DIR, "conversations")
    all_notes.extend(conv_notes)
    category_stats["conversations"] = len(conv_notes)
    for n in conv_notes:
        print(f"  + {n['title'][:45]}...")

    # Sortează după dată descrescător
    all_notes.sort(key=lambda x: x['date'], reverse=True)

    # Adaugă metadata globală
    output = {
        "notes": all_notes,
        "stats": {
            "total": len(all_notes),
            "by_domain": domain_stats,
            "by_category": category_stats
        },
        "domains": VALID_DOMAINS,
        "types": VALID_TYPES,
        "categories": list(category_stats.keys())
    }

    with open(INDEX_FILE, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Generated {INDEX_FILE} with {len(all_notes)} notes")
    print(f"   Categories: {category_stats}")
    return output

if __name__ == "__main__":
    generate_index()