clawd/tools/update_notes_index.py

#!/usr/bin/env python3
"""
Generează index.json pentru notes din fișierele .md
Extrage titlu, dată, tags, și domenii (@work, @health, etc.)
Scanează TOATE subdirectoarele din notes/ (youtube, retete, etc.)
"""

import os
import re
import json
from pathlib import Path

NOTES_ROOT = Path(__file__).parent.parent / "notes"
INDEX_FILE = NOTES_ROOT / "index.json"

# Subdirectoare de scanat (adaugă altele aici)
SCAN_DIRS = ['youtube', 'retete']

# Domenii de agenți
VALID_DOMAINS = ['work', 'health', 'growth', 'sprijin', 'scout']

def extract_metadata(filepath):
    """Extrage metadata din fișierul markdown"""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extrage titlul (prima linie cu #)
    title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
    title = title_match.group(1) if title_match else filepath.stem

    # Extrage tags (linia cu **Tags:** sau tags:)
    tags = []
    domains = []
    tags_match = re.search(r'\*\*Tags?:\*\*\s*(.+)$|^Tags?:\s*(.+)$', content, re.MULTILINE | re.IGNORECASE)
    if tags_match:
        tags_str = tags_match.group(1) or tags_match.group(2)

        # Extrage domenii (@work, @health, etc.)
        domain_matches = re.findall(r'@(\w+)', tags_str)
        domains = [d for d in domain_matches if d in VALID_DOMAINS]

        # Extrage tags normale (#tag) - exclude domeniile
        all_tags = re.findall(r'#([\w-]+)', tags_str)
        tags = [t for t in all_tags if t not in VALID_DOMAINS]

    # Extrage data din filename (YYYY-MM-DD_slug.md)
    date_match = re.match(r'(\d{4}-\d{2}-\d{2})_', filepath.name)
    date = date_match.group(1) if date_match else ""

    # Extrage video URL
    video_match = re.search(r'\*\*(?:Video|Link):\*\*\s*(https?://[^\s]+)', content)
    video_url = video_match.group(1) if video_match else ""

    # Extrage TL;DR (primele 200 caractere)
    tldr_match = re.search(r'##\s*📋?\s*TL;DR\s*\n+(.+?)(?=\n##|\n---|\Z)', content, re.DOTALL)
    tldr = ""
    if tldr_match:
        tldr = tldr_match.group(1).strip()[:200]
        if len(tldr_match.group(1).strip()) > 200:
            tldr += "..."

    return {
        "file": filepath.name,
        "title": title,
        "date": date,
        "tags": tags,
        "domains": domains,
        "video": video_url,
        "tldr": tldr
    }

def generate_index():
    """Generează index.json din toate fișierele .md din toate subdirectoarele"""
    notes = []

    # Stats per domeniu
    domain_stats = {d: 0 for d in VALID_DOMAINS}
    # Stats per categorie
    category_stats = {}

    for subdir in SCAN_DIRS:
        notes_dir = NOTES_ROOT / subdir
        if not notes_dir.exists():
            print(f"  (skipping {subdir}/ - not found)")
            continue

        print(f"Scanning notes/{subdir}/...")
        category_stats[subdir] = 0

        for filepath in sorted(notes_dir.glob("*.md"), reverse=True):
            if filepath.name == 'index.json':
                continue
            try:
                metadata = extract_metadata(filepath)
                # Adaugă categoria (subdirectorul)
                metadata['category'] = subdir
                # Modifică path-ul fișierului să includă subdirectorul
                metadata['file'] = f"{subdir}/{filepath.name}"
                notes.append(metadata)

                # Update stats
                category_stats[subdir] += 1
                for d in metadata['domains']:
                    domain_stats[d] += 1

                domains_str = ' '.join([f'@{d}' for d in metadata['domains']]) if metadata['domains'] else ''
                print(f"  + {metadata['title'][:40]}... {domains_str}")
            except Exception as e:
                print(f"  ! Error processing {filepath.name}: {e}")

    # Sortează după dată descrescător
    notes.sort(key=lambda x: x['date'], reverse=True)

    # Adaugă metadata globală
    output = {
        "notes": notes,
        "stats": {
            "total": len(notes),
            "by_domain": domain_stats,
            "by_category": category_stats
        },
        "domains": VALID_DOMAINS,
        "categories": SCAN_DIRS
    }

    with open(INDEX_FILE, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Generated {INDEX_FILE} with {len(notes)} notes")
    print(f"   Domains: {domain_stats}")
    print(f"   Categories: {category_stats}")
    return output

if __name__ == "__main__":
    generate_index()