#!/usr/bin/env python3
"""
Content Discovery - Căutare automată de conținut bazată pe interese.
Rulează noaptea, pregătește note pentru morning report.

Usage: python3 content_discovery.py [--dry-run]
"""

import os
import json
import re
from datetime import datetime, timedelta
from pathlib import Path

WORKSPACE = Path(__file__).parent.parent
MEMORY_DIR = WORKSPACE / "memory"
INSIGHTS_DIR = WORKSPACE / "kb" / "insights"
USER_MD = WORKSPACE / "USER.md"

# Interese de bază (fallback)
BASE_INTERESTS = [
    "NLP Sleight of Mouth patterns",
    "comunicare nonviolentă Marshall Rosenberg",
    "James Clear atomic habits productivity",
    "Monica Ion mindset antreprenor",
    "dezvoltare personală coaching",
    "Rumi quotes wisdom philosophy",
    "stoicism practical philosophy",
    "noua medicină germanică",
    "post negru fasting benefits",
    "80/20 principle productivity",
    "leadership entrepreneurship",
]

def get_recent_files(directory: Path, days: int = 3) -> list:
    """Get files modified in last N days"""
    cutoff = datetime.now() - timedelta(days=days)
    files = []
    if directory.exists():
        for f in directory.glob("*.md"):
            if f.stat().st_mtime > cutoff.timestamp():
                files.append(f)
    return sorted(files, key=lambda x: x.stat().st_mtime, reverse=True)

def extract_topics_from_file(filepath: Path) -> list:
    """Extract potential topics/keywords from a markdown file"""
    topics = []
    try:
        content = filepath.read_text(encoding='utf-8')
        
        # Extract from headers
        headers = re.findall(r'^##?\s+(.+)$', content, re.MULTILINE)
        topics.extend(headers[:5])
        
        # Extract YouTube video titles
        yt_titles = re.findall(r'^#\s+(.+)$', content, re.MULTILINE)
        topics.extend(yt_titles[:3])
        
        # Extract @tags
        tags = re.findall(r'@(\w+)', content)
        topics.extend(tags[:5])
        
        # Extract bold terms
        bold = re.findall(r'\*\*([^*]+)\*\*', content)
        topics.extend([b for b in bold if len(b) < 50][:5])
        
    except Exception as e:
        print(f"  Warning: Could not read {filepath}: {e}")
    
    return list(set(topics))

def get_recent_topics() -> list:
    """Analyze recent memory and insights to find current interests"""
    recent_topics = []
    
    # Check recent memory files
    print("Scanning recent memory...")
    for f in get_recent_files(MEMORY_DIR, days=3):
        topics = extract_topics_from_file(f)
        recent_topics.extend(topics)
        print(f"  {f.name}: {len(topics)} topics")
    
    # Check recent insights
    print("Scanning recent insights...")
    for f in get_recent_files(INSIGHTS_DIR, days=3):
        topics = extract_topics_from_file(f)
        recent_topics.extend(topics)
        print(f"  {f.name}: {len(topics)} topics")
    
    # Check recent YouTube notes
    yt_dir = WORKSPACE / "kb" / "youtube"
    print("Scanning recent YouTube notes...")
    for f in get_recent_files(yt_dir, days=3):
        topics = extract_topics_from_file(f)
        recent_topics.extend(topics)
        print(f"  {f.name}: {len(topics)} topics")
    
    return list(set(recent_topics))

def build_search_queries(recent_topics: list, base_interests: list) -> list:
    """Build search queries with 60% recent, 40% base interests"""
    queries = []
    
    # Filter and clean topics
    recent_clean = [t for t in recent_topics if len(t) > 3 and len(t) < 100][:10]
    
    # 60% from recent (if available)
    if recent_clean:
        # Pick top 2-3 recent topics
        for topic in recent_clean[:3]:
            queries.append({
                "query": f"{topic} YouTube tutorial",
                "source": "recent",
                "topic": topic
            })
    
    # 40% from base interests (rotate based on day)
    day_of_year = datetime.now().timetuple().tm_yday
    rotated_base = base_interests[day_of_year % len(base_interests):]
    rotated_base.extend(base_interests[:day_of_year % len(base_interests)])
    
    for interest in rotated_base[:2]:
        queries.append({
            "query": interest,
            "source": "base",
            "topic": interest
        })
    
    return queries[:5]  # Max 5 queries

def save_discovery_plan(queries: list):
    """Save the discovery plan for the agent to execute"""
    plan = {
        "generated_at": datetime.now().isoformat(),
        "queries": queries,
        "status": "pending",
        "results": []
    }
    
    plan_file = WORKSPACE / "memory" / "content-discovery-plan.json"
    with open(plan_file, 'w', encoding='utf-8') as f:
        json.dump(plan, f, indent=2, ensure_ascii=False)
    
    print(f"\n✅ Plan saved to {plan_file}")
    return plan_file

def main(dry_run: bool = False):
    print("=" * 50)
    print("🔍 Content Discovery - Pregătire căutare")
    print(f"   Data: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    print("=" * 50)
    
    # 1. Get recent topics
    recent_topics = get_recent_topics()
    print(f"\n📌 Topics recente găsite: {len(recent_topics)}")
    if recent_topics:
        print(f"   Exemple: {recent_topics[:5]}")
    
    # 2. Build search queries
    queries = build_search_queries(recent_topics, BASE_INTERESTS)
    print(f"\n🔎 Queries generate: {len(queries)}")
    for i, q in enumerate(queries, 1):
        print(f"   {i}. [{q['source']}] {q['query'][:60]}...")
    
    if dry_run:
        print("\n⚠️  DRY RUN - nu salvez planul")
        return
    
    # 3. Save plan for agent execution
    plan_file = save_discovery_plan(queries)
    
    print("\n📋 Următorul pas:")
    print("   Agentul va citi planul și va executa căutările")
    print("   Rezultatele vor fi în morning report")

if __name__ == "__main__":
    import sys
    dry_run = "--dry-run" in sys.argv
    main(dry_run)