echo-core/dashboard/handlers/youtube.py

"""YouTube subtitle-download + note-creation endpoint."""
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime
from pathlib import Path

import constants

log = logging.getLogger(__name__)


def _clean_vtt(content):
    """Convert VTT captions to plain text."""
    lines = []
    seen = set()
    for line in content.split('\n'):
        if any([
            line.startswith('WEBVTT'),
            line.startswith('Kind:'),
            line.startswith('Language:'),
            '-->' in line,
            line.strip().startswith('<'),
            not line.strip(),
            re.match(r'^\d+$', line.strip()),
        ]):
            continue
        clean = re.sub(r'<[^>]+>', '', line).strip()
        if clean and clean not in seen:
            seen.add(clean)
            lines.append(clean)
    return ' '.join(lines)


def _is_description_about_video(description):
    """Return True if description contains info about the video (chapters/topics)."""
    if not description or len(description.strip()) < 50:
        return False
    timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
    if len(timestamp_pattern.findall(description)) >= 3:
        return True
    lines = description.strip().split('\n')
    bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
    if len(bullet_lines) >= 3:
        return True
    numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
    if len(numbered_lines) >= 3:
        return True
    return False


def _extract_relevant_description(description):
    """Strip promotional tails (links, social media) from description."""
    if not description:
        return ""
    promo_patterns = [
        re.compile(r'https?://\S+'),
        re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
        re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
        re.compile(r'purchase|buy|order|shop|store', re.I),
    ]
    result_lines = []
    promo_streak = 0
    for line in description.strip().split('\n'):
        stripped = line.strip()
        is_promo = any(p.search(stripped) for p in promo_patterns)
        if is_promo:
            promo_streak += 1
            if promo_streak >= 2:
                break
        else:
            promo_streak = 0
            result_lines.append(line)
    while result_lines and not result_lines[-1].strip():
        result_lines.pop()
    return '\n'.join(result_lines)


ANALYSIS_PROMPT = """\
Ai primit transcriptul unui video YouTube și descrierea lui. Scrie o notiță KB în română, format Markdown.

Structura notei (în ordine):
1. ## TL;DR — un paragraf de 3-5 rânduri care surprinde esența
2. ## Puncte cheie — 6-10 puncte concise (pot fi bullets, dar scurte și dense)
3. ## Quote-uri memorabile — 4-6 citate directe din transcript, în limba originală, între ghilimele
4. ## Idei acționabile — 4-8 lucruri concrete pe care cititorul le poate face
5. Secțiuni tematice cu ## heading — câte teme apar natural, în proze curgătoare (NU bullets), fiecare cu conținut real din transcript: cifre, exemple, mecanisme, argumente

Nu scrie metadate (titlu, url, tags, dată) — vor fi adăugate separat.
Nu scrie fraze introductive despre tine sau despre video. Începe direct cu ## TL;DR.
Scrie în română. Citatele rămân în engleză dacă sursa e engleză.
"""


def _analyze_with_claude(title, description, transcript):
    """Call claude -p to generate rich analysis of the video."""
    claude_bin = os.path.expanduser('~/.local/bin/claude')
    if not os.path.exists(claude_bin):
        claude_bin = 'claude'

    desc_section = ""
    if description:
        desc_section = f"DESCRIERE VIDEO:\n{description[:3000]}\n\n"

    prompt = (
        f"{ANALYSIS_PROMPT}\n\n"
        f"TITLU: {title}\n\n"
        f"{desc_section}"
        f"TRANSCRIPT (primele 40000 caractere):\n{transcript[:40000]}"
    )

    result = subprocess.run(
        [claude_bin, '-p', prompt],
        capture_output=True, text=True, timeout=300,
    )
    if result.returncode == 0 and result.stdout.strip():
        return result.stdout.strip()
    log.warning("Claude analysis failed: %s", result.stderr[:300])
    return None


def _process_youtube(url):
    """Download subtitles, save note."""
    yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')

    result = subprocess.run(
        [yt_dlp, '--dump-json', '--no-download', url],
        capture_output=True, text=True, timeout=30,
    )
    if result.returncode != 0:
        print(f"Failed to get video info: {result.stderr}")
        return

    info = json.loads(result.stdout)
    title = info.get('title', 'Unknown')
    duration = info.get('duration', 0)
    description = info.get('description', '')

    temp_dir = Path('/tmp/yt_subs')
    temp_dir.mkdir(exist_ok=True)
    for f in temp_dir.glob('*'):
        f.unlink()

    subprocess.run([
        yt_dlp, '--write-auto-subs', '--sub-langs', 'en',
        '--skip-download', '--sub-format', 'vtt',
        '-o', str(temp_dir / '%(id)s'),
        url,
    ], capture_output=True, timeout=120)

    transcript = None
    for sub_file in temp_dir.glob('*.vtt'):
        content = sub_file.read_text(encoding='utf-8', errors='replace')
        transcript = _clean_vtt(content)
        break

    if not transcript:
        print("No subtitles found")
        return

    date_str = datetime.now().strftime('%Y-%m-%d')
    slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
    filename = f"{date_str}_{slug}.md"

    # Description block
    desc_block = ""
    if _is_description_about_video(description):
        relevant_desc = _extract_relevant_description(description)
        if relevant_desc:
            desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n"

    # Claude analysis: TL;DR + puncte cheie + citate + teme în proze
    print("Running Claude analysis...")
    analysis = _analyze_with_claude(title, description, transcript)

    if analysis:
        note_content = f"""# {title}

**Video:** {url}
**Duration:** {duration // 60}:{duration % 60:02d}
**Saved:** {date_str}
**Tags:** #youtube

---
{desc_block}
{analysis}
"""
    else:
        # Fallback: save raw transcript if Claude fails
        note_content = f"""# {title}

**Video:** {url}
**Duration:** {duration // 60}:{duration % 60:02d}
**Saved:** {date_str}
**Tags:** #youtube #to-summarize

---
{desc_block}
## Transcript

{transcript[:15000]}
"""

    constants.NOTES_DIR.mkdir(parents=True, exist_ok=True)
    note_path = constants.NOTES_DIR / filename
    note_path.write_text(note_content, encoding='utf-8')

    subprocess.run(
        [sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')],
        capture_output=True,
    )

    # Index new note with Ollama semantic embeddings
    try:
        sys.path.insert(0, str(constants.BASE_DIR))
        from src.memory_search import index_file, MEMORY_DIR
        n = index_file(note_path)
        log.info("Ollama indexed %s (%d chunks)", filename, n)
    except Exception as e:
        log.warning("Ollama indexing failed for %s: %s", filename, e)

    print(f"Created note: {filename}")
    return filename


class YoutubeHandlers:
    """Mixin for /api/youtube."""

    def handle_youtube(self):
        """Process a YouTube URL: download subs, save note."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length).decode('utf-8')
            data = json.loads(post_data)
            url = data.get('url', '').strip()

            if not url or ('youtube.com' not in url and 'youtu.be' not in url):
                self.send_json({'error': 'URL YouTube invalid'}, 400)
                return

            try:
                print(f"Processing YouTube URL: {url}")
                _process_youtube(url)
                self.send_json({
                    'status': 'done',
                    'message': 'Notița a fost creată! Refresh pagina Notes.',
                })
            except Exception as e:
                print(f"YouTube processing error: {e}")
                traceback.print_exc()
                self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500)
        except Exception as e:
            self.send_json({'error': str(e)}, 500)