echo-core/dashboard/handlers/youtube.py

"""YouTube subtitle-download + note-creation endpoint."""
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime
from pathlib import Path

import constants

log = logging.getLogger(__name__)


def _clean_vtt(content):
    """Convert VTT captions to plain text."""
    lines = []
    seen = set()
    for line in content.split('\n'):
        if any([
            line.startswith('WEBVTT'),
            line.startswith('Kind:'),
            line.startswith('Language:'),
            '-->' in line,
            line.strip().startswith('<'),
            not line.strip(),
            re.match(r'^\d+$', line.strip()),
        ]):
            continue
        clean = re.sub(r'<[^>]+>', '', line).strip()
        if clean and clean not in seen:
            seen.add(clean)
            lines.append(clean)
    return ' '.join(lines)


def _is_description_about_video(description):
    """Return True if description contains info about the video (chapters/topics)."""
    if not description or len(description.strip()) < 50:
        return False
    timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
    if len(timestamp_pattern.findall(description)) >= 3:
        return True
    lines = description.strip().split('\n')
    bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
    if len(bullet_lines) >= 3:
        return True
    numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
    if len(numbered_lines) >= 3:
        return True
    return False


def _extract_relevant_description(description):
    """Strip promotional tails (links, social media) from description."""
    if not description:
        return ""
    promo_patterns = [
        re.compile(r'https?://\S+'),
        re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
        re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
        re.compile(r'purchase|buy|order|shop|store', re.I),
    ]
    result_lines = []
    promo_streak = 0
    for line in description.strip().split('\n'):
        stripped = line.strip()
        is_promo = any(p.search(stripped) for p in promo_patterns)
        if is_promo:
            promo_streak += 1
            if promo_streak >= 2:
                break
        else:
            promo_streak = 0
            result_lines.append(line)
    while result_lines and not result_lines[-1].strip():
        result_lines.pop()
    return '\n'.join(result_lines)


def _process_youtube(url):
    """Download subtitles, save note."""
    yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')

    result = subprocess.run(
        [yt_dlp, '--dump-json', '--no-download', url],
        capture_output=True, text=True, timeout=30,
    )
    if result.returncode != 0:
        print(f"Failed to get video info: {result.stderr}")
        return

    info = json.loads(result.stdout)
    title = info.get('title', 'Unknown')
    duration = info.get('duration', 0)
    description = info.get('description', '')

    temp_dir = Path('/tmp/yt_subs')
    temp_dir.mkdir(exist_ok=True)
    for f in temp_dir.glob('*'):
        f.unlink()

    subprocess.run([
        yt_dlp, '--write-auto-subs', '--sub-langs', 'en',
        '--skip-download', '--sub-format', 'vtt',
        '-o', str(temp_dir / '%(id)s'),
        url,
    ], capture_output=True, timeout=120)

    transcript = None
    for sub_file in temp_dir.glob('*.vtt'):
        content = sub_file.read_text(encoding='utf-8', errors='replace')
        transcript = _clean_vtt(content)
        break

    if not transcript:
        print("No subtitles found")
        return

    date_str = datetime.now().strftime('%Y-%m-%d')
    slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
    filename = f"{date_str}_{slug}.md"

    # Build optional description block
    desc_block = ""
    if _is_description_about_video(description):
        relevant_desc = _extract_relevant_description(description)
        if relevant_desc:
            desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n"

    note_content = f"""# {title}

**Video:** {url}
**Duration:** {duration // 60}:{duration % 60:02d}
**Saved:** {date_str}
**Tags:** #youtube #to-summarize

---
{desc_block}
## Transcript

{transcript[:15000]}

---

*Notă: Sumarizarea va fi adăugată de Echo.*
"""

    constants.NOTES_DIR.mkdir(parents=True, exist_ok=True)
    note_path = constants.NOTES_DIR / filename
    note_path.write_text(note_content, encoding='utf-8')

    subprocess.run(
        [sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')],
        capture_output=True,
    )

    # Index new note with Ollama semantic embeddings
    try:
        sys.path.insert(0, str(constants.BASE_DIR))
        from src.memory_search import index_file, MEMORY_DIR
        n = index_file(note_path)
        log.info("Ollama indexed %s (%d chunks)", filename, n)
    except Exception as e:
        log.warning("Ollama indexing failed for %s: %s", filename, e)

    print(f"Created note: {filename}")
    return filename


class YoutubeHandlers:
    """Mixin for /api/youtube."""

    def handle_youtube(self):
        """Process a YouTube URL: download subs, save note."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length).decode('utf-8')
            data = json.loads(post_data)
            url = data.get('url', '').strip()

            if not url or ('youtube.com' not in url and 'youtu.be' not in url):
                self.send_json({'error': 'URL YouTube invalid'}, 400)
                return

            try:
                print(f"Processing YouTube URL: {url}")
                _process_youtube(url)
                self.send_json({
                    'status': 'done',
                    'message': 'Notița a fost creată! Refresh pagina Notes.',
                })
            except Exception as e:
                print(f"YouTube processing error: {e}")
                traceback.print_exc()
                self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500)
        except Exception as e:
            self.send_json({'error': str(e)}, 500)