echo-core/dashboard/handlers/youtube.py

"""YouTube subtitle-download + note-creation endpoint."""
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime
from pathlib import Path

import constants

log = logging.getLogger(__name__)


def _clean_vtt(content):
    """Convert VTT captions to plain text."""
    lines = []
    seen = set()
    for line in content.split('\n'):
        if any([
            line.startswith('WEBVTT'),
            line.startswith('Kind:'),
            line.startswith('Language:'),
            '-->' in line,
            line.strip().startswith('<'),
            not line.strip(),
            re.match(r'^\d+$', line.strip()),
        ]):
            continue
        clean = re.sub(r'<[^>]+>', '', line).strip()
        if clean and clean not in seen:
            seen.add(clean)
            lines.append(clean)
    return ' '.join(lines)


def _process_youtube(url):
    """Download subtitles, save note."""
    yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')

    result = subprocess.run(
        [yt_dlp, '--dump-json', '--no-download', url],
        capture_output=True, text=True, timeout=30,
    )
    if result.returncode != 0:
        print(f"Failed to get video info: {result.stderr}")
        return

    info = json.loads(result.stdout)
    title = info.get('title', 'Unknown')
    duration = info.get('duration', 0)

    temp_dir = Path('/tmp/yt_subs')
    temp_dir.mkdir(exist_ok=True)
    for f in temp_dir.glob('*'):
        f.unlink()

    subprocess.run([
        yt_dlp, '--write-auto-subs', '--sub-langs', 'en',
        '--skip-download', '--sub-format', 'vtt',
        '-o', str(temp_dir / '%(id)s'),
        url,
    ], capture_output=True, timeout=120)

    transcript = None
    for sub_file in temp_dir.glob('*.vtt'):
        content = sub_file.read_text(encoding='utf-8', errors='replace')
        transcript = _clean_vtt(content)
        break

    if not transcript:
        print("No subtitles found")
        return

    date_str = datetime.now().strftime('%Y-%m-%d')
    slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
    filename = f"{date_str}_{slug}.md"

    note_content = f"""# {title}

**Video:** {url}
**Duration:** {duration // 60}:{duration % 60:02d}
**Saved:** {date_str}
**Tags:** #youtube #to-summarize

---

## Transcript

{transcript[:15000]}

---

*Notă: Sumarizarea va fi adăugată de Echo.*
"""

    constants.NOTES_DIR.mkdir(parents=True, exist_ok=True)
    note_path = constants.NOTES_DIR / filename
    note_path.write_text(note_content, encoding='utf-8')

    subprocess.run(
        [sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')],
        capture_output=True,
    )

    # Index new note with Ollama semantic embeddings
    try:
        sys.path.insert(0, str(constants.BASE_DIR))
        from src.memory_search import index_file, MEMORY_DIR
        n = index_file(note_path)
        log.info("Ollama indexed %s (%d chunks)", filename, n)
    except Exception as e:
        log.warning("Ollama indexing failed for %s: %s", filename, e)

    print(f"Created note: {filename}")
    return filename


class YoutubeHandlers:
    """Mixin for /api/youtube."""

    def handle_youtube(self):
        """Process a YouTube URL: download subs, save note."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length).decode('utf-8')
            data = json.loads(post_data)
            url = data.get('url', '').strip()

            if not url or ('youtube.com' not in url and 'youtu.be' not in url):
                self.send_json({'error': 'URL YouTube invalid'}, 400)
                return

            try:
                print(f"Processing YouTube URL: {url}")
                _process_youtube(url)
                self.send_json({
                    'status': 'done',
                    'message': 'Notița a fost creată! Refresh pagina Notes.',
                })
            except Exception as e:
                print(f"YouTube processing error: {e}")
                traceback.print_exc()
                self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500)
        except Exception as e:
            self.send_json({'error': str(e)}, 500)