echo-core/tools/youtube_subs.py

#!/usr/bin/env python3
"""
Download YouTube subtitles/transcript for summarization.
Usage: python3 youtube_subs.py <video_url> [language]
"""

import subprocess
import sys
import os
import json
import re
from pathlib import Path

def clean_vtt(content):
    """Convert VTT to plain text, removing timestamps and duplicates."""
    lines = []
    seen = set()

    for line in content.split('\n'):
        # Skip VTT headers, timestamps, positioning
        if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
            continue
        if '-->' in line:  # Timestamp line
            continue
        if line.strip().startswith('<'):  # Positioning tags
            continue
        if not line.strip():
            continue
        if re.match(r'^\d+$', line.strip()):  # Sequence numbers
            continue

        # Clean HTML tags
        clean = re.sub(r'<[^>]+>', '', line).strip()
        if clean and clean not in seen:
            seen.add(clean)
            lines.append(clean)

    return ' '.join(lines)


def is_description_about_video(description):
    """
    Determine if the description contains info about the video content
    (chapters/timestamps, topics) vs. just author promotion/ads.
    Returns True if description is worth including.
    """
    if not description or len(description.strip()) < 50:
        return False

    # Strong signal: contains timestamp markers like 00:00, 0:00:00, 1:23
    timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
    timestamp_count = len(timestamp_pattern.findall(description))
    if timestamp_count >= 3:
        return True

    # Strong signal: contains chapter/topic-like bullet lines
    lines = description.strip().split('\n')
    bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
    if len(bullet_lines) >= 3:
        return True

    # Signal: numbered list or clear topic breakdown
    numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
    if len(numbered_lines) >= 3:
        return True

    return False


def extract_relevant_description(description):
    """
    Extract only the relevant parts of the description (about the video).
    Removes trailing promotional links, author bio boilerplate, etc.
    """
    if not description:
        return ""

    lines = description.strip().split('\n')

    # Find the last line that looks like content (timestamps or bullets or substantive text)
    # Cut off at lines that are clearly promotional (links, social media, etc.)
    promo_patterns = [
        re.compile(r'https?://\S+'),      # URLs
        re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
        re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
        re.compile(r'purchase|buy|order|shop|store', re.I),
    ]

    result_lines = []
    promo_streak = 0

    for line in lines:
        stripped = line.strip()

        # Check if this line is promotional
        is_promo = any(p.search(stripped) for p in promo_patterns)

        if is_promo:
            promo_streak += 1
            # Allow isolated promo lines (like a single URL after a chapter list)
            # but stop if we hit multiple consecutive promo lines
            if promo_streak >= 2:
                break
        else:
            promo_streak = 0
            result_lines.append(line)

    # Also strip trailing empty lines
    while result_lines and not result_lines[-1].strip():
        result_lines.pop()

    return '\n'.join(result_lines)


def get_subtitles(url, lang='en'):
    """Download subtitles for a YouTube video."""

    yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
    temp_dir = Path('/tmp/yt_subs')
    temp_dir.mkdir(exist_ok=True)

    # Clean old files
    for f in temp_dir.glob('*'):
        f.unlink()

    # First, get video info
    title = "Unknown"
    description = ""
    info_cmd = [yt_dlp, '--js-runtimes', 'node', '--dump-json', '--no-download', url]
    result = subprocess.run(info_cmd, capture_output=True, text=True, timeout=30)
    print(f"INFO: returncode={result.returncode}, stderr={result.stderr[:200]}", file=sys.stderr)
    if result.returncode == 0:
        try:
            info = json.loads(result.stdout)
            title = info.get('title', 'Unknown')
            description = info.get('description', '')
            duration = info.get('duration', 0)
            print(f"Title: {title}", file=sys.stderr)
            print(f"Duration: {duration//60}:{duration%60:02d}", file=sys.stderr)
        except Exception as e:
            print(f"JSON parse error: {e}", file=sys.stderr)
    else:
        print(f"yt-dlp failed: {result.stderr[:500]}", file=sys.stderr)

    # Try to get subtitles in order of preference
    lang_preferences = [lang, 'ro', 'en', 'en-US', 'en-GB']

    for try_lang in lang_preferences:
        # Try manual subtitles first
        cmd = [
            yt_dlp,
            '--js-runtimes', 'node',
            '--write-subs',
            '--sub-langs', try_lang,
            '--skip-download',
            '-o', str(temp_dir / '%(id)s.%(ext)s'),
            url
        ]

        subprocess.run(cmd, capture_output=True, timeout=60)

        # Check if we got subtitles
        for ext in ['vtt', 'srt', 'ass']:
            for sub_file in temp_dir.glob(f'*.{try_lang}*.{ext}'):
                content = sub_file.read_text(encoding='utf-8', errors='replace')
                return title, description, clean_vtt(content)

    # Try auto-generated subtitles
    for try_lang in lang_preferences:
        cmd = [
            yt_dlp,
            '--js-runtimes', 'node',
            '--write-auto-subs',
            '--sub-langs', try_lang,
            '--skip-download',
            '-o', str(temp_dir / '%(id)s.%(ext)s'),
            url
        ]

        subprocess.run(cmd, capture_output=True, timeout=60)

        for ext in ['vtt', 'srt', 'ass']:
            for sub_file in temp_dir.glob(f'*.{ext}'):
                content = sub_file.read_text(encoding='utf-8', errors='replace')
                text = clean_vtt(content)
                if text:
                    return title, description, text

    return title or "Unknown", description, None

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: python3 youtube_subs.py <video_url> [language]")
        sys.exit(1)

    url = sys.argv[1]
    lang = sys.argv[2] if len(sys.argv) > 2 else 'en'

    title, description, transcript = get_subtitles(url, lang)

    if transcript:
        print(f"\n=== {title} ===\n")

        # Include description if it's about the video content
        if description and is_description_about_video(description):
            relevant_desc = extract_relevant_description(description)
            if relevant_desc:
                print("--- Descriere / Index ---")
                print(relevant_desc)
                print("--- Transcript ---")

        print(transcript)
    else:
        print(f"No subtitles found for: {title}", file=sys.stderr)
        sys.exit(1)