#!/usr/bin/env python3 """ Download YouTube subtitles/transcript for summarization. Usage: python3 youtube_subs.py [language] """ import subprocess import sys import os import json import re from pathlib import Path def clean_vtt(content): """Convert VTT to plain text, removing timestamps and duplicates.""" lines = [] seen = set() for line in content.split('\n'): # Skip VTT headers, timestamps, positioning if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'): continue if '-->' in line: # Timestamp line continue if line.strip().startswith('<'): # Positioning tags continue if not line.strip(): continue if re.match(r'^\d+$', line.strip()): # Sequence numbers continue # Clean HTML tags clean = re.sub(r'<[^>]+>', '', line).strip() if clean and clean not in seen: seen.add(clean) lines.append(clean) return ' '.join(lines) def is_description_about_video(description): """ Determine if the description contains info about the video content (chapters/timestamps, topics) vs. just author promotion/ads. Returns True if description is worth including. """ if not description or len(description.strip()) < 50: return False # Strong signal: contains timestamp markers like 00:00, 0:00:00, 1:23 timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b') timestamp_count = len(timestamp_pattern.findall(description)) if timestamp_count >= 3: return True # Strong signal: contains chapter/topic-like bullet lines lines = description.strip().split('\n') bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)] if len(bullet_lines) >= 3: return True # Signal: numbered list or clear topic breakdown numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)] if len(numbered_lines) >= 3: return True return False def extract_relevant_description(description): """ Extract only the relevant parts of the description (about the video). Removes trailing promotional links, author bio boilerplate, etc. """ if not description: return "" lines = description.strip().split('\n') # Find the last line that looks like content (timestamps or bullets or substantive text) # Cut off at lines that are clearly promotional (links, social media, etc.) promo_patterns = [ re.compile(r'https?://\S+'), # URLs re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I), re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I), re.compile(r'purchase|buy|order|shop|store', re.I), ] result_lines = [] promo_streak = 0 for line in lines: stripped = line.strip() # Check if this line is promotional is_promo = any(p.search(stripped) for p in promo_patterns) if is_promo: promo_streak += 1 # Allow isolated promo lines (like a single URL after a chapter list) # but stop if we hit multiple consecutive promo lines if promo_streak >= 2: break else: promo_streak = 0 result_lines.append(line) # Also strip trailing empty lines while result_lines and not result_lines[-1].strip(): result_lines.pop() return '\n'.join(result_lines) def get_subtitles(url, lang='en'): """Download subtitles for a YouTube video.""" yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp') temp_dir = Path('/tmp/yt_subs') temp_dir.mkdir(exist_ok=True) # Clean old files for f in temp_dir.glob('*'): f.unlink() # First, get video info title = "Unknown" description = "" info_cmd = [yt_dlp, '--js-runtimes', 'node', '--dump-json', '--no-download', url] result = subprocess.run(info_cmd, capture_output=True, text=True, timeout=30) print(f"INFO: returncode={result.returncode}, stderr={result.stderr[:200]}", file=sys.stderr) if result.returncode == 0: try: info = json.loads(result.stdout) title = info.get('title', 'Unknown') description = info.get('description', '') duration = info.get('duration', 0) print(f"Title: {title}", file=sys.stderr) print(f"Duration: {duration//60}:{duration%60:02d}", file=sys.stderr) except Exception as e: print(f"JSON parse error: {e}", file=sys.stderr) else: print(f"yt-dlp failed: {result.stderr[:500]}", file=sys.stderr) # Try to get subtitles in order of preference lang_preferences = [lang, 'ro', 'en', 'en-US', 'en-GB'] for try_lang in lang_preferences: # Try manual subtitles first cmd = [ yt_dlp, '--js-runtimes', 'node', '--write-subs', '--sub-langs', try_lang, '--skip-download', '-o', str(temp_dir / '%(id)s.%(ext)s'), url ] subprocess.run(cmd, capture_output=True, timeout=60) # Check if we got subtitles for ext in ['vtt', 'srt', 'ass']: for sub_file in temp_dir.glob(f'*.{try_lang}*.{ext}'): content = sub_file.read_text(encoding='utf-8', errors='replace') return title, description, clean_vtt(content) # Try auto-generated subtitles for try_lang in lang_preferences: cmd = [ yt_dlp, '--js-runtimes', 'node', '--write-auto-subs', '--sub-langs', try_lang, '--skip-download', '-o', str(temp_dir / '%(id)s.%(ext)s'), url ] subprocess.run(cmd, capture_output=True, timeout=60) for ext in ['vtt', 'srt', 'ass']: for sub_file in temp_dir.glob(f'*.{ext}'): content = sub_file.read_text(encoding='utf-8', errors='replace') text = clean_vtt(content) if text: return title, description, text return title or "Unknown", description, None if __name__ == '__main__': if len(sys.argv) < 2: print("Usage: python3 youtube_subs.py [language]") sys.exit(1) url = sys.argv[1] lang = sys.argv[2] if len(sys.argv) > 2 else 'en' title, description, transcript = get_subtitles(url, lang) if transcript: print(f"\n=== {title} ===\n") # Include description if it's about the video content if description and is_description_about_video(description): relevant_desc = extract_relevant_description(description) if relevant_desc: print("--- Descriere / Index ---") print(relevant_desc) print("--- Transcript ---") print(transcript) else: print(f"No subtitles found for: {title}", file=sys.stderr) sys.exit(1)