Files
clawd/tools/youtube_subs.py
Echo f9912e0081 Initial commit - workspace setup
- AGENTS.md, SOUL.md, USER.md, IDENTITY.md
- ANAF monitor (declarații fiscale)
- Kanban board + Notes UI
- Email tools
- Memory system
2026-01-29 13:11:59 +00:00

125 lines
3.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Download YouTube subtitles/transcript for summarization.
Usage: python3 youtube_subs.py <video_url> [language]
"""
import subprocess
import sys
import os
import json
import re
from pathlib import Path
def clean_vtt(content):
"""Convert VTT to plain text, removing timestamps and duplicates."""
lines = []
seen = set()
for line in content.split('\n'):
# Skip VTT headers, timestamps, positioning
if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
continue
if '-->' in line: # Timestamp line
continue
if line.strip().startswith('<'): # Positioning tags
continue
if not line.strip():
continue
if re.match(r'^\d+$', line.strip()): # Sequence numbers
continue
# Clean HTML tags
clean = re.sub(r'<[^>]+>', '', line).strip()
if clean and clean not in seen:
seen.add(clean)
lines.append(clean)
return ' '.join(lines)
def get_subtitles(url, lang='en'):
"""Download subtitles for a YouTube video."""
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
temp_dir = Path('/tmp/yt_subs')
temp_dir.mkdir(exist_ok=True)
# Clean old files
for f in temp_dir.glob('*'):
f.unlink()
# First, get video info
info_cmd = [yt_dlp, '--dump-json', '--no-download', url]
try:
result = subprocess.run(info_cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0:
info = json.loads(result.stdout)
title = info.get('title', 'Unknown')
duration = info.get('duration', 0)
print(f"Title: {title}", file=sys.stderr)
print(f"Duration: {duration//60}:{duration%60:02d}", file=sys.stderr)
except Exception as e:
title = "Unknown"
print(f"Could not get video info: {e}", file=sys.stderr)
# Try to get subtitles in order of preference
lang_preferences = [lang, 'ro', 'en', 'en-US', 'en-GB']
for try_lang in lang_preferences:
# Try manual subtitles first
cmd = [
yt_dlp,
'--write-subs',
'--sub-langs', try_lang,
'--skip-download',
'-o', str(temp_dir / '%(id)s.%(ext)s'),
url
]
subprocess.run(cmd, capture_output=True, timeout=60)
# Check if we got subtitles
for ext in ['vtt', 'srt', 'ass']:
for sub_file in temp_dir.glob(f'*.{try_lang}*.{ext}'):
content = sub_file.read_text(encoding='utf-8', errors='replace')
return title, clean_vtt(content)
# Try auto-generated subtitles
for try_lang in lang_preferences:
cmd = [
yt_dlp,
'--write-auto-subs',
'--sub-langs', try_lang,
'--skip-download',
'-o', str(temp_dir / '%(id)s.%(ext)s'),
url
]
subprocess.run(cmd, capture_output=True, timeout=60)
for ext in ['vtt', 'srt', 'ass']:
for sub_file in temp_dir.glob(f'*.{ext}'):
content = sub_file.read_text(encoding='utf-8', errors='replace')
text = clean_vtt(content)
if text:
return title, text
return title, None
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: python3 youtube_subs.py <video_url> [language]")
sys.exit(1)
url = sys.argv[1]
lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
title, transcript = get_subtitles(url, lang)
if transcript:
print(f"\n=== {title} ===\n")
print(transcript)
else:
print(f"No subtitles found for: {title}", file=sys.stderr)
sys.exit(1)