Initial commit - workspace setup
- AGENTS.md, SOUL.md, USER.md, IDENTITY.md - ANAF monitor (declarații fiscale) - Kanban board + Notes UI - Email tools - Memory system
This commit is contained in:
124
tools/youtube_subs.py
Executable file
124
tools/youtube_subs.py
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download YouTube subtitles/transcript for summarization.
|
||||
Usage: python3 youtube_subs.py <video_url> [language]
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
def clean_vtt(content):
|
||||
"""Convert VTT to plain text, removing timestamps and duplicates."""
|
||||
lines = []
|
||||
seen = set()
|
||||
|
||||
for line in content.split('\n'):
|
||||
# Skip VTT headers, timestamps, positioning
|
||||
if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
|
||||
continue
|
||||
if '-->' in line: # Timestamp line
|
||||
continue
|
||||
if line.strip().startswith('<'): # Positioning tags
|
||||
continue
|
||||
if not line.strip():
|
||||
continue
|
||||
if re.match(r'^\d+$', line.strip()): # Sequence numbers
|
||||
continue
|
||||
|
||||
# Clean HTML tags
|
||||
clean = re.sub(r'<[^>]+>', '', line).strip()
|
||||
if clean and clean not in seen:
|
||||
seen.add(clean)
|
||||
lines.append(clean)
|
||||
|
||||
return ' '.join(lines)
|
||||
|
||||
def get_subtitles(url, lang='en'):
|
||||
"""Download subtitles for a YouTube video."""
|
||||
|
||||
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
|
||||
temp_dir = Path('/tmp/yt_subs')
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Clean old files
|
||||
for f in temp_dir.glob('*'):
|
||||
f.unlink()
|
||||
|
||||
# First, get video info
|
||||
info_cmd = [yt_dlp, '--dump-json', '--no-download', url]
|
||||
try:
|
||||
result = subprocess.run(info_cmd, capture_output=True, text=True, timeout=30)
|
||||
if result.returncode == 0:
|
||||
info = json.loads(result.stdout)
|
||||
title = info.get('title', 'Unknown')
|
||||
duration = info.get('duration', 0)
|
||||
print(f"Title: {title}", file=sys.stderr)
|
||||
print(f"Duration: {duration//60}:{duration%60:02d}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
title = "Unknown"
|
||||
print(f"Could not get video info: {e}", file=sys.stderr)
|
||||
|
||||
# Try to get subtitles in order of preference
|
||||
lang_preferences = [lang, 'ro', 'en', 'en-US', 'en-GB']
|
||||
|
||||
for try_lang in lang_preferences:
|
||||
# Try manual subtitles first
|
||||
cmd = [
|
||||
yt_dlp,
|
||||
'--write-subs',
|
||||
'--sub-langs', try_lang,
|
||||
'--skip-download',
|
||||
'-o', str(temp_dir / '%(id)s.%(ext)s'),
|
||||
url
|
||||
]
|
||||
|
||||
subprocess.run(cmd, capture_output=True, timeout=60)
|
||||
|
||||
# Check if we got subtitles
|
||||
for ext in ['vtt', 'srt', 'ass']:
|
||||
for sub_file in temp_dir.glob(f'*.{try_lang}*.{ext}'):
|
||||
content = sub_file.read_text(encoding='utf-8', errors='replace')
|
||||
return title, clean_vtt(content)
|
||||
|
||||
# Try auto-generated subtitles
|
||||
for try_lang in lang_preferences:
|
||||
cmd = [
|
||||
yt_dlp,
|
||||
'--write-auto-subs',
|
||||
'--sub-langs', try_lang,
|
||||
'--skip-download',
|
||||
'-o', str(temp_dir / '%(id)s.%(ext)s'),
|
||||
url
|
||||
]
|
||||
|
||||
subprocess.run(cmd, capture_output=True, timeout=60)
|
||||
|
||||
for ext in ['vtt', 'srt', 'ass']:
|
||||
for sub_file in temp_dir.glob(f'*.{ext}'):
|
||||
content = sub_file.read_text(encoding='utf-8', errors='replace')
|
||||
text = clean_vtt(content)
|
||||
if text:
|
||||
return title, text
|
||||
|
||||
return title, None
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 youtube_subs.py <video_url> [language]")
|
||||
sys.exit(1)
|
||||
|
||||
url = sys.argv[1]
|
||||
lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
|
||||
|
||||
title, transcript = get_subtitles(url, lang)
|
||||
|
||||
if transcript:
|
||||
print(f"\n=== {title} ===\n")
|
||||
print(transcript)
|
||||
else:
|
||||
print(f"No subtitles found for: {title}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user