- New: tools/yt_download.py with cookies support + rate limit tracking - New: docs/YOUTUBE-SETUP.md complete documentation - Updated: night-execute jobs to use new script - Updated: TOOLS.md with YouTube section - Added: 5 new YouTube notes (OpenClaw, cost optimization, task system, leads) - Added: credentials/ to .gitignore
193 lines
6.1 KiB
Python
Executable File
193 lines
6.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
YouTube subtitle downloader with rate limit protection and progressive retry.
|
|
Usage: python3 yt_download.py URL [URL2] [URL3] ...
|
|
|
|
Features:
|
|
- Cookies support for higher limits
|
|
- Sleep between downloads to avoid rate limiting
|
|
- Progressive retry: 2h → 4h → 24h on 429
|
|
- Tracks rate limit state in JSON file
|
|
"""
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import os
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
|
|
# Add deno to PATH
|
|
os.environ["PATH"] = f"{Path.home()}/.deno/bin:" + os.environ.get("PATH", "")
|
|
|
|
COOKIES_FILE = Path(__file__).parent.parent / "credentials" / "youtube-cookies.txt"
|
|
RATE_LIMIT_FILE = Path(__file__).parent.parent / "memory" / "youtube-rate-limit.json"
|
|
SLEEP_BETWEEN = 20 # seconds between downloads
|
|
MAX_PER_SESSION = 30
|
|
|
|
# Progressive retry delays (in hours)
|
|
RETRY_DELAYS = [2, 4, 24]
|
|
|
|
def load_rate_limit_state() -> dict:
|
|
"""Load rate limit state from JSON file."""
|
|
if RATE_LIMIT_FILE.exists():
|
|
try:
|
|
with open(RATE_LIMIT_FILE) as f:
|
|
return json.load(f)
|
|
except:
|
|
pass
|
|
return {"last_429": None, "retry_count": 0, "blocked_until": None}
|
|
|
|
def save_rate_limit_state(state: dict):
|
|
"""Save rate limit state to JSON file."""
|
|
RATE_LIMIT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(RATE_LIMIT_FILE, "w") as f:
|
|
json.dump(state, f, indent=2, default=str)
|
|
|
|
def check_rate_limit() -> tuple[bool, str]:
|
|
"""
|
|
Check if we're still in a rate limit cooldown period.
|
|
Returns: (can_proceed, message)
|
|
"""
|
|
state = load_rate_limit_state()
|
|
|
|
if state.get("blocked_until"):
|
|
blocked_until = datetime.fromisoformat(state["blocked_until"])
|
|
now = datetime.now()
|
|
|
|
if now < blocked_until:
|
|
remaining = blocked_until - now
|
|
hours = remaining.total_seconds() / 3600
|
|
return False, f"⏳ Rate limited. Retry în {hours:.1f}h ({blocked_until.strftime('%H:%M')})"
|
|
else:
|
|
# Cooldown period expired, reset retry count
|
|
state["retry_count"] = 0
|
|
state["blocked_until"] = None
|
|
save_rate_limit_state(state)
|
|
|
|
return True, "OK"
|
|
|
|
def record_rate_limit():
|
|
"""Record a rate limit hit and calculate next retry time."""
|
|
state = load_rate_limit_state()
|
|
|
|
retry_count = state.get("retry_count", 0)
|
|
delay_hours = RETRY_DELAYS[min(retry_count, len(RETRY_DELAYS) - 1)]
|
|
|
|
blocked_until = datetime.now() + timedelta(hours=delay_hours)
|
|
|
|
state["last_429"] = datetime.now().isoformat()
|
|
state["retry_count"] = retry_count + 1
|
|
state["blocked_until"] = blocked_until.isoformat()
|
|
|
|
save_rate_limit_state(state)
|
|
|
|
return delay_hours, blocked_until
|
|
|
|
def clear_rate_limit():
|
|
"""Clear rate limit state after successful downloads."""
|
|
state = load_rate_limit_state()
|
|
if state.get("retry_count", 0) > 0:
|
|
state["retry_count"] = 0
|
|
state["blocked_until"] = None
|
|
save_rate_limit_state(state)
|
|
|
|
def download_subtitles(url: str, use_cookies: bool = True) -> tuple[bool, bool]:
|
|
"""
|
|
Download subtitles for a single video.
|
|
Returns: (success, rate_limited)
|
|
"""
|
|
cmd = [
|
|
"yt-dlp",
|
|
"--remote-components", "ejs:github",
|
|
"--write-auto-sub",
|
|
"--sub-lang", "en,ro",
|
|
"--skip-download",
|
|
"-o", "temp_%(id)s",
|
|
]
|
|
|
|
if use_cookies and COOKIES_FILE.exists():
|
|
cmd.extend(["--cookies", str(COOKIES_FILE)])
|
|
|
|
cmd.append(url)
|
|
|
|
print(f"📥 Downloading: {url}")
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
combined_output = result.stdout + result.stderr
|
|
|
|
if "429" in combined_output:
|
|
print(f" ⚠️ Rate limited (429)")
|
|
return False, True
|
|
elif "Writing video subtitles" in combined_output:
|
|
print(f" ✅ Success")
|
|
return True, False
|
|
elif result.returncode == 0:
|
|
print(f" ✅ Success")
|
|
return True, False
|
|
else:
|
|
print(f" ❌ Error: {combined_output[:200]}")
|
|
return False, False
|
|
|
|
def main():
|
|
urls = sys.argv[1:]
|
|
if not urls:
|
|
print("Usage: python3 yt_download.py URL [URL2] ...")
|
|
print("\nRate limit state:")
|
|
state = load_rate_limit_state()
|
|
print(f" Retry count: {state.get('retry_count', 0)}")
|
|
print(f" Blocked until: {state.get('blocked_until', 'Not blocked')}")
|
|
sys.exit(1)
|
|
|
|
# Check if we're in cooldown
|
|
can_proceed, message = check_rate_limit()
|
|
if not can_proceed:
|
|
print(message)
|
|
sys.exit(3) # Special exit code for cooldown
|
|
|
|
if len(urls) > MAX_PER_SESSION:
|
|
print(f"⚠️ Max {MAX_PER_SESSION} per session. Processing first {MAX_PER_SESSION}.")
|
|
urls = urls[:MAX_PER_SESSION]
|
|
|
|
has_cookies = COOKIES_FILE.exists()
|
|
state = load_rate_limit_state()
|
|
|
|
print(f"🍪 Cookies: {'YES' if has_cookies else 'NO'}")
|
|
print(f"⏱️ Sleep: {SLEEP_BETWEEN}s between videos")
|
|
print(f"📊 Videos: {len(urls)}")
|
|
print(f"🔄 Retry count: {state.get('retry_count', 0)}")
|
|
print("-" * 40)
|
|
|
|
success = 0
|
|
rate_limited = False
|
|
|
|
for i, url in enumerate(urls):
|
|
ok, limited = download_subtitles(url, has_cookies)
|
|
if ok:
|
|
success += 1
|
|
if limited:
|
|
rate_limited = True
|
|
delay_hours, blocked_until = record_rate_limit()
|
|
print(f"🛑 Rate limit hit! Retry în {delay_hours}h ({blocked_until.strftime('%H:%M')})")
|
|
print(f" {len(urls) - i - 1} videos rămase pentru retry.")
|
|
break
|
|
|
|
if i < len(urls) - 1:
|
|
print(f" 💤 Sleeping {SLEEP_BETWEEN}s...")
|
|
time.sleep(SLEEP_BETWEEN)
|
|
|
|
print("-" * 40)
|
|
print(f"✅ Done: {success}/{len(urls)} videos")
|
|
|
|
# Clear rate limit state if we had successful downloads without hitting limit
|
|
if success > 0 and not rate_limited:
|
|
clear_rate_limit()
|
|
|
|
if rate_limited:
|
|
sys.exit(2) # Rate limit hit
|
|
|
|
sys.exit(0 if success == len(urls) else 1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|