YouTube rate limit protection + progressive retry system

- New: tools/yt_download.py with cookies support + rate limit tracking
- New: docs/YOUTUBE-SETUP.md complete documentation
- Updated: night-execute jobs to use new script
- Updated: TOOLS.md with YouTube section
- Added: 5 new YouTube notes (OpenClaw, cost optimization, task system, leads)
- Added: credentials/ to .gitignore
This commit is contained in:
Echo
2026-02-03 07:11:21 +00:00
parent b0c9b254f1
commit 762ac21681
24 changed files with 2295 additions and 45 deletions

192
tools/yt_download.py Executable file
View File

@@ -0,0 +1,192 @@
#!/usr/bin/env python3
"""
YouTube subtitle downloader with rate limit protection and progressive retry.
Usage: python3 yt_download.py URL [URL2] [URL3] ...
Features:
- Cookies support for higher limits
- Sleep between downloads to avoid rate limiting
- Progressive retry: 2h → 4h → 24h on 429
- Tracks rate limit state in JSON file
"""
import subprocess
import sys
import time
import os
import json
from pathlib import Path
from datetime import datetime, timedelta
# Add deno to PATH
os.environ["PATH"] = f"{Path.home()}/.deno/bin:" + os.environ.get("PATH", "")
COOKIES_FILE = Path(__file__).parent.parent / "credentials" / "youtube-cookies.txt"
RATE_LIMIT_FILE = Path(__file__).parent.parent / "memory" / "youtube-rate-limit.json"
SLEEP_BETWEEN = 20 # seconds between downloads
MAX_PER_SESSION = 30
# Progressive retry delays (in hours)
RETRY_DELAYS = [2, 4, 24]
def load_rate_limit_state() -> dict:
"""Load rate limit state from JSON file."""
if RATE_LIMIT_FILE.exists():
try:
with open(RATE_LIMIT_FILE) as f:
return json.load(f)
except:
pass
return {"last_429": None, "retry_count": 0, "blocked_until": None}
def save_rate_limit_state(state: dict):
"""Save rate limit state to JSON file."""
RATE_LIMIT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(RATE_LIMIT_FILE, "w") as f:
json.dump(state, f, indent=2, default=str)
def check_rate_limit() -> tuple[bool, str]:
"""
Check if we're still in a rate limit cooldown period.
Returns: (can_proceed, message)
"""
state = load_rate_limit_state()
if state.get("blocked_until"):
blocked_until = datetime.fromisoformat(state["blocked_until"])
now = datetime.now()
if now < blocked_until:
remaining = blocked_until - now
hours = remaining.total_seconds() / 3600
return False, f"⏳ Rate limited. Retry în {hours:.1f}h ({blocked_until.strftime('%H:%M')})"
else:
# Cooldown period expired, reset retry count
state["retry_count"] = 0
state["blocked_until"] = None
save_rate_limit_state(state)
return True, "OK"
def record_rate_limit():
"""Record a rate limit hit and calculate next retry time."""
state = load_rate_limit_state()
retry_count = state.get("retry_count", 0)
delay_hours = RETRY_DELAYS[min(retry_count, len(RETRY_DELAYS) - 1)]
blocked_until = datetime.now() + timedelta(hours=delay_hours)
state["last_429"] = datetime.now().isoformat()
state["retry_count"] = retry_count + 1
state["blocked_until"] = blocked_until.isoformat()
save_rate_limit_state(state)
return delay_hours, blocked_until
def clear_rate_limit():
"""Clear rate limit state after successful downloads."""
state = load_rate_limit_state()
if state.get("retry_count", 0) > 0:
state["retry_count"] = 0
state["blocked_until"] = None
save_rate_limit_state(state)
def download_subtitles(url: str, use_cookies: bool = True) -> tuple[bool, bool]:
"""
Download subtitles for a single video.
Returns: (success, rate_limited)
"""
cmd = [
"yt-dlp",
"--remote-components", "ejs:github",
"--write-auto-sub",
"--sub-lang", "en,ro",
"--skip-download",
"-o", "temp_%(id)s",
]
if use_cookies and COOKIES_FILE.exists():
cmd.extend(["--cookies", str(COOKIES_FILE)])
cmd.append(url)
print(f"📥 Downloading: {url}")
result = subprocess.run(cmd, capture_output=True, text=True)
combined_output = result.stdout + result.stderr
if "429" in combined_output:
print(f" ⚠️ Rate limited (429)")
return False, True
elif "Writing video subtitles" in combined_output:
print(f" ✅ Success")
return True, False
elif result.returncode == 0:
print(f" ✅ Success")
return True, False
else:
print(f" ❌ Error: {combined_output[:200]}")
return False, False
def main():
urls = sys.argv[1:]
if not urls:
print("Usage: python3 yt_download.py URL [URL2] ...")
print("\nRate limit state:")
state = load_rate_limit_state()
print(f" Retry count: {state.get('retry_count', 0)}")
print(f" Blocked until: {state.get('blocked_until', 'Not blocked')}")
sys.exit(1)
# Check if we're in cooldown
can_proceed, message = check_rate_limit()
if not can_proceed:
print(message)
sys.exit(3) # Special exit code for cooldown
if len(urls) > MAX_PER_SESSION:
print(f"⚠️ Max {MAX_PER_SESSION} per session. Processing first {MAX_PER_SESSION}.")
urls = urls[:MAX_PER_SESSION]
has_cookies = COOKIES_FILE.exists()
state = load_rate_limit_state()
print(f"🍪 Cookies: {'YES' if has_cookies else 'NO'}")
print(f"⏱️ Sleep: {SLEEP_BETWEEN}s between videos")
print(f"📊 Videos: {len(urls)}")
print(f"🔄 Retry count: {state.get('retry_count', 0)}")
print("-" * 40)
success = 0
rate_limited = False
for i, url in enumerate(urls):
ok, limited = download_subtitles(url, has_cookies)
if ok:
success += 1
if limited:
rate_limited = True
delay_hours, blocked_until = record_rate_limit()
print(f"🛑 Rate limit hit! Retry în {delay_hours}h ({blocked_until.strftime('%H:%M')})")
print(f" {len(urls) - i - 1} videos rămase pentru retry.")
break
if i < len(urls) - 1:
print(f" 💤 Sleeping {SLEEP_BETWEEN}s...")
time.sleep(SLEEP_BETWEEN)
print("-" * 40)
print(f"✅ Done: {success}/{len(urls)} videos")
# Clear rate limit state if we had successful downloads without hitting limit
if success > 0 and not rate_limited:
clear_rate_limit()
if rate_limited:
sys.exit(2) # Rate limit hit
sys.exit(0 if success == len(urls) else 1)
if __name__ == "__main__":
main()