136 lines
3.8 KiB
Python
136 lines
3.8 KiB
Python
"""YouTube subtitle-download + note-creation endpoint."""
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import traceback
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import constants
|
|
|
|
|
|
def _clean_vtt(content):
|
|
"""Convert VTT captions to plain text."""
|
|
lines = []
|
|
seen = set()
|
|
for line in content.split('\n'):
|
|
if any([
|
|
line.startswith('WEBVTT'),
|
|
line.startswith('Kind:'),
|
|
line.startswith('Language:'),
|
|
'-->' in line,
|
|
line.strip().startswith('<'),
|
|
not line.strip(),
|
|
re.match(r'^\d+$', line.strip()),
|
|
]):
|
|
continue
|
|
clean = re.sub(r'<[^>]+>', '', line).strip()
|
|
if clean and clean not in seen:
|
|
seen.add(clean)
|
|
lines.append(clean)
|
|
return ' '.join(lines)
|
|
|
|
|
|
def _process_youtube(url):
|
|
"""Download subtitles, save note."""
|
|
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
|
|
|
|
result = subprocess.run(
|
|
[yt_dlp, '--dump-json', '--no-download', url],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
print(f"Failed to get video info: {result.stderr}")
|
|
return
|
|
|
|
info = json.loads(result.stdout)
|
|
title = info.get('title', 'Unknown')
|
|
duration = info.get('duration', 0)
|
|
|
|
temp_dir = Path('/tmp/yt_subs')
|
|
temp_dir.mkdir(exist_ok=True)
|
|
for f in temp_dir.glob('*'):
|
|
f.unlink()
|
|
|
|
subprocess.run([
|
|
yt_dlp, '--write-auto-subs', '--sub-langs', 'en',
|
|
'--skip-download', '--sub-format', 'vtt',
|
|
'-o', str(temp_dir / '%(id)s'),
|
|
url,
|
|
], capture_output=True, timeout=120)
|
|
|
|
transcript = None
|
|
for sub_file in temp_dir.glob('*.vtt'):
|
|
content = sub_file.read_text(encoding='utf-8', errors='replace')
|
|
transcript = _clean_vtt(content)
|
|
break
|
|
|
|
if not transcript:
|
|
print("No subtitles found")
|
|
return
|
|
|
|
date_str = datetime.now().strftime('%Y-%m-%d')
|
|
slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
|
|
filename = f"{date_str}_{slug}.md"
|
|
|
|
note_content = f"""# {title}
|
|
|
|
**Video:** {url}
|
|
**Duration:** {duration // 60}:{duration % 60:02d}
|
|
**Saved:** {date_str}
|
|
**Tags:** #youtube #to-summarize
|
|
|
|
---
|
|
|
|
## Transcript
|
|
|
|
{transcript[:15000]}
|
|
|
|
---
|
|
|
|
*Notă: Sumarizarea va fi adăugată de Echo.*
|
|
"""
|
|
|
|
constants.NOTES_DIR.mkdir(parents=True, exist_ok=True)
|
|
note_path = constants.NOTES_DIR / filename
|
|
note_path.write_text(note_content, encoding='utf-8')
|
|
|
|
subprocess.run(
|
|
[sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')],
|
|
capture_output=True,
|
|
)
|
|
print(f"Created note: {filename}")
|
|
return filename
|
|
|
|
|
|
class YoutubeHandlers:
|
|
"""Mixin for /api/youtube."""
|
|
|
|
def handle_youtube(self):
|
|
"""Process a YouTube URL: download subs, save note."""
|
|
try:
|
|
content_length = int(self.headers['Content-Length'])
|
|
post_data = self.rfile.read(content_length).decode('utf-8')
|
|
data = json.loads(post_data)
|
|
url = data.get('url', '').strip()
|
|
|
|
if not url or ('youtube.com' not in url and 'youtu.be' not in url):
|
|
self.send_json({'error': 'URL YouTube invalid'}, 400)
|
|
return
|
|
|
|
try:
|
|
print(f"Processing YouTube URL: {url}")
|
|
_process_youtube(url)
|
|
self.send_json({
|
|
'status': 'done',
|
|
'message': 'Notița a fost creată! Refresh pagina Notes.',
|
|
})
|
|
except Exception as e:
|
|
print(f"YouTube processing error: {e}")
|
|
traceback.print_exc()
|
|
self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500)
|
|
except Exception as e:
|
|
self.send_json({'error': str(e)}, 500)
|