Extraction: Replace youtube-dl with custom-built watch page extraction
This commit is contained in:
@@ -187,8 +187,17 @@
|
||||
.format-ext{
|
||||
width: 60px;
|
||||
}
|
||||
.format-res{
|
||||
width:90px;
|
||||
.format-video-quality{
|
||||
width: 140px;
|
||||
}
|
||||
.format-audio-quality{
|
||||
width: 120px;
|
||||
}
|
||||
.format-file-size{
|
||||
width: 80px;
|
||||
}
|
||||
.format-codecs{
|
||||
width: 120px;
|
||||
}
|
||||
{% endblock style %}
|
||||
|
||||
@@ -227,8 +236,10 @@
|
||||
<a class="download-link" href="{{ format['url'] }}">
|
||||
<ol class="format-attributes">
|
||||
<li class="format-ext">{{ format['ext'] }}</li>
|
||||
<li class="format-res">{{ format['resolution'] }}</li>
|
||||
<li class="format-note">{{ format['note'] }}</li>
|
||||
<li class="format-video-quality">{{ format['video_quality'] }}</li>
|
||||
<li class="format-audio-quality">{{ format['audio_quality'] }}</li>
|
||||
<li class="format-file-size">{{ format['file_size'] }}</li>
|
||||
<li class="format-codecs">{{ format['codecs'] }}</li>
|
||||
</ol>
|
||||
</a>
|
||||
</li>
|
||||
@@ -238,7 +249,7 @@
|
||||
<input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
|
||||
|
||||
|
||||
<span class="description">{{ description }}</span>
|
||||
<span class="description">{{ common_elements.text_runs(description) }}</span>
|
||||
<div class="music-list">
|
||||
{% if music_list.__len__() != 0 %}
|
||||
<hr>
|
||||
|
||||
@@ -176,7 +176,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja
|
||||
return content, response
|
||||
return content
|
||||
|
||||
mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
|
||||
mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
|
||||
mobile_ua = (('User-Agent', mobile_user_agent),)
|
||||
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
|
||||
desktop_ua = (('User-Agent', desktop_user_agent),)
|
||||
@@ -312,3 +312,10 @@ def uppercase_escape(s):
|
||||
def prefix_url(url):
|
||||
url = url.lstrip('/') # some urls have // before them, which has a special meaning
|
||||
return '/' + url
|
||||
|
||||
def left_remove(string, substring):
|
||||
'''removes substring from the start of string, if present'''
|
||||
if string.startswith(substring):
|
||||
return string[len(substring):]
|
||||
return string
|
||||
|
||||
|
||||
148
youtube/watch.py
148
youtube/watch.py
@@ -5,49 +5,15 @@ import settings
|
||||
from flask import request
|
||||
import flask
|
||||
|
||||
from youtube_dl.YoutubeDL import YoutubeDL
|
||||
from youtube_dl.extractor.youtube import YoutubeError
|
||||
import json
|
||||
import html
|
||||
import gevent
|
||||
import os
|
||||
import math
|
||||
import traceback
|
||||
|
||||
|
||||
def get_related_items(info):
|
||||
results = []
|
||||
for item in info['related_vids']:
|
||||
if 'list' in item: # playlist:
|
||||
result = watch_page_related_playlist_info(item)
|
||||
else:
|
||||
result = watch_page_related_video_info(item)
|
||||
yt_data_extract.prefix_urls(result)
|
||||
yt_data_extract.add_extra_html_info(result)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
# json of related items retrieved directly from the watch page has different names for everything
|
||||
# converts these to standard names
|
||||
def watch_page_related_video_info(item):
|
||||
result = {key: item[key] for key in ('id', 'title', 'author')}
|
||||
result['duration'] = util.seconds_to_timestamp(item['length_seconds'])
|
||||
try:
|
||||
result['views'] = item['short_view_count_text']
|
||||
except KeyError:
|
||||
result['views'] = ''
|
||||
result['thumbnail'] = util.get_thumbnail_url(item['id'])
|
||||
result['type'] = 'video'
|
||||
return result
|
||||
|
||||
def watch_page_related_playlist_info(item):
|
||||
return {
|
||||
'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+",
|
||||
'title': item['playlist_title'],
|
||||
'id': item['list'],
|
||||
'first_video_id': item['video_id'],
|
||||
'thumbnail': util.get_thumbnail_url(item['video_id']),
|
||||
'type': 'playlist',
|
||||
}
|
||||
|
||||
def get_video_sources(info):
|
||||
video_sources = []
|
||||
@@ -55,9 +21,10 @@ def get_video_sources(info):
|
||||
max_resolution = 360
|
||||
else:
|
||||
max_resolution = settings.default_resolution
|
||||
|
||||
for format in info['formats']:
|
||||
if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution:
|
||||
if not all(attr in format for attr in ('height', 'width', 'ext', 'url')):
|
||||
continue
|
||||
if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution:
|
||||
video_sources.append({
|
||||
'src': format['url'],
|
||||
'type': 'video/' + format['ext'],
|
||||
@@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list):
|
||||
|
||||
return ordered_attributes
|
||||
|
||||
headers = (
|
||||
('Accept', '*/*'),
|
||||
('Accept-Language', 'en-US,en;q=0.5'),
|
||||
('X-YouTube-Client-Name', '2'),
|
||||
('X-YouTube-Client-Version', '2.20180830'),
|
||||
) + util.mobile_ua
|
||||
|
||||
def extract_info(downloader, *args, **kwargs):
|
||||
def extract_info(video_id):
|
||||
polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch')
|
||||
try:
|
||||
return downloader.extract_info(*args, **kwargs)
|
||||
except YoutubeError as e:
|
||||
return str(e)
|
||||
polymer_json = json.loads(polymer_json)
|
||||
except json.decoder.JSONDecodeError:
|
||||
traceback.print_exc()
|
||||
return {'error': 'Failed to parse json response'}
|
||||
return yt_data_extract.extract_watch_info(polymer_json)
|
||||
|
||||
def video_quality_string(format):
|
||||
if 'vcodec' in format:
|
||||
result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?'))
|
||||
if 'fps' in format:
|
||||
result += ' ' + format['fps'] + 'fps'
|
||||
return result
|
||||
elif 'acodec' in format:
|
||||
return 'audio only'
|
||||
|
||||
return '?'
|
||||
|
||||
def audio_quality_string(format):
|
||||
if 'acodec' in format:
|
||||
result = str(format.get('abr', '?')) + 'k'
|
||||
if 'audio_sample_rate' in format:
|
||||
result += ' ' + str(format['audio_sample_rate']) + ' Hz'
|
||||
return result
|
||||
elif 'vcodec' in format:
|
||||
return 'video only'
|
||||
|
||||
return '?'
|
||||
|
||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
|
||||
def format_bytes(bytes):
|
||||
if bytes is None:
|
||||
return 'N/A'
|
||||
if type(bytes) is str:
|
||||
bytes = float(bytes)
|
||||
if bytes == 0.0:
|
||||
exponent = 0
|
||||
else:
|
||||
exponent = int(math.log(bytes, 1024.0))
|
||||
suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
|
||||
converted = float(bytes) / float(1024 ** exponent)
|
||||
return '%.2f%s' % (converted, suffix)
|
||||
|
||||
|
||||
@yt_app.route('/watch')
|
||||
@@ -152,38 +162,26 @@ def get_watch_page():
|
||||
flask.abort(flask.Response('Incomplete video id (too short): ' + video_id))
|
||||
|
||||
lc = request.args.get('lc', '')
|
||||
if settings.route_tor:
|
||||
proxy = 'socks5://127.0.0.1:9150/'
|
||||
else:
|
||||
proxy = ''
|
||||
yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy})
|
||||
tasks = (
|
||||
gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ),
|
||||
gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False)
|
||||
gevent.spawn(extract_info, video_id)
|
||||
)
|
||||
gevent.joinall(tasks)
|
||||
comments_info, info = tasks[0].value, tasks[1].value
|
||||
|
||||
if isinstance(info, str): # youtube error
|
||||
return flask.render_template('error.html', error_message = info)
|
||||
if info['error']:
|
||||
return flask.render_template('error.html', error_message = info['error'])
|
||||
|
||||
video_info = {
|
||||
"duration": util.seconds_to_timestamp(info["duration"]),
|
||||
"duration": util.seconds_to_timestamp(info["duration"] or 0),
|
||||
"id": info['id'],
|
||||
"title": info['title'],
|
||||
"author": info['uploader'],
|
||||
"author": info['author'],
|
||||
}
|
||||
|
||||
upload_year = info["upload_date"][0:4]
|
||||
upload_month = info["upload_date"][4:6]
|
||||
upload_day = info["upload_date"][6:8]
|
||||
upload_date = upload_month + "/" + upload_day + "/" + upload_year
|
||||
|
||||
if settings.related_videos_mode:
|
||||
related_videos = get_related_items(info)
|
||||
else:
|
||||
related_videos = []
|
||||
|
||||
for item in info['related_videos']:
|
||||
yt_data_extract.prefix_urls(item)
|
||||
yt_data_extract.add_extra_html_info(item)
|
||||
|
||||
if settings.gather_googlevideo_domains:
|
||||
with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f:
|
||||
@@ -195,23 +193,29 @@ def get_watch_page():
|
||||
download_formats = []
|
||||
|
||||
for format in info['formats']:
|
||||
if 'acodec' in format and 'vcodec' in format:
|
||||
codecs_string = format['acodec'] + ', ' + format['vcodec']
|
||||
else:
|
||||
codecs_string = format.get('acodec') or format.get('vcodec') or '?'
|
||||
download_formats.append({
|
||||
'url': format['url'],
|
||||
'ext': format['ext'],
|
||||
'resolution': yt_dl_downloader.format_resolution(format),
|
||||
'note': yt_dl_downloader._format_note(format),
|
||||
'ext': format.get('ext', '?'),
|
||||
'audio_quality': audio_quality_string(format),
|
||||
'video_quality': video_quality_string(format),
|
||||
'file_size': format_bytes(format['file_size']),
|
||||
'codecs': codecs_string,
|
||||
})
|
||||
|
||||
video_sources = get_video_sources(info)
|
||||
video_height = video_sources[0]['height']
|
||||
|
||||
video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360)
|
||||
video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640)
|
||||
# 1 second per pixel, or the actual video width
|
||||
theater_video_target_width = max(640, info['duration'], video_sources[0]['width'])
|
||||
theater_video_target_width = max(640, info['duration'] or 0, video_width)
|
||||
|
||||
return flask.render_template('watch.html',
|
||||
header_playlist_names = local_playlist.get_playlist_names(),
|
||||
uploader_channel_url = '/' + info['uploader_url'],
|
||||
upload_date = upload_date,
|
||||
uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '',
|
||||
upload_date = info['published_date'],
|
||||
views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
|
||||
likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
|
||||
dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
|
||||
@@ -219,7 +223,7 @@ def get_watch_page():
|
||||
video_info = json.dumps(video_info),
|
||||
video_sources = video_sources,
|
||||
subtitle_sources = get_subtitle_sources(info),
|
||||
related = related_videos,
|
||||
related = info['related_videos'],
|
||||
music_list = info['music_list'],
|
||||
music_attributes = get_ordered_music_list_attributes(info['music_list']),
|
||||
comments_info = comments_info,
|
||||
@@ -232,7 +236,7 @@ def get_watch_page():
|
||||
theater_video_target_width = theater_video_target_width,
|
||||
|
||||
title = info['title'],
|
||||
uploader = info['uploader'],
|
||||
uploader = info['author'],
|
||||
description = info['description'],
|
||||
unlisted = info['unlisted'],
|
||||
)
|
||||
|
||||
@@ -6,6 +6,7 @@ import re
|
||||
import urllib
|
||||
import collections
|
||||
from math import ceil
|
||||
import traceback
|
||||
|
||||
# videos (all of type str):
|
||||
|
||||
@@ -36,8 +37,112 @@ from math import ceil
|
||||
# size
|
||||
# first_video_id
|
||||
|
||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
|
||||
_formats = {
|
||||
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
|
||||
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
|
||||
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
|
||||
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
|
||||
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
|
||||
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
|
||||
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
|
||||
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
|
||||
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
|
||||
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
|
||||
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
|
||||
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
|
||||
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
|
||||
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
|
||||
|
||||
# 3D videos
|
||||
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
|
||||
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
|
||||
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
|
||||
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
|
||||
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
|
||||
|
||||
# Apple HTTP Live Streaming
|
||||
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
|
||||
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
|
||||
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
|
||||
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
|
||||
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
|
||||
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
|
||||
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'},
|
||||
|
||||
# DASH mp4 video
|
||||
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
|
||||
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
||||
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
||||
|
||||
# Dash mp4 audio
|
||||
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
|
||||
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
|
||||
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
|
||||
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
||||
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
|
||||
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
|
||||
|
||||
# Dash webm
|
||||
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
||||
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
|
||||
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
|
||||
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
||||
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
||||
|
||||
# Dash webm audio
|
||||
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
|
||||
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
|
||||
|
||||
# Dash webm audio with opus inside
|
||||
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
|
||||
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
|
||||
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
|
||||
|
||||
# RTMP (unnamed)
|
||||
'_rtmp': {'protocol': 'rtmp'},
|
||||
|
||||
# av01 video only formats sometimes served with "unknown" codecs
|
||||
'394': {'vcodec': 'av01.0.05M.08'},
|
||||
'395': {'vcodec': 'av01.0.05M.08'},
|
||||
'396': {'vcodec': 'av01.0.05M.08'},
|
||||
'397': {'vcodec': 'av01.0.05M.08'},
|
||||
}
|
||||
|
||||
|
||||
def get_plain_text(node):
|
||||
@@ -59,7 +164,7 @@ def format_text_runs(runs):
|
||||
result += html.escape(text_run["text"])
|
||||
return result
|
||||
|
||||
def default_get(object, key, default, types=()):
|
||||
def default_get(object, key, default=None, types=()):
|
||||
'''Like dict.get(), but returns default if the result doesn't match one of the types.
|
||||
Also works for indexing lists.'''
|
||||
try:
|
||||
@@ -74,7 +179,7 @@ def default_get(object, key, default, types=()):
|
||||
|
||||
|
||||
|
||||
def default_multi_get(object, *keys, default, types=()):
|
||||
def default_multi_get(object, *keys, default=None, types=()):
|
||||
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
|
||||
Last argument is the default value to use in case of any IndexErrors or KeyErrors.
|
||||
If types is given and the result doesn't match one of those types, default is returned'''
|
||||
@@ -106,6 +211,11 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()):
|
||||
continue
|
||||
return default
|
||||
|
||||
def remove_redirect(url):
|
||||
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
|
||||
query_string = url[url.find('?')+1: ]
|
||||
return urllib.parse.parse_qs(query_string)['q'][0]
|
||||
return url
|
||||
|
||||
def get_url(node):
|
||||
try:
|
||||
@@ -239,9 +349,9 @@ def renderer_info(renderer, additional_info={}):
|
||||
type = list(renderer.keys())[0]
|
||||
renderer = renderer[type]
|
||||
info = {}
|
||||
if type == 'itemSectionRenderer':
|
||||
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
|
||||
return renderer_info(renderer['contents'][0], additional_info)
|
||||
|
||||
|
||||
if type in ('movieRenderer', 'clarificationRenderer'):
|
||||
info['type'] = 'unsupported'
|
||||
return info
|
||||
@@ -345,6 +455,7 @@ item_types = {
|
||||
|
||||
'videoRenderer',
|
||||
'compactVideoRenderer',
|
||||
'compactAutoplayRenderer',
|
||||
'gridVideoRenderer',
|
||||
'playlistVideoRenderer',
|
||||
|
||||
@@ -378,6 +489,11 @@ def traverse_browse_renderer(renderer):
|
||||
print('Could not find tab with content')
|
||||
return {}
|
||||
|
||||
def traverse_standard_list(renderer):
|
||||
renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
|
||||
continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
|
||||
return renderer_list, continuation
|
||||
|
||||
# these renderers contain one inside them
|
||||
nested_renderer_dispatch = {
|
||||
'singleColumnBrowseResultsRenderer': traverse_browse_renderer,
|
||||
@@ -385,7 +501,16 @@ nested_renderer_dispatch = {
|
||||
'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict),
|
||||
}
|
||||
|
||||
def extract_items(response):
|
||||
# these renderers contain a list of renderers in side them
|
||||
nested_renderer_list_dispatch = {
|
||||
'sectionListRenderer': traverse_standard_list,
|
||||
'itemSectionRenderer': traverse_standard_list,
|
||||
'gridRenderer': traverse_standard_list,
|
||||
'playlistVideoListRenderer': traverse_standard_list,
|
||||
'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
|
||||
}
|
||||
|
||||
def extract_items(response, item_types=item_types):
|
||||
'''return items, ctoken'''
|
||||
if 'continuationContents' in response:
|
||||
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
|
||||
@@ -414,13 +539,11 @@ def extract_items(response):
|
||||
key, value = list(renderer.items())[0]
|
||||
|
||||
# has a list in it, add it to the iter stack
|
||||
if key in list_types:
|
||||
renderer_list = multi_default_multi_get(value, ['contents'], ['items'], default=(), types=(list, tuple))
|
||||
if key in nested_renderer_list_dispatch:
|
||||
renderer_list, continuation = nested_renderer_list_dispatch[key](value)
|
||||
if renderer_list:
|
||||
iter_stack.append(current_iter)
|
||||
current_iter = iter(renderer_list)
|
||||
|
||||
continuation = default_multi_get(value, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
|
||||
if continuation:
|
||||
ctoken = continuation
|
||||
|
||||
@@ -506,10 +629,7 @@ def extract_channel_info(polymer_json, tab):
|
||||
|
||||
info['links'] = []
|
||||
for link_json in channel_metadata.get('primaryLinks', ()):
|
||||
url = link_json['navigationEndpoint']['urlEndpoint']['url']
|
||||
if url.startswith('/redirect'): # youtube puts these on external links to do tracking
|
||||
query_string = url[url.find('?')+1: ]
|
||||
url = urllib.parse.parse_qs(query_string)['q'][0]
|
||||
url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url'])
|
||||
|
||||
text = get_plain_text(link_json['title'])
|
||||
|
||||
@@ -699,5 +819,290 @@ def parse_comments_polymer(polymer_json):
|
||||
'sort': metadata['sort'],
|
||||
}
|
||||
|
||||
def check_missing_keys(object, *key_sequences):
|
||||
for key_sequence in key_sequences:
|
||||
_object = object
|
||||
try:
|
||||
for key in key_sequence:
|
||||
_object = object[key]
|
||||
except (KeyError, IndexError, TypeError):
|
||||
return 'Could not find ' + key
|
||||
|
||||
return None
|
||||
|
||||
def extract_plain_text(node, default=None):
|
||||
if isinstance(node, str):
|
||||
return node
|
||||
|
||||
try:
|
||||
return node['simpleText']
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
try:
|
||||
return ''.join(text_run['text'] for text_run in node['runs'])
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
return default
|
||||
|
||||
def extract_formatted_text(node):
|
||||
try:
|
||||
result = []
|
||||
runs = node['runs']
|
||||
for run in runs:
|
||||
url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
||||
if url is not None:
|
||||
run['url'] = remove_redirect(url)
|
||||
run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense
|
||||
return runs
|
||||
except (KeyError, TypeError):
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
try:
|
||||
return [{'text': node['simpleText']}]
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
return []
|
||||
|
||||
def extract_integer(string):
|
||||
if not isinstance(string, str):
|
||||
return None
|
||||
match = re.search(r'(\d+)', string.replace(',', ''))
|
||||
if match is None:
|
||||
return None
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def extract_metadata_row_info(video_renderer_info):
|
||||
# extract category and music list
|
||||
info = {
|
||||
'category': None,
|
||||
'music_list': [],
|
||||
}
|
||||
|
||||
current_song = {}
|
||||
for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
||||
row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='')
|
||||
row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0))
|
||||
if row_title == 'Category':
|
||||
info['category'] = row_content
|
||||
elif row_title in ('Song', 'Music'):
|
||||
if current_song:
|
||||
info['music_list'].append(current_song)
|
||||
current_song = {'title': row_content}
|
||||
elif row_title == 'Artist':
|
||||
current_song['artist'] = row_content
|
||||
elif row_title == 'Album':
|
||||
current_song['album'] = row_content
|
||||
elif row_title == 'Writers':
|
||||
current_song['writers'] = row_content
|
||||
elif row_title.startswith('Licensed'):
|
||||
current_song['licensor'] = row_content
|
||||
if current_song:
|
||||
info['music_list'].append(current_song)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def extract_watch_info_mobile(top_level):
|
||||
info = {}
|
||||
microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||
|
||||
info['allowed_countries'] = microformat.get('availableCountries', [])
|
||||
info['published_date'] = microformat.get('publishDate')
|
||||
|
||||
response = top_level.get('response', {})
|
||||
|
||||
# video info from metadata renderers
|
||||
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
|
||||
if items:
|
||||
video_info = items[0]['slimVideoMetadataRenderer']
|
||||
else:
|
||||
print('Failed to extract video metadata')
|
||||
video_info = {}
|
||||
|
||||
info.update(extract_metadata_row_info(video_info))
|
||||
#info['description'] = extract_formatted_text(video_info.get('description'))
|
||||
info['like_count'] = None
|
||||
info['dislike_count'] = None
|
||||
for button in video_info.get('buttons', ()):
|
||||
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||
|
||||
# all the digits can be found in the accessibility data
|
||||
count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
||||
|
||||
# this count doesn't have all the digits, it's like 53K for instance
|
||||
dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||
|
||||
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
||||
if dumb_count == 0:
|
||||
count = 0
|
||||
|
||||
if 'isLike' in button_renderer:
|
||||
info['like_count'] = count
|
||||
elif 'isDislike' in button_renderer:
|
||||
info['dislike_count'] = count
|
||||
|
||||
# comment section info
|
||||
items, _ = extract_items(response, item_types={'commentSectionRenderer'})
|
||||
if items:
|
||||
comment_info = items[0]['commentSectionRenderer']
|
||||
comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
|
||||
if comment_count_text == 'Comments': # just this with no number, means 0 comments
|
||||
info['comment_count'] = 0
|
||||
else:
|
||||
info['comment_count'] = extract_integer(comment_count_text)
|
||||
info['comments_disabled'] = False
|
||||
else: # no comment section present means comments are disabled
|
||||
info['comment_count'] = 0
|
||||
info['comments_disabled'] = True
|
||||
|
||||
# related videos
|
||||
related, _ = extract_items(response)
|
||||
info['related_videos'] = [renderer_info(renderer) for renderer in related]
|
||||
|
||||
return info
|
||||
|
||||
month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
|
||||
def extract_watch_info_desktop(top_level):
|
||||
info = {
|
||||
'comment_count': None,
|
||||
'comments_disabled': None,
|
||||
'allowed_countries': None,
|
||||
}
|
||||
|
||||
video_info = {}
|
||||
for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
|
||||
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
|
||||
video_info.update(list(renderer.values())[0])
|
||||
|
||||
info.update(extract_metadata_row_info(video_info))
|
||||
#info['description'] = extract_formatted_text(video_info.get('description', None))
|
||||
info['published_date'] = None
|
||||
date_text = extract_plain_text(video_info.get('dateText', None))
|
||||
if date_text is not None:
|
||||
date_text = util.left_remove(date_text.lower(), 'published on ').replace(',', '')
|
||||
parts = date_text.split()
|
||||
if len(parts) == 3:
|
||||
month, day, year = date_text.split()
|
||||
month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name
|
||||
if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None):
|
||||
info['published_date'] = year + '-' + month + '-' + day
|
||||
|
||||
likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
|
||||
if len(likes_dislikes) == 2:
|
||||
info['like_count'] = extract_integer(likes_dislikes[0])
|
||||
info['dislike_count'] = extract_integer(likes_dislikes[1])
|
||||
else:
|
||||
info['like_count'] = None
|
||||
info['dislike_count'] = None
|
||||
|
||||
#info['title'] = extract_plain_text(video_info.get('title', None))
|
||||
#info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
|
||||
#info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||
#info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
|
||||
|
||||
related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
|
||||
info['related_videos'] = [renderer_info(renderer) for renderer in related]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def extract_watch_info(polymer_json):
|
||||
info = {'playability_error': None, 'error': None}
|
||||
|
||||
if isinstance(polymer_json, dict):
|
||||
top_level = polymer_json
|
||||
elif isinstance(polymer_json, (list, tuple)):
|
||||
top_level = {}
|
||||
for page_part in polymer_json:
|
||||
if not isinstance(page_part, dict):
|
||||
return {'error': 'Invalid page part'}
|
||||
top_level.update(page_part)
|
||||
else:
|
||||
return {'error': 'Invalid top level polymer data'}
|
||||
|
||||
error = check_missing_keys(top_level,
|
||||
['playerResponse'],
|
||||
)
|
||||
if error:
|
||||
return {'error': error}
|
||||
|
||||
error = check_missing_keys(top_level,
|
||||
['player', 'args'],
|
||||
['player', 'assets', 'js'],
|
||||
)
|
||||
if error:
|
||||
info['playability_error'] = error
|
||||
|
||||
|
||||
player_args = default_multi_get(top_level, 'player', 'args', default={})
|
||||
parsed_formats = []
|
||||
|
||||
if 'url_encoded_fmt_stream_map' in player_args:
|
||||
string_formats = player_args['url_encoded_fmt_stream_map'].split(',')
|
||||
parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
|
||||
|
||||
if 'adaptive_fmts' in player_args:
|
||||
string_formats = player_args['adaptive_fmts'].split(',')
|
||||
parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
|
||||
|
||||
info['formats'] = []
|
||||
|
||||
for parsed_fmt in parsed_formats:
|
||||
# start with defaults from the big table at the top
|
||||
if 'itag' in parsed_fmt:
|
||||
fmt = _formats.get(parsed_fmt['itag'], {}).copy()
|
||||
else:
|
||||
fmt = {}
|
||||
|
||||
# then override them
|
||||
fmt.update(parsed_fmt)
|
||||
try:
|
||||
fmt['width'], fmt['height'] = map(int, fmt['size'].split('x'))
|
||||
except (KeyError, ValueError, TypeError):
|
||||
pass
|
||||
|
||||
fmt['file_size'] = None
|
||||
if 'clen' in fmt:
|
||||
fmt['file_size'] = int(fmt.get('clen'))
|
||||
else:
|
||||
match = re.search(r'&clen=(\d+)', fmt.get('url'))
|
||||
if match:
|
||||
fmt['file_size'] = int(match.group(1))
|
||||
info['formats'].append(fmt)
|
||||
|
||||
info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js')
|
||||
if info['base_js']:
|
||||
info['base_js'] = normalize_url(info['base_js'])
|
||||
|
||||
mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={})
|
||||
if mobile:
|
||||
info.update(extract_watch_info_mobile(top_level))
|
||||
else:
|
||||
info.update(extract_watch_info_desktop(top_level))
|
||||
|
||||
# stuff from videoDetails
|
||||
video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={})
|
||||
info['title'] = extract_plain_text(video_details.get('title'))
|
||||
info['duration'] = extract_integer(video_details.get('lengthSeconds'))
|
||||
info['view_count'] = extract_integer(video_details.get('viewCount'))
|
||||
# videos with no description have a blank string
|
||||
info['description'] = video_details.get('shortDescription')
|
||||
info['id'] = video_details.get('videoId')
|
||||
info['author'] = video_details.get('author')
|
||||
info['author_id'] = video_details.get('channelId')
|
||||
info['live'] = video_details.get('isLiveContent')
|
||||
info['unlisted'] = not video_details.get('isCrawlable', True)
|
||||
info['tags'] = video_details.get('keywords', [])
|
||||
|
||||
# other stuff
|
||||
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||
info['subtitles'] = {} # TODO
|
||||
|
||||
return info
|
||||
|
||||
Reference in New Issue
Block a user