Extraction: Replace youtube-dl with custom-built watch page extraction

This commit is contained in:
James Taylor
2019-10-17 19:58:13 -07:00
parent 9abb83fdbc
commit 4c07546e7a
45 changed files with 518 additions and 31348 deletions

View File

@@ -187,8 +187,17 @@
.format-ext{
width: 60px;
}
.format-res{
width:90px;
.format-video-quality{
width: 140px;
}
.format-audio-quality{
width: 120px;
}
.format-file-size{
width: 80px;
}
.format-codecs{
width: 120px;
}
{% endblock style %}
@@ -227,8 +236,10 @@
<a class="download-link" href="{{ format['url'] }}">
<ol class="format-attributes">
<li class="format-ext">{{ format['ext'] }}</li>
<li class="format-res">{{ format['resolution'] }}</li>
<li class="format-note">{{ format['note'] }}</li>
<li class="format-video-quality">{{ format['video_quality'] }}</li>
<li class="format-audio-quality">{{ format['audio_quality'] }}</li>
<li class="format-file-size">{{ format['file_size'] }}</li>
<li class="format-codecs">{{ format['codecs'] }}</li>
</ol>
</a>
</li>
@@ -238,7 +249,7 @@
<input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
<span class="description">{{ description }}</span>
<span class="description">{{ common_elements.text_runs(description) }}</span>
<div class="music-list">
{% if music_list.__len__() != 0 %}
<hr>

View File

@@ -176,7 +176,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja
return content, response
return content
mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
mobile_ua = (('User-Agent', mobile_user_agent),)
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
desktop_ua = (('User-Agent', desktop_user_agent),)
@@ -312,3 +312,10 @@ def uppercase_escape(s):
def prefix_url(url):
url = url.lstrip('/') # some urls have // before them, which has a special meaning
return '/' + url
def left_remove(string, substring):
'''removes substring from the start of string, if present'''
if string.startswith(substring):
return string[len(substring):]
return string

View File

@@ -5,49 +5,15 @@ import settings
from flask import request
import flask
from youtube_dl.YoutubeDL import YoutubeDL
from youtube_dl.extractor.youtube import YoutubeError
import json
import html
import gevent
import os
import math
import traceback
def get_related_items(info):
results = []
for item in info['related_vids']:
if 'list' in item: # playlist:
result = watch_page_related_playlist_info(item)
else:
result = watch_page_related_video_info(item)
yt_data_extract.prefix_urls(result)
yt_data_extract.add_extra_html_info(result)
results.append(result)
return results
# json of related items retrieved directly from the watch page has different names for everything
# converts these to standard names
def watch_page_related_video_info(item):
result = {key: item[key] for key in ('id', 'title', 'author')}
result['duration'] = util.seconds_to_timestamp(item['length_seconds'])
try:
result['views'] = item['short_view_count_text']
except KeyError:
result['views'] = ''
result['thumbnail'] = util.get_thumbnail_url(item['id'])
result['type'] = 'video'
return result
def watch_page_related_playlist_info(item):
return {
'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+",
'title': item['playlist_title'],
'id': item['list'],
'first_video_id': item['video_id'],
'thumbnail': util.get_thumbnail_url(item['video_id']),
'type': 'playlist',
}
def get_video_sources(info):
video_sources = []
@@ -55,9 +21,10 @@ def get_video_sources(info):
max_resolution = 360
else:
max_resolution = settings.default_resolution
for format in info['formats']:
if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution:
if not all(attr in format for attr in ('height', 'width', 'ext', 'url')):
continue
if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution:
video_sources.append({
'src': format['url'],
'type': 'video/' + format['ext'],
@@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list):
return ordered_attributes
headers = (
('Accept', '*/*'),
('Accept-Language', 'en-US,en;q=0.5'),
('X-YouTube-Client-Name', '2'),
('X-YouTube-Client-Version', '2.20180830'),
) + util.mobile_ua
def extract_info(downloader, *args, **kwargs):
def extract_info(video_id):
polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch')
try:
return downloader.extract_info(*args, **kwargs)
except YoutubeError as e:
return str(e)
polymer_json = json.loads(polymer_json)
except json.decoder.JSONDecodeError:
traceback.print_exc()
return {'error': 'Failed to parse json response'}
return yt_data_extract.extract_watch_info(polymer_json)
def video_quality_string(format):
if 'vcodec' in format:
result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?'))
if 'fps' in format:
result += ' ' + format['fps'] + 'fps'
return result
elif 'acodec' in format:
return 'audio only'
return '?'
def audio_quality_string(format):
if 'acodec' in format:
result = str(format.get('abr', '?')) + 'k'
if 'audio_sample_rate' in format:
result += ' ' + str(format['audio_sample_rate']) + ' Hz'
return result
elif 'vcodec' in format:
return 'video only'
return '?'
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
def format_bytes(bytes):
if bytes is None:
return 'N/A'
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = int(math.log(bytes, 1024.0))
suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
converted = float(bytes) / float(1024 ** exponent)
return '%.2f%s' % (converted, suffix)
@yt_app.route('/watch')
@@ -152,38 +162,26 @@ def get_watch_page():
flask.abort(flask.Response('Incomplete video id (too short): ' + video_id))
lc = request.args.get('lc', '')
if settings.route_tor:
proxy = 'socks5://127.0.0.1:9150/'
else:
proxy = ''
yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy})
tasks = (
gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ),
gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False)
gevent.spawn(extract_info, video_id)
)
gevent.joinall(tasks)
comments_info, info = tasks[0].value, tasks[1].value
if isinstance(info, str): # youtube error
return flask.render_template('error.html', error_message = info)
if info['error']:
return flask.render_template('error.html', error_message = info['error'])
video_info = {
"duration": util.seconds_to_timestamp(info["duration"]),
"duration": util.seconds_to_timestamp(info["duration"] or 0),
"id": info['id'],
"title": info['title'],
"author": info['uploader'],
"author": info['author'],
}
upload_year = info["upload_date"][0:4]
upload_month = info["upload_date"][4:6]
upload_day = info["upload_date"][6:8]
upload_date = upload_month + "/" + upload_day + "/" + upload_year
if settings.related_videos_mode:
related_videos = get_related_items(info)
else:
related_videos = []
for item in info['related_videos']:
yt_data_extract.prefix_urls(item)
yt_data_extract.add_extra_html_info(item)
if settings.gather_googlevideo_domains:
with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f:
@@ -195,23 +193,29 @@ def get_watch_page():
download_formats = []
for format in info['formats']:
if 'acodec' in format and 'vcodec' in format:
codecs_string = format['acodec'] + ', ' + format['vcodec']
else:
codecs_string = format.get('acodec') or format.get('vcodec') or '?'
download_formats.append({
'url': format['url'],
'ext': format['ext'],
'resolution': yt_dl_downloader.format_resolution(format),
'note': yt_dl_downloader._format_note(format),
'ext': format.get('ext', '?'),
'audio_quality': audio_quality_string(format),
'video_quality': video_quality_string(format),
'file_size': format_bytes(format['file_size']),
'codecs': codecs_string,
})
video_sources = get_video_sources(info)
video_height = video_sources[0]['height']
video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360)
video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640)
# 1 second per pixel, or the actual video width
theater_video_target_width = max(640, info['duration'], video_sources[0]['width'])
theater_video_target_width = max(640, info['duration'] or 0, video_width)
return flask.render_template('watch.html',
header_playlist_names = local_playlist.get_playlist_names(),
uploader_channel_url = '/' + info['uploader_url'],
upload_date = upload_date,
uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '',
upload_date = info['published_date'],
views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
@@ -219,7 +223,7 @@ def get_watch_page():
video_info = json.dumps(video_info),
video_sources = video_sources,
subtitle_sources = get_subtitle_sources(info),
related = related_videos,
related = info['related_videos'],
music_list = info['music_list'],
music_attributes = get_ordered_music_list_attributes(info['music_list']),
comments_info = comments_info,
@@ -232,7 +236,7 @@ def get_watch_page():
theater_video_target_width = theater_video_target_width,
title = info['title'],
uploader = info['uploader'],
uploader = info['author'],
description = info['description'],
unlisted = info['unlisted'],
)

View File

@@ -6,6 +6,7 @@ import re
import urllib
import collections
from math import ceil
import traceback
# videos (all of type str):
@@ -36,8 +37,112 @@ from math import ceil
# size
# first_video_id
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
_formats = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
# 3D videos
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
# Apple HTTP Live Streaming
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'},
# DASH mp4 video
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
# Dash webm audio
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
# Dash webm audio with opus inside
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
# av01 video only formats sometimes served with "unknown" codecs
'394': {'vcodec': 'av01.0.05M.08'},
'395': {'vcodec': 'av01.0.05M.08'},
'396': {'vcodec': 'av01.0.05M.08'},
'397': {'vcodec': 'av01.0.05M.08'},
}
def get_plain_text(node):
@@ -59,7 +164,7 @@ def format_text_runs(runs):
result += html.escape(text_run["text"])
return result
def default_get(object, key, default, types=()):
def default_get(object, key, default=None, types=()):
'''Like dict.get(), but returns default if the result doesn't match one of the types.
Also works for indexing lists.'''
try:
@@ -74,7 +179,7 @@ def default_get(object, key, default, types=()):
def default_multi_get(object, *keys, default, types=()):
def default_multi_get(object, *keys, default=None, types=()):
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
Last argument is the default value to use in case of any IndexErrors or KeyErrors.
If types is given and the result doesn't match one of those types, default is returned'''
@@ -106,6 +211,11 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()):
continue
return default
def remove_redirect(url):
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
query_string = url[url.find('?')+1: ]
return urllib.parse.parse_qs(query_string)['q'][0]
return url
def get_url(node):
try:
@@ -239,9 +349,9 @@ def renderer_info(renderer, additional_info={}):
type = list(renderer.keys())[0]
renderer = renderer[type]
info = {}
if type == 'itemSectionRenderer':
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
return renderer_info(renderer['contents'][0], additional_info)
if type in ('movieRenderer', 'clarificationRenderer'):
info['type'] = 'unsupported'
return info
@@ -345,6 +455,7 @@ item_types = {
'videoRenderer',
'compactVideoRenderer',
'compactAutoplayRenderer',
'gridVideoRenderer',
'playlistVideoRenderer',
@@ -378,6 +489,11 @@ def traverse_browse_renderer(renderer):
print('Could not find tab with content')
return {}
def traverse_standard_list(renderer):
renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
return renderer_list, continuation
# these renderers contain one inside them
nested_renderer_dispatch = {
'singleColumnBrowseResultsRenderer': traverse_browse_renderer,
@@ -385,7 +501,16 @@ nested_renderer_dispatch = {
'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict),
}
def extract_items(response):
# these renderers contain a list of renderers in side them
nested_renderer_list_dispatch = {
'sectionListRenderer': traverse_standard_list,
'itemSectionRenderer': traverse_standard_list,
'gridRenderer': traverse_standard_list,
'playlistVideoListRenderer': traverse_standard_list,
'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
}
def extract_items(response, item_types=item_types):
'''return items, ctoken'''
if 'continuationContents' in response:
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
@@ -414,13 +539,11 @@ def extract_items(response):
key, value = list(renderer.items())[0]
# has a list in it, add it to the iter stack
if key in list_types:
renderer_list = multi_default_multi_get(value, ['contents'], ['items'], default=(), types=(list, tuple))
if key in nested_renderer_list_dispatch:
renderer_list, continuation = nested_renderer_list_dispatch[key](value)
if renderer_list:
iter_stack.append(current_iter)
current_iter = iter(renderer_list)
continuation = default_multi_get(value, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
if continuation:
ctoken = continuation
@@ -506,10 +629,7 @@ def extract_channel_info(polymer_json, tab):
info['links'] = []
for link_json in channel_metadata.get('primaryLinks', ()):
url = link_json['navigationEndpoint']['urlEndpoint']['url']
if url.startswith('/redirect'): # youtube puts these on external links to do tracking
query_string = url[url.find('?')+1: ]
url = urllib.parse.parse_qs(query_string)['q'][0]
url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url'])
text = get_plain_text(link_json['title'])
@@ -699,5 +819,290 @@ def parse_comments_polymer(polymer_json):
'sort': metadata['sort'],
}
def check_missing_keys(object, *key_sequences):
for key_sequence in key_sequences:
_object = object
try:
for key in key_sequence:
_object = object[key]
except (KeyError, IndexError, TypeError):
return 'Could not find ' + key
return None
def extract_plain_text(node, default=None):
if isinstance(node, str):
return node
try:
return node['simpleText']
except (KeyError, TypeError):
pass
try:
return ''.join(text_run['text'] for text_run in node['runs'])
except (KeyError, TypeError):
pass
return default
def extract_formatted_text(node):
try:
result = []
runs = node['runs']
for run in runs:
url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
if url is not None:
run['url'] = remove_redirect(url)
run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense
return runs
except (KeyError, TypeError):
traceback.print_exc()
pass
try:
return [{'text': node['simpleText']}]
except (KeyError, TypeError):
pass
return []
def extract_integer(string):
if not isinstance(string, str):
return None
match = re.search(r'(\d+)', string.replace(',', ''))
if match is None:
return None
try:
return int(match.group(1))
except ValueError:
return None
def extract_metadata_row_info(video_renderer_info):
# extract category and music list
info = {
'category': None,
'music_list': [],
}
current_song = {}
for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='')
row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0))
if row_title == 'Category':
info['category'] = row_content
elif row_title in ('Song', 'Music'):
if current_song:
info['music_list'].append(current_song)
current_song = {'title': row_content}
elif row_title == 'Artist':
current_song['artist'] = row_content
elif row_title == 'Album':
current_song['album'] = row_content
elif row_title == 'Writers':
current_song['writers'] = row_content
elif row_title.startswith('Licensed'):
current_song['licensor'] = row_content
if current_song:
info['music_list'].append(current_song)
return info
def extract_watch_info_mobile(top_level):
info = {}
microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
info['allowed_countries'] = microformat.get('availableCountries', [])
info['published_date'] = microformat.get('publishDate')
response = top_level.get('response', {})
# video info from metadata renderers
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
if items:
video_info = items[0]['slimVideoMetadataRenderer']
else:
print('Failed to extract video metadata')
video_info = {}
info.update(extract_metadata_row_info(video_info))
#info['description'] = extract_formatted_text(video_info.get('description'))
info['like_count'] = None
info['dislike_count'] = None
for button in video_info.get('buttons', ()):
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
# all the digits can be found in the accessibility data
count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
# this count doesn't have all the digits, it's like 53K for instance
dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
if dumb_count == 0:
count = 0
if 'isLike' in button_renderer:
info['like_count'] = count
elif 'isDislike' in button_renderer:
info['dislike_count'] = count
# comment section info
items, _ = extract_items(response, item_types={'commentSectionRenderer'})
if items:
comment_info = items[0]['commentSectionRenderer']
comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
if comment_count_text == 'Comments': # just this with no number, means 0 comments
info['comment_count'] = 0
else:
info['comment_count'] = extract_integer(comment_count_text)
info['comments_disabled'] = False
else: # no comment section present means comments are disabled
info['comment_count'] = 0
info['comments_disabled'] = True
# related videos
related, _ = extract_items(response)
info['related_videos'] = [renderer_info(renderer) for renderer in related]
return info
month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
def extract_watch_info_desktop(top_level):
info = {
'comment_count': None,
'comments_disabled': None,
'allowed_countries': None,
}
video_info = {}
for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
video_info.update(list(renderer.values())[0])
info.update(extract_metadata_row_info(video_info))
#info['description'] = extract_formatted_text(video_info.get('description', None))
info['published_date'] = None
date_text = extract_plain_text(video_info.get('dateText', None))
if date_text is not None:
date_text = util.left_remove(date_text.lower(), 'published on ').replace(',', '')
parts = date_text.split()
if len(parts) == 3:
month, day, year = date_text.split()
month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name
if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None):
info['published_date'] = year + '-' + month + '-' + day
likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
if len(likes_dislikes) == 2:
info['like_count'] = extract_integer(likes_dislikes[0])
info['dislike_count'] = extract_integer(likes_dislikes[1])
else:
info['like_count'] = None
info['dislike_count'] = None
#info['title'] = extract_plain_text(video_info.get('title', None))
#info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
#info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
#info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
info['related_videos'] = [renderer_info(renderer) for renderer in related]
return info
def extract_watch_info(polymer_json):
info = {'playability_error': None, 'error': None}
if isinstance(polymer_json, dict):
top_level = polymer_json
elif isinstance(polymer_json, (list, tuple)):
top_level = {}
for page_part in polymer_json:
if not isinstance(page_part, dict):
return {'error': 'Invalid page part'}
top_level.update(page_part)
else:
return {'error': 'Invalid top level polymer data'}
error = check_missing_keys(top_level,
['playerResponse'],
)
if error:
return {'error': error}
error = check_missing_keys(top_level,
['player', 'args'],
['player', 'assets', 'js'],
)
if error:
info['playability_error'] = error
player_args = default_multi_get(top_level, 'player', 'args', default={})
parsed_formats = []
if 'url_encoded_fmt_stream_map' in player_args:
string_formats = player_args['url_encoded_fmt_stream_map'].split(',')
parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
if 'adaptive_fmts' in player_args:
string_formats = player_args['adaptive_fmts'].split(',')
parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
info['formats'] = []
for parsed_fmt in parsed_formats:
# start with defaults from the big table at the top
if 'itag' in parsed_fmt:
fmt = _formats.get(parsed_fmt['itag'], {}).copy()
else:
fmt = {}
# then override them
fmt.update(parsed_fmt)
try:
fmt['width'], fmt['height'] = map(int, fmt['size'].split('x'))
except (KeyError, ValueError, TypeError):
pass
fmt['file_size'] = None
if 'clen' in fmt:
fmt['file_size'] = int(fmt.get('clen'))
else:
match = re.search(r'&clen=(\d+)', fmt.get('url'))
if match:
fmt['file_size'] = int(match.group(1))
info['formats'].append(fmt)
info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js')
if info['base_js']:
info['base_js'] = normalize_url(info['base_js'])
mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={})
if mobile:
info.update(extract_watch_info_mobile(top_level))
else:
info.update(extract_watch_info_desktop(top_level))
# stuff from videoDetails
video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={})
info['title'] = extract_plain_text(video_details.get('title'))
info['duration'] = extract_integer(video_details.get('lengthSeconds'))
info['view_count'] = extract_integer(video_details.get('viewCount'))
# videos with no description have a blank string
info['description'] = video_details.get('shortDescription')
info['id'] = video_details.get('videoId')
info['author'] = video_details.get('author')
info['author_id'] = video_details.get('channelId')
info['live'] = video_details.get('isLiveContent')
info['unlisted'] = not video_details.get('isCrawlable', True)
info['tags'] = video_details.get('keywords', [])
# other stuff
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
info['subtitles'] = {} # TODO
return info