Extraction: rename multi_get functions to more descriptive names

This commit is contained in:
James Taylor 2019-12-18 19:43:55 -08:00
parent 98777ee825
commit f6bf5213a5
3 changed files with 68 additions and 68 deletions

View File

@ -104,7 +104,7 @@ def get_playlist_page():
if 'id' in item: if 'id' in item:
item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg' item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg'
video_count = yt_data_extract.default_multi_get(info, 'metadata', 'video_count') video_count = yt_data_extract.deep_get(info, 'metadata', 'video_count')
if video_count is None: if video_count is None:
video_count = 40 video_count = 40

View File

@ -185,7 +185,7 @@ def decrypt_signatures(info):
return False # No decryption needed return False # No decryption needed
if not info['base_js']: if not info['base_js']:
return 'Failed to find base.js' return 'Failed to find base.js'
player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2) player_name = yt_data_extract.get(info['base_js'].split('/'), -2)
if not player_name: if not player_name:
return 'Could not find player name' return 'Could not find player name'
@ -204,7 +204,7 @@ def decrypt_signatures(info):
if not function_body: if not function_body:
return 'Empty decryption function body' return 'Empty decryption function body'
var_name = yt_data_extract.default_get(function_body[0].split('.'), 0) var_name = yt_data_extract.get(function_body[0].split('.'), 0)
if var_name is None: if var_name is None:
return 'Could not find var_name' return 'Could not find var_name'
@ -397,8 +397,8 @@ def get_watch_page():
}) })
video_sources = get_video_sources(info) video_sources = get_video_sources(info)
video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360) video_height = yt_data_extract.deep_get(video_sources, 0, 'height', default=360)
video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640) video_width = yt_data_extract.deep_get(video_sources, 0, 'width', default=640)
# 1 second per pixel, or the actual video width # 1 second per pixel, or the actual video width
theater_video_target_width = max(640, info['duration'] or 0, video_width) theater_video_target_width = max(640, info['duration'] or 0, video_width)

View File

@ -145,7 +145,7 @@ _formats = {
'397': {'vcodec': 'av01.0.05M.08'}, '397': {'vcodec': 'av01.0.05M.08'},
} }
def default_get(object, key, default=None, types=()): def get(object, key, default=None, types=()):
'''Like dict.get(), but returns default if the result doesn't match one of the types. '''Like dict.get(), but returns default if the result doesn't match one of the types.
Also works for indexing lists.''' Also works for indexing lists.'''
try: try:
@ -158,8 +158,8 @@ def default_get(object, key, default=None, types=()):
else: else:
return default return default
def multi_default_get(object, *keys, default=None, types=()): def multi_get(object, *keys, default=None, types=()):
'''Like default_get, but try other keys if the first fails''' '''Like get, but try other keys if the first fails'''
for key in keys: for key in keys:
try: try:
result = object[key] result = object[key]
@ -173,7 +173,7 @@ def multi_default_get(object, *keys, default=None, types=()):
return default return default
def default_multi_get(object, *keys, default=None, types=()): def deep_get(object, *keys, default=None, types=()):
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
Last argument is the default value to use in case of any IndexErrors or KeyErrors. Last argument is the default value to use in case of any IndexErrors or KeyErrors.
If types is given and the result doesn't match one of those types, default is returned''' If types is given and the result doesn't match one of those types, default is returned'''
@ -188,8 +188,8 @@ def default_multi_get(object, *keys, default=None, types=()):
else: else:
return default return default
def multi_default_multi_get(object, *key_sequences, default=None, types=()): def multi_deep_get(object, *key_sequences, default=None, types=()):
'''Like default_multi_get, but can try different key sequences in case one fails. '''Like deep_get, but can try different key sequences in case one fails.
Return default if all of them fail. key_sequences is a list of lists''' Return default if all of them fail. key_sequences is a list of lists'''
for key_sequence in key_sequences: for key_sequence in key_sequences:
_object = object _object = object
@ -224,7 +224,7 @@ def remove_redirect(url):
def _recover_urls(runs): def _recover_urls(runs):
for run in runs: for run in runs:
url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
text = run.get('text', '') text = run.get('text', '')
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
if url is not None and (text.startswith('http://') or text.startswith('https://')): if url is not None and (text.startswith('http://') or text.startswith('https://')):
@ -328,14 +328,14 @@ def extract_item_info(item, additional_info={}):
if not item: if not item:
return {'error': 'No item given'} return {'error': 'No item given'}
type = default_get(list(item.keys()), 0) type = get(list(item.keys()), 0)
if not type: if not type:
return {'error': 'Could not find type'} return {'error': 'Could not find type'}
item = item[type] item = item[type]
info = {'error': None} info = {'error': None}
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
return extract_item_info(default_multi_get(item, 'contents', 0), additional_info) return extract_item_info(deep_get(item, 'contents', 0), additional_info)
if type in ('movieRenderer', 'clarificationRenderer'): if type in ('movieRenderer', 'clarificationRenderer'):
info['type'] = 'unsupported' info['type'] = 'unsupported'
@ -360,23 +360,23 @@ def extract_item_info(item, additional_info={}):
info['type'] = 'unsupported' info['type'] = 'unsupported'
info['title'] = extract_str(item.get('title')) info['title'] = extract_str(item.get('title'))
info['author'] = extract_str(multi_default_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText'))
info['author_id'] = extract_str(multi_default_multi_get(item, info['author_id'] = extract_str(multi_deep_get(item,
['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'] ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId']
)) ))
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
info['description'] = extract_formatted_text(multi_default_get(item, 'descriptionSnippet', 'descriptionText')) info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
info['thumbnail'] = multi_default_multi_get(item, info['thumbnail'] = multi_deep_get(item,
['thumbnail', 'thumbnails', 0, 'url'], # videos ['thumbnail', 'thumbnails', 0, 'url'], # videos
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
) )
info['badges'] = [] info['badges'] = []
for badge_node in multi_default_get(item, 'badges', 'ownerBadges', default=()): for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
badge = default_multi_get(badge_node, 'metadataBadgeRenderer', 'label') badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label')
if badge: if badge:
info['badges'].append(badge) info['badges'].append(badge)
@ -389,7 +389,7 @@ def extract_item_info(item, additional_info={}):
if info['view_count']: if info['view_count']:
info['approx_view_count'] = '{:,}'.format(info['view_count']) info['approx_view_count'] = '{:,}'.format(info['view_count'])
else: else:
info['approx_view_count'] = extract_approx_int(multi_default_get(item, 'shortViewCountText')) info['approx_view_count'] = extract_approx_int(multi_get(item, 'shortViewCountText'))
info['duration'] = extract_str(item.get('lengthText')) info['duration'] = extract_str(item.get('lengthText'))
elif primary_type == 'playlist': elif primary_type == 'playlist':
info['id'] = item.get('playlistId') info['id'] = item.get('playlistId')
@ -398,17 +398,17 @@ def extract_item_info(item, additional_info={}):
info['id'] = item.get('channelId') info['id'] = item.get('channelId')
info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText')) info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText'))
elif primary_type == 'show': elif primary_type == 'show':
info['id'] = default_multi_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId')
if primary_type in ('playlist', 'channel'): if primary_type in ('playlist', 'channel'):
conservative_update(info, 'video_count', extract_int(item.get('videoCountText'))) conservative_update(info, 'video_count', extract_int(item.get('videoCountText')))
for overlay in item.get('thumbnailOverlays', []): for overlay in item.get('thumbnailOverlays', []):
conservative_update(info, 'duration', extract_str(default_multi_get( conservative_update(info, 'duration', extract_str(deep_get(
overlay, 'thumbnailOverlayTimeStatusRenderer', 'text' overlay, 'thumbnailOverlayTimeStatusRenderer', 'text'
))) )))
# show renderers don't have videoCountText # show renderers don't have videoCountText
conservative_update(info, 'video_count', extract_int(default_multi_get( conservative_update(info, 'video_count', extract_int(deep_get(
overlay, 'thumbnailOverlayBottomPanelRenderer', 'text' overlay, 'thumbnailOverlayBottomPanelRenderer', 'text'
))) )))
return info return info
@ -422,7 +422,7 @@ def parse_info_prepare_for_html(renderer, additional_info={}):
def extract_response(polymer_json): def extract_response(polymer_json):
'''return response, error''' '''return response, error'''
response = multi_default_multi_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict)
if response is None: if response is None:
return None, 'Failed to extract response' return None, 'Failed to extract response'
else: else:
@ -468,25 +468,25 @@ item_types = {
} }
def traverse_browse_renderer(renderer): def traverse_browse_renderer(renderer):
for tab in default_get(renderer, 'tabs', (), types=(list, tuple)): for tab in get(renderer, 'tabs', (), types=(list, tuple)):
tab_renderer = multi_default_multi_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict)
if tab_renderer is None: if tab_renderer is None:
continue continue
if tab_renderer.get('selected', False): if tab_renderer.get('selected', False):
return default_get(tab_renderer, 'content', {}, types=(dict)) return get(tab_renderer, 'content', {}, types=(dict))
print('Could not find tab with content') print('Could not find tab with content')
return {} return {}
def traverse_standard_list(renderer): def traverse_standard_list(renderer):
renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
return renderer_list, continuation return renderer_list, continuation
# these renderers contain one inside them # these renderers contain one inside them
nested_renderer_dispatch = { nested_renderer_dispatch = {
'singleColumnBrowseResultsRenderer': traverse_browse_renderer, 'singleColumnBrowseResultsRenderer': traverse_browse_renderer,
'twoColumnBrowseResultsRenderer': traverse_browse_renderer, 'twoColumnBrowseResultsRenderer': traverse_browse_renderer,
'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict), 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict),
} }
# these renderers contain a list of renderers in side them # these renderers contain a list of renderers in side them
@ -495,17 +495,17 @@ nested_renderer_list_dispatch = {
'itemSectionRenderer': traverse_standard_list, 'itemSectionRenderer': traverse_standard_list,
'gridRenderer': traverse_standard_list, 'gridRenderer': traverse_standard_list,
'playlistVideoListRenderer': traverse_standard_list, 'playlistVideoListRenderer': traverse_standard_list,
'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
} }
def extract_items(response, item_types=item_types): def extract_items(response, item_types=item_types):
'''return items, ctoken''' '''return items, ctoken'''
if 'continuationContents' in response: if 'continuationContents' in response:
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
for key, renderer_continuation in default_get(response, 'continuationContents', {}, types=dict).items(): for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items():
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation
items = multi_default_multi_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple)) items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple))
ctoken = default_multi_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
return items, ctoken return items, ctoken
return [], None return [], None
elif 'contents' in response: elif 'contents' in response:
@ -515,7 +515,7 @@ def extract_items(response, item_types=item_types):
iter_stack = collections.deque() iter_stack = collections.deque()
current_iter = iter(()) current_iter = iter(())
renderer = default_get(response, 'contents', {}, types=dict) renderer = get(response, 'contents', {}, types=dict)
while True: while True:
# mode 1: dig into the current renderer # mode 1: dig into the current renderer
@ -692,11 +692,11 @@ def extract_playlist_metadata(polymer_json):
return {'error': err} return {'error': err}
metadata = {'error': None} metadata = {'error': None}
header = default_multi_get(response, 'header', 'playlistHeaderRenderer', default={}) header = deep_get(response, 'header', 'playlistHeaderRenderer', default={})
metadata['title'] = extract_str(header.get('title')) metadata['title'] = extract_str(header.get('title'))
metadata['first_video_id'] = default_multi_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
first_id = re.search(r'([a-z_\-]{11})', default_multi_get(header, first_id = re.search(r'([a-z_\-]{11})', deep_get(header,
'thumbnail', 'thumbnails', 0, 'url', default='')) 'thumbnail', 'thumbnails', 0, 'url', default=''))
if first_id: if first_id:
conservative_update(metadata, 'first_video_id', first_id.group(1)) conservative_update(metadata, 'first_video_id', first_id.group(1))
@ -708,7 +708,7 @@ def extract_playlist_metadata(polymer_json):
metadata['video_count'] = extract_int(header.get('numVideosText')) metadata['video_count'] = extract_int(header.get('numVideosText'))
metadata['description'] = extract_str(header.get('descriptionText'), default='') metadata['description'] = extract_str(header.get('descriptionText'), default='')
metadata['author'] = extract_str(header.get('ownerText')) metadata['author'] = extract_str(header.get('ownerText'))
metadata['author_id'] = multi_default_multi_get(header, metadata['author_id'] = multi_deep_get(header,
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
['ownerEndpoint', 'browseEndpoint', 'browseId']) ['ownerEndpoint', 'browseEndpoint', 'browseId'])
if metadata['author_id']: if metadata['author_id']:
@ -854,9 +854,9 @@ def extract_metadata_row_info(video_renderer_info):
} }
current_song = {} current_song = {}
for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
row_title = extract_str(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
row_content = extract_str(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
if row_title == 'Category': if row_title == 'Category':
info['category'] = row_content info['category'] = row_content
elif row_title in ('Song', 'Music'): elif row_title in ('Song', 'Music'):
@ -890,7 +890,7 @@ def extract_date(date_text):
def extract_watch_info_mobile(top_level): def extract_watch_info_mobile(top_level):
info = {} info = {}
microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
family_safe = microformat.get('isFamilySafe') family_safe = microformat.get('isFamilySafe')
if family_safe is None: if family_safe is None:
@ -913,13 +913,13 @@ def extract_watch_info_mobile(top_level):
info.update(extract_metadata_row_info(video_info)) info.update(extract_metadata_row_info(video_info))
info['description'] = extract_str(video_info.get('description'), recover_urls=True) info['description'] = extract_str(video_info.get('description'), recover_urls=True)
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
info['title'] = extract_str(video_info.get('title')) info['title'] = extract_str(video_info.get('title'))
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
info['unlisted'] = False info['unlisted'] = False
for badge in video_info.get('badges', []): for badge in video_info.get('badges', []):
if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
info['unlisted'] = True info['unlisted'] = True
info['like_count'] = None info['like_count'] = None
info['dislike_count'] = None info['dislike_count'] = None
@ -929,10 +929,10 @@ def extract_watch_info_mobile(top_level):
button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
# all the digits can be found in the accessibility data # all the digits can be found in the accessibility data
count = extract_int(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
# this count doesn't have all the digits, it's like 53K for instance # this count doesn't have all the digits, it's like 53K for instance
dumb_count = extract_int(extract_str(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
if dumb_count == 0: if dumb_count == 0:
@ -947,7 +947,7 @@ def extract_watch_info_mobile(top_level):
items, _ = extract_items(response, item_types={'commentSectionRenderer'}) items, _ = extract_items(response, item_types={'commentSectionRenderer'})
if items: if items:
comment_info = items[0]['commentSectionRenderer'] comment_info = items[0]['commentSectionRenderer']
comment_count_text = extract_str(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) comment_count_text = extract_str(deep_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
if comment_count_text == 'Comments': # just this with no number, means 0 comments if comment_count_text == 'Comments': # just this with no number, means 0 comments
info['comment_count'] = 0 info['comment_count'] = 0
else: else:
@ -980,7 +980,7 @@ def extract_watch_info_desktop(top_level):
} }
video_info = {} video_info = {}
for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
video_info.update(list(renderer.values())[0]) video_info.update(list(renderer.values())[0])
@ -988,7 +988,7 @@ def extract_watch_info_desktop(top_level):
info['description'] = extract_str(video_info.get('description', None), recover_urls=True) info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
if len(likes_dislikes) == 2: if len(likes_dislikes) == 2:
info['like_count'] = extract_int(likes_dislikes[0]) info['like_count'] = extract_int(likes_dislikes[0])
info['dislike_count'] = extract_int(likes_dislikes[1]) info['dislike_count'] = extract_int(likes_dislikes[1])
@ -997,11 +997,11 @@ def extract_watch_info_desktop(top_level):
info['dislike_count'] = None info['dislike_count'] = None
info['title'] = extract_str(video_info.get('title', None)) info['title'] = extract_str(video_info.get('title', None))
info['author'] = extract_str(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
info['related_videos'] = [extract_item_info(renderer) for renderer in related] info['related_videos'] = [extract_item_info(renderer) for renderer in related]
return info return info
@ -1054,10 +1054,10 @@ def extract_playability_error(info, player_response, error_prefix=''):
info['playability_error'] = None info['playability_error'] = None
return return
playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None)
info['playability_status'] = playability_status info['playability_status'] = playability_status
playability_reason = extract_str(multi_default_multi_get(player_response, playability_reason = extract_str(multi_deep_get(player_response,
['playabilityStatus', 'reason'], ['playabilityStatus', 'reason'],
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
default='Could not find playability error') default='Could not find playability error')
@ -1091,7 +1091,7 @@ def extract_watch_info(polymer_json):
if error: if error:
info['playability_error'] = error info['playability_error'] = error
player_args = default_multi_get(top_level, 'player', 'args', default={}) player_args = deep_get(top_level, 'player', 'args', default={})
player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {}
# captions # captions
@ -1100,8 +1100,8 @@ def extract_watch_info(polymer_json):
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
info['translation_languages'] = [] info['translation_languages'] = []
captions_info = player_response.get('captions', {}) captions_info = player_response.get('captions', {})
info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
lang_code = caption_track.get('languageCode') lang_code = caption_track.get('languageCode')
if not lang_code: if not lang_code:
continue continue
@ -1110,11 +1110,11 @@ def extract_watch_info(polymer_json):
else: else:
info['manual_caption_languages'].append(lang_code) info['manual_caption_languages'].append(lang_code)
base_url = caption_track.get('baseUrl', '') base_url = caption_track.get('baseUrl', '')
lang_name = default_multi_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
if lang_name: if lang_name:
info['_manual_caption_language_names'][lang_code] = lang_name info['_manual_caption_language_names'][lang_code] = lang_name
for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
lang_code = translation_lang_info.get('languageCode') lang_code = translation_lang_info.get('languageCode')
if lang_code: if lang_code:
info['translation_languages'].append(lang_code) info['translation_languages'].append(lang_code)
@ -1131,18 +1131,18 @@ def extract_watch_info(polymer_json):
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
# base_js (for decryption of signatures) # base_js (for decryption of signatures)
info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') info['base_js'] = deep_get(top_level, 'player', 'assets', 'js')
if info['base_js']: if info['base_js']:
info['base_js'] = normalize_url(info['base_js']) info['base_js'] = normalize_url(info['base_js'])
mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={}) mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
if mobile: if mobile:
info.update(extract_watch_info_mobile(top_level)) info.update(extract_watch_info_mobile(top_level))
else: else:
info.update(extract_watch_info_desktop(top_level)) info.update(extract_watch_info_desktop(top_level))
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
vd = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
liberal_update(info, 'title', extract_str(vd.get('title'))) liberal_update(info, 'title', extract_str(vd.get('title')))
liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds'))) liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
liberal_update(info, 'view_count', extract_int(vd.get('viewCount'))) liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
@ -1156,7 +1156,7 @@ def extract_watch_info(polymer_json):
liberal_update(info, 'tags', vd.get('keywords', [])) liberal_update(info, 'tags', vd.get('keywords', []))
# fallback stuff from microformat # fallback stuff from microformat
mf = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
conservative_update(info, 'title', extract_str(mf.get('title'))) conservative_update(info, 'title', extract_str(mf.get('title')))
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds'))) conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
# this gives the view count for limited state videos # this gives the view count for limited state videos
@ -1177,7 +1177,7 @@ def update_with_age_restricted_info(info, video_info_page):
ERROR_PREFIX = 'Error bypassing age-restriction: ' ERROR_PREFIX = 'Error bypassing age-restriction: '
video_info = urllib.parse.parse_qs(video_info_page) video_info = urllib.parse.parse_qs(video_info_page)
player_response = default_multi_get(video_info, 'player_response', 0) player_response = deep_get(video_info, 'player_response', 0)
if player_response is None: if player_response is None:
info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page' info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page'
return return