Extraction: rename multi_get functions to more descriptive names
This commit is contained in:
parent
98777ee825
commit
f6bf5213a5
@ -104,7 +104,7 @@ def get_playlist_page():
|
|||||||
if 'id' in item:
|
if 'id' in item:
|
||||||
item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg'
|
item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg'
|
||||||
|
|
||||||
video_count = yt_data_extract.default_multi_get(info, 'metadata', 'video_count')
|
video_count = yt_data_extract.deep_get(info, 'metadata', 'video_count')
|
||||||
if video_count is None:
|
if video_count is None:
|
||||||
video_count = 40
|
video_count = 40
|
||||||
|
|
||||||
|
@ -185,7 +185,7 @@ def decrypt_signatures(info):
|
|||||||
return False # No decryption needed
|
return False # No decryption needed
|
||||||
if not info['base_js']:
|
if not info['base_js']:
|
||||||
return 'Failed to find base.js'
|
return 'Failed to find base.js'
|
||||||
player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2)
|
player_name = yt_data_extract.get(info['base_js'].split('/'), -2)
|
||||||
if not player_name:
|
if not player_name:
|
||||||
return 'Could not find player name'
|
return 'Could not find player name'
|
||||||
|
|
||||||
@ -204,7 +204,7 @@ def decrypt_signatures(info):
|
|||||||
if not function_body:
|
if not function_body:
|
||||||
return 'Empty decryption function body'
|
return 'Empty decryption function body'
|
||||||
|
|
||||||
var_name = yt_data_extract.default_get(function_body[0].split('.'), 0)
|
var_name = yt_data_extract.get(function_body[0].split('.'), 0)
|
||||||
if var_name is None:
|
if var_name is None:
|
||||||
return 'Could not find var_name'
|
return 'Could not find var_name'
|
||||||
|
|
||||||
@ -397,8 +397,8 @@ def get_watch_page():
|
|||||||
})
|
})
|
||||||
|
|
||||||
video_sources = get_video_sources(info)
|
video_sources = get_video_sources(info)
|
||||||
video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360)
|
video_height = yt_data_extract.deep_get(video_sources, 0, 'height', default=360)
|
||||||
video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640)
|
video_width = yt_data_extract.deep_get(video_sources, 0, 'width', default=640)
|
||||||
# 1 second per pixel, or the actual video width
|
# 1 second per pixel, or the actual video width
|
||||||
theater_video_target_width = max(640, info['duration'] or 0, video_width)
|
theater_video_target_width = max(640, info['duration'] or 0, video_width)
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ _formats = {
|
|||||||
'397': {'vcodec': 'av01.0.05M.08'},
|
'397': {'vcodec': 'av01.0.05M.08'},
|
||||||
}
|
}
|
||||||
|
|
||||||
def default_get(object, key, default=None, types=()):
|
def get(object, key, default=None, types=()):
|
||||||
'''Like dict.get(), but returns default if the result doesn't match one of the types.
|
'''Like dict.get(), but returns default if the result doesn't match one of the types.
|
||||||
Also works for indexing lists.'''
|
Also works for indexing lists.'''
|
||||||
try:
|
try:
|
||||||
@ -158,8 +158,8 @@ def default_get(object, key, default=None, types=()):
|
|||||||
else:
|
else:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def multi_default_get(object, *keys, default=None, types=()):
|
def multi_get(object, *keys, default=None, types=()):
|
||||||
'''Like default_get, but try other keys if the first fails'''
|
'''Like get, but try other keys if the first fails'''
|
||||||
for key in keys:
|
for key in keys:
|
||||||
try:
|
try:
|
||||||
result = object[key]
|
result = object[key]
|
||||||
@ -173,7 +173,7 @@ def multi_default_get(object, *keys, default=None, types=()):
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
def default_multi_get(object, *keys, default=None, types=()):
|
def deep_get(object, *keys, default=None, types=()):
|
||||||
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
|
'''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
|
||||||
Last argument is the default value to use in case of any IndexErrors or KeyErrors.
|
Last argument is the default value to use in case of any IndexErrors or KeyErrors.
|
||||||
If types is given and the result doesn't match one of those types, default is returned'''
|
If types is given and the result doesn't match one of those types, default is returned'''
|
||||||
@ -188,8 +188,8 @@ def default_multi_get(object, *keys, default=None, types=()):
|
|||||||
else:
|
else:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def multi_default_multi_get(object, *key_sequences, default=None, types=()):
|
def multi_deep_get(object, *key_sequences, default=None, types=()):
|
||||||
'''Like default_multi_get, but can try different key sequences in case one fails.
|
'''Like deep_get, but can try different key sequences in case one fails.
|
||||||
Return default if all of them fail. key_sequences is a list of lists'''
|
Return default if all of them fail. key_sequences is a list of lists'''
|
||||||
for key_sequence in key_sequences:
|
for key_sequence in key_sequences:
|
||||||
_object = object
|
_object = object
|
||||||
@ -224,7 +224,7 @@ def remove_redirect(url):
|
|||||||
|
|
||||||
def _recover_urls(runs):
|
def _recover_urls(runs):
|
||||||
for run in runs:
|
for run in runs:
|
||||||
url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
||||||
text = run.get('text', '')
|
text = run.get('text', '')
|
||||||
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
|
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
|
||||||
if url is not None and (text.startswith('http://') or text.startswith('https://')):
|
if url is not None and (text.startswith('http://') or text.startswith('https://')):
|
||||||
@ -328,14 +328,14 @@ def extract_item_info(item, additional_info={}):
|
|||||||
if not item:
|
if not item:
|
||||||
return {'error': 'No item given'}
|
return {'error': 'No item given'}
|
||||||
|
|
||||||
type = default_get(list(item.keys()), 0)
|
type = get(list(item.keys()), 0)
|
||||||
if not type:
|
if not type:
|
||||||
return {'error': 'Could not find type'}
|
return {'error': 'Could not find type'}
|
||||||
item = item[type]
|
item = item[type]
|
||||||
|
|
||||||
info = {'error': None}
|
info = {'error': None}
|
||||||
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
|
if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
|
||||||
return extract_item_info(default_multi_get(item, 'contents', 0), additional_info)
|
return extract_item_info(deep_get(item, 'contents', 0), additional_info)
|
||||||
|
|
||||||
if type in ('movieRenderer', 'clarificationRenderer'):
|
if type in ('movieRenderer', 'clarificationRenderer'):
|
||||||
info['type'] = 'unsupported'
|
info['type'] = 'unsupported'
|
||||||
@ -360,23 +360,23 @@ def extract_item_info(item, additional_info={}):
|
|||||||
info['type'] = 'unsupported'
|
info['type'] = 'unsupported'
|
||||||
|
|
||||||
info['title'] = extract_str(item.get('title'))
|
info['title'] = extract_str(item.get('title'))
|
||||||
info['author'] = extract_str(multi_default_get(item, 'longBylineText', 'shortBylineText', 'ownerText'))
|
info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText'))
|
||||||
info['author_id'] = extract_str(multi_default_multi_get(item,
|
info['author_id'] = extract_str(multi_deep_get(item,
|
||||||
['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||||
['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||||
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId']
|
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId']
|
||||||
))
|
))
|
||||||
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
|
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
|
||||||
info['description'] = extract_formatted_text(multi_default_get(item, 'descriptionSnippet', 'descriptionText'))
|
info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
|
||||||
info['thumbnail'] = multi_default_multi_get(item,
|
info['thumbnail'] = multi_deep_get(item,
|
||||||
['thumbnail', 'thumbnails', 0, 'url'], # videos
|
['thumbnail', 'thumbnails', 0, 'url'], # videos
|
||||||
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
|
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
|
||||||
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
|
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
|
||||||
)
|
)
|
||||||
|
|
||||||
info['badges'] = []
|
info['badges'] = []
|
||||||
for badge_node in multi_default_get(item, 'badges', 'ownerBadges', default=()):
|
for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
|
||||||
badge = default_multi_get(badge_node, 'metadataBadgeRenderer', 'label')
|
badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label')
|
||||||
if badge:
|
if badge:
|
||||||
info['badges'].append(badge)
|
info['badges'].append(badge)
|
||||||
|
|
||||||
@ -389,7 +389,7 @@ def extract_item_info(item, additional_info={}):
|
|||||||
if info['view_count']:
|
if info['view_count']:
|
||||||
info['approx_view_count'] = '{:,}'.format(info['view_count'])
|
info['approx_view_count'] = '{:,}'.format(info['view_count'])
|
||||||
else:
|
else:
|
||||||
info['approx_view_count'] = extract_approx_int(multi_default_get(item, 'shortViewCountText'))
|
info['approx_view_count'] = extract_approx_int(multi_get(item, 'shortViewCountText'))
|
||||||
info['duration'] = extract_str(item.get('lengthText'))
|
info['duration'] = extract_str(item.get('lengthText'))
|
||||||
elif primary_type == 'playlist':
|
elif primary_type == 'playlist':
|
||||||
info['id'] = item.get('playlistId')
|
info['id'] = item.get('playlistId')
|
||||||
@ -398,17 +398,17 @@ def extract_item_info(item, additional_info={}):
|
|||||||
info['id'] = item.get('channelId')
|
info['id'] = item.get('channelId')
|
||||||
info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText'))
|
info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText'))
|
||||||
elif primary_type == 'show':
|
elif primary_type == 'show':
|
||||||
info['id'] = default_multi_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId')
|
info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId')
|
||||||
|
|
||||||
if primary_type in ('playlist', 'channel'):
|
if primary_type in ('playlist', 'channel'):
|
||||||
conservative_update(info, 'video_count', extract_int(item.get('videoCountText')))
|
conservative_update(info, 'video_count', extract_int(item.get('videoCountText')))
|
||||||
|
|
||||||
for overlay in item.get('thumbnailOverlays', []):
|
for overlay in item.get('thumbnailOverlays', []):
|
||||||
conservative_update(info, 'duration', extract_str(default_multi_get(
|
conservative_update(info, 'duration', extract_str(deep_get(
|
||||||
overlay, 'thumbnailOverlayTimeStatusRenderer', 'text'
|
overlay, 'thumbnailOverlayTimeStatusRenderer', 'text'
|
||||||
)))
|
)))
|
||||||
# show renderers don't have videoCountText
|
# show renderers don't have videoCountText
|
||||||
conservative_update(info, 'video_count', extract_int(default_multi_get(
|
conservative_update(info, 'video_count', extract_int(deep_get(
|
||||||
overlay, 'thumbnailOverlayBottomPanelRenderer', 'text'
|
overlay, 'thumbnailOverlayBottomPanelRenderer', 'text'
|
||||||
)))
|
)))
|
||||||
return info
|
return info
|
||||||
@ -422,7 +422,7 @@ def parse_info_prepare_for_html(renderer, additional_info={}):
|
|||||||
|
|
||||||
def extract_response(polymer_json):
|
def extract_response(polymer_json):
|
||||||
'''return response, error'''
|
'''return response, error'''
|
||||||
response = multi_default_multi_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict)
|
response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict)
|
||||||
if response is None:
|
if response is None:
|
||||||
return None, 'Failed to extract response'
|
return None, 'Failed to extract response'
|
||||||
else:
|
else:
|
||||||
@ -468,25 +468,25 @@ item_types = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def traverse_browse_renderer(renderer):
|
def traverse_browse_renderer(renderer):
|
||||||
for tab in default_get(renderer, 'tabs', (), types=(list, tuple)):
|
for tab in get(renderer, 'tabs', (), types=(list, tuple)):
|
||||||
tab_renderer = multi_default_multi_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict)
|
tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict)
|
||||||
if tab_renderer is None:
|
if tab_renderer is None:
|
||||||
continue
|
continue
|
||||||
if tab_renderer.get('selected', False):
|
if tab_renderer.get('selected', False):
|
||||||
return default_get(tab_renderer, 'content', {}, types=(dict))
|
return get(tab_renderer, 'content', {}, types=(dict))
|
||||||
print('Could not find tab with content')
|
print('Could not find tab with content')
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def traverse_standard_list(renderer):
|
def traverse_standard_list(renderer):
|
||||||
renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
|
renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
|
||||||
continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
|
continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
|
||||||
return renderer_list, continuation
|
return renderer_list, continuation
|
||||||
|
|
||||||
# these renderers contain one inside them
|
# these renderers contain one inside them
|
||||||
nested_renderer_dispatch = {
|
nested_renderer_dispatch = {
|
||||||
'singleColumnBrowseResultsRenderer': traverse_browse_renderer,
|
'singleColumnBrowseResultsRenderer': traverse_browse_renderer,
|
||||||
'twoColumnBrowseResultsRenderer': traverse_browse_renderer,
|
'twoColumnBrowseResultsRenderer': traverse_browse_renderer,
|
||||||
'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict),
|
'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict),
|
||||||
}
|
}
|
||||||
|
|
||||||
# these renderers contain a list of renderers in side them
|
# these renderers contain a list of renderers in side them
|
||||||
@ -495,17 +495,17 @@ nested_renderer_list_dispatch = {
|
|||||||
'itemSectionRenderer': traverse_standard_list,
|
'itemSectionRenderer': traverse_standard_list,
|
||||||
'gridRenderer': traverse_standard_list,
|
'gridRenderer': traverse_standard_list,
|
||||||
'playlistVideoListRenderer': traverse_standard_list,
|
'playlistVideoListRenderer': traverse_standard_list,
|
||||||
'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
|
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
|
||||||
}
|
}
|
||||||
|
|
||||||
def extract_items(response, item_types=item_types):
|
def extract_items(response, item_types=item_types):
|
||||||
'''return items, ctoken'''
|
'''return items, ctoken'''
|
||||||
if 'continuationContents' in response:
|
if 'continuationContents' in response:
|
||||||
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
|
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
|
||||||
for key, renderer_continuation in default_get(response, 'continuationContents', {}, types=dict).items():
|
for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items():
|
||||||
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation
|
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation
|
||||||
items = multi_default_multi_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple))
|
items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple))
|
||||||
ctoken = default_multi_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
|
ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
|
||||||
return items, ctoken
|
return items, ctoken
|
||||||
return [], None
|
return [], None
|
||||||
elif 'contents' in response:
|
elif 'contents' in response:
|
||||||
@ -515,7 +515,7 @@ def extract_items(response, item_types=item_types):
|
|||||||
iter_stack = collections.deque()
|
iter_stack = collections.deque()
|
||||||
current_iter = iter(())
|
current_iter = iter(())
|
||||||
|
|
||||||
renderer = default_get(response, 'contents', {}, types=dict)
|
renderer = get(response, 'contents', {}, types=dict)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
# mode 1: dig into the current renderer
|
# mode 1: dig into the current renderer
|
||||||
@ -692,11 +692,11 @@ def extract_playlist_metadata(polymer_json):
|
|||||||
return {'error': err}
|
return {'error': err}
|
||||||
|
|
||||||
metadata = {'error': None}
|
metadata = {'error': None}
|
||||||
header = default_multi_get(response, 'header', 'playlistHeaderRenderer', default={})
|
header = deep_get(response, 'header', 'playlistHeaderRenderer', default={})
|
||||||
metadata['title'] = extract_str(header.get('title'))
|
metadata['title'] = extract_str(header.get('title'))
|
||||||
|
|
||||||
metadata['first_video_id'] = default_multi_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
|
metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
|
||||||
first_id = re.search(r'([a-z_\-]{11})', default_multi_get(header,
|
first_id = re.search(r'([a-z_\-]{11})', deep_get(header,
|
||||||
'thumbnail', 'thumbnails', 0, 'url', default=''))
|
'thumbnail', 'thumbnails', 0, 'url', default=''))
|
||||||
if first_id:
|
if first_id:
|
||||||
conservative_update(metadata, 'first_video_id', first_id.group(1))
|
conservative_update(metadata, 'first_video_id', first_id.group(1))
|
||||||
@ -708,7 +708,7 @@ def extract_playlist_metadata(polymer_json):
|
|||||||
metadata['video_count'] = extract_int(header.get('numVideosText'))
|
metadata['video_count'] = extract_int(header.get('numVideosText'))
|
||||||
metadata['description'] = extract_str(header.get('descriptionText'), default='')
|
metadata['description'] = extract_str(header.get('descriptionText'), default='')
|
||||||
metadata['author'] = extract_str(header.get('ownerText'))
|
metadata['author'] = extract_str(header.get('ownerText'))
|
||||||
metadata['author_id'] = multi_default_multi_get(header,
|
metadata['author_id'] = multi_deep_get(header,
|
||||||
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
|
||||||
['ownerEndpoint', 'browseEndpoint', 'browseId'])
|
['ownerEndpoint', 'browseEndpoint', 'browseId'])
|
||||||
if metadata['author_id']:
|
if metadata['author_id']:
|
||||||
@ -854,9 +854,9 @@ def extract_metadata_row_info(video_renderer_info):
|
|||||||
}
|
}
|
||||||
|
|
||||||
current_song = {}
|
current_song = {}
|
||||||
for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
||||||
row_title = extract_str(default_multi_get(row, 'metadataRowRenderer', 'title'), default='')
|
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
||||||
row_content = extract_str(default_multi_get(row, 'metadataRowRenderer', 'contents', 0))
|
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
||||||
if row_title == 'Category':
|
if row_title == 'Category':
|
||||||
info['category'] = row_content
|
info['category'] = row_content
|
||||||
elif row_title in ('Song', 'Music'):
|
elif row_title in ('Song', 'Music'):
|
||||||
@ -890,7 +890,7 @@ def extract_date(date_text):
|
|||||||
|
|
||||||
def extract_watch_info_mobile(top_level):
|
def extract_watch_info_mobile(top_level):
|
||||||
info = {}
|
info = {}
|
||||||
microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||||
|
|
||||||
family_safe = microformat.get('isFamilySafe')
|
family_safe = microformat.get('isFamilySafe')
|
||||||
if family_safe is None:
|
if family_safe is None:
|
||||||
@ -913,13 +913,13 @@ def extract_watch_info_mobile(top_level):
|
|||||||
info.update(extract_metadata_row_info(video_info))
|
info.update(extract_metadata_row_info(video_info))
|
||||||
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
|
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
|
||||||
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
|
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
|
||||||
info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
|
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
|
||||||
info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||||
info['title'] = extract_str(video_info.get('title'))
|
info['title'] = extract_str(video_info.get('title'))
|
||||||
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
|
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
|
||||||
info['unlisted'] = False
|
info['unlisted'] = False
|
||||||
for badge in video_info.get('badges', []):
|
for badge in video_info.get('badges', []):
|
||||||
if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
||||||
info['unlisted'] = True
|
info['unlisted'] = True
|
||||||
info['like_count'] = None
|
info['like_count'] = None
|
||||||
info['dislike_count'] = None
|
info['dislike_count'] = None
|
||||||
@ -929,10 +929,10 @@ def extract_watch_info_mobile(top_level):
|
|||||||
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||||
|
|
||||||
# all the digits can be found in the accessibility data
|
# all the digits can be found in the accessibility data
|
||||||
count = extract_int(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
||||||
|
|
||||||
# this count doesn't have all the digits, it's like 53K for instance
|
# this count doesn't have all the digits, it's like 53K for instance
|
||||||
dumb_count = extract_int(extract_str(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||||
|
|
||||||
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
||||||
if dumb_count == 0:
|
if dumb_count == 0:
|
||||||
@ -947,7 +947,7 @@ def extract_watch_info_mobile(top_level):
|
|||||||
items, _ = extract_items(response, item_types={'commentSectionRenderer'})
|
items, _ = extract_items(response, item_types={'commentSectionRenderer'})
|
||||||
if items:
|
if items:
|
||||||
comment_info = items[0]['commentSectionRenderer']
|
comment_info = items[0]['commentSectionRenderer']
|
||||||
comment_count_text = extract_str(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
|
comment_count_text = extract_str(deep_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
|
||||||
if comment_count_text == 'Comments': # just this with no number, means 0 comments
|
if comment_count_text == 'Comments': # just this with no number, means 0 comments
|
||||||
info['comment_count'] = 0
|
info['comment_count'] = 0
|
||||||
else:
|
else:
|
||||||
@ -980,7 +980,7 @@ def extract_watch_info_desktop(top_level):
|
|||||||
}
|
}
|
||||||
|
|
||||||
video_info = {}
|
video_info = {}
|
||||||
for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
|
for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
|
||||||
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
|
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
|
||||||
video_info.update(list(renderer.values())[0])
|
video_info.update(list(renderer.values())[0])
|
||||||
|
|
||||||
@ -988,7 +988,7 @@ def extract_watch_info_desktop(top_level):
|
|||||||
info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
|
info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
|
||||||
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
||||||
|
|
||||||
likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
|
likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
|
||||||
if len(likes_dislikes) == 2:
|
if len(likes_dislikes) == 2:
|
||||||
info['like_count'] = extract_int(likes_dislikes[0])
|
info['like_count'] = extract_int(likes_dislikes[0])
|
||||||
info['dislike_count'] = extract_int(likes_dislikes[1])
|
info['dislike_count'] = extract_int(likes_dislikes[1])
|
||||||
@ -997,11 +997,11 @@ def extract_watch_info_desktop(top_level):
|
|||||||
info['dislike_count'] = None
|
info['dislike_count'] = None
|
||||||
|
|
||||||
info['title'] = extract_str(video_info.get('title', None))
|
info['title'] = extract_str(video_info.get('title', None))
|
||||||
info['author'] = extract_str(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
|
info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
|
||||||
info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||||
info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
|
info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
|
||||||
|
|
||||||
related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
|
related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
|
||||||
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
||||||
|
|
||||||
return info
|
return info
|
||||||
@ -1054,10 +1054,10 @@ def extract_playability_error(info, player_response, error_prefix=''):
|
|||||||
info['playability_error'] = None
|
info['playability_error'] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None)
|
playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None)
|
||||||
info['playability_status'] = playability_status
|
info['playability_status'] = playability_status
|
||||||
|
|
||||||
playability_reason = extract_str(multi_default_multi_get(player_response,
|
playability_reason = extract_str(multi_deep_get(player_response,
|
||||||
['playabilityStatus', 'reason'],
|
['playabilityStatus', 'reason'],
|
||||||
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
|
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
|
||||||
default='Could not find playability error')
|
default='Could not find playability error')
|
||||||
@ -1091,7 +1091,7 @@ def extract_watch_info(polymer_json):
|
|||||||
if error:
|
if error:
|
||||||
info['playability_error'] = error
|
info['playability_error'] = error
|
||||||
|
|
||||||
player_args = default_multi_get(top_level, 'player', 'args', default={})
|
player_args = deep_get(top_level, 'player', 'args', default={})
|
||||||
player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {}
|
player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {}
|
||||||
|
|
||||||
# captions
|
# captions
|
||||||
@ -1100,8 +1100,8 @@ def extract_watch_info(polymer_json):
|
|||||||
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
|
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
|
||||||
info['translation_languages'] = []
|
info['translation_languages'] = []
|
||||||
captions_info = player_response.get('captions', {})
|
captions_info = player_response.get('captions', {})
|
||||||
info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
|
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
|
||||||
for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
|
for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
|
||||||
lang_code = caption_track.get('languageCode')
|
lang_code = caption_track.get('languageCode')
|
||||||
if not lang_code:
|
if not lang_code:
|
||||||
continue
|
continue
|
||||||
@ -1110,11 +1110,11 @@ def extract_watch_info(polymer_json):
|
|||||||
else:
|
else:
|
||||||
info['manual_caption_languages'].append(lang_code)
|
info['manual_caption_languages'].append(lang_code)
|
||||||
base_url = caption_track.get('baseUrl', '')
|
base_url = caption_track.get('baseUrl', '')
|
||||||
lang_name = default_multi_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
|
lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
|
||||||
if lang_name:
|
if lang_name:
|
||||||
info['_manual_caption_language_names'][lang_code] = lang_name
|
info['_manual_caption_language_names'][lang_code] = lang_name
|
||||||
|
|
||||||
for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
|
for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
|
||||||
lang_code = translation_lang_info.get('languageCode')
|
lang_code = translation_lang_info.get('languageCode')
|
||||||
if lang_code:
|
if lang_code:
|
||||||
info['translation_languages'].append(lang_code)
|
info['translation_languages'].append(lang_code)
|
||||||
@ -1131,18 +1131,18 @@ def extract_watch_info(polymer_json):
|
|||||||
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
|
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
|
||||||
|
|
||||||
# base_js (for decryption of signatures)
|
# base_js (for decryption of signatures)
|
||||||
info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js')
|
info['base_js'] = deep_get(top_level, 'player', 'assets', 'js')
|
||||||
if info['base_js']:
|
if info['base_js']:
|
||||||
info['base_js'] = normalize_url(info['base_js'])
|
info['base_js'] = normalize_url(info['base_js'])
|
||||||
|
|
||||||
mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={})
|
mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
|
||||||
if mobile:
|
if mobile:
|
||||||
info.update(extract_watch_info_mobile(top_level))
|
info.update(extract_watch_info_mobile(top_level))
|
||||||
else:
|
else:
|
||||||
info.update(extract_watch_info_desktop(top_level))
|
info.update(extract_watch_info_desktop(top_level))
|
||||||
|
|
||||||
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
|
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
|
||||||
vd = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={})
|
vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
|
||||||
liberal_update(info, 'title', extract_str(vd.get('title')))
|
liberal_update(info, 'title', extract_str(vd.get('title')))
|
||||||
liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
|
liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
|
||||||
liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
|
liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
|
||||||
@ -1156,7 +1156,7 @@ def extract_watch_info(polymer_json):
|
|||||||
liberal_update(info, 'tags', vd.get('keywords', []))
|
liberal_update(info, 'tags', vd.get('keywords', []))
|
||||||
|
|
||||||
# fallback stuff from microformat
|
# fallback stuff from microformat
|
||||||
mf = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||||
conservative_update(info, 'title', extract_str(mf.get('title')))
|
conservative_update(info, 'title', extract_str(mf.get('title')))
|
||||||
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
|
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
|
||||||
# this gives the view count for limited state videos
|
# this gives the view count for limited state videos
|
||||||
@ -1177,7 +1177,7 @@ def update_with_age_restricted_info(info, video_info_page):
|
|||||||
ERROR_PREFIX = 'Error bypassing age-restriction: '
|
ERROR_PREFIX = 'Error bypassing age-restriction: '
|
||||||
|
|
||||||
video_info = urllib.parse.parse_qs(video_info_page)
|
video_info = urllib.parse.parse_qs(video_info_page)
|
||||||
player_response = default_multi_get(video_info, 'player_response', 0)
|
player_response = deep_get(video_info, 'player_response', 0)
|
||||||
if player_response is None:
|
if player_response is None:
|
||||||
info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page'
|
info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page'
|
||||||
return
|
return
|
||||||
|
Loading…
x
Reference in New Issue
Block a user