Fix missing likes, dislikes, & music list due to Youtube changes
Also moves some microformat extraction from _extract_watch_info_mobile to extract_watch_info where it belongs. _extract_watch_info_mobile is really only for stuff visible on the page, and thus specialized for either mobile or desktop. Signed-off-by: Jesús <heckyel@hyperbola.info>
This commit is contained in:
parent
f5f9b1c181
commit
54b39f1303
@ -403,6 +403,7 @@ nested_renderer_dispatch = {
|
||||
'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
||||
'twoColumnSearchResultsRenderer': lambda r: get(r, 'primaryContents', {}),
|
||||
'richItemRenderer': lambda r: get(r, 'content', {}),
|
||||
'engagementPanelSectionListRenderer': lambda r: get(r, 'content', {}),
|
||||
}
|
||||
|
||||
# these renderers contain a list of renderers inside them
|
||||
@ -412,6 +413,8 @@ nested_renderer_list_dispatch = {
|
||||
'gridRenderer': _traverse_standard_list,
|
||||
'richGridRenderer': _traverse_standard_list,
|
||||
'playlistVideoListRenderer': _traverse_standard_list,
|
||||
'structuredDescriptionContentRenderer': _traverse_standard_list,
|
||||
'slimVideoMetadataSectionRenderer': _traverse_standard_list,
|
||||
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
|
||||
}
|
||||
def get_nested_renderer_list_function(key):
|
||||
@ -475,8 +478,11 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
|
||||
|
||||
renderer = None
|
||||
|
||||
def extract_items(response, item_types=_item_types):
|
||||
def extract_items(response, item_types=_item_types,
|
||||
search_engagement_panels=False):
|
||||
'''return items, ctoken'''
|
||||
items = []
|
||||
ctoken = None
|
||||
if 'continuationContents' in response:
|
||||
# sometimes there's another, empty, junk [something]Continuation key
|
||||
# find real one
|
||||
@ -484,13 +490,23 @@ def extract_items(response, item_types=_item_types):
|
||||
'continuationContents', {}).items():
|
||||
# e.g. commentSectionContinuation, playlistVideoListContinuation
|
||||
if key.endswith('Continuation'):
|
||||
items, cont = extract_items_from_renderer({key: renderer_cont},
|
||||
items, ctoken = extract_items_from_renderer(
|
||||
{key: renderer_cont},
|
||||
item_types=item_types)
|
||||
if items:
|
||||
return items, cont
|
||||
return [], None
|
||||
break
|
||||
elif 'contents' in response:
|
||||
renderer = get(response, 'contents', {})
|
||||
return extract_items_from_renderer(renderer, item_types=item_types)
|
||||
else:
|
||||
return [], None
|
||||
items, ctoken = extract_items_from_renderer(
|
||||
renderer,
|
||||
item_types=item_types)
|
||||
|
||||
if search_engagement_panels and 'engagementPanels' in response:
|
||||
for engagement_renderer in response['engagementPanels']:
|
||||
additional_items, cont = extract_items_from_renderer(
|
||||
engagement_renderer,
|
||||
item_types=item_types)
|
||||
items += additional_items
|
||||
if cont and not ctoken:
|
||||
ctoken = cont
|
||||
return items, ctoken
|
||||
|
@ -116,7 +116,72 @@ _formats = {
|
||||
'397': {'vcodec': 'av01.0.05M.08'},
|
||||
}
|
||||
|
||||
def _extract_metadata_row_info(video_renderer_info):
|
||||
|
||||
def _extract_from_video_information_renderer(renderer_content):
|
||||
subtitle = extract_str(renderer_content.get('expandedSubtitle'),
|
||||
default='')
|
||||
info = {
|
||||
'title': extract_str(renderer_content.get('title')),
|
||||
'view_count': extract_int(subtitle),
|
||||
'unlisted': False,
|
||||
'live': 'watching' in subtitle,
|
||||
}
|
||||
for badge in renderer_content.get('badges', []):
|
||||
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
||||
info['unlisted'] = True
|
||||
return info
|
||||
|
||||
def _extract_likes_dislikes(renderer_content):
|
||||
info = {
|
||||
'like_count': None,
|
||||
'dislike_count': None,
|
||||
}
|
||||
for button in renderer_content.get('buttons', ()):
|
||||
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||
|
||||
# all the digits can be found in the accessibility data
|
||||
count = extract_int(deep_get(
|
||||
button_renderer,
|
||||
'button', 'toggleButtonRenderer', 'defaultText',
|
||||
'accessibility', 'accessibilityData', 'label'))
|
||||
|
||||
# this count doesn't have all the digits, it's like 53K for instance
|
||||
dumb_count = extract_int(extract_str(deep_get(
|
||||
button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||
|
||||
# The accessibility text will be "No likes" or "No dislikes" or
|
||||
# something like that, but dumb count will be 0
|
||||
if dumb_count == 0:
|
||||
count = 0
|
||||
|
||||
if 'isLike' in button_renderer:
|
||||
info['like_count'] = count
|
||||
elif 'isDislike' in button_renderer:
|
||||
info['dislike_count'] = count
|
||||
return info
|
||||
|
||||
def _extract_from_owner_renderer(renderer_content):
|
||||
return {
|
||||
'author': extract_str(renderer_content.get('title')),
|
||||
'author_id': deep_get(
|
||||
renderer_content,
|
||||
'navigationEndpoint', 'browseEndpoint', 'browseId'),
|
||||
}
|
||||
|
||||
def _extract_from_video_header_renderer(renderer_content):
|
||||
return {
|
||||
'title': extract_str(renderer_content.get('title')),
|
||||
'time_published': extract_date(extract_str(
|
||||
renderer_content.get('publishDate'))),
|
||||
}
|
||||
|
||||
def _extract_from_description_renderer(renderer_content):
|
||||
return {
|
||||
'description': extract_str(
|
||||
renderer_content.get('descriptionBodyText'), recover_urls=True),
|
||||
}
|
||||
|
||||
def _extract_metadata_row_info(renderer_content):
|
||||
# extract category and music list
|
||||
info = {
|
||||
'category': None,
|
||||
@ -124,7 +189,7 @@ def _extract_metadata_row_info(video_renderer_info):
|
||||
}
|
||||
|
||||
current_song = {}
|
||||
for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
||||
for row in deep_get(renderer_content, 'rows', default=[]):
|
||||
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
||||
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
||||
if row_title == 'Category':
|
||||
@ -146,18 +211,18 @@ def _extract_metadata_row_info(video_renderer_info):
|
||||
|
||||
return info
|
||||
|
||||
visible_extraction_dispatch = {
|
||||
'slimVideoInformationRenderer': _extract_from_video_information_renderer,
|
||||
'slimVideoActionBarRenderer': _extract_likes_dislikes,
|
||||
'slimOwnerRenderer': _extract_from_owner_renderer,
|
||||
'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
|
||||
'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
|
||||
'metadataRowContainerRenderer': _extract_metadata_row_info,
|
||||
}
|
||||
|
||||
def _extract_watch_info_mobile(top_level):
|
||||
'''Scrapes information from the visible page'''
|
||||
info = {}
|
||||
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
||||
|
||||
family_safe = microformat.get('isFamilySafe')
|
||||
if family_safe is None:
|
||||
info['age_restricted'] = None
|
||||
else:
|
||||
info['age_restricted'] = not family_safe
|
||||
info['allowed_countries'] = microformat.get('availableCountries', [])
|
||||
info['time_published'] = microformat.get('publishDate')
|
||||
|
||||
response = top_level.get('response', {})
|
||||
|
||||
# this renderer has the stuff visible on the page
|
||||
@ -190,47 +255,22 @@ def _extract_watch_info_mobile(top_level):
|
||||
else:
|
||||
info['playlist'] = None
|
||||
|
||||
# Holds the visible video info. It is inside singleColumnWatchNextResults
|
||||
# but use our convenience function instead
|
||||
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
|
||||
if items:
|
||||
video_info = items[0]['slimVideoMetadataRenderer']
|
||||
else:
|
||||
print('Failed to extract video metadata')
|
||||
video_info = {}
|
||||
# use dispatch table to get information scattered in various renderers
|
||||
items, _ = extract_items(
|
||||
response,
|
||||
item_types=visible_extraction_dispatch.keys(),
|
||||
search_engagement_panels=True
|
||||
)
|
||||
found = set()
|
||||
for renderer in items:
|
||||
name, renderer_content = list(renderer.items())[0]
|
||||
found.add(name)
|
||||
info.update(visible_extraction_dispatch[name](renderer_content))
|
||||
# Call the function on blank dict for any that weren't found
|
||||
# so that the empty keys get added
|
||||
for name in visible_extraction_dispatch.keys() - found:
|
||||
info.update(visible_extraction_dispatch[name]({}))
|
||||
|
||||
info.update(_extract_metadata_row_info(video_info))
|
||||
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
|
||||
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
|
||||
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
|
||||
info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
||||
info['title'] = extract_str(video_info.get('title'))
|
||||
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
|
||||
info['unlisted'] = False
|
||||
for badge in video_info.get('badges', []):
|
||||
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
||||
info['unlisted'] = True
|
||||
info['like_count'] = None
|
||||
info['dislike_count'] = None
|
||||
if not info['time_published']:
|
||||
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
||||
for button in video_info.get('buttons', ()):
|
||||
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||
|
||||
# all the digits can be found in the accessibility data
|
||||
count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
||||
|
||||
# this count doesn't have all the digits, it's like 53K for instance
|
||||
dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||
|
||||
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
||||
if dumb_count == 0:
|
||||
count = 0
|
||||
|
||||
if 'isLike' in button_renderer:
|
||||
info['like_count'] = count
|
||||
elif 'isDislike' in button_renderer:
|
||||
info['dislike_count'] = count
|
||||
|
||||
# comment section info
|
||||
items, _ = extract_items(response, item_types={
|
||||
@ -274,7 +314,6 @@ def _extract_watch_info_desktop(top_level):
|
||||
info = {
|
||||
'comment_count': None,
|
||||
'comments_disabled': None,
|
||||
'allowed_countries': [],
|
||||
'limited_state': None,
|
||||
'playlist': None,
|
||||
}
|
||||
@ -564,6 +603,12 @@ def extract_watch_info(polymer_json):
|
||||
liberal_update(info, 'category', mf.get('category'))
|
||||
liberal_update(info, 'time_published', mf.get('publishDate'))
|
||||
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
|
||||
family_safe = mf.get('isFamilySafe')
|
||||
if family_safe is None:
|
||||
conservative_update(info, 'age_restricted', None)
|
||||
else:
|
||||
conservative_update(info, 'age_restricted', not family_safe)
|
||||
info['allowed_countries'] = mf.get('availableCountries', [])
|
||||
|
||||
# other stuff
|
||||
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||
|
Loading…
x
Reference in New Issue
Block a user