Fix missing likes, dislikes, & music list due to Youtube changes

Also moves some microformat extraction from
_extract_watch_info_mobile to extract_watch_info where it belongs.
_extract_watch_info_mobile is really only for stuff visible on the
page, and thus specialized for either mobile or desktop.

Signed-off-by: Jesús <heckyel@hyperbola.info>
This commit is contained in:
James Taylor 2021-07-27 21:35:11 -07:00 committed by Jesús
parent f5f9b1c181
commit 54b39f1303
No known key found for this signature in database
GPG Key ID: F6EE7BC59A315766
2 changed files with 121 additions and 60 deletions

View File

@ -403,6 +403,7 @@ nested_renderer_dispatch = {
'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
'twoColumnSearchResultsRenderer': lambda r: get(r, 'primaryContents', {}),
'richItemRenderer': lambda r: get(r, 'content', {}),
'engagementPanelSectionListRenderer': lambda r: get(r, 'content', {}),
}
# these renderers contain a list of renderers inside them
@ -412,6 +413,8 @@ nested_renderer_list_dispatch = {
'gridRenderer': _traverse_standard_list,
'richGridRenderer': _traverse_standard_list,
'playlistVideoListRenderer': _traverse_standard_list,
'structuredDescriptionContentRenderer': _traverse_standard_list,
'slimVideoMetadataSectionRenderer': _traverse_standard_list,
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
}
def get_nested_renderer_list_function(key):
@ -475,8 +478,11 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
renderer = None
def extract_items(response, item_types=_item_types):
def extract_items(response, item_types=_item_types,
search_engagement_panels=False):
'''return items, ctoken'''
items = []
ctoken = None
if 'continuationContents' in response:
# sometimes there's another, empty, junk [something]Continuation key
# find real one
@ -484,13 +490,23 @@ def extract_items(response, item_types=_item_types):
'continuationContents', {}).items():
# e.g. commentSectionContinuation, playlistVideoListContinuation
if key.endswith('Continuation'):
items, cont = extract_items_from_renderer({key: renderer_cont},
items, ctoken = extract_items_from_renderer(
{key: renderer_cont},
item_types=item_types)
if items:
return items, cont
return [], None
break
elif 'contents' in response:
renderer = get(response, 'contents', {})
return extract_items_from_renderer(renderer, item_types=item_types)
else:
return [], None
items, ctoken = extract_items_from_renderer(
renderer,
item_types=item_types)
if search_engagement_panels and 'engagementPanels' in response:
for engagement_renderer in response['engagementPanels']:
additional_items, cont = extract_items_from_renderer(
engagement_renderer,
item_types=item_types)
items += additional_items
if cont and not ctoken:
ctoken = cont
return items, ctoken

View File

@ -116,7 +116,72 @@ _formats = {
'397': {'vcodec': 'av01.0.05M.08'},
}
def _extract_metadata_row_info(video_renderer_info):
def _extract_from_video_information_renderer(renderer_content):
subtitle = extract_str(renderer_content.get('expandedSubtitle'),
default='')
info = {
'title': extract_str(renderer_content.get('title')),
'view_count': extract_int(subtitle),
'unlisted': False,
'live': 'watching' in subtitle,
}
for badge in renderer_content.get('badges', []):
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
info['unlisted'] = True
return info
def _extract_likes_dislikes(renderer_content):
info = {
'like_count': None,
'dislike_count': None,
}
for button in renderer_content.get('buttons', ()):
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
# all the digits can be found in the accessibility data
count = extract_int(deep_get(
button_renderer,
'button', 'toggleButtonRenderer', 'defaultText',
'accessibility', 'accessibilityData', 'label'))
# this count doesn't have all the digits, it's like 53K for instance
dumb_count = extract_int(extract_str(deep_get(
button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
# The accessibility text will be "No likes" or "No dislikes" or
# something like that, but dumb count will be 0
if dumb_count == 0:
count = 0
if 'isLike' in button_renderer:
info['like_count'] = count
elif 'isDislike' in button_renderer:
info['dislike_count'] = count
return info
def _extract_from_owner_renderer(renderer_content):
return {
'author': extract_str(renderer_content.get('title')),
'author_id': deep_get(
renderer_content,
'navigationEndpoint', 'browseEndpoint', 'browseId'),
}
def _extract_from_video_header_renderer(renderer_content):
return {
'title': extract_str(renderer_content.get('title')),
'time_published': extract_date(extract_str(
renderer_content.get('publishDate'))),
}
def _extract_from_description_renderer(renderer_content):
return {
'description': extract_str(
renderer_content.get('descriptionBodyText'), recover_urls=True),
}
def _extract_metadata_row_info(renderer_content):
# extract category and music list
info = {
'category': None,
@ -124,7 +189,7 @@ def _extract_metadata_row_info(video_renderer_info):
}
current_song = {}
for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
for row in deep_get(renderer_content, 'rows', default=[]):
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
if row_title == 'Category':
@ -146,18 +211,18 @@ def _extract_metadata_row_info(video_renderer_info):
return info
visible_extraction_dispatch = {
'slimVideoInformationRenderer': _extract_from_video_information_renderer,
'slimVideoActionBarRenderer': _extract_likes_dislikes,
'slimOwnerRenderer': _extract_from_owner_renderer,
'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
'metadataRowContainerRenderer': _extract_metadata_row_info,
}
def _extract_watch_info_mobile(top_level):
'''Scrapes information from the visible page'''
info = {}
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
family_safe = microformat.get('isFamilySafe')
if family_safe is None:
info['age_restricted'] = None
else:
info['age_restricted'] = not family_safe
info['allowed_countries'] = microformat.get('availableCountries', [])
info['time_published'] = microformat.get('publishDate')
response = top_level.get('response', {})
# this renderer has the stuff visible on the page
@ -190,47 +255,22 @@ def _extract_watch_info_mobile(top_level):
else:
info['playlist'] = None
# Holds the visible video info. It is inside singleColumnWatchNextResults
# but use our convenience function instead
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
if items:
video_info = items[0]['slimVideoMetadataRenderer']
else:
print('Failed to extract video metadata')
video_info = {}
# use dispatch table to get information scattered in various renderers
items, _ = extract_items(
response,
item_types=visible_extraction_dispatch.keys(),
search_engagement_panels=True
)
found = set()
for renderer in items:
name, renderer_content = list(renderer.items())[0]
found.add(name)
info.update(visible_extraction_dispatch[name](renderer_content))
# Call the function on blank dict for any that weren't found
# so that the empty keys get added
for name in visible_extraction_dispatch.keys() - found:
info.update(visible_extraction_dispatch[name]({}))
info.update(_extract_metadata_row_info(video_info))
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
info['title'] = extract_str(video_info.get('title'))
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
info['unlisted'] = False
for badge in video_info.get('badges', []):
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
info['unlisted'] = True
info['like_count'] = None
info['dislike_count'] = None
if not info['time_published']:
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
for button in video_info.get('buttons', ()):
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
# all the digits can be found in the accessibility data
count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
# this count doesn't have all the digits, it's like 53K for instance
dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
if dumb_count == 0:
count = 0
if 'isLike' in button_renderer:
info['like_count'] = count
elif 'isDislike' in button_renderer:
info['dislike_count'] = count
# comment section info
items, _ = extract_items(response, item_types={
@ -274,7 +314,6 @@ def _extract_watch_info_desktop(top_level):
info = {
'comment_count': None,
'comments_disabled': None,
'allowed_countries': [],
'limited_state': None,
'playlist': None,
}
@ -564,6 +603,12 @@ def extract_watch_info(polymer_json):
liberal_update(info, 'category', mf.get('category'))
liberal_update(info, 'time_published', mf.get('publishDate'))
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
family_safe = mf.get('isFamilySafe')
if family_safe is None:
conservative_update(info, 'age_restricted', None)
else:
conservative_update(info, 'age_restricted', not family_safe)
info['allowed_countries'] = mf.get('availableCountries', [])
# other stuff
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None