Fix missing likes, dislikes, & music list due to Youtube changes
Also moves some microformat extraction from _extract_watch_info_mobile to extract_watch_info where it belongs. _extract_watch_info_mobile is really only for stuff visible on the page, and thus specialized for either mobile or desktop. Signed-off-by: Jesús <heckyel@hyperbola.info>
This commit is contained in:
parent
f5f9b1c181
commit
54b39f1303
@ -403,6 +403,7 @@ nested_renderer_dispatch = {
|
|||||||
'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
|
||||||
'twoColumnSearchResultsRenderer': lambda r: get(r, 'primaryContents', {}),
|
'twoColumnSearchResultsRenderer': lambda r: get(r, 'primaryContents', {}),
|
||||||
'richItemRenderer': lambda r: get(r, 'content', {}),
|
'richItemRenderer': lambda r: get(r, 'content', {}),
|
||||||
|
'engagementPanelSectionListRenderer': lambda r: get(r, 'content', {}),
|
||||||
}
|
}
|
||||||
|
|
||||||
# these renderers contain a list of renderers inside them
|
# these renderers contain a list of renderers inside them
|
||||||
@ -412,6 +413,8 @@ nested_renderer_list_dispatch = {
|
|||||||
'gridRenderer': _traverse_standard_list,
|
'gridRenderer': _traverse_standard_list,
|
||||||
'richGridRenderer': _traverse_standard_list,
|
'richGridRenderer': _traverse_standard_list,
|
||||||
'playlistVideoListRenderer': _traverse_standard_list,
|
'playlistVideoListRenderer': _traverse_standard_list,
|
||||||
|
'structuredDescriptionContentRenderer': _traverse_standard_list,
|
||||||
|
'slimVideoMetadataSectionRenderer': _traverse_standard_list,
|
||||||
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
|
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
|
||||||
}
|
}
|
||||||
def get_nested_renderer_list_function(key):
|
def get_nested_renderer_list_function(key):
|
||||||
@ -475,8 +478,11 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
|
|||||||
|
|
||||||
renderer = None
|
renderer = None
|
||||||
|
|
||||||
def extract_items(response, item_types=_item_types):
|
def extract_items(response, item_types=_item_types,
|
||||||
|
search_engagement_panels=False):
|
||||||
'''return items, ctoken'''
|
'''return items, ctoken'''
|
||||||
|
items = []
|
||||||
|
ctoken = None
|
||||||
if 'continuationContents' in response:
|
if 'continuationContents' in response:
|
||||||
# sometimes there's another, empty, junk [something]Continuation key
|
# sometimes there's another, empty, junk [something]Continuation key
|
||||||
# find real one
|
# find real one
|
||||||
@ -484,13 +490,23 @@ def extract_items(response, item_types=_item_types):
|
|||||||
'continuationContents', {}).items():
|
'continuationContents', {}).items():
|
||||||
# e.g. commentSectionContinuation, playlistVideoListContinuation
|
# e.g. commentSectionContinuation, playlistVideoListContinuation
|
||||||
if key.endswith('Continuation'):
|
if key.endswith('Continuation'):
|
||||||
items, cont = extract_items_from_renderer({key: renderer_cont},
|
items, ctoken = extract_items_from_renderer(
|
||||||
|
{key: renderer_cont},
|
||||||
item_types=item_types)
|
item_types=item_types)
|
||||||
if items:
|
if items:
|
||||||
return items, cont
|
break
|
||||||
return [], None
|
|
||||||
elif 'contents' in response:
|
elif 'contents' in response:
|
||||||
renderer = get(response, 'contents', {})
|
renderer = get(response, 'contents', {})
|
||||||
return extract_items_from_renderer(renderer, item_types=item_types)
|
items, ctoken = extract_items_from_renderer(
|
||||||
else:
|
renderer,
|
||||||
return [], None
|
item_types=item_types)
|
||||||
|
|
||||||
|
if search_engagement_panels and 'engagementPanels' in response:
|
||||||
|
for engagement_renderer in response['engagementPanels']:
|
||||||
|
additional_items, cont = extract_items_from_renderer(
|
||||||
|
engagement_renderer,
|
||||||
|
item_types=item_types)
|
||||||
|
items += additional_items
|
||||||
|
if cont and not ctoken:
|
||||||
|
ctoken = cont
|
||||||
|
return items, ctoken
|
||||||
|
@ -116,7 +116,72 @@ _formats = {
|
|||||||
'397': {'vcodec': 'av01.0.05M.08'},
|
'397': {'vcodec': 'av01.0.05M.08'},
|
||||||
}
|
}
|
||||||
|
|
||||||
def _extract_metadata_row_info(video_renderer_info):
|
|
||||||
|
def _extract_from_video_information_renderer(renderer_content):
|
||||||
|
subtitle = extract_str(renderer_content.get('expandedSubtitle'),
|
||||||
|
default='')
|
||||||
|
info = {
|
||||||
|
'title': extract_str(renderer_content.get('title')),
|
||||||
|
'view_count': extract_int(subtitle),
|
||||||
|
'unlisted': False,
|
||||||
|
'live': 'watching' in subtitle,
|
||||||
|
}
|
||||||
|
for badge in renderer_content.get('badges', []):
|
||||||
|
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
||||||
|
info['unlisted'] = True
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _extract_likes_dislikes(renderer_content):
|
||||||
|
info = {
|
||||||
|
'like_count': None,
|
||||||
|
'dislike_count': None,
|
||||||
|
}
|
||||||
|
for button in renderer_content.get('buttons', ()):
|
||||||
|
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
||||||
|
|
||||||
|
# all the digits can be found in the accessibility data
|
||||||
|
count = extract_int(deep_get(
|
||||||
|
button_renderer,
|
||||||
|
'button', 'toggleButtonRenderer', 'defaultText',
|
||||||
|
'accessibility', 'accessibilityData', 'label'))
|
||||||
|
|
||||||
|
# this count doesn't have all the digits, it's like 53K for instance
|
||||||
|
dumb_count = extract_int(extract_str(deep_get(
|
||||||
|
button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
||||||
|
|
||||||
|
# The accessibility text will be "No likes" or "No dislikes" or
|
||||||
|
# something like that, but dumb count will be 0
|
||||||
|
if dumb_count == 0:
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
if 'isLike' in button_renderer:
|
||||||
|
info['like_count'] = count
|
||||||
|
elif 'isDislike' in button_renderer:
|
||||||
|
info['dislike_count'] = count
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _extract_from_owner_renderer(renderer_content):
|
||||||
|
return {
|
||||||
|
'author': extract_str(renderer_content.get('title')),
|
||||||
|
'author_id': deep_get(
|
||||||
|
renderer_content,
|
||||||
|
'navigationEndpoint', 'browseEndpoint', 'browseId'),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_from_video_header_renderer(renderer_content):
|
||||||
|
return {
|
||||||
|
'title': extract_str(renderer_content.get('title')),
|
||||||
|
'time_published': extract_date(extract_str(
|
||||||
|
renderer_content.get('publishDate'))),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_from_description_renderer(renderer_content):
|
||||||
|
return {
|
||||||
|
'description': extract_str(
|
||||||
|
renderer_content.get('descriptionBodyText'), recover_urls=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_metadata_row_info(renderer_content):
|
||||||
# extract category and music list
|
# extract category and music list
|
||||||
info = {
|
info = {
|
||||||
'category': None,
|
'category': None,
|
||||||
@ -124,7 +189,7 @@ def _extract_metadata_row_info(video_renderer_info):
|
|||||||
}
|
}
|
||||||
|
|
||||||
current_song = {}
|
current_song = {}
|
||||||
for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
|
for row in deep_get(renderer_content, 'rows', default=[]):
|
||||||
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
||||||
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
||||||
if row_title == 'Category':
|
if row_title == 'Category':
|
||||||
@ -146,18 +211,18 @@ def _extract_metadata_row_info(video_renderer_info):
|
|||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
visible_extraction_dispatch = {
|
||||||
|
'slimVideoInformationRenderer': _extract_from_video_information_renderer,
|
||||||
|
'slimVideoActionBarRenderer': _extract_likes_dislikes,
|
||||||
|
'slimOwnerRenderer': _extract_from_owner_renderer,
|
||||||
|
'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
|
||||||
|
'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
|
||||||
|
'metadataRowContainerRenderer': _extract_metadata_row_info,
|
||||||
|
}
|
||||||
|
|
||||||
def _extract_watch_info_mobile(top_level):
|
def _extract_watch_info_mobile(top_level):
|
||||||
|
'''Scrapes information from the visible page'''
|
||||||
info = {}
|
info = {}
|
||||||
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
|
||||||
|
|
||||||
family_safe = microformat.get('isFamilySafe')
|
|
||||||
if family_safe is None:
|
|
||||||
info['age_restricted'] = None
|
|
||||||
else:
|
|
||||||
info['age_restricted'] = not family_safe
|
|
||||||
info['allowed_countries'] = microformat.get('availableCountries', [])
|
|
||||||
info['time_published'] = microformat.get('publishDate')
|
|
||||||
|
|
||||||
response = top_level.get('response', {})
|
response = top_level.get('response', {})
|
||||||
|
|
||||||
# this renderer has the stuff visible on the page
|
# this renderer has the stuff visible on the page
|
||||||
@ -190,47 +255,22 @@ def _extract_watch_info_mobile(top_level):
|
|||||||
else:
|
else:
|
||||||
info['playlist'] = None
|
info['playlist'] = None
|
||||||
|
|
||||||
# Holds the visible video info. It is inside singleColumnWatchNextResults
|
# use dispatch table to get information scattered in various renderers
|
||||||
# but use our convenience function instead
|
items, _ = extract_items(
|
||||||
items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
|
response,
|
||||||
if items:
|
item_types=visible_extraction_dispatch.keys(),
|
||||||
video_info = items[0]['slimVideoMetadataRenderer']
|
search_engagement_panels=True
|
||||||
else:
|
)
|
||||||
print('Failed to extract video metadata')
|
found = set()
|
||||||
video_info = {}
|
for renderer in items:
|
||||||
|
name, renderer_content = list(renderer.items())[0]
|
||||||
|
found.add(name)
|
||||||
|
info.update(visible_extraction_dispatch[name](renderer_content))
|
||||||
|
# Call the function on blank dict for any that weren't found
|
||||||
|
# so that the empty keys get added
|
||||||
|
for name in visible_extraction_dispatch.keys() - found:
|
||||||
|
info.update(visible_extraction_dispatch[name]({}))
|
||||||
|
|
||||||
info.update(_extract_metadata_row_info(video_info))
|
|
||||||
info['description'] = extract_str(video_info.get('description'), recover_urls=True)
|
|
||||||
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
|
|
||||||
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
|
|
||||||
info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
|
||||||
info['title'] = extract_str(video_info.get('title'))
|
|
||||||
info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
|
|
||||||
info['unlisted'] = False
|
|
||||||
for badge in video_info.get('badges', []):
|
|
||||||
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
|
||||||
info['unlisted'] = True
|
|
||||||
info['like_count'] = None
|
|
||||||
info['dislike_count'] = None
|
|
||||||
if not info['time_published']:
|
|
||||||
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
|
||||||
for button in video_info.get('buttons', ()):
|
|
||||||
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
|
||||||
|
|
||||||
# all the digits can be found in the accessibility data
|
|
||||||
count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
|
|
||||||
|
|
||||||
# this count doesn't have all the digits, it's like 53K for instance
|
|
||||||
dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
|
||||||
|
|
||||||
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
|
|
||||||
if dumb_count == 0:
|
|
||||||
count = 0
|
|
||||||
|
|
||||||
if 'isLike' in button_renderer:
|
|
||||||
info['like_count'] = count
|
|
||||||
elif 'isDislike' in button_renderer:
|
|
||||||
info['dislike_count'] = count
|
|
||||||
|
|
||||||
# comment section info
|
# comment section info
|
||||||
items, _ = extract_items(response, item_types={
|
items, _ = extract_items(response, item_types={
|
||||||
@ -274,7 +314,6 @@ def _extract_watch_info_desktop(top_level):
|
|||||||
info = {
|
info = {
|
||||||
'comment_count': None,
|
'comment_count': None,
|
||||||
'comments_disabled': None,
|
'comments_disabled': None,
|
||||||
'allowed_countries': [],
|
|
||||||
'limited_state': None,
|
'limited_state': None,
|
||||||
'playlist': None,
|
'playlist': None,
|
||||||
}
|
}
|
||||||
@ -564,6 +603,12 @@ def extract_watch_info(polymer_json):
|
|||||||
liberal_update(info, 'category', mf.get('category'))
|
liberal_update(info, 'category', mf.get('category'))
|
||||||
liberal_update(info, 'time_published', mf.get('publishDate'))
|
liberal_update(info, 'time_published', mf.get('publishDate'))
|
||||||
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
|
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
|
||||||
|
family_safe = mf.get('isFamilySafe')
|
||||||
|
if family_safe is None:
|
||||||
|
conservative_update(info, 'age_restricted', None)
|
||||||
|
else:
|
||||||
|
conservative_update(info, 'age_restricted', not family_safe)
|
||||||
|
info['allowed_countries'] = mf.get('availableCountries', [])
|
||||||
|
|
||||||
# other stuff
|
# other stuff
|
||||||
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user