Extraction: Extract info from microformat to get views for limited state videos, and as a fallback. Shorten some function names

This commit is contained in:
James Taylor 2019-12-17 16:02:23 -08:00
parent e870eea057
commit 40de1b74ed

View File

@ -832,7 +832,7 @@ def check_missing_keys(object, *key_sequences):
return None return None
def extract_plain_text(node, default=None, recover_urls=False): def extract_str(node, default=None, recover_urls=False):
'''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)'''
if isinstance(node, str): if isinstance(node, str):
return node return node
@ -881,7 +881,7 @@ def extract_formatted_text(node):
return [] return []
def extract_integer(string): def extract_int(string):
if not isinstance(string, str): if not isinstance(string, str):
return None return None
match = re.search(r'(\d+)', string.replace(',', '')) match = re.search(r'(\d+)', string.replace(',', ''))
@ -892,11 +892,6 @@ def extract_integer(string):
except ValueError: except ValueError:
return None return None
def update_if_not_none(dictionary, key, value):
'''Update dictionary[key] with value if value is not none'''
if key not in dictionary or value is not None:
dictionary[key] = value
def extract_metadata_row_info(video_renderer_info): def extract_metadata_row_info(video_renderer_info):
# extract category and music list # extract category and music list
info = { info = {
@ -906,8 +901,8 @@ def extract_metadata_row_info(video_renderer_info):
current_song = {} current_song = {}
for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') row_title = extract_str(default_multi_get(row, 'metadataRowRenderer', 'title'), default='')
row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) row_content = extract_str(default_multi_get(row, 'metadataRowRenderer', 'contents', 0))
if row_title == 'Category': if row_title == 'Category':
info['category'] = row_content info['category'] = row_content
elif row_title in ('Song', 'Music'): elif row_title in ('Song', 'Music'):
@ -962,12 +957,12 @@ def extract_watch_info_mobile(top_level):
video_info = {} video_info = {}
info.update(extract_metadata_row_info(video_info)) info.update(extract_metadata_row_info(video_info))
info['description'] = extract_plain_text(video_info.get('description'), recover_urls=True) info['description'] = extract_str(video_info.get('description'), recover_urls=True)
info['view_count'] = extract_integer(extract_plain_text(video_info.get('expandedSubtitle'))) info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
info['title'] = extract_plain_text(video_info.get('title')) info['title'] = extract_str(video_info.get('title'))
info['live'] = 'watching' in extract_plain_text(video_info.get('expandedSubtitle')) info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'))
info['unlisted'] = False info['unlisted'] = False
for badge in video_info.get('badges', []): for badge in video_info.get('badges', []):
if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
@ -975,15 +970,15 @@ def extract_watch_info_mobile(top_level):
info['like_count'] = None info['like_count'] = None
info['dislike_count'] = None info['dislike_count'] = None
if not info['published_date']: if not info['published_date']:
info['published_date'] = extract_date(extract_plain_text(video_info.get('dateText', None))) info['published_date'] = extract_date(extract_str(video_info.get('dateText', None)))
for button in video_info.get('buttons', ()): for button in video_info.get('buttons', ()):
button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
# all the digits can be found in the accessibility data # all the digits can be found in the accessibility data
count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) count = extract_int(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
# this count doesn't have all the digits, it's like 53K for instance # this count doesn't have all the digits, it's like 53K for instance
dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) dumb_count = extract_int(extract_str(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
# the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
if dumb_count == 0: if dumb_count == 0:
@ -998,11 +993,11 @@ def extract_watch_info_mobile(top_level):
items, _ = extract_items(response, item_types={'commentSectionRenderer'}) items, _ = extract_items(response, item_types={'commentSectionRenderer'})
if items: if items:
comment_info = items[0]['commentSectionRenderer'] comment_info = items[0]['commentSectionRenderer']
comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) comment_count_text = extract_str(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
if comment_count_text == 'Comments': # just this with no number, means 0 comments if comment_count_text == 'Comments': # just this with no number, means 0 comments
info['comment_count'] = 0 info['comment_count'] = 0
else: else:
info['comment_count'] = extract_integer(comment_count_text) info['comment_count'] = extract_int(comment_count_text)
info['comments_disabled'] = False info['comments_disabled'] = False
else: # no comment section present means comments are disabled else: # no comment section present means comments are disabled
info['comment_count'] = 0 info['comment_count'] = 0
@ -1028,21 +1023,21 @@ def extract_watch_info_desktop(top_level):
video_info.update(list(renderer.values())[0]) video_info.update(list(renderer.values())[0])
info.update(extract_metadata_row_info(video_info)) info.update(extract_metadata_row_info(video_info))
info['description'] = extract_plain_text(video_info.get('description', None), recover_urls=True) info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
info['published_date'] = extract_date(extract_plain_text(video_info.get('dateText', None))) info['published_date'] = extract_date(extract_str(video_info.get('dateText', None)))
likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
if len(likes_dislikes) == 2: if len(likes_dislikes) == 2:
info['like_count'] = extract_integer(likes_dislikes[0]) info['like_count'] = extract_int(likes_dislikes[0])
info['dislike_count'] = extract_integer(likes_dislikes[1]) info['dislike_count'] = extract_int(likes_dislikes[1])
else: else:
info['like_count'] = None info['like_count'] = None
info['dislike_count'] = None info['dislike_count'] = None
info['title'] = extract_plain_text(video_info.get('title', None)) info['title'] = extract_str(video_info.get('title', None))
info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) info['author'] = extract_str(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
info['related_videos'] = [renderer_info(renderer) for renderer in related] info['related_videos'] = [renderer_info(renderer) for renderer in related]
@ -1093,13 +1088,14 @@ def extract_formats(info, player_response):
def extract_playability_error(info, player_response, error_prefix=''): def extract_playability_error(info, player_response, error_prefix=''):
if info['formats']: if info['formats']:
info['playability_status'] = None
info['playability_error'] = None info['playability_error'] = None
return return
playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None)
info['playability_status'] = playability_status info['playability_status'] = playability_status
playability_reason = extract_plain_text(multi_default_multi_get(player_response, playability_reason = extract_str(multi_default_multi_get(player_response,
['playabilityStatus', 'reason'], ['playabilityStatus', 'reason'],
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
default='Could not find playability error') default='Could not find playability error')
@ -1110,6 +1106,17 @@ def extract_playability_error(info, player_response, error_prefix=''):
else: else:
info['playability_error'] = error_prefix + 'Unknown playability error' info['playability_error'] = error_prefix + 'Unknown playability error'
def liberal_update(obj, key, value):
'''Updates obj[key] with value as long as value is not None.
Ensures obj[key] will at least get a value of None, however'''
if (value is not None) or (key not in obj):
obj[key] = value
def conservative_update(obj, key, value):
'''Only updates obj if it doesn't have key or obj[key] is None'''
if obj.get(key) is None:
obj[key] = value
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def extract_watch_info(polymer_json): def extract_watch_info(polymer_json):
info = {'playability_error': None, 'error': None} info = {'playability_error': None, 'error': None}
@ -1183,19 +1190,33 @@ def extract_watch_info(polymer_json):
else: else:
info.update(extract_watch_info_desktop(top_level)) info.update(extract_watch_info_desktop(top_level))
# stuff from videoDetails # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) vd = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={})
update_if_not_none(info, 'title', extract_plain_text(video_details.get('title'))) liberal_update(info, 'title', extract_str(vd.get('title')))
update_if_not_none(info, 'duration', extract_integer(video_details.get('lengthSeconds'))) liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
update_if_not_none(info, 'view_count', extract_integer(video_details.get('viewCount'))) liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
# videos with no description have a blank string # videos with no description have a blank string
update_if_not_none(info, 'description', video_details.get('shortDescription')) liberal_update(info, 'description', vd.get('shortDescription'))
update_if_not_none(info, 'id', video_details.get('videoId')) liberal_update(info, 'id', vd.get('videoId'))
update_if_not_none(info, 'author', video_details.get('author')) liberal_update(info, 'author', vd.get('author'))
update_if_not_none(info, 'author_id', video_details.get('channelId')) liberal_update(info, 'author_id', vd.get('channelId'))
update_if_not_none(info, 'live', video_details.get('isLiveContent')) liberal_update(info, 'live', vd.get('isLiveContent'))
update_if_not_none(info, 'unlisted', not video_details.get('isCrawlable', True)) liberal_update(info, 'unlisted', not vd.get('isCrawlable', True))
update_if_not_none(info, 'tags', video_details.get('keywords', [])) liberal_update(info, 'tags', vd.get('keywords', []))
# fallback stuff from microformat
mf = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
conservative_update(info, 'title', extract_str(mf.get('title')))
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
# this gives the view count for limited state videos
conservative_update(info, 'view_count', extract_int(mf.get('viewCount')))
conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True))
conservative_update(info, 'author', mf.get('ownerChannelName'))
conservative_update(info, 'author_id', mf.get('externalChannelId'))
conservative_update(info, 'unlisted', mf.get('isUnlisted'))
liberal_update(info, 'category', mf.get('category'))
liberal_update(info, 'published_date', mf.get('publishDate'))
liberal_update(info, 'uploaded_date', mf.get('uploadDate'))
# other stuff # other stuff
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None