Extraction: Move stuff around in files and put underscores in front of internal helper function names

Move get_captions_url in watch_extraction to bottom next to other exported, public functions
This commit is contained in:
James Taylor 2019-12-19 20:12:37 -08:00
parent d1d908d5b1
commit 4a3529df95
3 changed files with 37 additions and 38 deletions

View File

@ -322,7 +322,7 @@ item_types = {
'channelAboutFullMetadataRenderer', 'channelAboutFullMetadataRenderer',
} }
def traverse_browse_renderer(renderer): def _traverse_browse_renderer(renderer):
for tab in get(renderer, 'tabs', (), types=(list, tuple)): for tab in get(renderer, 'tabs', (), types=(list, tuple)):
tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict)
if tab_renderer is None: if tab_renderer is None:
@ -332,24 +332,24 @@ def traverse_browse_renderer(renderer):
print('Could not find tab with content') print('Could not find tab with content')
return {} return {}
def traverse_standard_list(renderer): def _traverse_standard_list(renderer):
renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
return renderer_list, continuation return renderer_list, continuation
# these renderers contain one inside them # these renderers contain one inside them
nested_renderer_dispatch = { nested_renderer_dispatch = {
'singleColumnBrowseResultsRenderer': traverse_browse_renderer, 'singleColumnBrowseResultsRenderer': _traverse_browse_renderer,
'twoColumnBrowseResultsRenderer': traverse_browse_renderer, 'twoColumnBrowseResultsRenderer': _traverse_browse_renderer,
'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict),
} }
# these renderers contain a list of renderers inside them # these renderers contain a list of renderers inside them
nested_renderer_list_dispatch = { nested_renderer_list_dispatch = {
'sectionListRenderer': traverse_standard_list, 'sectionListRenderer': _traverse_standard_list,
'itemSectionRenderer': traverse_standard_list, 'itemSectionRenderer': _traverse_standard_list,
'gridRenderer': traverse_standard_list, 'gridRenderer': _traverse_standard_list,
'playlistVideoListRenderer': traverse_standard_list, 'playlistVideoListRenderer': _traverse_standard_list,
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
} }
@ -411,6 +411,5 @@ def extract_items(response, item_types=item_types):
current_iter = iter_stack.pop() # go back up the stack current_iter = iter_stack.pop() # go back up the stack
except IndexError: except IndexError:
return items, ctoken return items, ctoken
else: else:
return [], None return [], None

View File

@ -192,7 +192,7 @@ def extract_playlist_info(polymer_json):
return info return info
def ctoken_metadata(ctoken): def _ctoken_metadata(ctoken):
result = dict() result = dict()
params = proto.parse(proto.b64_to_bytes(ctoken)) params = proto.parse(proto.b64_to_bytes(ctoken))
result['video_id'] = proto.parse(params[2])[2].decode('ascii') result['video_id'] = proto.parse(params[2])[2].decode('ascii')
@ -220,7 +220,7 @@ def extract_comments_info(polymer_json):
url = multi_deep_get(polymer_json, [1, 'url'], ['url']) url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
if url: if url:
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
metadata = ctoken_metadata(ctoken) metadata = _ctoken_metadata(ctoken)
else: else:
metadata = {} metadata = {}
info['video_id'] = metadata.get('video_id') info['video_id'] = metadata.get('video_id')

View File

@ -115,7 +115,7 @@ _formats = {
'397': {'vcodec': 'av01.0.05M.08'}, '397': {'vcodec': 'av01.0.05M.08'},
} }
def extract_metadata_row_info(video_renderer_info): def _extract_metadata_row_info(video_renderer_info):
# extract category and music list # extract category and music list
info = { info = {
'category': None, 'category': None,
@ -145,7 +145,7 @@ def extract_metadata_row_info(video_renderer_info):
return info return info
def extract_watch_info_mobile(top_level): def _extract_watch_info_mobile(top_level):
info = {} info = {}
microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
@ -167,7 +167,7 @@ def extract_watch_info_mobile(top_level):
print('Failed to extract video metadata') print('Failed to extract video metadata')
video_info = {} video_info = {}
info.update(extract_metadata_row_info(video_info)) info.update(_extract_metadata_row_info(video_info))
info['description'] = extract_str(video_info.get('description'), recover_urls=True) info['description'] = extract_str(video_info.get('description'), recover_urls=True)
info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
@ -228,7 +228,7 @@ def extract_watch_info_mobile(top_level):
return info return info
month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
def extract_watch_info_desktop(top_level): def _extract_watch_info_desktop(top_level):
info = { info = {
'comment_count': None, 'comment_count': None,
'comments_disabled': None, 'comments_disabled': None,
@ -241,7 +241,7 @@ def extract_watch_info_desktop(top_level):
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
video_info.update(list(renderer.values())[0]) video_info.update(list(renderer.values())[0])
info.update(extract_metadata_row_info(video_info)) info.update(_extract_metadata_row_info(video_info))
info['description'] = extract_str(video_info.get('description', None), recover_urls=True) info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
@ -263,21 +263,7 @@ def extract_watch_info_desktop(top_level):
return info return info
def get_caption_url(info, language, format, automatic=False, translation_language=None): def _extract_formats(info, player_response):
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
url = info['_captions_base_url']
url += '&lang=' + language
url += '&fmt=' + format
if automatic:
url += '&kind=asr'
elif language in info['_manual_caption_language_names']:
url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='')
if translation_language:
url += '&tlang=' + translation_language
return url
def extract_formats(info, player_response):
streaming_data = player_response.get('streamingData', {}) streaming_data = player_response.get('streamingData', {})
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
@ -305,7 +291,7 @@ def extract_formats(info, player_response):
info['formats'].append(fmt) info['formats'].append(fmt)
def extract_playability_error(info, player_response, error_prefix=''): def _extract_playability_error(info, player_response, error_prefix=''):
if info['formats']: if info['formats']:
info['playability_status'] = None info['playability_status'] = None
info['playability_error'] = None info['playability_error'] = None
@ -379,10 +365,10 @@ def extract_watch_info(polymer_json):
print('WARNING: Found non-translatable caption language') print('WARNING: Found non-translatable caption language')
# formats # formats
extract_formats(info, player_response) _extract_formats(info, player_response)
# playability errors # playability errors
extract_playability_error(info, player_response) _extract_playability_error(info, player_response)
# check age-restriction # check age-restriction
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
@ -394,9 +380,9 @@ def extract_watch_info(polymer_json):
mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
if mobile: if mobile:
info.update(extract_watch_info_mobile(top_level)) info.update(_extract_watch_info_mobile(top_level))
else: else:
info.update(extract_watch_info_desktop(top_level)) info.update(_extract_watch_info_desktop(top_level))
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
@ -430,6 +416,20 @@ def extract_watch_info(polymer_json):
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
return info return info
def get_caption_url(info, language, format, automatic=False, translation_language=None):
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
url = info['_captions_base_url']
url += '&lang=' + language
url += '&fmt=' + format
if automatic:
url += '&kind=asr'
elif language in info['_manual_caption_language_names']:
url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='')
if translation_language:
url += '&tlang=' + translation_language
return url
def update_with_age_restricted_info(info, video_info_page): def update_with_age_restricted_info(info, video_info_page):
ERROR_PREFIX = 'Error bypassing age-restriction: ' ERROR_PREFIX = 'Error bypassing age-restriction: '
@ -445,5 +445,5 @@ def update_with_age_restricted_info(info, video_info_page):
info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response'
return return
extract_formats(info, player_response) _extract_formats(info, player_response)
extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)