Extraction: refactor response extraction to work with both mobile & desktop respones, also improve errors

This commit is contained in:
James Taylor 2019-09-18 21:39:53 -07:00
parent 89e5761f8d
commit dc6c370152
5 changed files with 54 additions and 23 deletions

View File

@ -186,8 +186,8 @@ def get_channel_page(channel_id, tab='videos'):
info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
if info['errors']: if info['error']:
return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) return flask.render_template('error.html', error_message = info['error'])
post_process_channel_info(info) post_process_channel_info(info)
if tab in ('videos', 'search'): if tab in ('videos', 'search'):
info['number_of_videos'] = number_of_videos info['number_of_videos'] = number_of_videos
@ -228,8 +228,8 @@ def get_channel_page_general_url(base_url, tab, request):
info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
if info['errors']: if info['error']:
return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) return flask.render_template('error.html', error_message = info['error'])
post_process_channel_info(info) post_process_channel_info(info)
if tab in ('videos', 'search'): if tab in ('videos', 'search'):

View File

@ -91,6 +91,9 @@ def get_playlist_page():
first_page_json, this_page_json = tasks[0].value, tasks[1].value first_page_json, this_page_json = tasks[0].value, tasks[1].value
info = yt_data_extract.extract_playlist_info(this_page_json) info = yt_data_extract.extract_playlist_info(this_page_json)
if info['error']:
return flask.render_template('error.html', error_message = info['error'])
if page != '1': if page != '1':
info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json)

View File

@ -76,6 +76,9 @@ def get_search_page():
polymer_json = get_search_json(query, page, autocorrect, sort, filters) polymer_json = get_search_json(query, page, autocorrect, sort, filters)
search_info = yt_data_extract.extract_search_info(polymer_json) search_info = yt_data_extract.extract_search_info(polymer_json)
if search_info['error']:
return flask.render_template('error.html', error_message = search_info['error'])
for item_info in search_info['items']: for item_info in search_info['items']:
yt_data_extract.prefix_urls(item_info) yt_data_extract.prefix_urls(item_info)
yt_data_extract.add_extra_html_info(item_info) yt_data_extract.add_extra_html_info(item_info)

View File

@ -456,8 +456,8 @@ def _get_upstream_videos(channel_id):
traceback.print_exc() traceback.print_exc()
channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos') channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
if channel_info['errors']: if channel_info['error']:
print('Error checking channel ' + channel_status_name + ': ' + ', '.join(channel_info['errors'])) print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
return return
videos = channel_info['items'] videos = channel_info['items']

View File

@ -280,10 +280,29 @@ def parse_info_prepare_for_html(renderer, additional_info={}):
return item return item
def get_response(polymer_json):
'''return response, error'''
# responses returned for desktop version
try:
return polymer_json[1]['response'], None
except (TypeError, KeyError, IndexError):
pass
# responses returned for mobile version
try:
return polymer_json['response'], None
except (TypeError, KeyError):
pass
return None, 'Failed to extract response'
def extract_channel_info(polymer_json, tab): def extract_channel_info(polymer_json, tab):
info = {'errors': []} response, err = get_response(polymer_json)
response = polymer_json[1]['response'] if err:
return {'error': err}
try: try:
microformat = response['microformat']['microformatDataRenderer'] microformat = response['microformat']['microformatDataRenderer']
@ -291,18 +310,14 @@ def extract_channel_info(polymer_json, tab):
# example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
except KeyError: except KeyError:
if 'alerts' in response and len(response['alerts']) > 0: if 'alerts' in response and len(response['alerts']) > 0:
for alert in response['alerts']: return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) }
info['errors'].append(alert['alertRenderer']['text']['simpleText'])
return info
elif 'errors' in response['responseContext']: elif 'errors' in response['responseContext']:
for error in response['responseContext']['errors']['error']: for error in response['responseContext']['errors']['error']:
if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id':
info['errors'].append('This channel does not exist') return {'error': 'This channel does not exist'}
return info return {'error': 'Failure getting microformat'}
info['errors'].append('Failure getting microformat')
return info
info = {'error': None}
info['current_tab'] = tab info['current_tab'] = tab
@ -402,13 +417,16 @@ def extract_channel_info(polymer_json, tab):
return info return info
def extract_search_info(polymer_json): def extract_search_info(polymer_json):
info = {} response, err = get_response(polymer_json)
info['estimated_results'] = int(polymer_json[1]['response']['estimatedResults']) if err:
return {'error': err}
info = {'error': None}
info['estimated_results'] = int(response['estimatedResults'])
info['estimated_pages'] = ceil(info['estimated_results']/20) info['estimated_pages'] = ceil(info['estimated_results']/20)
# almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency
results = [] results = []
for section in polymer_json[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: for section in response['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']:
results += section['itemSectionRenderer']['contents'] results += section['itemSectionRenderer']['contents']
info['items'] = [] info['items'] = []
@ -451,7 +469,11 @@ def extract_search_info(polymer_json):
return info return info
def extract_playlist_metadata(polymer_json): def extract_playlist_metadata(polymer_json):
metadata = renderer_info(polymer_json['response']['header']) response, err = get_response(polymer_json)
if err:
return {'error': err}
metadata = renderer_info(response['header'])
metadata['error'] = None
if 'description' not in metadata: if 'description' not in metadata:
metadata['description'] = '' metadata['description'] = ''
@ -461,12 +483,15 @@ def extract_playlist_metadata(polymer_json):
return metadata return metadata
def extract_playlist_info(polymer_json): def extract_playlist_info(polymer_json):
info = {} response, err = get_response(polymer_json)
if err:
return {'error': err}
info = {'error': None}
try: # first page try: # first page
video_list = polymer_json['response']['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] video_list = response['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents']
first_page = True first_page = True
except KeyError: # other pages except KeyError: # other pages
video_list = polymer_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] video_list = response['continuationContents']['playlistVideoListContinuation']['contents']
first_page = False first_page = False
info['items'] = [renderer_info(renderer) for renderer in video_list] info['items'] = [renderer_info(renderer) for renderer in video_list]