Extraction: refactor response extraction to work with both mobile & desktop respones, also improve errors

2019-09-18 21:39:53 -07:00 · 2019-09-18 21:39:53 -07:00 · dc6c370152
commit dc6c370152
parent 89e5761f8d
5 changed files with 54 additions and 23 deletions
--- a/youtube/channel.py
+++ b/youtube/channel.py
@ -186,8 +186,8 @@ def get_channel_page(channel_id, tab='videos'):
    info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
-    if info['errors']:
+    if info['error']:
-        return flask.render_template('error.html', error_message = '\n'.join(info['errors']))
+        return flask.render_template('error.html', error_message = info['error'])
    post_process_channel_info(info)
    if tab in ('videos', 'search'):
        info['number_of_videos'] = number_of_videos
@ -228,8 +228,8 @@ def get_channel_page_general_url(base_url, tab, request):
    info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab)
-    if info['errors']:
+    if info['error']:
-        return flask.render_template('error.html', error_message = '\n'.join(info['errors']))
+        return flask.render_template('error.html', error_message = info['error'])
    post_process_channel_info(info)
    if tab in ('videos', 'search'):
--- a/youtube/playlist.py
+++ b/youtube/playlist.py
@ -91,6 +91,9 @@ def get_playlist_page():
        first_page_json, this_page_json = tasks[0].value, tasks[1].value
    info = yt_data_extract.extract_playlist_info(this_page_json)
    if info['error']:
        return flask.render_template('error.html', error_message = info['error'])
    if page != '1':
        info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json)
--- a/youtube/search.py
+++ b/youtube/search.py
@ -76,6 +76,9 @@ def get_search_page():
    polymer_json = get_search_json(query, page, autocorrect, sort, filters)
    search_info = yt_data_extract.extract_search_info(polymer_json)
    if search_info['error']:
        return flask.render_template('error.html', error_message = search_info['error'])
    for item_info in search_info['items']:
        yt_data_extract.prefix_urls(item_info)
        yt_data_extract.add_extra_html_info(item_info)
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@ -456,8 +456,8 @@ def _get_upstream_videos(channel_id):
        traceback.print_exc()
    channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
-    if channel_info['errors']:
+    if channel_info['error']:
-        print('Error checking channel ' + channel_status_name + ': ' + ', '.join(channel_info['errors']))
+        print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
        return
    videos = channel_info['items']
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@ -280,10 +280,29 @@ def parse_info_prepare_for_html(renderer, additional_info={}):
    return item
 def get_response(polymer_json):
    '''return response, error'''
    # responses returned for desktop version
    try:
        return polymer_json[1]['response'], None
    except (TypeError, KeyError, IndexError):
        pass
    # responses returned for mobile version
    try:
        return polymer_json['response'], None
    except (TypeError, KeyError):
        pass
    return None, 'Failed to extract response'
 def extract_channel_info(polymer_json, tab):
-    info = {'errors': []}
+    response, err = get_response(polymer_json)
-    response = polymer_json[1]['response']
+    if err:
        return {'error': err}
    try:
        microformat = response['microformat']['microformatDataRenderer']
@ -291,18 +310,14 @@ def extract_channel_info(polymer_json, tab):
    # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
    except KeyError:
        if 'alerts' in response and len(response['alerts']) > 0:
-            for alert in response['alerts']:
+            return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) }
                info['errors'].append(alert['alertRenderer']['text']['simpleText'])
            return info
        elif 'errors' in response['responseContext']:
            for error in response['responseContext']['errors']['error']:
                if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id':
-                    info['errors'].append('This channel does not exist')
+                    return {'error': 'This channel does not exist'}
-                    return info
+        return {'error': 'Failure getting microformat'}
        info['errors'].append('Failure getting microformat')
        return info
    info = {'error': None}
    info['current_tab'] = tab
@ -402,13 +417,16 @@ def extract_channel_info(polymer_json, tab):
    return info
 def extract_search_info(polymer_json):
-    info = {}
+    response, err = get_response(polymer_json)
-    info['estimated_results'] = int(polymer_json[1]['response']['estimatedResults'])
+    if err:
        return {'error': err}
    info = {'error': None}
    info['estimated_results'] = int(response['estimatedResults'])
    info['estimated_pages'] = ceil(info['estimated_results']/20)
    # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency
    results = []
-    for section in polymer_json[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']:
+    for section in response['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']:
        results += section['itemSectionRenderer']['contents']
    info['items'] = []
@ -451,7 +469,11 @@ def extract_search_info(polymer_json):
    return info
 def extract_playlist_metadata(polymer_json):
-    metadata = renderer_info(polymer_json['response']['header'])
+    response, err = get_response(polymer_json)
    if err:
        return {'error': err}
    metadata = renderer_info(response['header'])
    metadata['error'] = None
    if 'description' not in metadata:
        metadata['description'] = ''
@ -461,12 +483,15 @@ def extract_playlist_metadata(polymer_json):
    return metadata
 def extract_playlist_info(polymer_json):
-    info = {}
+    response, err = get_response(polymer_json)
    if err:
        return {'error': err}
    info = {'error': None}
    try:    # first page
-        video_list = polymer_json['response']['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents']
+        video_list = response['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents']
        first_page = True
    except KeyError:    # other pages
-        video_list = polymer_json['response']['continuationContents']['playlistVideoListContinuation']['contents']
+        video_list = response['continuationContents']['playlistVideoListContinuation']['contents']
        first_page = False
    info['items'] = [renderer_info(renderer) for renderer in video_list]