extract_items: Handle case where continuation has multiple

[something]Continuation renderers, all of which are junk
except one. Check the items in each one until the one which
contains the items being sought is found.
The usage in extract_comments_info needed to be changed to
specify the items being sought. It was unspecified before which
is strictly incorrect since extract_items by default looks for
video/playlist/channel thumbnail items. It was relying on this
special case for continuations. But now that wouldn't work
anymore.
This commit is contained in:
James Taylor 2020-08-11 19:59:25 -07:00
parent 81ff5ab99c
commit fa61874f97
2 changed files with 23 additions and 11 deletions

View File

@ -392,6 +392,13 @@ nested_renderer_list_dispatch = {
'playlistVideoListRenderer': _traverse_standard_list, 'playlistVideoListRenderer': _traverse_standard_list,
'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None), 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
} }
def get_nested_renderer_list_function(key):
if key in nested_renderer_list_dispatch:
return nested_renderer_list_dispatch[key]
elif key.endswith('Continuation'):
return _traverse_standard_list
return None
def extract_items_from_renderer(renderer, item_types=_item_types): def extract_items_from_renderer(renderer, item_types=_item_types):
ctoken = None ctoken = None
items = [] items = []
@ -423,13 +430,13 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
items.append(renderer) items.append(renderer)
# has a list in it, add it to the iter stack # has a list in it, add it to the iter stack
elif key in nested_renderer_list_dispatch: elif get_nested_renderer_list_function(key):
renderer_list, continuation = nested_renderer_list_dispatch[key](value) renderer_list, cont = get_nested_renderer_list_function(key)(value)
if renderer_list: if renderer_list:
iter_stack.append(current_iter) iter_stack.append(current_iter)
current_iter = iter(renderer_list) current_iter = iter(renderer_list)
if continuation: if cont:
ctoken = continuation ctoken = cont
# new renderer nested inside this one # new renderer nested inside this one
elif key in nested_renderer_dispatch: elif key in nested_renderer_dispatch:
@ -441,12 +448,16 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
def extract_items(response, item_types=_item_types): def extract_items(response, item_types=_item_types):
'''return items, ctoken''' '''return items, ctoken'''
if 'continuationContents' in response: if 'continuationContents' in response:
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something # sometimes there's another, empty, junk [something]Continuation key
for key, renderer_continuation in get(response, 'continuationContents', {}).items(): # find real one
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation for key, renderer_cont in get(response,
items = multi_get(renderer_continuation, 'contents', 'items', default=[]) 'continuationContents', {}).items():
ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation') # e.g. commentSectionContinuation, playlistVideoListContinuation
return items, ctoken if key.endswith('Continuation'):
items, cont = extract_items_from_renderer({key: renderer_cont},
item_types=item_types)
if items:
return items, cont
return [], None return [], None
elif 'contents' in response: elif 'contents' in response:
renderer = get(response, 'contents', {}) renderer = get(response, 'contents', {})

View File

@ -227,7 +227,8 @@ def extract_comments_info(polymer_json):
info['sort'] = metadata.get('sort') info['sort'] = metadata.get('sort')
info['video_title'] = None info['video_title'] = None
comments, ctoken = extract_items(response) comments, ctoken = extract_items(response,
item_types={'commentThreadRenderer', 'commentRenderer'})
info['comments'] = [] info['comments'] = []
info['ctoken'] = ctoken info['ctoken'] = ctoken
for comment in comments: for comment in comments: