Fix related vids, like_count, playlist sometimes missing
Cause is that some pages have the onResponseReceivedEndpoints key at the top level with useless stuff in it, and the extract_items function was searching in that instead of the 'contents' key. Change to use if blocks instead of elif blocks in the extract_items function.
This commit is contained in:
parent
8f9c5eeb48
commit
5594d017e2
@ -109,7 +109,7 @@ def concat_or_none(*strings):
|
|||||||
def remove_redirect(url):
|
def remove_redirect(url):
|
||||||
if url is None:
|
if url is None:
|
||||||
return None
|
return None
|
||||||
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
|
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # YouTube puts these on external links to do tracking
|
||||||
query_string = url[url.find('?')+1: ]
|
query_string = url[url.find('?')+1: ]
|
||||||
return urllib.parse.parse_qs(query_string)['q'][0]
|
return urllib.parse.parse_qs(query_string)['q'][0]
|
||||||
return url
|
return url
|
||||||
@ -133,11 +133,11 @@ def _recover_urls(runs):
|
|||||||
for run in runs:
|
for run in runs:
|
||||||
url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
|
||||||
text = run.get('text', '')
|
text = run.get('text', '')
|
||||||
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
|
# second condition is necessary because YouTube makes other things into urls, such as hashtags, which we want to keep as text
|
||||||
if url is not None and (text.startswith('http://') or text.startswith('https://')):
|
if url is not None and (text.startswith('http://') or text.startswith('https://')):
|
||||||
url = remove_redirect(url)
|
url = remove_redirect(url)
|
||||||
run['url'] = url
|
run['url'] = url
|
||||||
run['text'] = url # youtube truncates the url text, use actual url instead
|
run['text'] = url # YouTube truncates the url text, use actual url instead
|
||||||
|
|
||||||
def extract_str(node, default=None, recover_urls=False):
|
def extract_str(node, default=None, recover_urls=False):
|
||||||
'''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)'''
|
'''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)'''
|
||||||
@ -569,13 +569,13 @@ def extract_items(response, item_types=_item_types,
|
|||||||
item_types=item_types)
|
item_types=item_types)
|
||||||
if items:
|
if items:
|
||||||
break
|
break
|
||||||
elif ('onResponseReceivedEndpoints' in response
|
if ('onResponseReceivedEndpoints' in response
|
||||||
or 'onResponseReceivedActions' in response):
|
or 'onResponseReceivedActions' in response):
|
||||||
for endpoint in multi_get(response,
|
for endpoint in multi_get(response,
|
||||||
'onResponseReceivedEndpoints',
|
'onResponseReceivedEndpoints',
|
||||||
'onResponseReceivedActions',
|
'onResponseReceivedActions',
|
||||||
[]):
|
[]):
|
||||||
items, ctoken = extract_items_from_renderer_list(
|
new_items, new_ctoken = extract_items_from_renderer_list(
|
||||||
multi_deep_get(
|
multi_deep_get(
|
||||||
endpoint,
|
endpoint,
|
||||||
['reloadContinuationItemsCommand', 'continuationItems'],
|
['reloadContinuationItemsCommand', 'continuationItems'],
|
||||||
@ -584,13 +584,17 @@ def extract_items(response, item_types=_item_types,
|
|||||||
),
|
),
|
||||||
item_types=item_types,
|
item_types=item_types,
|
||||||
)
|
)
|
||||||
if items:
|
items += new_items
|
||||||
break
|
if (not ctoken) or (new_ctoken and new_items):
|
||||||
elif 'contents' in response:
|
ctoken = new_ctoken
|
||||||
|
if 'contents' in response:
|
||||||
renderer = get(response, 'contents', {})
|
renderer = get(response, 'contents', {})
|
||||||
items, ctoken = extract_items_from_renderer(
|
new_items, new_ctoken = extract_items_from_renderer(
|
||||||
renderer,
|
renderer,
|
||||||
item_types=item_types)
|
item_types=item_types)
|
||||||
|
items += new_items
|
||||||
|
if (not ctoken) or (new_ctoken and new_items):
|
||||||
|
ctoken = new_ctoken
|
||||||
|
|
||||||
if search_engagement_panels and 'engagementPanels' in response:
|
if search_engagement_panels and 'engagementPanels' in response:
|
||||||
new_items, new_ctoken = extract_items_from_renderer_list(
|
new_items, new_ctoken = extract_items_from_renderer_list(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user