Fix related vids, like_count, playlist sometimes missing

Cause is that some pages have the onResponseReceivedEndpoints key
at the top level with useless stuff in it, and the extract_items
function was searching in that instead of the 'contents' key.

Change to use if blocks instead of elif blocks in the
extract_items function.
This commit is contained in:
Jesus 2023-09-11 04:13:48 +08:00
parent 8f9c5eeb48
commit 5594d017e2
No known key found for this signature in database
GPG Key ID: E607CE7149F4D71C

View File

@ -109,7 +109,7 @@ def concat_or_none(*strings):
def remove_redirect(url): def remove_redirect(url):
if url is None: if url is None:
return None return None
if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # YouTube puts these on external links to do tracking
query_string = url[url.find('?')+1: ] query_string = url[url.find('?')+1: ]
return urllib.parse.parse_qs(query_string)['q'][0] return urllib.parse.parse_qs(query_string)['q'][0]
return url return url
@ -133,11 +133,11 @@ def _recover_urls(runs):
for run in runs: for run in runs:
url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
text = run.get('text', '') text = run.get('text', '')
# second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text # second condition is necessary because YouTube makes other things into urls, such as hashtags, which we want to keep as text
if url is not None and (text.startswith('http://') or text.startswith('https://')): if url is not None and (text.startswith('http://') or text.startswith('https://')):
url = remove_redirect(url) url = remove_redirect(url)
run['url'] = url run['url'] = url
run['text'] = url # youtube truncates the url text, use actual url instead run['text'] = url # YouTube truncates the url text, use actual url instead
def extract_str(node, default=None, recover_urls=False): def extract_str(node, default=None, recover_urls=False):
'''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)''' '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)'''
@ -569,13 +569,13 @@ def extract_items(response, item_types=_item_types,
item_types=item_types) item_types=item_types)
if items: if items:
break break
elif ('onResponseReceivedEndpoints' in response if ('onResponseReceivedEndpoints' in response
or 'onResponseReceivedActions' in response): or 'onResponseReceivedActions' in response):
for endpoint in multi_get(response, for endpoint in multi_get(response,
'onResponseReceivedEndpoints', 'onResponseReceivedEndpoints',
'onResponseReceivedActions', 'onResponseReceivedActions',
[]): []):
items, ctoken = extract_items_from_renderer_list( new_items, new_ctoken = extract_items_from_renderer_list(
multi_deep_get( multi_deep_get(
endpoint, endpoint,
['reloadContinuationItemsCommand', 'continuationItems'], ['reloadContinuationItemsCommand', 'continuationItems'],
@ -584,13 +584,17 @@ def extract_items(response, item_types=_item_types,
), ),
item_types=item_types, item_types=item_types,
) )
if items: items += new_items
break if (not ctoken) or (new_ctoken and new_items):
elif 'contents' in response: ctoken = new_ctoken
if 'contents' in response:
renderer = get(response, 'contents', {}) renderer = get(response, 'contents', {})
items, ctoken = extract_items_from_renderer( new_items, new_ctoken = extract_items_from_renderer(
renderer, renderer,
item_types=item_types) item_types=item_types)
items += new_items
if (not ctoken) or (new_ctoken and new_items):
ctoken = new_ctoken
if search_engagement_panels and 'engagementPanels' in response: if search_engagement_panels and 'engagementPanels' in response:
new_items, new_ctoken = extract_items_from_renderer_list( new_items, new_ctoken = extract_items_from_renderer_list(