yt_data_extract: normalize thumbnail and author urls
for instance, urls that start with // become https:// adjustment required in comments.py because the url was left as a relative url in yt_data_extract by mistake and was using URL_ORIGIN prefix as fix. see #31
This commit is contained in:
parent
e3c311e10a
commit
75e8930958
@ -90,7 +90,7 @@ def single_comment_ctoken(video_id, comment_id):
|
|||||||
def post_process_comments_info(comments_info):
|
def post_process_comments_info(comments_info):
|
||||||
for comment in comments_info['comments']:
|
for comment in comments_info['comments']:
|
||||||
comment['author_url'] = concat_or_none(
|
comment['author_url'] = concat_or_none(
|
||||||
util.URL_ORIGIN, comment['author_url'])
|
'/', comment['author_url'])
|
||||||
comment['author_avatar'] = concat_or_none(
|
comment['author_avatar'] = concat_or_none(
|
||||||
settings.img_prefix, comment['author_avatar'])
|
settings.img_prefix, comment['author_avatar'])
|
||||||
|
|
||||||
|
@ -90,15 +90,20 @@ def remove_redirect(url):
|
|||||||
return urllib.parse.parse_qs(query_string)['q'][0]
|
return urllib.parse.parse_qs(query_string)['q'][0]
|
||||||
return url
|
return url
|
||||||
|
|
||||||
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
|
norm_url_re = re.compile(r'^(?:(?:https?:)?//)?((?:[\w-]+\.)+[\w-]+)?(/.*)$')
|
||||||
def normalize_url(url):
|
def normalize_url(url):
|
||||||
|
'''Insert https, resolve relative paths for youtube.com, and put www. infront of youtube.com'''
|
||||||
if url is None:
|
if url is None:
|
||||||
return None
|
return None
|
||||||
match = youtube_url_re.fullmatch(url)
|
match = norm_url_re.fullmatch(url)
|
||||||
if match is None:
|
if match is None:
|
||||||
raise Exception()
|
raise Exception(url)
|
||||||
|
|
||||||
return 'https://www.youtube.com' + match.group(1)
|
domain = match.group(1) or 'www.youtube.com'
|
||||||
|
if domain == 'youtube.com':
|
||||||
|
domain = 'www.youtube.com'
|
||||||
|
|
||||||
|
return 'https://' + domain + match.group(2)
|
||||||
|
|
||||||
def _recover_urls(runs):
|
def _recover_urls(runs):
|
||||||
for run in runs:
|
for run in runs:
|
||||||
@ -240,11 +245,11 @@ def extract_item_info(item, additional_info={}):
|
|||||||
))
|
))
|
||||||
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
|
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
|
||||||
info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
|
info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
|
||||||
info['thumbnail'] = multi_deep_get(item,
|
info['thumbnail'] = normalize_url(multi_deep_get(item,
|
||||||
['thumbnail', 'thumbnails', 0, 'url'], # videos
|
['thumbnail', 'thumbnails', 0, 'url'], # videos
|
||||||
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
|
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
|
||||||
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
|
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
|
||||||
)
|
))
|
||||||
|
|
||||||
info['badges'] = []
|
info['badges'] = []
|
||||||
for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
|
for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
|
||||||
|
@ -49,10 +49,10 @@ def extract_channel_info(polymer_json, tab):
|
|||||||
if info['short_description'] and len(info['short_description']) > 730:
|
if info['short_description'] and len(info['short_description']) > 730:
|
||||||
info['short_description'] = info['short_description'][0:730] + '...'
|
info['short_description'] = info['short_description'][0:730] + '...'
|
||||||
info['channel_name'] = metadata.get('title')
|
info['channel_name'] = metadata.get('title')
|
||||||
info['avatar'] = multi_deep_get(metadata,
|
info['avatar'] = normalize_url(multi_deep_get(metadata,
|
||||||
['avatar', 'thumbnails', 0, 'url'],
|
['avatar', 'thumbnails', 0, 'url'],
|
||||||
['thumbnail', 'thumbnails', 0, 'url'],
|
['thumbnail', 'thumbnails', 0, 'url'],
|
||||||
)
|
))
|
||||||
channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
|
channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
|
||||||
if channel_url:
|
if channel_url:
|
||||||
channel_id = get(channel_url.rstrip('/').split('/'), -1)
|
channel_id = get(channel_url.rstrip('/').split('/'), -1)
|
||||||
@ -263,13 +263,13 @@ def extract_comments_info(polymer_json):
|
|||||||
|
|
||||||
# These 3 are sometimes absent, likely because the channel was deleted
|
# These 3 are sometimes absent, likely because the channel was deleted
|
||||||
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
|
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
|
||||||
comment_info['author_url'] = deep_get(comment_renderer,
|
comment_info['author_url'] = normalize_url(deep_get(comment_renderer,
|
||||||
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
|
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'))
|
||||||
comment_info['author_id'] = deep_get(comment_renderer,
|
comment_info['author_id'] = deep_get(comment_renderer,
|
||||||
'authorEndpoint', 'browseEndpoint', 'browseId')
|
'authorEndpoint', 'browseEndpoint', 'browseId')
|
||||||
|
|
||||||
comment_info['author_avatar'] = deep_get(comment_renderer,
|
comment_info['author_avatar'] = normalize_url(deep_get(
|
||||||
'authorThumbnail', 'thumbnails', 0, 'url')
|
comment_renderer, 'authorThumbnail', 'thumbnails', 0, 'url'))
|
||||||
comment_info['id'] = comment_renderer.get('commentId')
|
comment_info['id'] = comment_renderer.get('commentId')
|
||||||
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
|
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
|
||||||
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
|
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user