yt_data_extract: normalize thumbnail and author urls
for instance, urls that start with // become https:// adjustment required in comments.py because the url was left as a relative url in yt_data_extract by mistake and was using URL_ORIGIN prefix as fix. see #31
This commit is contained in:
@@ -49,10 +49,10 @@ def extract_channel_info(polymer_json, tab):
|
||||
if info['short_description'] and len(info['short_description']) > 730:
|
||||
info['short_description'] = info['short_description'][0:730] + '...'
|
||||
info['channel_name'] = metadata.get('title')
|
||||
info['avatar'] = multi_deep_get(metadata,
|
||||
info['avatar'] = normalize_url(multi_deep_get(metadata,
|
||||
['avatar', 'thumbnails', 0, 'url'],
|
||||
['thumbnail', 'thumbnails', 0, 'url'],
|
||||
)
|
||||
))
|
||||
channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
|
||||
if channel_url:
|
||||
channel_id = get(channel_url.rstrip('/').split('/'), -1)
|
||||
@@ -263,13 +263,13 @@ def extract_comments_info(polymer_json):
|
||||
|
||||
# These 3 are sometimes absent, likely because the channel was deleted
|
||||
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
|
||||
comment_info['author_url'] = deep_get(comment_renderer,
|
||||
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
|
||||
comment_info['author_url'] = normalize_url(deep_get(comment_renderer,
|
||||
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'))
|
||||
comment_info['author_id'] = deep_get(comment_renderer,
|
||||
'authorEndpoint', 'browseEndpoint', 'browseId')
|
||||
|
||||
comment_info['author_avatar'] = deep_get(comment_renderer,
|
||||
'authorThumbnail', 'thumbnails', 0, 'url')
|
||||
comment_info['author_avatar'] = normalize_url(deep_get(
|
||||
comment_renderer, 'authorThumbnail', 'thumbnails', 0, 'url'))
|
||||
comment_info['id'] = comment_renderer.get('commentId')
|
||||
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
|
||||
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
|
||||
|
||||
Reference in New Issue
Block a user