Extraction: Fix url prefixing

This commit is contained in:
James Taylor 2019-09-08 17:20:02 -07:00
parent 1b6fb4e100
commit fb1a3531c5
3 changed files with 19 additions and 8 deletions

View File

@ -219,8 +219,7 @@ def extract_info(polymer_json, tab):
else: else:
items = contents # for search items = contents # for search
# TODO: Fix this URL prefixing shit additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id}
additional_info = {'author': info['channel_name'], 'author_url': '/channel/' + channel_id}
info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items]
elif tab == 'about': elif tab == 'about':
@ -258,8 +257,8 @@ def extract_info(polymer_json, tab):
return info return info
def post_process_channel_info(info): def post_process_channel_info(info):
info['avatar'] = '/' + info['avatar'] info['avatar'] = util.prefix_url(info['avatar'])
info['channel_url'] = '/' + info['channel_url'] info['channel_url'] = util.prefix_url(info['channel_url'])
for item in info['items']: for item in info['items']:
yt_data_extract.prefix_urls(item) yt_data_extract.prefix_urls(item)
yt_data_extract.add_extra_html_info(item) yt_data_extract.add_extra_html_info(item)

View File

@ -317,3 +317,7 @@ def uppercase_escape(s):
return re.sub( return re.sub(
r'\\U([0-9a-fA-F]{8})', r'\\U([0-9a-fA-F]{8})',
lambda m: chr(int(m.group(1), base=16)), s) lambda m: chr(int(m.group(1), base=16)), s)
def prefix_url(url):
url = url.lstrip('/') # some urls have // before them, which has a special meaning
return '/' + url

View File

@ -2,6 +2,7 @@ from youtube import util
import html import html
import json import json
import re
# videos (all of type str): # videos (all of type str):
@ -152,15 +153,22 @@ def ajax_info(item_json):
raise raise
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
def normalize_url(url):
match = youtube_url_re.fullmatch(url)
if match is None:
raise Exception()
return 'https://www.youtube.com' + match.group(1)
def prefix_urls(item): def prefix_urls(item):
try: try:
item['thumbnail'] = '/' + item['thumbnail'].lstrip('/') item['thumbnail'] = util.prefix_url(item['thumbnail'])
except KeyError: except KeyError:
pass pass
try: try:
item['author_url'] = util.URL_ORIGIN + item['author_url'] item['author_url'] = util.prefix_url(item['author_url'])
except KeyError: except KeyError:
pass pass
@ -219,7 +227,7 @@ def renderer_info(renderer, additional_info={}):
if 'ownerText' in renderer: if 'ownerText' in renderer:
info['author'] = renderer['ownerText']['runs'][0]['text'] info['author'] = renderer['ownerText']['runs'][0]['text']
info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'])
try: try:
overlays = renderer['thumbnailOverlays'] overlays = renderer['thumbnailOverlays']
except KeyError: except KeyError:
@ -241,7 +249,7 @@ def renderer_info(renderer, additional_info={}):
if key in ('longBylineText', 'shortBylineText'): if key in ('longBylineText', 'shortBylineText'):
info['author'] = get_text(node) info['author'] = get_text(node)
try: try:
info['author_url'] = get_url(node) info['author_url'] = normalize_url(get_url(node))
except KeyError: except KeyError:
pass pass