Extraction: Fix url prefixing
This commit is contained in:
parent
1b6fb4e100
commit
fb1a3531c5
@ -219,8 +219,7 @@ def extract_info(polymer_json, tab):
|
|||||||
else:
|
else:
|
||||||
items = contents # for search
|
items = contents # for search
|
||||||
|
|
||||||
# TODO: Fix this URL prefixing shit
|
additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id}
|
||||||
additional_info = {'author': info['channel_name'], 'author_url': '/channel/' + channel_id}
|
|
||||||
info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items]
|
info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items]
|
||||||
|
|
||||||
elif tab == 'about':
|
elif tab == 'about':
|
||||||
@ -258,8 +257,8 @@ def extract_info(polymer_json, tab):
|
|||||||
return info
|
return info
|
||||||
|
|
||||||
def post_process_channel_info(info):
|
def post_process_channel_info(info):
|
||||||
info['avatar'] = '/' + info['avatar']
|
info['avatar'] = util.prefix_url(info['avatar'])
|
||||||
info['channel_url'] = '/' + info['channel_url']
|
info['channel_url'] = util.prefix_url(info['channel_url'])
|
||||||
for item in info['items']:
|
for item in info['items']:
|
||||||
yt_data_extract.prefix_urls(item)
|
yt_data_extract.prefix_urls(item)
|
||||||
yt_data_extract.add_extra_html_info(item)
|
yt_data_extract.add_extra_html_info(item)
|
||||||
|
@ -317,3 +317,7 @@ def uppercase_escape(s):
|
|||||||
return re.sub(
|
return re.sub(
|
||||||
r'\\U([0-9a-fA-F]{8})',
|
r'\\U([0-9a-fA-F]{8})',
|
||||||
lambda m: chr(int(m.group(1), base=16)), s)
|
lambda m: chr(int(m.group(1), base=16)), s)
|
||||||
|
|
||||||
|
def prefix_url(url):
|
||||||
|
url = url.lstrip('/') # some urls have // before them, which has a special meaning
|
||||||
|
return '/' + url
|
||||||
|
@ -2,6 +2,7 @@ from youtube import util
|
|||||||
|
|
||||||
import html
|
import html
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
# videos (all of type str):
|
# videos (all of type str):
|
||||||
|
|
||||||
@ -152,15 +153,22 @@ def ajax_info(item_json):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
|
||||||
|
def normalize_url(url):
|
||||||
|
match = youtube_url_re.fullmatch(url)
|
||||||
|
if match is None:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
return 'https://www.youtube.com' + match.group(1)
|
||||||
|
|
||||||
def prefix_urls(item):
|
def prefix_urls(item):
|
||||||
try:
|
try:
|
||||||
item['thumbnail'] = '/' + item['thumbnail'].lstrip('/')
|
item['thumbnail'] = util.prefix_url(item['thumbnail'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
item['author_url'] = util.URL_ORIGIN + item['author_url']
|
item['author_url'] = util.prefix_url(item['author_url'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -219,7 +227,7 @@ def renderer_info(renderer, additional_info={}):
|
|||||||
|
|
||||||
if 'ownerText' in renderer:
|
if 'ownerText' in renderer:
|
||||||
info['author'] = renderer['ownerText']['runs'][0]['text']
|
info['author'] = renderer['ownerText']['runs'][0]['text']
|
||||||
info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
|
info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'])
|
||||||
try:
|
try:
|
||||||
overlays = renderer['thumbnailOverlays']
|
overlays = renderer['thumbnailOverlays']
|
||||||
except KeyError:
|
except KeyError:
|
||||||
@ -241,7 +249,7 @@ def renderer_info(renderer, additional_info={}):
|
|||||||
if key in ('longBylineText', 'shortBylineText'):
|
if key in ('longBylineText', 'shortBylineText'):
|
||||||
info['author'] = get_text(node)
|
info['author'] = get_text(node)
|
||||||
try:
|
try:
|
||||||
info['author_url'] = get_url(node)
|
info['author_url'] = normalize_url(get_url(node))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user