yt-dl: update youtube extractor file to fix js-sig extraction issues

This commit is contained in:
James Taylor 2018-11-09 00:58:46 -08:00
parent f6778f8c3d
commit 630b476e42
2 changed files with 3015 additions and 16 deletions

View File

@ -67,7 +67,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided # If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False _LOGIN_REQUIRED = False
_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
def _set_language(self): def _set_language(self):
self._set_cookie( self._set_cookie(
@ -262,7 +262,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return True return True
def _download_webpage_handle(self, *args, **kwargs): def _download_webpage_handle(self, *args, **kwargs):
kwargs.setdefault('query', {})['disable_polymer'] = 'true' query = kwargs.get('query', {}).copy()
query['disable_polymer'] = 'true'
kwargs['query'] = query
return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs)) *args, **compat_kwargs(kwargs))
@ -350,6 +352,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?hooktube\.com/| (?:www\.)?hooktube\.com/|
(?:www\.)?yourepeat\.com/| (?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/| tube\.majestyc\.net/|
(?:www\.)?invidio\.us/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls (?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID: (?: # the various things that can precede the ID:
@ -493,12 +496,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Philipp Hagemeister', 'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag', 'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002', 'upload_date': '20121002',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'], 'categories': ['Science & Technology'],
'tags': ['youtube-dl'], 'tags': ['youtube-dl'],
'duration': 10, 'duration': 10,
'view_count': int,
'like_count': int, 'like_count': int,
'dislike_count': int, 'dislike_count': int,
'start_time': 1, 'start_time': 1,
@ -581,6 +587,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'categories': ['Science & Technology'], 'categories': ['Science & Technology'],
'tags': ['youtube-dl'], 'tags': ['youtube-dl'],
'duration': 10, 'duration': 10,
'view_count': int,
'like_count': int, 'like_count': int,
'dislike_count': int, 'dislike_count': int,
}, },
@ -1067,6 +1074,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
'only_matching': True, 'only_matching': True,
}, },
{
'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True,
},
] ]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -1183,7 +1194,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('), r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
jscode, 'Initial JS player signature function name', group='sig') jscode, 'Initial JS player signature function name', group='sig')
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
@ -1536,6 +1548,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response = {} player_response = {}
# Is it unlisted? # Is it unlisted?
unlisted = (self._search_regex('''<meta itemprop="unlisted" content="(\w*)">''', video_webpage, 'is_unlisted', default='False') == "True") unlisted = (self._search_regex('''<meta itemprop="unlisted" content="(\w*)">''', video_webpage, 'is_unlisted', default='False') == "True")
@ -1590,6 +1604,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
current_song[title.lower()] = value current_song[title.lower()] = value
last_index = match.end() last_index = match.end()
# Get video info # Get video info
embed_webpage = None embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None: if re.search(r'player-age-gate-content">', video_webpage) is not None:
@ -1605,7 +1622,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
}) })
video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_url = proto + '://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage( video_info_webpage = self._download_webpage(
video_info_url, video_id, video_info_url, video_id,
note='Refetching age-gated info webpage', note='Refetching age-gated info webpage',
@ -1697,7 +1713,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if 'token' not in video_info: if 'token' not in video_info:
if 'reason' in video_info: if 'reason' in video_info:
print(video_info['reason'])
if 'The uploader has not made this video available in your country.' in video_info['reason']: if 'The uploader has not made this video available in your country.' in video_info['reason']:
regions_allowed = self._html_search_meta( regions_allowed = self._html_search_meta(
'regionsAllowed', video_webpage, default=None) 'regionsAllowed', video_webpage, default=None)
@ -1709,7 +1724,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
unavailable_message = extract_unavailable_message() unavailable_message = extract_unavailable_message()
if unavailable_message: if unavailable_message:
reason = unavailable_message reason = unavailable_message
raise YoutubeError( raise ExtractorError(
'YouTube said: %s' % reason, 'YouTube said: %s' % reason,
expected=True, video_id=video_id) expected=True, video_id=video_id)
else: else:
@ -1961,13 +1976,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
formats.append(a_format) formats.append(a_format)
else: else:
error_message = extract_unavailable_message() error_message = clean_html(video_info.get('reason', [None])[0])
alt_error_message = clean_html(video_info.get('reason', [None])[0])
print(alt_error_message)
if not error_message: if not error_message:
error_message = alt_error_message error_message = extract_unavailable_message()
if error_message: if error_message:
raise YoutubeError(error_message) raise ExtractorError(error_message, expected=True)
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# uploader # uploader
@ -1991,6 +2004,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else: else:
self._downloader.report_warning('unable to extract uploader nickname') self._downloader.report_warning('unable to extract uploader nickname')
channel_id = self._html_search_meta(
'channelId', video_webpage, 'channel id')
channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
# thumbnail image # thumbnail image
# We try first to get a high quality image: # We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
@ -2088,8 +2105,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'view count', default=None)) 'view count', default=None))
# subtitles # subtitles
video_subtitles = self._get_subtitles(video_id, video_webpage) video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self._get_automatic_captions(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
video_duration = try_get( video_duration = try_get(
video_info, lambda x: int_or_none(x['length_seconds'][0])) video_info, lambda x: int_or_none(x['length_seconds'][0]))
@ -2169,6 +2186,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': video_uploader, 'uploader': video_uploader,
'uploader_id': video_uploader_id, 'uploader_id': video_uploader_id,
'uploader_url': video_uploader_url, 'uploader_url': video_uploader_url,
'channel_id': channel_id,
'channel_url': channel_url,
'upload_date': upload_date, 'upload_date': upload_date,
'license': video_license, 'license': video_license,
'creator': video_creator or artist, 'creator': video_creator or artist,
@ -2210,7 +2229,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(?:https?://)? (?:https?://)?
(?:\w+\.)? (?:\w+\.)?
(?: (?:
youtube\.com/ (?:
youtube\.com|
invidio\.us
)
/
(?: (?:
(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
\? (?:.*?[&;])*? (?:p|a|list)= \? (?:.*?[&;])*? (?:p|a|list)=
@ -2219,7 +2242,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
) )
( (
(?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots # Top tracks, they can also include dots
|(?:MC)[\w\.]* |(?:MC)[\w\.]*
) )
@ -2323,6 +2346,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
'categories': ['People & Blogs'], 'categories': ['People & Blogs'],
'tags': list, 'tags': list,
'view_count': int,
'like_count': int, 'like_count': int,
'dislike_count': int, 'dislike_count': int,
}, },
@ -2357,6 +2381,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
}, { }, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True, 'only_matching': True,
}, {
# music album playlist
'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
'only_matching': True,
}, {
'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
'only_matching': True,
}] }]
def _real_initialize(self): def _real_initialize(self):
@ -2499,7 +2530,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com channels' IE_DESC = 'YouTube.com channels'
_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
IE_NAME = 'youtube:channel' IE_NAME = 'youtube:channel'
@ -2520,6 +2551,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
'title': 'Uploads from Deus Ex', 'title': 'Uploads from Deus Ex',
}, },
}, {
'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
'only_matching': True,
}] }]
@classmethod @classmethod

File diff suppressed because it is too large Load Diff