Fix breakage due to youtube changes

This commit is contained in:
James Taylor 2019-09-11 15:09:20 -07:00
parent 21c617faf7
commit f48de1aad2
2 changed files with 499 additions and 236 deletions

View File

@ -28,9 +28,11 @@ from ..compat import (
compat_str,
)
from ..utils import (
bool_or_none,
clean_html,
dict_get,
error_to_compat_str,
extract_attributes,
ExtractorError,
float_or_none,
get_element_by_attribute,
@ -40,7 +42,6 @@ from ..utils import (
orderedSet,
parse_codecs,
parse_duration,
qualities,
remove_quotes,
remove_start,
smuggle_url,
@ -118,6 +119,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'f.req': json.dumps(f_req),
'flowName': 'GlifWebSignIn',
'flowEntry': 'ServiceLogin',
# TODO: reverse actual botguard identifier generation algo
'bgRequest': '["identifier",""]',
})
return self._download_json(
url, None, note=note, errnote=errnote,
@ -323,17 +326,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
for video_id, video_title in self.extract_videos_from_page(content):
yield self.url_result(video_id, 'Youtube', video_id, video_title)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
for mobj in re.finditer(self._VIDEO_RE, page):
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
for mobj in re.finditer(video_re, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue
video_id = mobj.group('id')
video_title = unescapeHTML(mobj.group('title'))
video_title = unescapeHTML(
mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title:
video_title = video_title.strip()
if video_title == '► Play all':
video_title = None
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
@ -341,6 +345,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
@ -370,11 +380,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?hooktube\.com/|
(?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
(?:(?:www|dev)\.)?invidio\.us/|
(?:www\.)?invidiou\.sh/|
(?:www\.)?invidious\.snopyta\.org/|
(?:(?:www|no)\.)?invidiou\.sh/|
(?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?invidious\.enkirton\.net/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?invidious\.mastodon\.host/|
(?:www\.)?invidious\.nixnet\.xyz/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/|
(?:www\.)?yt\.elukerio\.org/|
(?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
@ -1589,17 +1612,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2)
return video_id
def _extract_annotations(self, video_id):
return self._download_webpage(
'https://www.youtube.com/annotations_invideo', video_id,
note='Downloading annotations',
errnote='Unable to download video annotations', fatal=False,
query={
'features': 1,
'legacy': 1,
'video_id': video_id,
})
@staticmethod
def _extract_chapters(description, duration):
if not description:
@ -1696,6 +1708,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_token(v_info):
return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
def extract_player_response(player_response, video_id):
pl_response = str_or_none(player_response)
if not pl_response:
return
pl_response = self._parse_json(pl_response, video_id, fatal=False)
if isinstance(pl_response, dict):
add_dash_mpd_pr(pl_response)
return pl_response
player_response = {}
@ -1780,7 +1801,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
pl_response = video_info.get('player_response', [None])[0]
player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(video_info)
view_count = extract_view_count(video_info)
else:
age_gate = False
video_info = None
@ -1803,11 +1827,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
is_live = True
sts = ytplayer_config.get('sts')
if not player_response:
pl_response = str_or_none(args.get('player_response'))
if pl_response:
pl_response = self._parse_json(pl_response, video_id, fatal=False)
if isinstance(pl_response, dict):
player_response = pl_response
player_response = extract_player_response(args.get('player_response'), video_id)
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
# We also try looking in get_video_info since it may contain different dashmpd
@ -1839,9 +1859,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
get_video_info = compat_parse_qs(video_info_webpage)
if not player_response:
pl_response = get_video_info.get('player_response', [None])[0]
if isinstance(pl_response, dict):
player_response = pl_response
add_dash_mpd_pr(player_response)
player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
@ -1864,9 +1882,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break
def extract_unavailable_message():
return self._html_search_regex(
r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
video_webpage, 'unavailable message', default=None)
messages = []
for tag, kind in (('h1', 'message'), ('div', 'submessage')):
msg = self._html_search_regex(
r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
video_webpage, 'unavailable %s' % kind, default=None)
if msg:
messages.append(msg)
if messages:
return '\n'.join(messages)
if not video_info:
unavailable_message = extract_unavailable_message()
@ -1883,7 +1907,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning('Unable to extract video title')
video_title = '_'
# description
description_original = video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
@ -1946,6 +1969,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if view_count is None and video_details:
view_count = int_or_none(video_details.get('viewCount'))
if is_live is None:
is_live = bool_or_none(video_details.get('isLive'))
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
@ -1954,6 +1980,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return int_or_none(self._search_regex(
r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
formats = [{
@ -1962,10 +1991,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': video_info['conn'][0],
'player_url': player_url,
}]
elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
formats = []
formats_spec = {}
fmt_list = video_info.get('fmt_list', [''])[0]
if fmt_list:
@ -1979,91 +2009,104 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
}
q = qualities(['small', 'medium', 'hd720'])
streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)
if streaming_formats:
for fmt in streaming_formats:
itag = str_or_none(fmt.get('itag'))
if not itag:
continue
quality = fmt.get('quality')
quality_label = fmt.get('qualityLabel') or quality
formats_spec[itag] = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_note': quality_label,
'fps': int_or_none(fmt.get('fps')),
'height': int_or_none(fmt.get('height')),
'quality': q(quality),
# bitrate for itag 43 is always 2147483647
'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
'width': int_or_none(fmt.get('width')),
}
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
for fmt in streaming_formats:
itag = str_or_none(fmt.get('itag'))
if not itag:
continue
quality = fmt.get('quality')
quality_label = fmt.get('qualityLabel') or quality
formats_spec[itag] = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_note': quality_label,
'fps': int_or_none(fmt.get('fps')),
'height': int_or_none(fmt.get('height')),
# bitrate for itag 43 is always 2147483647
'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
'width': int_or_none(fmt.get('width')),
}
for fmt in streaming_formats:
if fmt.get('drm_families'):
continue
url = url_or_none(fmt.get('url'))
if not url:
cipher = fmt.get('cipher')
if not cipher:
continue
url_data = compat_parse_qs(cipher)
url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
if not url:
continue
else:
cipher = None
url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
# Unsupported FORMAT_STREAM_TYPE_OTF
if stream_type == 3:
continue
format_id = url_data['itag'][0]
url = url_data['url'][0]
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
jsplayer_url_json = self._search_regex(
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
'JS player URL (1)', default=None)
if not jsplayer_url_json and not age_gate:
# We need the embed website after all
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
format_id = fmt.get('itag') or url_data['itag'][0]
if not format_id:
continue
format_id = compat_str(format_id)
if cipher:
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
jsplayer_url_json = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL')
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
'JS player URL (1)', default=None)
if not jsplayer_url_json and not age_gate:
# We need the embed website after all
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
jsplayer_url_json = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL')
player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
video_webpage, 'age gate player URL')
player_url = json.loads(player_url_json)
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_version = 'unknown'
player_desc = 'unknown'
else:
if player_url.endswith('swf'):
player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
video_webpage, 'age gate player URL')
player_url = json.loads(player_url_json)
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
if player_url is None:
player_version = 'unknown'
player_desc = 'unknown'
else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
if player_url.endswith('swf'):
player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc))
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc))
signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate)
sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
url += '&%s=%s' % (sp, signature)
signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate)
sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
url += '&%s=%s' % (sp, signature)
if 'ratebypass' not in url:
url += '&ratebypass=yes'
@ -2083,24 +2126,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
if width is None:
width = int_or_none(fmt.get('width'))
if height is None:
height = int_or_none(fmt.get('height'))
filesize = int_or_none(url_data.get(
'clen', [None])[0]) or _extract_filesize(url)
quality = url_data.get('quality', [None])[0]
quality = url_data.get('quality', [None])[0] or fmt.get('quality')
quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
more_fields = {
'filesize': filesize,
'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
'tbr': tbr,
'width': width,
'height': height,
'fps': int_or_none(url_data.get('fps', [None])[0]),
'format_note': url_data.get('quality_label', [None])[0] or quality,
'quality': q(quality),
'fps': fps,
'format_note': quality_label or quality,
}
for key, value in more_fields.items():
if value:
dct[key] = value
type_ = url_data.get('type', [None])[0]
type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
if type_:
type_split = type_.split(';')
kind_ext = type_split[0].split('/')
@ -2148,11 +2200,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
formats.append(a_format)
else:
error_message = clean_html(video_info.get('reason', [None])[0])
error_message = extract_unavailable_message()
alt_error_message = clean_html(video_info.get('reason', [None])[0])
print(alt_error_message)
if not error_message:
error_message = alt_error_message
if not error_message:
error_message = clean_html(
try_get(video_info, lambda x: x['reason'][0], compat_str))
if error_message:
raise YoutubeError(error_message)
raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
@ -2323,7 +2378,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
xsrf_token = self._search_regex(
r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
invideo_url = try_get(
player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
if xsrf_token and invideo_url:
xsrf_field_name = self._search_regex(
r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
video_webpage, 'xsrf field name',
group='xsrf_field_name', default='session_token')
video_annotations = self._download_webpage(
self._proto_relative_url(invideo_url),
video_id, note='Downloading annotations',
errnote='Unable to download video annotations', fatal=False,
data=urlencode_postdata({xsrf_field_name: xsrf_token}))
chapters = self._extract_chapters(description_original, video_duration)
@ -2484,7 +2553,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
_VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@ -2507,6 +2577,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': '29C3: Not my department',
'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
'uploader': 'Christiaan008',
'uploader_id': 'ChRiStIaAn008',
},
'playlist_count': 95,
}, {
@ -2515,6 +2587,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': '[OLD]Team Fortress 2 (Class-based LP)',
'id': 'PLBB231211A4F62143',
'uploader': 'Wickydoo',
'uploader_id': 'Wickydoo',
},
'playlist_mincount': 26,
}, {
@ -2523,6 +2597,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': 'Uploads from Cauchemar',
'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
'uploader': 'Cauchemar',
'uploader_id': 'Cauchemar89',
},
'playlist_mincount': 799,
}, {
@ -2540,13 +2616,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': 'JODA15',
'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
'uploader': 'milan',
'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
}
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 485,
'info_dict': {
'title': '2017 華語最新單曲 (2/24更新)',
'title': '2018 Chinese New Singles (11/6 updated)',
'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'uploader': 'LBK',
'uploader_id': 'sdragonfang',
}
}, {
'note': 'Embedded SWF player',
@ -2555,13 +2635,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': 'JODA7',
'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
}
},
'skip': 'This playlist does not exist',
}, {
'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
'info_dict': {
'title': 'Uploads from Interstellar Movie',
'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
'uploader': 'Interstellar Movie',
'uploader_id': 'InterstellarMovie1',
},
'playlist_mincount': 21,
}, {
@ -2586,6 +2669,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'This video is not available.',
'add_ie': [YoutubeIE.ie_key()],
}, {
'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
@ -2597,7 +2681,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'uploader_id': 'backuspagemuseum',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
'upload_date': '20161008',
'license': 'Standard YouTube License',
'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
'categories': ['Nonprofits & Activism'],
'tags': list,
@ -2608,6 +2691,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'noplaylist': True,
'skip_download': True,
},
}, {
# https://github.com/ytdl-org/youtube-dl/issues/21844
'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'info_dict': {
'title': 'Data Analysis with Dr Mike Pound',
'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'uploader_id': 'Computerphile',
'uploader': 'Computerphile',
},
'playlist_mincount': 11,
}, {
'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
'only_matching': True,
@ -2626,6 +2719,34 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
def _real_initialize(self):
self._login()
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
for item in re.findall(
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
attrs = extract_attributes(item)
video_id = attrs['data-video-id']
video_title = unescapeHTML(attrs.get('data-title'))
if video_title:
video_title = video_title.strip()
ids_in_page.append(video_id)
titles_in_page.append(video_title)
# Fallback with old _VIDEO_RE
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
# Relaxed fallbacks
self.extract_videos_from_page_impl(
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
self.extract_videos_from_page_impl(
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
@ -2774,6 +2895,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
'title': 'Uploads from lex will',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
}
}, {
'note': 'Age restricted channel',
@ -2783,6 +2906,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'id': 'UUs0ifCMCm1icqRbqhUINa0w',
'title': 'Uploads from Deus Ex',
'uploader': 'Deus Ex',
'uploader_id': 'DeusExOfficial',
},
}, {
'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
@ -2867,6 +2992,8 @@ class YoutubeUserIE(YoutubeChannelIE):
'info_dict': {
'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
'title': 'Uploads from The Linux Foundation',
'uploader': 'The Linux Foundation',
'uploader_id': 'TheLinuxFoundation',
}
}, {
# Only available via https://www.youtube.com/c/12minuteathlete/videos
@ -2876,6 +3003,8 @@ class YoutubeUserIE(YoutubeChannelIE):
'info_dict': {
'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
'title': 'Uploads from 12 Minute Athlete',
'uploader': '12 Minute Athlete',
'uploader_id': 'the12minuteathlete',
}
}, {
'url': 'ytuser:phihag',
@ -2969,7 +3098,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
'playlist_mincount': 4,
'info_dict': {
'id': 'ThirstForScience',
'title': 'Thirst for Science',
'title': 'ThirstForScience',
},
}, {
# with "Load more" button
@ -2986,6 +3115,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
'title': 'Chem Player',
},
'skip': 'Blocked',
}]

View File

@ -27,9 +27,11 @@ from ..compat import (
compat_str,
)
from ..utils import (
bool_or_none,
clean_html,
dict_get,
error_to_compat_str,
extract_attributes,
ExtractorError,
float_or_none,
get_element_by_attribute,
@ -39,7 +41,6 @@ from ..utils import (
orderedSet,
parse_codecs,
parse_duration,
qualities,
remove_quotes,
remove_start,
smuggle_url,
@ -116,6 +117,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'f.req': json.dumps(f_req),
'flowName': 'GlifWebSignIn',
'flowEntry': 'ServiceLogin',
# TODO: reverse actual botguard identifier generation algo
'bgRequest': '["identifier",""]',
})
return self._download_json(
url, None, note=note, errnote=errnote,
@ -321,17 +324,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
for video_id, video_title in self.extract_videos_from_page(content):
yield self.url_result(video_id, 'Youtube', video_id, video_title)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
for mobj in re.finditer(self._VIDEO_RE, page):
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
for mobj in re.finditer(video_re, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue
video_id = mobj.group('id')
video_title = unescapeHTML(mobj.group('title'))
video_title = unescapeHTML(
mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title:
video_title = video_title.strip()
if video_title == '► Play all':
video_title = None
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
@ -339,6 +343,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
@ -368,11 +378,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?hooktube\.com/|
(?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
(?:(?:www|dev)\.)?invidio\.us/|
(?:www\.)?invidiou\.sh/|
(?:www\.)?invidious\.snopyta\.org/|
(?:(?:www|no)\.)?invidiou\.sh/|
(?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?invidious\.enkirton\.net/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?invidious\.mastodon\.host/|
(?:www\.)?invidious\.nixnet\.xyz/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/|
(?:www\.)?yt\.elukerio\.org/|
(?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
@ -1587,17 +1610,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2)
return video_id
def _extract_annotations(self, video_id):
return self._download_webpage(
'https://www.youtube.com/annotations_invideo', video_id,
note='Downloading annotations',
errnote='Unable to download video annotations', fatal=False,
query={
'features': 1,
'legacy': 1,
'video_id': video_id,
})
@staticmethod
def _extract_chapters(description, duration):
if not description:
@ -1692,6 +1704,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_token(v_info):
return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
def extract_player_response(player_response, video_id):
pl_response = str_or_none(player_response)
if not pl_response:
return
pl_response = self._parse_json(pl_response, video_id, fatal=False)
if isinstance(pl_response, dict):
add_dash_mpd_pr(pl_response)
return pl_response
player_response = {}
# Get video info
@ -1714,7 +1735,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
pl_response = video_info.get('player_response', [None])[0]
player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(video_info)
view_count = extract_view_count(video_info)
else:
age_gate = False
video_info = None
@ -1737,11 +1761,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
is_live = True
sts = ytplayer_config.get('sts')
if not player_response:
pl_response = str_or_none(args.get('player_response'))
if pl_response:
pl_response = self._parse_json(pl_response, video_id, fatal=False)
if isinstance(pl_response, dict):
player_response = pl_response
player_response = extract_player_response(args.get('player_response'), video_id)
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
# We also try looking in get_video_info since it may contain different dashmpd
@ -1773,9 +1793,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
get_video_info = compat_parse_qs(video_info_webpage)
if not player_response:
pl_response = get_video_info.get('player_response', [None])[0]
if isinstance(pl_response, dict):
player_response = pl_response
add_dash_mpd_pr(player_response)
player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
@ -1798,9 +1816,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break
def extract_unavailable_message():
return self._html_search_regex(
r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
video_webpage, 'unavailable message', default=None)
messages = []
for tag, kind in (('h1', 'message'), ('div', 'submessage')):
msg = self._html_search_regex(
r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
video_webpage, 'unavailable %s' % kind, default=None)
if msg:
messages.append(msg)
if messages:
return '\n'.join(messages)
if not video_info:
unavailable_message = extract_unavailable_message()
@ -1879,6 +1903,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if view_count is None and video_details:
view_count = int_or_none(video_details.get('viewCount'))
if is_live is None:
is_live = bool_or_none(video_details.get('isLive'))
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
@ -1887,6 +1914,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return int_or_none(self._search_regex(
r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
formats = [{
@ -1895,10 +1925,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': video_info['conn'][0],
'player_url': player_url,
}]
elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
formats = []
formats_spec = {}
fmt_list = video_info.get('fmt_list', [''])[0]
if fmt_list:
@ -1912,91 +1943,104 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
}
q = qualities(['small', 'medium', 'hd720'])
streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)
if streaming_formats:
for fmt in streaming_formats:
itag = str_or_none(fmt.get('itag'))
if not itag:
continue
quality = fmt.get('quality')
quality_label = fmt.get('qualityLabel') or quality
formats_spec[itag] = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_note': quality_label,
'fps': int_or_none(fmt.get('fps')),
'height': int_or_none(fmt.get('height')),
'quality': q(quality),
# bitrate for itag 43 is always 2147483647
'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
'width': int_or_none(fmt.get('width')),
}
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
for fmt in streaming_formats:
itag = str_or_none(fmt.get('itag'))
if not itag:
continue
quality = fmt.get('quality')
quality_label = fmt.get('qualityLabel') or quality
formats_spec[itag] = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_note': quality_label,
'fps': int_or_none(fmt.get('fps')),
'height': int_or_none(fmt.get('height')),
# bitrate for itag 43 is always 2147483647
'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
'width': int_or_none(fmt.get('width')),
}
for fmt in streaming_formats:
if fmt.get('drm_families'):
continue
url = url_or_none(fmt.get('url'))
if not url:
cipher = fmt.get('cipher')
if not cipher:
continue
url_data = compat_parse_qs(cipher)
url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
if not url:
continue
else:
cipher = None
url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
# Unsupported FORMAT_STREAM_TYPE_OTF
if stream_type == 3:
continue
format_id = url_data['itag'][0]
url = url_data['url'][0]
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
jsplayer_url_json = self._search_regex(
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
'JS player URL (1)', default=None)
if not jsplayer_url_json and not age_gate:
# We need the embed website after all
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
format_id = fmt.get('itag') or url_data['itag'][0]
if not format_id:
continue
format_id = compat_str(format_id)
if cipher:
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
jsplayer_url_json = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL')
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
'JS player URL (1)', default=None)
if not jsplayer_url_json and not age_gate:
# We need the embed website after all
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
jsplayer_url_json = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL')
player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
video_webpage, 'age gate player URL')
player_url = json.loads(player_url_json)
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_version = 'unknown'
player_desc = 'unknown'
else:
if player_url.endswith('swf'):
player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
video_webpage, 'age gate player URL')
player_url = json.loads(player_url_json)
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
if player_url is None:
player_version = 'unknown'
player_desc = 'unknown'
else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
if player_url.endswith('swf'):
player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc))
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc))
signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate)
sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
url += '&%s=%s' % (sp, signature)
signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate)
sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
url += '&%s=%s' % (sp, signature)
if 'ratebypass' not in url:
url += '&ratebypass=yes'
@ -2016,24 +2060,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
if width is None:
width = int_or_none(fmt.get('width'))
if height is None:
height = int_or_none(fmt.get('height'))
filesize = int_or_none(url_data.get(
'clen', [None])[0]) or _extract_filesize(url)
quality = url_data.get('quality', [None])[0]
quality = url_data.get('quality', [None])[0] or fmt.get('quality')
quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
more_fields = {
'filesize': filesize,
'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
'tbr': tbr,
'width': width,
'height': height,
'fps': int_or_none(url_data.get('fps', [None])[0]),
'format_note': url_data.get('quality_label', [None])[0] or quality,
'quality': q(quality),
'fps': fps,
'format_note': quality_label or quality,
}
for key, value in more_fields.items():
if value:
dct[key] = value
type_ = url_data.get('type', [None])[0]
type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
if type_:
type_split = type_.split(';')
kind_ext = type_split[0].split('/')
@ -2081,9 +2134,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
formats.append(a_format)
else:
error_message = clean_html(video_info.get('reason', [None])[0])
error_message = extract_unavailable_message()
if not error_message:
error_message = extract_unavailable_message()
error_message = clean_html(try_get(
player_response, lambda x: x['playabilityStatus']['reason'],
compat_str))
if not error_message:
error_message = clean_html(
try_get(video_info, lambda x: x['reason'][0], compat_str))
if error_message:
raise ExtractorError(error_message, expected=True)
raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
@ -2254,7 +2312,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
xsrf_token = self._search_regex(
r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
invideo_url = try_get(
player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
if xsrf_token and invideo_url:
xsrf_field_name = self._search_regex(
r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
video_webpage, 'xsrf field name',
group='xsrf_field_name', default='session_token')
video_annotations = self._download_webpage(
self._proto_relative_url(invideo_url),
video_id, note='Downloading annotations',
errnote='Unable to download video annotations', fatal=False,
data=urlencode_postdata({xsrf_field_name: xsrf_token}))
chapters = self._extract_chapters(description_original, video_duration)
@ -2412,7 +2484,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
_VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@ -2435,6 +2508,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': '29C3: Not my department',
'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
'uploader': 'Christiaan008',
'uploader_id': 'ChRiStIaAn008',
},
'playlist_count': 95,
}, {
@ -2443,6 +2518,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': '[OLD]Team Fortress 2 (Class-based LP)',
'id': 'PLBB231211A4F62143',
'uploader': 'Wickydoo',
'uploader_id': 'Wickydoo',
},
'playlist_mincount': 26,
}, {
@ -2451,6 +2528,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': 'Uploads from Cauchemar',
'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
'uploader': 'Cauchemar',
'uploader_id': 'Cauchemar89',
},
'playlist_mincount': 799,
}, {
@ -2468,13 +2547,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': 'JODA15',
'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
'uploader': 'milan',
'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
}
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 485,
'info_dict': {
'title': '2017 華語最新單曲 (2/24更新)',
'title': '2018 Chinese New Singles (11/6 updated)',
'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'uploader': 'LBK',
'uploader_id': 'sdragonfang',
}
}, {
'note': 'Embedded SWF player',
@ -2483,13 +2566,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'title': 'JODA7',
'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
}
},
'skip': 'This playlist does not exist',
}, {
'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
'info_dict': {
'title': 'Uploads from Interstellar Movie',
'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
'uploader': 'Interstellar Movie',
'uploader_id': 'InterstellarMovie1',
},
'playlist_mincount': 21,
}, {
@ -2514,6 +2600,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'This video is not available.',
'add_ie': [YoutubeIE.ie_key()],
}, {
'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
@ -2525,7 +2612,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'uploader_id': 'backuspagemuseum',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
'upload_date': '20161008',
'license': 'Standard YouTube License',
'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
'categories': ['Nonprofits & Activism'],
'tags': list,
@ -2536,6 +2622,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'noplaylist': True,
'skip_download': True,
},
}, {
# https://github.com/ytdl-org/youtube-dl/issues/21844
'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'info_dict': {
'title': 'Data Analysis with Dr Mike Pound',
'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
'uploader_id': 'Computerphile',
'uploader': 'Computerphile',
},
'playlist_mincount': 11,
}, {
'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
'only_matching': True,
@ -2554,6 +2650,34 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
def _real_initialize(self):
self._login()
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
for item in re.findall(
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
attrs = extract_attributes(item)
video_id = attrs['data-video-id']
video_title = unescapeHTML(attrs.get('data-title'))
if video_title:
video_title = video_title.strip()
ids_in_page.append(video_id)
titles_in_page.append(video_title)
# Fallback with old _VIDEO_RE
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
# Relaxed fallbacks
self.extract_videos_from_page_impl(
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
self.extract_videos_from_page_impl(
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
@ -2702,6 +2826,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
'title': 'Uploads from lex will',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
}
}, {
'note': 'Age restricted channel',
@ -2711,6 +2837,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
'info_dict': {
'id': 'UUs0ifCMCm1icqRbqhUINa0w',
'title': 'Uploads from Deus Ex',
'uploader': 'Deus Ex',
'uploader_id': 'DeusExOfficial',
},
}, {
'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
@ -2795,6 +2923,8 @@ class YoutubeUserIE(YoutubeChannelIE):
'info_dict': {
'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
'title': 'Uploads from The Linux Foundation',
'uploader': 'The Linux Foundation',
'uploader_id': 'TheLinuxFoundation',
}
}, {
# Only available via https://www.youtube.com/c/12minuteathlete/videos
@ -2804,6 +2934,8 @@ class YoutubeUserIE(YoutubeChannelIE):
'info_dict': {
'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
'title': 'Uploads from 12 Minute Athlete',
'uploader': '12 Minute Athlete',
'uploader_id': 'the12minuteathlete',
}
}, {
'url': 'ytuser:phihag',
@ -2897,7 +3029,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
'playlist_mincount': 4,
'info_dict': {
'id': 'ThirstForScience',
'title': 'Thirst for Science',
'title': 'ThirstForScience',
},
}, {
# with "Load more" button
@ -2914,6 +3046,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
'title': 'Chem Player',
},
'skip': 'Blocked',
}]