yt-dl: update youtube extractor file to fix js-sig extraction issues

2018-11-09 00:58:46 -08:00
parent f6778f8c3d
commit 630b476e42
2 changed files with 3015 additions and 16 deletions
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -67,7 +67,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
    # If True it will raise an error if no login info is provided
    _LOGIN_REQUIRED = False

-    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}'
+    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'

    def _set_language(self):
        self._set_cookie(
@@ -262,7 +262,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
        return True

    def _download_webpage_handle(self, *args, **kwargs):
-        kwargs.setdefault('query', {})['disable_polymer'] = 'true'
+        query = kwargs.get('query', {}).copy()
+        query['disable_polymer'] = 'true'
+        kwargs['query'] = query
        return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
            *args, **compat_kwargs(kwargs))

@@ -350,6 +352,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                            (?:www\.)?hooktube\.com/|
                            (?:www\.)?yourepeat\.com/|
                            tube\.majestyc\.net/|
+                            (?:www\.)?invidio\.us/|
                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                         (?:                                                  # the various things that can precede the ID:
@@ -493,12 +496,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'Philipp Hagemeister',
                'uploader_id': 'phihag',
                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+                'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
+                'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
                'upload_date': '20121002',
                'license': 'Standard YouTube License',
                'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
                'categories': ['Science & Technology'],
                'tags': ['youtube-dl'],
                'duration': 10,
+                'view_count': int,
                'like_count': int,
                'dislike_count': int,
                'start_time': 1,
@@ -581,6 +587,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'categories': ['Science & Technology'],
                'tags': ['youtube-dl'],
                'duration': 10,
+                'view_count': int,
                'like_count': int,
                'dislike_count': int,
            },
@@ -1067,6 +1074,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
            'only_matching': True,
        },
+        {
+            'url': 'https://invidio.us/watch?v=BaW_jenozKc',
+            'only_matching': True,
+        },
    ]

    def __init__(self, *args, **kwargs):
@@ -1183,7 +1194,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
             r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
             r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
+             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
            jscode, 'Initial JS player signature function name', group='sig')

        jsi = JSInterpreter(jscode)
@@ -1536,6 +1548,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

        player_response = {}

+
+
        # Is it unlisted?
        unlisted = (self._search_regex('''<meta itemprop="unlisted" content="(\w*)">''', video_webpage, 'is_unlisted', default='False') == "True")

@@ -1590,6 +1604,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                        current_song[title.lower()] = value
                    last_index = match.end()

+
+
+
        # Get video info
        embed_webpage = None
        if re.search(r'player-age-gate-content">', video_webpage) is not None:
@@ -1605,7 +1622,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
            })
            video_info_url = proto + '://www.youtube.com/get_video_info?' + data
-            
            video_info_webpage = self._download_webpage(
                video_info_url, video_id,
                note='Refetching age-gated info webpage',
@@ -1697,7 +1713,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

        if 'token' not in video_info:
            if 'reason' in video_info:
-                print(video_info['reason'])
                if 'The uploader has not made this video available in your country.' in video_info['reason']:
                    regions_allowed = self._html_search_meta(
                        'regionsAllowed', video_webpage, default=None)
@@ -1709,7 +1724,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    unavailable_message = extract_unavailable_message()
                    if unavailable_message:
                        reason = unavailable_message
-                raise YoutubeError(
+                raise ExtractorError(
                    'YouTube said: %s' % reason,
                    expected=True, video_id=video_id)
            else:
@@ -1961,13 +1976,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
                formats.append(a_format)
        else:
-            error_message = extract_unavailable_message()
-            alt_error_message = clean_html(video_info.get('reason', [None])[0])
-            print(alt_error_message)
+            error_message = clean_html(video_info.get('reason', [None])[0])
            if not error_message:
-                error_message = alt_error_message
+                error_message = extract_unavailable_message()
            if error_message:
-                raise YoutubeError(error_message)
+                raise ExtractorError(error_message, expected=True)
            raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

        # uploader
@@ -1991,6 +2004,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        else:
            self._downloader.report_warning('unable to extract uploader nickname')

+        channel_id = self._html_search_meta(
+            'channelId', video_webpage, 'channel id')
+        channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
+
        # thumbnail image
        # We try first to get a high quality image:
        m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
@@ -2088,8 +2105,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'view count', default=None))

        # subtitles
-        video_subtitles = self._get_subtitles(video_id, video_webpage)
-        automatic_captions = self._get_automatic_captions(video_id, video_webpage)
+        video_subtitles = self.extract_subtitles(video_id, video_webpage)
+        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

        video_duration = try_get(
            video_info, lambda x: int_or_none(x['length_seconds'][0]))
@@ -2169,6 +2186,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'uploader': video_uploader,
            'uploader_id': video_uploader_id,
            'uploader_url': video_uploader_url,
+            'channel_id': channel_id,
+            'channel_url': channel_url,
            'upload_date': upload_date,
            'license': video_license,
            'creator': video_creator or artist,
@@ -2210,7 +2229,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
                        (?:https?://)?
                        (?:\w+\.)?
                        (?:
-                            youtube\.com/
+                            (?:
+                                youtube\.com|
+                                invidio\.us
+                            )
+                            /
                            (?:
                               (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
                               \? (?:.*?[&;])*? (?:p|a|list)=
@@ -2219,7 +2242,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
                            youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
                        )
                        (
-                            (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,}
+                            (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
                            # Top tracks, they can also include dots
                            |(?:MC)[\w\.]*
                        )
@@ -2323,6 +2346,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
            'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
            'categories': ['People & Blogs'],
            'tags': list,
+            'view_count': int,
            'like_count': int,
            'dislike_count': int,
        },
@@ -2357,6 +2381,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
    }, {
        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
        'only_matching': True,
+    }, {
+        # music album playlist
+        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
+        'only_matching': True,
+    }, {
+        'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
+        'only_matching': True,
    }]

    def _real_initialize(self):
@@ -2499,7 +2530,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
    IE_DESC = 'YouTube.com channels'
-    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
    IE_NAME = 'youtube:channel'
@@ -2520,6 +2551,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
            'id': 'UUs0ifCMCm1icqRbqhUINa0w',
            'title': 'Uploads from Deus Ex',
        },
+    }, {
+        'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
+        'only_matching': True,
    }]

    @classmethod
--- a/youtube_dl/extractor/youtube_unmodified_reference.py
+++ b/youtube_dl/extractor/youtube_unmodified_reference.py