Switch to new comments api now that old one is being disabled

watch_comment api periodically gives the error "Top level comments mweb servlet is turned down." The continuation items for the new api are in a different arrangement in the json, so changes were necessary to the extract_items function. Signed-off-by: Jesús <heckyel@hyperbola.info>
2021-08-07 17:05:58 -07:00
parent bee14ea9ea
commit 3dee7ea0d1
4 changed files with 66 additions and 31 deletions
--- a/youtube/comments.py
+++ b/youtube/comments.py
@@ -47,25 +47,23 @@ def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
    return base64.urlsafe_b64encode(result).decode('ascii')


-mobile_headers = {
-    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
-    'Accept': '*/*',
-    'Accept-Language': 'en-US,en;q=0.5',
-    'X-YouTube-Client-Name': '2',
-    'X-YouTube-Client-Version': '2.20180823',
-}
-
-
 def request_comments(ctoken, replies=False):
-    base_url = 'https://m.youtube.com/watch_comment?'
-    if replies:
-        base_url += 'action_get_comment_replies=1&ctoken='
-    else:
-        base_url += 'action_get_comments=1&ctoken='
-    url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"
+    url = 'https://m.youtube.com/youtubei/v1/next'
+    url += '?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+    data = json.dumps({
+        'context': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'MWEB',
+                'clientVersion': '2.20210804.02.00',
+            },
+        },
+        'continuation': ctoken.replace('=', '%3D'),
+    })

    content = util.fetch_url(
-        url, headers=mobile_headers,
+        url, headers=util.mobile_xhr_headers + util.json_header, data=data,
        report_text='Retrieved comments', debug_name='request_comments')
    content = content.decode('utf-8')

@@ -178,10 +176,9 @@ def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
                ('Direct link', this_sort_url)
            ]

+            ctoken = make_comment_ctoken(video_id, sort, offset, lc)
            comments_info.update(yt_data_extract.extract_comments_info(
-                request_comments(
-                    make_comment_ctoken(video_id, sort, offset, lc, secret_key)
-                )
+                request_comments(ctoken), ctoken=ctoken
            ))
            post_process_comments_info(comments_info)

@@ -212,7 +209,9 @@ def get_comments_page():
    ctoken = request.args.get('ctoken', '')
    replies = request.args.get('replies', '0') == '1'

-    comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies))
+    comments_info = yt_data_extract.extract_comments_info(
+        request_comments(ctoken, replies), ctoken=ctoken
+    )
    post_process_comments_info(comments_info)

    if not replies:
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -387,6 +387,19 @@ mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M)
 mobile_ua = (('User-Agent', mobile_user_agent),)
 desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
 desktop_ua = (('User-Agent', desktop_user_agent),)
+json_header = (('Content-Type', 'application/json'),)
+desktop_xhr_headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '1'),
+    ('X-YouTube-Client-Version', '2.20180830'),
+) + desktop_ua
+mobile_xhr_headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '2'),
+    ('X-YouTube-Client-Version', '2.20180830'),
+) + mobile_ua


 class RateLimitedQueue(gevent.queue.Queue):
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -478,6 +478,22 @@ def extract_items_from_renderer(renderer, item_types=_item_types):

        renderer = None

+
+def extract_items_from_renderer_list(renderers, item_types=_item_types):
+    '''Same as extract_items_from_renderer, but provide a list of renderers'''
+    items = []
+    ctoken = None
+    for renderer in renderers:
+        new_items, new_ctoken = extract_items_from_renderer(
+            renderer,
+            item_types=item_types)
+        items += new_items
+        # prioritize ctoken associated with items
+        if (not ctoken) or (new_ctoken and new_items):
+            ctoken = new_ctoken
+    return items, ctoken
+
+
 def extract_items(response, item_types=_item_types,
                  search_engagement_panels=False):
    '''return items, ctoken'''
@@ -495,6 +511,15 @@ def extract_items(response, item_types=_item_types,
                    item_types=item_types)
                if items:
                    break
+    elif 'onResponseReceivedEndpoints' in response:
+        for endpoint in response.get('onResponseReceivedEndpoints', []):
+            items, ctoken = extract_items_from_renderer_list(
+                deep_get(endpoint, 'appendContinuationItemsAction',
+                         'continuationItems', default=[]),
+                item_types=item_types,
+            )
+            if items:
+                break
    elif 'contents' in response:
        renderer = get(response, 'contents', {})
        items, ctoken = extract_items_from_renderer(
@@ -502,11 +527,11 @@ def extract_items(response, item_types=_item_types,
            item_types=item_types)

    if search_engagement_panels and 'engagementPanels' in response:
-        for engagement_renderer in response['engagementPanels']:
-            additional_items, cont = extract_items_from_renderer(
-                engagement_renderer,
-                item_types=item_types)
-            items += additional_items
-            if cont and not ctoken:
-                ctoken = cont
+        new_items, new_ctoken = extract_items_from_renderer_list(
+            response['engagementPanels'], item_types=item_types
+        )
+        items += new_items
+        if (not ctoken) or (new_ctoken and new_items):
+            ctoken = new_ctoken
+
    return items, ctoken
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -222,15 +222,13 @@ def _ctoken_metadata(ctoken):
            result['sort'] = 0
    return result

-def extract_comments_info(polymer_json):
+def extract_comments_info(polymer_json, ctoken=None):
    response, err = extract_response(polymer_json)
    if err:
        return {'error': err}
    info = {'error': None}

-    url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
-    if url:
-        ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
+    if ctoken:
        metadata = _ctoken_metadata(ctoken)
    else:
        metadata = {}