Fix comment replies

Comment reply protobuf now requires the channel id of the uploader of the video. Otherwise the endpoint returns 500. Instead of making the protobuf ourselves and passing this data around through query parameters, just use the ctoken provided to us but modify the max_replies field from 10 to 250. Fixes #53 Signed-off-by: Jesús <heckyel@hyperbola.info>
2021-02-25 15:55:23 -08:00
parent f26c9be85e
commit 00ef1c8627
4 changed files with 129 additions and 51 deletions
--- a/youtube/comments.py
+++ b/youtube/comments.py
@@ -33,8 +33,8 @@ def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
    video_id = proto.as_bytes(video_id)
    secret_key = proto.as_bytes(secret_key)

-    page_info = proto.string(4, video_id) + proto.uint(6, sort)

+    page_info = proto.string(4,video_id) + proto.uint(6, sort)
    offset_information = proto.nested(4, page_info) + proto.uint(5, offset)
    if secret_key:
        offset_information = proto.string(1, secret_key) + offset_information
@@ -47,15 +47,6 @@ def make_comment_ctoken(video_id, sort=0, offset=0, lc='', secret_key=''):
    return base64.urlsafe_b64encode(result).decode('ascii')


-def comment_replies_ctoken(video_id, comment_id, max_results=500):
-
-    params = proto.string(2, comment_id) + proto.uint(9, max_results)
-    params = proto.nested(3, params)
-
-    result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3, 6) + proto.nested(6, params)
-    return base64.urlsafe_b64encode(result).decode('ascii')
-
-
 mobile_headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
    'Accept': '*/*',
@@ -66,10 +57,11 @@ mobile_headers = {


 def request_comments(ctoken, replies=False):
-    if replies: # let's make it use different urls for no reason despite all the data being encoded
-        base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken="
+    base_url = 'https://m.youtube.com/watch_comment?'
+    if replies:
+        base_url += 'action_get_comment_replies=1&ctoken='
    else:
-        base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken="
+        base_url += 'action_get_comments=1&ctoken='
    url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"

    content = util.fetch_url(
@@ -99,17 +91,24 @@ def post_process_comments_info(comments_info):

        comment['permalink'] = concat_or_none(
            util.URL_ORIGIN, '/watch?v=',
-            comments_info['video_id'], '&lc=', comment['id'])
+            comments_info['video_id'],
+            '&lc=', comment['id']
+        )

        reply_count = comment['reply_count']
-
-        if reply_count == 0:
-            comment['replies_url'] = None
-        else:
-            comment['replies_url'] = concat_or_none(
-                util.URL_ORIGIN,
-                '/comments?parent_id=', comment['id'],
-                '&video_id=', comments_info['video_id'])
+        comment['replies_url'] = None
+        if comment['reply_ctoken']:
+            # change max_replies field to 250 in ctoken
+            ctoken = comment['reply_ctoken']
+            ctoken, err = proto.set_protobuf_value(
+                ctoken,
+                'base64p', 6, 3, 9, value=250)
+            if err:
+                print('Error setting ctoken value:')
+                print(err)
+                comment['replies_url'] = None
+            comment['replies_url'] = concat_or_none(util.URL_ORIGIN,
+                '/comments?replies=1&ctoken=' + ctoken)

        if reply_count == 0:
            comment['view_replies_text'] = 'Reply'
@@ -118,6 +117,7 @@ def post_process_comments_info(comments_info):
        else:
            comment['view_replies_text'] = str(reply_count) + ' replies'

+
        if comment['like_count'] == 1:
            comment['likes_text'] = '1 like'
        else:
@@ -125,10 +125,12 @@ def post_process_comments_info(comments_info):

    comments_info['include_avatars'] = settings.enable_comment_avatars
    if comments_info['ctoken']:
+        replies_param = '&replies=1' if comments_info['is_replies'] else ''
        comments_info['more_comments_url'] = concat_or_none(
            util.URL_ORIGIN,
            '/comments?ctoken=',
-            comments_info['ctoken']
+            comments_info['ctoken'],
+            replies_param
        )

    comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1)
@@ -137,14 +139,11 @@ def post_process_comments_info(comments_info):
        comments_info['sort_text'] = 'top' if comments_info['sort'] == 0 else 'newest'

    comments_info['video_url'] = concat_or_none(
-        util.URL_ORIGIN,
-        '/watch?v=',
-        comments_info['video_id']
-    )
-
+        util.URL_ORIGIN, '/watch?v=', comments_info['video_id'])
    comments_info['video_thumbnail'] = concat_or_none(
        settings.img_prefix, 'https://i.ytimg.com/vi/',
-        comments_info['video_id'], '/mqdefault.jpg')
+        comments_info['video_id'], '/mqdefault.jpg'
+    )


 def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
@@ -198,17 +197,9 @@ def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
@yt_app.route('/comments')
 def get_comments_page():
    ctoken = request.args.get('ctoken', '')
-    replies = False
-    if not ctoken:
-        video_id = request.args['video_id']
-        parent_id = request.args['parent_id']
-
-        ctoken = comment_replies_ctoken(video_id, parent_id)
-        replies = True
-
-    comments_info = yt_data_extract.extract_comments_info(
-        request_comments(ctoken, replies))
+    replies = request.args.get('replies', '0') == '1'

+    comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies))
    post_process_comments_info(comments_info)

    if not replies:
--- a/youtube/proto.py
+++ b/youtube/proto.py
@@ -1,6 +1,7 @@
 from math import ceil
 import base64
 import io
+import traceback


 def byte(n):
@@ -92,7 +93,6 @@ def read_group(data, end_sequence):
    data.seek(index + len(end_sequence))
    return data.original[start:index]

-
 def read_protobuf(data):
    data_original = data
    data = io.BytesIO(data)
@@ -122,12 +122,89 @@ def read_protobuf(data):
        yield (wire_type, field_number, value)


-def parse(data):
-    return {field_number: value for _, field_number, value in read_protobuf(data)}
+def parse(data, include_wire_type=False):
+    '''Returns a dict mapping field numbers to values
+
+    data is the protobuf structure, which must not be b64-encoded'''
+    if include_wire_type:
+        return {field_number: [wire_type, value]
+                for wire_type, field_number, value in read_protobuf(data)}
+    return {field_number: value
+            for _, field_number, value in read_protobuf(data)}
+
+
+base64_enc_funcs = {
+    'base64': base64.urlsafe_b64encode,
+    'base64s': unpadded_b64encode,
+    'base64p': percent_b64encode,
+}
+
+
+def _make_protobuf(data):
+    # must be dict mapping field_number to [wire_type, value]
+    if isinstance(data, dict):
+        new_data = []
+        for field_num, (wire_type, value) in sorted(data.items()):
+            new_data.append((wire_type, field_num, value))
+        data = new_data
+    if isinstance(data, str):
+        return data.encode('utf-8')
+    elif len(data) == 2 and data[0] in base64_enc_funcs:
+        return base64_enc_funcs[data[0]](make_proto(data[1]))
+    elif isinstance(data, list):
+        result = b''
+        for field in data:
+            if field[0] == 0:
+                result += uint(field[1], field[2])
+            elif field[0] == 2:
+                result += string(field[1], _make_protobuf(field[2]))
+            else:
+                raise NotImplementedError('Wire type ' + str(field[0])
+                    + ' not implemented')
+        return result
+    return data
+
+
+def make_protobuf(data):
+    return _make_protobuf(data).decode('ascii')
+
+
+def _set_protobuf_value(data, *path, value):
+    if not path:
+        return value
+    op = path[0]
+    if op in base64_enc_funcs:
+        inner_data = b64_to_bytes(data)
+        return base64_enc_funcs[op](
+            _set_protobuf_value(inner_data, *path[1:], value=value)
+        )
+    pb_dict = parse(data, include_wire_type=True)
+    pb_dict[op][1] = _set_protobuf_value(
+        pb_dict[op][1], *path[1:], value=value
+    )
+    return _make_protobuf(pb_dict)
+
+
+def set_protobuf_value(data, *path, value):
+    '''Set a field's value in a raw protobuf structure
+
+    path is a list of field numbers and/or base64 encoding directives
+
+    The directives are
+        base64: normal base64 encoding with equal signs padding
+        base64s ("stripped"): no padding
+        base64p: %3D instead of = for padding
+
+    return new_protobuf, err'''
+    try:
+        new_protobuf = _set_protobuf_value(data, *path, value=value)
+        return new_protobuf.decode('ascii'), None
+    except Exception:
+        return None, traceback.format_exc()


 def b64_to_bytes(data):
    if isinstance(data, bytes):
        data = data.decode('ascii')
    data = data.replace("%3D", "=")
-    return base64.urlsafe_b64decode(data + "="*((4 - len(data)%4)%4))
+    return base64.urlsafe_b64decode(data + "="*((4 - len(data) % 4) % 4))
--- a/youtube/templates/comments.html
+++ b/youtube/templates/comments.html
@@ -23,14 +23,18 @@

            <span class="comment-likes">{{ comment['likes_text'] if comment['like_count'] else ''}}</span>
            <div class="button-row">
-                {% if settings.use_comments_js and comment['reply_count'] %}
-                    <details class="replies" data-src="{{ comment['replies_url'] }}">
-                        <summary>{{ comment['view_replies_text'] }}</summary>
-                        <a href="{{ comment['replies_url'] }}" class="replies-open-new-tab" target="_blank">Open in new tab</a>
-                        <div class="comment_page">loading..</div>
-                    </details>
-                {% elif comment['reply_count'] %}
-                    <a href="{{ comment['replies_url'] }}" class="replies">{{ comment['view_replies_text'] }}</a>
+                {% if comment['reply_count'] %}
+                    {% if settings.use_comments_js and comment['replies_url'] %}
+                        <details class="replies" src="{{ comment['replies_url'] }}">
+                            <summary>{{ comment['view_replies_text'] }}</summary>
+                            <a href="{{ comment['replies_url'] }}" class="replies-open-new-tab" target="_blank">Open in new tab</a>
+                            <div class="comment_page">loading...</div>
+                        </details>
+                    {% elif comment['replies_url'] %}
+                        <a href="{{ comment['replies_url'] }}" class="replies">{{ comment['view_replies_text'] }}</a>
+                    {% else %}
+                        <a class="replies">{{ comment['view_replies_text'] }} (error constructing url)</a>
+                    {% endif %}
                {% endif %}
            </div>
        </div>
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -251,13 +251,19 @@ def extract_comments_info(polymer_json):
            info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
            if 'replies' not in comment_thread:
                comment_info['reply_count'] = 0
+                comment_info['reply_ctoken'] = None
            else:
                comment_info['reply_count'] = extract_int(deep_get(comment_thread,
                    'replies', 'commentRepliesRenderer', 'moreText'
                ), default=1)   # With 1 reply, the text reads "View reply"
+                comment_info['reply_ctoken'] = deep_get(comment_thread,
+                    'replies', 'commentRepliesRenderer', 'continuations', 0,
+                    'nextContinuationData', 'continuation'
+                )
            comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
        elif 'commentRenderer' in comment:  # replies
            comment_info['reply_count'] = 0     # replyCount, below, not present for replies even if the reply has further replies to it
+            comment_info['reply_ctoken'] = None
            conservative_update(info, 'is_replies', True)
            comment_renderer = comment['commentRenderer']
        else: