Extraction: Rewrite comment extraction, remove author_id and rename author_channel_id to that, fix bug in extract_items
author_id (an internal sql-like integer previously required for deleting and editing comments) has been removed by Youtube and is no longer required. Remove it for simplicity. Rename author_channel_id to author_id for consistency with other extraction attributes. extract_items returned None for items instead of [] for empty continuation responses. Fixes that.
This commit is contained in:
parent
02848a1a32
commit
beb0976b5b
@ -93,11 +93,10 @@ def post_process_comments_info(comments_info):
|
|||||||
|
|
||||||
comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id']
|
comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id']
|
||||||
|
|
||||||
if comment['author_channel_id'] in accounts.accounts:
|
if comment['author_id'] in accounts.accounts:
|
||||||
comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id='
|
comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id='
|
||||||
+ comments_info['video_id']
|
+ comments_info['video_id']
|
||||||
+ '&channel_id='+ comment['author_channel_id']
|
+ '&channel_id='+ comment['author_id']
|
||||||
+ '&author_id=' + comment['author_id']
|
|
||||||
+ '&comment_id=' + comment['id'])
|
+ '&comment_id=' + comment['id'])
|
||||||
|
|
||||||
reply_count = comment['reply_count']
|
reply_count = comment['reply_count']
|
||||||
@ -135,7 +134,7 @@ def post_process_comments_info(comments_info):
|
|||||||
|
|
||||||
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
|
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
|
||||||
if settings.comments_mode:
|
if settings.comments_mode:
|
||||||
comments_info = yt_data_extract.parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key)))
|
comments_info = yt_data_extract.extract_comments_info(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key)))
|
||||||
post_process_comments_info(comments_info)
|
post_process_comments_info(comments_info)
|
||||||
|
|
||||||
post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id
|
post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id
|
||||||
@ -160,7 +159,7 @@ def get_comments_page():
|
|||||||
ctoken = comment_replies_ctoken(video_id, parent_id)
|
ctoken = comment_replies_ctoken(video_id, parent_id)
|
||||||
replies = True
|
replies = True
|
||||||
|
|
||||||
comments_info = yt_data_extract.parse_comments_polymer(request_comments(ctoken, replies))
|
comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies))
|
||||||
post_process_comments_info(comments_info)
|
post_process_comments_info(comments_info)
|
||||||
|
|
||||||
if not replies:
|
if not replies:
|
||||||
|
@ -70,7 +70,7 @@ def _post_comment_reply(text, video_id, parent_comment_id, session_token, cookie
|
|||||||
print("Comment posting code: " + code)
|
print("Comment posting code: " + code)
|
||||||
return code
|
return code
|
||||||
|
|
||||||
def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar):
|
def _delete_comment(video_id, comment_id, session_token, cookiejar):
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
||||||
'Accept': '*/*',
|
'Accept': '*/*',
|
||||||
@ -79,7 +79,7 @@ def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar):
|
|||||||
'X-YouTube-Client-Version': '2.20180823',
|
'X-YouTube-Client-Version': '2.20180823',
|
||||||
'Content-Type': 'application/x-www-form-urlencoded',
|
'Content-Type': 'application/x-www-form-urlencoded',
|
||||||
}
|
}
|
||||||
action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) + proto.string(9, author_id)
|
action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id)
|
||||||
action = proto.percent_b64encode(action).decode('ascii')
|
action = proto.percent_b64encode(action).decode('ascii')
|
||||||
|
|
||||||
sej = json.dumps({"clickTrackingParams":"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":True}},"performCommentActionEndpoint":{"action":action}})
|
sej = json.dumps({"clickTrackingParams":"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":True}},"performCommentActionEndpoint":{"action":action}})
|
||||||
@ -115,7 +115,7 @@ def delete_comment():
|
|||||||
cookiejar = accounts.account_cookiejar(request.values['channel_id'])
|
cookiejar = accounts.account_cookiejar(request.values['channel_id'])
|
||||||
token = get_session_token(video_id, cookiejar)
|
token = get_session_token(video_id, cookiejar)
|
||||||
|
|
||||||
code = _delete_comment(video_id, request.values['comment_id'], request.values['author_id'], token, cookiejar)
|
code = _delete_comment(video_id, request.values['comment_id'], token, cookiejar)
|
||||||
|
|
||||||
if code == "SUCCESS":
|
if code == "SUCCESS":
|
||||||
return flask.redirect(util.URL_ORIGIN + '/comment_delete_success', 303)
|
return flask.redirect(util.URL_ORIGIN + '/comment_delete_success', 303)
|
||||||
@ -147,7 +147,7 @@ def post_comment():
|
|||||||
|
|
||||||
@yt_app.route('/delete_comment', methods=['GET'])
|
@yt_app.route('/delete_comment', methods=['GET'])
|
||||||
def get_delete_comment_page():
|
def get_delete_comment_page():
|
||||||
parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'author_id', 'comment_id')]
|
parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'comment_id')]
|
||||||
return flask.render_template('delete_comment.html', parameters = parameters)
|
return flask.render_template('delete_comment.html', parameters = parameters)
|
||||||
|
|
||||||
|
|
||||||
|
@ -259,20 +259,20 @@ def extract_formatted_text(node):
|
|||||||
return [{'text': node['simpleText']}]
|
return [{'text': node['simpleText']}]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def extract_int(string):
|
def extract_int(string, default=None):
|
||||||
if isinstance(string, int):
|
if isinstance(string, int):
|
||||||
return string
|
return string
|
||||||
if not isinstance(string, str):
|
if not isinstance(string, str):
|
||||||
string = extract_str(string)
|
string = extract_str(string)
|
||||||
if not string:
|
if not string:
|
||||||
return None
|
return default
|
||||||
match = re.search(r'(\d+)', string.replace(',', ''))
|
match = re.search(r'(\d+)', string.replace(',', ''))
|
||||||
if match is None:
|
if match is None:
|
||||||
return None
|
return default
|
||||||
try:
|
try:
|
||||||
return int(match.group(1))
|
return int(match.group(1))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return default
|
||||||
|
|
||||||
def extract_approx_int(string):
|
def extract_approx_int(string):
|
||||||
'''e.g. "15M" from "15M subscribers"'''
|
'''e.g. "15M" from "15M subscribers"'''
|
||||||
@ -514,7 +514,7 @@ def extract_items(response, item_types=item_types):
|
|||||||
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
|
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
|
||||||
for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items():
|
for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items():
|
||||||
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation
|
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation
|
||||||
items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple))
|
items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple))
|
||||||
ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
|
ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
|
||||||
return items, ctoken
|
return items, ctoken
|
||||||
return [], None
|
return [], None
|
||||||
@ -772,78 +772,66 @@ def ctoken_metadata(ctoken):
|
|||||||
result['sort'] = 0
|
result['sort'] = 0
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def parse_comments_polymer(polymer_json):
|
def extract_comments_info(polymer_json):
|
||||||
try:
|
response, err = extract_response(polymer_json)
|
||||||
video_title = ''
|
if err:
|
||||||
response, err = extract_response(polymer_json)
|
return {'error': err}
|
||||||
if err:
|
info = {'error': None}
|
||||||
raise Exception(err)
|
|
||||||
|
|
||||||
try:
|
|
||||||
url = polymer_json[1]['url']
|
|
||||||
except (TypeError, IndexError, KeyError):
|
|
||||||
url = polymer_json['url']
|
|
||||||
|
|
||||||
|
url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
|
||||||
|
if url:
|
||||||
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
|
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
|
||||||
metadata = ctoken_metadata(ctoken)
|
metadata = ctoken_metadata(ctoken)
|
||||||
|
else:
|
||||||
|
metadata = {}
|
||||||
|
info['video_id'] = metadata.get('video_id')
|
||||||
|
info['offset'] = metadata.get('offset')
|
||||||
|
info['is_replies'] = metadata.get('is_replies')
|
||||||
|
info['sort'] = metadata.get('sort')
|
||||||
|
info['video_title'] = None
|
||||||
|
|
||||||
comments_raw, ctoken = extract_items(response)
|
comments, ctoken = extract_items(response)
|
||||||
|
info['comments'] = []
|
||||||
|
info['ctoken'] = ctoken
|
||||||
|
for comment in comments:
|
||||||
|
comment_info = {}
|
||||||
|
|
||||||
comments = []
|
if 'commentThreadRenderer' in comment: # top level comments
|
||||||
for comment_json in comments_raw:
|
conservative_update(info, 'is_replies', False)
|
||||||
number_of_replies = 0
|
comment_thread = comment['commentThreadRenderer']
|
||||||
try:
|
info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
|
||||||
comment_thread = comment_json['commentThreadRenderer']
|
if 'replies' not in comment_thread:
|
||||||
except KeyError:
|
comment_info['reply_count'] = 0
|
||||||
comment_renderer = comment_json['commentRenderer']
|
|
||||||
else:
|
else:
|
||||||
if 'commentTargetTitle' in comment_thread:
|
comment_info['reply_count'] = extract_int(deep_get(comment_thread,
|
||||||
video_title = comment_thread['commentTargetTitle']['runs'][0]['text']
|
'replies', 'commentRepliesRenderer', 'moreText'
|
||||||
|
), default=1) # With 1 reply, the text reads "View reply"
|
||||||
|
comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
|
||||||
|
elif 'commentRenderer' in comment: # replies
|
||||||
|
comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it
|
||||||
|
conservative_update(info, 'is_replies', True)
|
||||||
|
comment_renderer = comment['commentRenderer']
|
||||||
|
else:
|
||||||
|
comment_renderer = {}
|
||||||
|
|
||||||
if 'replies' in comment_thread:
|
# These 3 are sometimes absent, likely because the channel was deleted
|
||||||
view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText'])
|
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
|
||||||
view_replies_text = view_replies_text.replace(',', '')
|
comment_info['author_url'] = deep_get(comment_renderer,
|
||||||
match = re.search(r'(\d+)', view_replies_text)
|
'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
|
||||||
if match is None:
|
comment_info['author_id'] = deep_get(comment_renderer,
|
||||||
number_of_replies = 1
|
'authorEndpoint', 'browseEndpoint', 'browseId')
|
||||||
else:
|
|
||||||
number_of_replies = int(match.group(1))
|
|
||||||
comment_renderer = comment_thread['comment']['commentRenderer']
|
|
||||||
|
|
||||||
comment = {
|
comment_info['author_avatar'] = deep_get(comment_renderer,
|
||||||
'author_id': comment_renderer.get('authorId', ''),
|
'authorThumbnail', 'thumbnails', 0, 'url')
|
||||||
'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'],
|
comment_info['id'] = comment_renderer.get('commentId')
|
||||||
'like_count': comment_renderer['likeCount'],
|
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
|
||||||
'time_published': extract_str(comment_renderer['publishedTimeText']),
|
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
|
||||||
'text': comment_renderer['contentText'].get('runs', ''),
|
comment_info['like_count'] = comment_renderer.get('likeCount')
|
||||||
'reply_count': number_of_replies,
|
liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount'))
|
||||||
'id': comment_renderer['commentId'],
|
|
||||||
}
|
|
||||||
|
|
||||||
if 'authorText' in comment_renderer: # deleted channels have no name or channel link
|
info['comments'].append(comment_info)
|
||||||
comment['author'] = extract_str(comment_renderer['authorText'])
|
|
||||||
comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
|
|
||||||
comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId']
|
|
||||||
else:
|
|
||||||
comment['author'] = ''
|
|
||||||
comment['author_url'] = ''
|
|
||||||
comment['author_channel_id'] = ''
|
|
||||||
|
|
||||||
comments.append(comment)
|
return info
|
||||||
except Exception as e:
|
|
||||||
print('Error parsing comments: ' + str(e))
|
|
||||||
comments = ()
|
|
||||||
ctoken = ''
|
|
||||||
|
|
||||||
return {
|
|
||||||
'ctoken': ctoken,
|
|
||||||
'comments': comments,
|
|
||||||
'video_title': video_title,
|
|
||||||
'video_id': metadata['video_id'],
|
|
||||||
'offset': metadata['offset'],
|
|
||||||
'is_replies': metadata['is_replies'],
|
|
||||||
'sort': metadata['sort'],
|
|
||||||
}
|
|
||||||
|
|
||||||
def check_missing_keys(object, *key_sequences):
|
def check_missing_keys(object, *key_sequences):
|
||||||
for key_sequence in key_sequences:
|
for key_sequence in key_sequences:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user