refactor common.py into 3 files

This commit is contained in:
James Taylor 2019-02-21 21:32:31 -08:00
parent a61ba6b8f4
commit b32330be4f
12 changed files with 519 additions and 478 deletions

View File

@ -1,10 +1,10 @@
# Contains functions having to do with logging in # Contains functions having to do with logging in
from youtube import util, html_common
import settings
import urllib import urllib
import json import json
from youtube import common
import re import re
import settings
import http.cookiejar import http.cookiejar
import io import io
import os import os
@ -106,7 +106,7 @@ def get_account_login_page(env, start_response):
''' '''
page = ''' page = '''
<form action="''' + common.URL_ORIGIN + '''/login" method="POST"> <form action="''' + util.URL_ORIGIN + '''/login" method="POST">
<div class="form-field"> <div class="form-field">
<label for="username">Username:</label> <label for="username">Username:</label>
<input type="text" id="username" name="username"> <input type="text" id="username" name="username">
@ -130,10 +130,10 @@ Using Tor to log in should only be done if the account was created using a proxy
</div> </div>
''' '''
return common.yt_basic_template.substitute( return html_common.yt_basic_template.substitute(
page_title = "Login", page_title = "Login",
style = style, style = style,
header = common.get_header(), header = html_common.get_header(),
page = page, page = page,
).encode('utf-8') ).encode('utf-8')
@ -229,7 +229,7 @@ def _login(username, password, cookiejar, use_tor):
Taken from youtube-dl Taken from youtube-dl
""" """
login_page = common.fetch_url(_LOGIN_URL, yt_dl_headers, report_text='Downloaded login page', cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') login_page = util.fetch_url(_LOGIN_URL, yt_dl_headers, report_text='Downloaded login page', cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8')
'''with open('debug/login_page', 'w', encoding='utf-8') as f: '''with open('debug/login_page', 'w', encoding='utf-8') as f:
f.write(login_page)''' f.write(login_page)'''
#print(cookiejar.as_lwp_str()) #print(cookiejar.as_lwp_str())
@ -255,7 +255,7 @@ def _login(username, password, cookiejar, use_tor):
'Google-Accounts-XSRF': 1, 'Google-Accounts-XSRF': 1,
} }
headers.update(yt_dl_headers) headers.update(yt_dl_headers)
result = common.fetch_url(url, headers, report_text=note, data=data, cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') result = util.fetch_url(url, headers, report_text=note, data=data, cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8')
#print(cookiejar.as_lwp_str()) #print(cookiejar.as_lwp_str())
'''with open('debug/' + note, 'w', encoding='utf-8') as f: '''with open('debug/' + note, 'w', encoding='utf-8') as f:
f.write(result)''' f.write(result)'''
@ -387,7 +387,7 @@ def _login(username, password, cookiejar, use_tor):
return False return False
try: try:
check_cookie_results = common.fetch_url(check_cookie_url, headers=yt_dl_headers, report_text="Checked cookie", cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') check_cookie_results = util.fetch_url(check_cookie_url, headers=yt_dl_headers, report_text="Checked cookie", cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8')
except (urllib.error.URLError, compat_http_client.HTTPException, socket.error) as err: except (urllib.error.URLError, compat_http_client.HTTPException, socket.error) as err:
return False return False
@ -398,7 +398,7 @@ def _login(username, password, cookiejar, use_tor):
warn('Unable to log in') warn('Unable to log in')
return False return False
select_site_page = common.fetch_url('https://m.youtube.com/select_site', headers=common.mobile_ua, report_text="Retrieved page for channel id", cookiejar_send=cookiejar, use_tor=use_tor).decode('utf-8') select_site_page = util.fetch_url('https://m.youtube.com/select_site', headers=util.mobile_ua, report_text="Retrieved page for channel id", cookiejar_send=cookiejar, use_tor=use_tor).decode('utf-8')
match = _CHANNEL_ID_RE.search(select_site_page) match = _CHANNEL_ID_RE.search(select_site_page)
if match is None: if match is None:
warn('Failed to find channel id') warn('Failed to find channel id')

View File

@ -1,6 +1,6 @@
import base64 import base64
import youtube.common as common from youtube import util, yt_data_extract, html_common
from youtube.common import default_multi_get, URL_ORIGIN, get_thumbnail_url, video_id
import http_errors import http_errors
import urllib import urllib
import json import json
@ -91,7 +91,7 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1):
url = "https://www.youtube.com/browse_ajax?ctoken=" + ctoken url = "https://www.youtube.com/browse_ajax?ctoken=" + ctoken
print("Sending channel tab ajax request") print("Sending channel tab ajax request")
content = common.fetch_url(url, common.desktop_ua + headers_1) content = util.fetch_url(url, util.desktop_ua + headers_1)
print("Finished recieving channel tab response") print("Finished recieving channel tab response")
'''with open('debug/channel_debug', 'wb') as f: '''with open('debug/channel_debug', 'wb') as f:
@ -110,7 +110,7 @@ def get_number_of_videos(channel_id):
# Sometimes retrieving playlist info fails with 403 for no discernable reason # Sometimes retrieving playlist info fails with 403 for no discernable reason
try: try:
response = common.fetch_url(url, common.mobile_ua + headers_pbj) response = util.fetch_url(url, util.mobile_ua + headers_pbj)
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
if e.code != 403: if e.code != 403:
raise raise
@ -133,20 +133,20 @@ def get_channel_id(username):
# method that gives the smallest possible response at ~10 kb # method that gives the smallest possible response at ~10 kb
# needs to be as fast as possible # needs to be as fast as possible
url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true' url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true'
response = common.fetch_url(url, common.mobile_ua + headers_1).decode('utf-8') response = util.fetch_url(url, util.mobile_ua + headers_1).decode('utf-8')
return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1) return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1)
def grid_items_html(items, additional_info={}): def grid_items_html(items, additional_info={}):
result = ''' <nav class="item-grid">\n''' result = ''' <nav class="item-grid">\n'''
for item in items: for item in items:
result += common.renderer_html(item, additional_info) result += html_common.renderer_html(item, additional_info)
result += '''\n</nav>''' result += '''\n</nav>'''
return result return result
def list_items_html(items, additional_info={}): def list_items_html(items, additional_info={}):
result = ''' <nav class="item-list">''' result = ''' <nav class="item-list">'''
for item in items: for item in items:
result += common.renderer_html(item, additional_info) result += html_common.renderer_html(item, additional_info)
result += '''\n</nav>''' result += '''\n</nav>'''
return result return result
@ -168,11 +168,11 @@ def channel_tabs_html(channel_id, current_tab, search_box_value=''):
) )
else: else:
result += channel_tab_template.substitute( result += channel_tab_template.substitute(
href_attribute = ' href="' + URL_ORIGIN + '/channel/' + channel_id + '/' + tab_name.lower() + '"', href_attribute = ' href="' + util.URL_ORIGIN + '/channel/' + channel_id + '/' + tab_name.lower() + '"',
tab_name = tab_name, tab_name = tab_name,
) )
result += channel_search_template.substitute( result += channel_search_template.substitute(
action = URL_ORIGIN + "/channel/" + channel_id + "/search", action = util.URL_ORIGIN + "/channel/" + channel_id + "/search",
search_box_value = html.escape(search_box_value), search_box_value = html.escape(search_box_value),
) )
return result return result
@ -192,7 +192,7 @@ def channel_sort_buttons_html(channel_id, tab, current_sort):
) )
else: else:
result += channel_sort_button_template.substitute( result += channel_sort_button_template.substitute(
href_attribute=' href="' + URL_ORIGIN + '/channel/' + channel_id + '/' + tab + '?sort=' + sort_number + '"', href_attribute=' href="' + util.URL_ORIGIN + '/channel/' + channel_id + '/' + tab + '?sort=' + sort_number + '"',
text = 'Sort by ' + sort_name text = 'Sort by ' + sort_name
) )
return result return result
@ -246,14 +246,14 @@ def channel_videos_html(polymer_json, current_page=1, current_sort=3, number_of_
items_html = grid_items_html(items, {'author': microformat['title']}) items_html = grid_items_html(items, {'author': microformat['title']})
return yt_channel_items_template.substitute( return yt_channel_items_template.substitute(
header = common.get_header(), header = html_common.get_header(),
channel_title = microformat['title'], channel_title = microformat['title'],
channel_tabs = channel_tabs_html(channel_id, 'Videos'), channel_tabs = channel_tabs_html(channel_id, 'Videos'),
sort_buttons = channel_sort_buttons_html(channel_id, 'videos', current_sort), sort_buttons = channel_sort_buttons_html(channel_id, 'videos', current_sort),
avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'],
page_title = microformat['title'] + ' - Channel', page_title = microformat['title'] + ' - Channel',
items = items_html, items = items_html,
page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/30), URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string), page_buttons = html_common.page_buttons_html(current_page, math.ceil(number_of_videos/30), util.URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string),
number_of_results = '{:,}'.format(number_of_videos) + " videos", number_of_results = '{:,}'.format(number_of_videos) + " videos",
) )
@ -267,7 +267,7 @@ def channel_playlists_html(polymer_json, current_sort=3):
items_html = grid_items_html(items, {'author': microformat['title']}) items_html = grid_items_html(items, {'author': microformat['title']})
return yt_channel_items_template.substitute( return yt_channel_items_template.substitute(
header = common.get_header(), header = html_common.get_header(),
channel_title = microformat['title'], channel_title = microformat['title'],
channel_tabs = channel_tabs_html(channel_id, 'Playlists'), channel_tabs = channel_tabs_html(channel_id, 'Playlists'),
sort_buttons = channel_sort_buttons_html(channel_id, 'playlists', current_sort), sort_buttons = channel_sort_buttons_html(channel_id, 'playlists', current_sort),
@ -310,25 +310,25 @@ def channel_about_page(polymer_json):
channel_links += channel_link_template.substitute( channel_links += channel_link_template.substitute(
url = html.escape(url), url = html.escape(url),
text = common.get_plain_text(link_json['title']), text = yt_data_extract.get_plain_text(link_json['title']),
) )
stats = '' stats = ''
for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'):
try: try:
stat_value = common.get_plain_text(channel_metadata[stat_name]) stat_value = yt_data_extract.get_plain_text(channel_metadata[stat_name])
except KeyError: except KeyError:
continue continue
else: else:
stats += stat_template.substitute(stat_value=stat_value) stats += stat_template.substitute(stat_value=stat_value)
try: try:
description = common.format_text_runs(common.get_formatted_text(channel_metadata['description'])) description = yt_data_extract.format_text_runs(yt_data_extract.get_formatted_text(channel_metadata['description']))
except KeyError: except KeyError:
description = '' description = ''
return yt_channel_about_template.substitute( return yt_channel_about_template.substitute(
header = common.get_header(), header = html_common.get_header(),
page_title = common.get_plain_text(channel_metadata['title']) + ' - About', page_title = yt_data_extract.get_plain_text(channel_metadata['title']) + ' - About',
channel_title = common.get_plain_text(channel_metadata['title']), channel_title = yt_data_extract.get_plain_text(channel_metadata['title']),
avatar = html.escape(avatar), avatar = html.escape(avatar),
description = description, description = description,
links = channel_links, links = channel_links,
@ -351,13 +351,13 @@ def channel_search_page(polymer_json, query, current_page=1, number_of_videos =
items_html = list_items_html(items) items_html = list_items_html(items)
return yt_channel_items_template.substitute( return yt_channel_items_template.substitute(
header = common.get_header(), header = html_common.get_header(),
channel_title = html.escape(microformat['title']), channel_title = html.escape(microformat['title']),
channel_tabs = channel_tabs_html(channel_id, '', query), channel_tabs = channel_tabs_html(channel_id, '', query),
avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'],
page_title = html.escape(query + ' - Channel search'), page_title = html.escape(query + ' - Channel search'),
items = items_html, items = items_html,
page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/29), URL_ORIGIN + "/channel/" + channel_id + "/search", current_query_string), page_buttons = html_common.page_buttons_html(current_page, math.ceil(number_of_videos/29), util.URL_ORIGIN + "/channel/" + channel_id + "/search", current_query_string),
number_of_results = '', number_of_results = '',
sort_buttons = '', sort_buttons = '',
) )
@ -367,7 +367,7 @@ def get_channel_search_json(channel_id, query, page):
ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query) ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query)
ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii') ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii')
polymer_json = common.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, common.desktop_ua + headers_1) polymer_json = util.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, util.desktop_ua + headers_1)
'''with open('debug/channel_search_debug', 'wb') as f: '''with open('debug/channel_search_debug', 'wb') as f:
f.write(polymer_json)''' f.write(polymer_json)'''
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
@ -384,10 +384,10 @@ def get_channel_page(env, start_response):
tab = 'videos' tab = 'videos'
parameters = env['parameters'] parameters = env['parameters']
page_number = int(common.default_multi_get(parameters, 'page', 0, default='1')) page_number = int(util.default_multi_get(parameters, 'page', 0, default='1'))
sort = common.default_multi_get(parameters, 'sort', 0, default='3') sort = util.default_multi_get(parameters, 'sort', 0, default='3')
view = common.default_multi_get(parameters, 'view', 0, default='1') view = util.default_multi_get(parameters, 'view', 0, default='1')
query = common.default_multi_get(parameters, 'query', 0, default='') query = util.default_multi_get(parameters, 'query', 0, default='')
if tab == 'videos': if tab == 'videos':
tasks = ( tasks = (
@ -399,11 +399,11 @@ def get_channel_page(env, start_response):
result = channel_videos_html(polymer_json, page_number, sort, number_of_videos, env['QUERY_STRING']) result = channel_videos_html(polymer_json, page_number, sort, number_of_videos, env['QUERY_STRING'])
elif tab == 'about': elif tab == 'about':
polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', common.desktop_ua + headers_1) polymer_json = util.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', util.desktop_ua + headers_1)
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
result = channel_about_page(polymer_json) result = channel_about_page(polymer_json)
elif tab == 'playlists': elif tab == 'playlists':
polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], common.desktop_ua + headers_1) polymer_json = util.fetch_url('https://www.youtube.com/channel/' + channel_id + '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], util.desktop_ua + headers_1)
'''with open('debug/channel_playlists_debug', 'wb') as f: '''with open('debug/channel_playlists_debug', 'wb') as f:
f.write(polymer_json)''' f.write(polymer_json)'''
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
@ -443,22 +443,22 @@ def get_channel_page_general_url(env, start_response):
return b'Invalid channel url' return b'Invalid channel url'
if page == 'videos': if page == 'videos':
polymer_json = common.fetch_url(base_url + '/videos?pbj=1&view=0', common.desktop_ua + headers_1) polymer_json = util.fetch_url(base_url + '/videos?pbj=1&view=0', util.desktop_ua + headers_1)
'''with open('debug/user_page_videos', 'wb') as f: '''with open('debug/user_page_videos', 'wb') as f:
f.write(polymer_json)''' f.write(polymer_json)'''
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
result = channel_videos_html(polymer_json) result = channel_videos_html(polymer_json)
elif page == 'about': elif page == 'about':
polymer_json = common.fetch_url(base_url + '/about?pbj=1', common.desktop_ua + headers_1) polymer_json = util.fetch_url(base_url + '/about?pbj=1', util.desktop_ua + headers_1)
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
result = channel_about_page(polymer_json) result = channel_about_page(polymer_json)
elif page == 'playlists': elif page == 'playlists':
polymer_json = common.fetch_url(base_url+ '/playlists?pbj=1&view=1', common.desktop_ua + headers_1) polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1', util.desktop_ua + headers_1)
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
result = channel_playlists_html(polymer_json) result = channel_playlists_html(polymer_json)
elif page == 'search': elif page == 'search':
raise NotImplementedError() raise NotImplementedError()
'''polymer_json = common.fetch_url('https://www.youtube.com/user' + username + '/search?pbj=1&' + query_string, common.desktop_ua + headers_1) '''polymer_json = util.fetch_url('https://www.youtube.com/user' + username + '/search?pbj=1&' + query_string, util.desktop_ua + headers_1)
polymer_json = json.loads(polymer_json) polymer_json = json.loads(polymer_json)
return channel_search_page(''' return channel_search_page('''
else: else:

View File

@ -1,13 +1,14 @@
from youtube import proto, util, html_common, yt_data_extract, accounts
import settings
import json import json
from youtube import proto, common, accounts
import base64 import base64
from youtube.common import uppercase_escape, default_multi_get, format_text_runs, URL_ORIGIN, fetch_url
from string import Template from string import Template
import urllib.request import urllib.request
import urllib import urllib
import html import html
import settings
import re import re
comment_area_template = Template(''' comment_area_template = Template('''
<section class="comment-area"> <section class="comment-area">
$video-metadata $video-metadata
@ -130,7 +131,7 @@ def request_comments(ctoken, replies=False):
url = base_url + ctoken.replace("=", "%3D") + "&pbj=1" url = base_url + ctoken.replace("=", "%3D") + "&pbj=1"
for i in range(0,8): # don't retry more than 8 times for i in range(0,8): # don't retry more than 8 times
content = fetch_url(url, headers=mobile_headers, report_text="Retrieved comments") content = util.fetch_url(url, headers=mobile_headers, report_text="Retrieved comments")
if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason
content = content[4:] content = content[4:]
elif content[0:10] == b'\n<!DOCTYPE': # occasionally returns html instead of json for no reason elif content[0:10] == b'\n<!DOCTYPE': # occasionally returns html instead of json for no reason
@ -151,10 +152,10 @@ def single_comment_ctoken(video_id, comment_id):
def parse_comments_ajax(content, replies=False): def parse_comments_ajax(content, replies=False):
try: try:
content = json.loads(uppercase_escape(content.decode('utf-8'))) content = json.loads(util.uppercase_escape(content.decode('utf-8')))
#print(content) #print(content)
comments_raw = content['content']['continuation_contents']['contents'] comments_raw = content['content']['continuation_contents']['contents']
ctoken = default_multi_get(content, 'content', 'continuation_contents', 'continuations', 0, 'continuation', default='') ctoken = util.default_multi_get(content, 'content', 'continuation_contents', 'continuations', 0, 'continuation', default='')
comments = [] comments = []
for comment_raw in comments_raw: for comment_raw in comments_raw:
@ -163,7 +164,7 @@ def parse_comments_ajax(content, replies=False):
if comment_raw['replies'] is not None: if comment_raw['replies'] is not None:
reply_ctoken = comment_raw['replies']['continuations'][0]['continuation'] reply_ctoken = comment_raw['replies']['continuations'][0]['continuation']
comment_id, video_id = get_ids(reply_ctoken) comment_id, video_id = get_ids(reply_ctoken)
replies_url = URL_ORIGIN + '/comments?parent_id=' + comment_id + "&video_id=" + video_id replies_url = util.URL_ORIGIN + '/comments?parent_id=' + comment_id + "&video_id=" + video_id
comment_raw = comment_raw['comment'] comment_raw = comment_raw['comment']
comment = { comment = {
'author': comment_raw['author']['runs'][0]['text'], 'author': comment_raw['author']['runs'][0]['text'],
@ -189,7 +190,7 @@ reply_count_regex = re.compile(r'(\d+)')
def parse_comments_polymer(content, replies=False): def parse_comments_polymer(content, replies=False):
try: try:
video_title = '' video_title = ''
content = json.loads(uppercase_escape(content.decode('utf-8'))) content = json.loads(util.uppercase_escape(content.decode('utf-8')))
url = content[1]['url'] url = content[1]['url']
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
video_id = ctoken_metadata(ctoken)['video_id'] video_id = ctoken_metadata(ctoken)['video_id']
@ -200,7 +201,7 @@ def parse_comments_polymer(content, replies=False):
comments_raw = content[1]['response']['continuationContents']['commentRepliesContinuation']['contents'] comments_raw = content[1]['response']['continuationContents']['commentRepliesContinuation']['contents']
replies = True replies = True
ctoken = default_multi_get(content, 1, 'response', 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') ctoken = util.default_multi_get(content, 1, 'response', 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='')
comments = [] comments = []
for comment_raw in comments_raw: for comment_raw in comments_raw:
@ -219,8 +220,8 @@ def parse_comments_polymer(content, replies=False):
if 'replies' in comment_raw: if 'replies' in comment_raw:
#reply_ctoken = comment_raw['replies']['commentRepliesRenderer']['continuations'][0]['nextContinuationData']['continuation'] #reply_ctoken = comment_raw['replies']['commentRepliesRenderer']['continuations'][0]['nextContinuationData']['continuation']
#comment_id, video_id = get_ids(reply_ctoken) #comment_id, video_id = get_ids(reply_ctoken)
replies_url = URL_ORIGIN + '/comments?parent_id=' + parent_id + "&video_id=" + video_id replies_url = util.URL_ORIGIN + '/comments?parent_id=' + parent_id + "&video_id=" + video_id
view_replies_text = common.get_plain_text(comment_raw['replies']['commentRepliesRenderer']['moreText']) view_replies_text = yt_data_extract.get_plain_text(comment_raw['replies']['commentRepliesRenderer']['moreText'])
match = reply_count_regex.search(view_replies_text) match = reply_count_regex.search(view_replies_text)
if match is None: if match is None:
view_replies_text = '1 reply' view_replies_text = '1 reply'
@ -228,18 +229,18 @@ def parse_comments_polymer(content, replies=False):
view_replies_text = match.group(1) + " replies" view_replies_text = match.group(1) + " replies"
elif not replies: elif not replies:
view_replies_text = "Reply" view_replies_text = "Reply"
replies_url = URL_ORIGIN + '/post_comment?parent_id=' + parent_id + "&video_id=" + video_id replies_url = util.URL_ORIGIN + '/post_comment?parent_id=' + parent_id + "&video_id=" + video_id
comment_raw = comment_raw['comment'] comment_raw = comment_raw['comment']
comment_raw = comment_raw['commentRenderer'] comment_raw = comment_raw['commentRenderer']
comment = { comment = {
'author': common.get_plain_text(comment_raw['authorText']), 'author': yt_data_extract.get_plain_text(comment_raw['authorText']),
'author_url': comment_raw['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'], 'author_url': comment_raw['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'],
'author_channel_id': comment_raw['authorEndpoint']['browseEndpoint']['browseId'], 'author_channel_id': comment_raw['authorEndpoint']['browseEndpoint']['browseId'],
'author_id': comment_raw['authorId'], 'author_id': comment_raw['authorId'],
'author_avatar': comment_raw['authorThumbnail']['thumbnails'][0]['url'], 'author_avatar': comment_raw['authorThumbnail']['thumbnails'][0]['url'],
'likes': comment_raw['likeCount'], 'likes': comment_raw['likeCount'],
'published': common.get_plain_text(comment_raw['publishedTimeText']), 'published': yt_data_extract.get_plain_text(comment_raw['publishedTimeText']),
'text': comment_raw['contentText'].get('runs', ''), 'text': comment_raw['contentText'].get('runs', ''),
'view_replies_text': view_replies_text, 'view_replies_text': view_replies_text,
'replies_url': replies_url, 'replies_url': replies_url,
@ -264,13 +265,13 @@ def get_comments_html(comments):
replies = reply_link_template.substitute(url=comment['replies_url'], view_replies_text=html.escape(comment['view_replies_text'])) replies = reply_link_template.substitute(url=comment['replies_url'], view_replies_text=html.escape(comment['view_replies_text']))
if settings.enable_comment_avatars: if settings.enable_comment_avatars:
avatar = comment_avatar_template.substitute( avatar = comment_avatar_template.substitute(
author_url = URL_ORIGIN + comment['author_url'], author_url = util.URL_ORIGIN + comment['author_url'],
author_avatar = '/' + comment['author_avatar'], author_avatar = '/' + comment['author_avatar'],
) )
else: else:
avatar = '' avatar = ''
if comment['author_channel_id'] in accounts.accounts: if comment['author_channel_id'] in accounts.accounts:
delete_url = (URL_ORIGIN + '/delete_comment?video_id=' delete_url = (util.URL_ORIGIN + '/delete_comment?video_id='
+ comment['video_id'] + comment['video_id']
+ '&channel_id='+ comment['author_channel_id'] + '&channel_id='+ comment['author_channel_id']
+ '&author_id=' + comment['author_id'] + '&author_id=' + comment['author_id']
@ -280,14 +281,14 @@ def get_comments_html(comments):
else: else:
action_buttons = '' action_buttons = ''
permalink = URL_ORIGIN + '/watch?v=' + comment['video_id'] + '&lc=' + comment['comment_id'] permalink = util.URL_ORIGIN + '/watch?v=' + comment['video_id'] + '&lc=' + comment['comment_id']
html_result += comment_template.substitute( html_result += comment_template.substitute(
author=comment['author'], author=comment['author'],
author_url = URL_ORIGIN + comment['author_url'], author_url = util.URL_ORIGIN + comment['author_url'],
avatar = avatar, avatar = avatar,
likes = str(comment['likes']) + ' likes' if str(comment['likes']) != '0' else '', likes = str(comment['likes']) + ' likes' if str(comment['likes']) != '0' else '',
published = comment['published'], published = comment['published'],
text = format_text_runs(comment['text']), text = yt_data_extract.format_text_runs(comment['text']),
datetime = '', #TODO datetime = '', #TODO
replies = replies, replies = replies,
action_buttons = action_buttons, action_buttons = action_buttons,
@ -297,10 +298,10 @@ def get_comments_html(comments):
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
if settings.enable_comments: if settings.enable_comments:
post_comment_url = common.URL_ORIGIN + "/post_comment?video_id=" + video_id post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id
post_comment_link = '''<a class="sort-button" href="''' + post_comment_url + '''">Post comment</a>''' post_comment_link = '''<a class="sort-button" href="''' + post_comment_url + '''">Post comment</a>'''
other_sort_url = common.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(video_id, sort=1 - sort, lc=lc) other_sort_url = util.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(video_id, sort=1 - sort, lc=lc)
other_sort_name = 'newest' if sort == 0 else 'top' other_sort_name = 'newest' if sort == 0 else 'top'
other_sort_link = '''<a class="sort-button" href="''' + other_sort_url + '''">Sort by ''' + other_sort_name + '''</a>''' other_sort_link = '''<a class="sort-button" href="''' + other_sort_url + '''">Sort by ''' + other_sort_name + '''</a>'''
@ -314,7 +315,7 @@ def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
if ctoken == '': if ctoken == '':
more_comments_button = '' more_comments_button = ''
else: else:
more_comments_button = more_comments_template.substitute(url = common.URL_ORIGIN + '/comments?ctoken=' + ctoken) more_comments_button = more_comments_template.substitute(url = util.URL_ORIGIN + '/comments?ctoken=' + ctoken)
result = '''<section class="comments-area">\n''' result = '''<section class="comments-area">\n'''
result += comment_links + '\n' result += comment_links + '\n'
@ -350,7 +351,7 @@ comment_box_template = Template('''
<select id="account-selection" name="channel_id"> <select id="account-selection" name="channel_id">
$options $options
</select> </select>
<a href="''' + common.URL_ORIGIN + '''/login" target="_blank">Add account</a> <a href="''' + util.URL_ORIGIN + '''/login" target="_blank">Add account</a>
</div> </div>
<textarea name="comment_text"></textarea> <textarea name="comment_text"></textarea>
$video_id_input $video_id_input
@ -359,7 +360,7 @@ $options
def get_comments_page(env, start_response): def get_comments_page(env, start_response):
start_response('200 OK', [('Content-type','text/html'),] ) start_response('200 OK', [('Content-type','text/html'),] )
parameters = env['parameters'] parameters = env['parameters']
ctoken = default_multi_get(parameters, 'ctoken', 0, default='') ctoken = util.default_multi_get(parameters, 'ctoken', 0, default='')
replies = False replies = False
if not ctoken: if not ctoken:
video_id = parameters['video_id'][0] video_id = parameters['video_id'][0]
@ -384,17 +385,17 @@ def get_comments_page(env, start_response):
page_number = page_number, page_number = page_number,
sort = 'top' if metadata['sort'] == 0 else 'newest', sort = 'top' if metadata['sort'] == 0 else 'newest',
title = html.escape(comment_info['video_title']), title = html.escape(comment_info['video_title']),
url = common.URL_ORIGIN + '/watch?v=' + metadata['video_id'], url = util.URL_ORIGIN + '/watch?v=' + metadata['video_id'],
thumbnail = '/i.ytimg.com/vi/'+ metadata['video_id'] + '/mqdefault.jpg', thumbnail = '/i.ytimg.com/vi/'+ metadata['video_id'] + '/mqdefault.jpg',
) )
comment_box = comment_box_template.substitute( comment_box = comment_box_template.substitute(
form_action= common.URL_ORIGIN + '/post_comment', form_action= util.URL_ORIGIN + '/post_comment',
video_id_input='''<input type="hidden" name="video_id" value="''' + metadata['video_id'] + '''">''', video_id_input='''<input type="hidden" name="video_id" value="''' + metadata['video_id'] + '''">''',
post_text='Post comment', post_text='Post comment',
options=comment_box_account_options(), options=comment_box_account_options(),
) )
other_sort_url = common.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(metadata['video_id'], sort=1 - metadata['sort']) other_sort_url = util.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(metadata['video_id'], sort=1 - metadata['sort'])
other_sort_name = 'newest' if metadata['sort'] == 0 else 'top' other_sort_name = 'newest' if metadata['sort'] == 0 else 'top'
other_sort_link = '''<a class="sort-button" href="''' + other_sort_url + '''">Sort by ''' + other_sort_name + '''</a>''' other_sort_link = '''<a class="sort-button" href="''' + other_sort_url + '''">Sort by ''' + other_sort_name + '''</a>'''
@ -408,7 +409,7 @@ def get_comments_page(env, start_response):
if ctoken == '': if ctoken == '':
more_comments_button = '' more_comments_button = ''
else: else:
more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) more_comments_button = more_comments_template.substitute(url = util.URL_ORIGIN + '/comments?ctoken=' + ctoken)
comments_area = '<section class="comments-area">\n' comments_area = '<section class="comments-area">\n'
comments_area += video_metadata + comment_box + comment_links + '\n' comments_area += video_metadata + comment_box + comment_links + '\n'
comments_area += '<div class="comments">\n' comments_area += '<div class="comments">\n'
@ -417,7 +418,7 @@ def get_comments_page(env, start_response):
comments_area += more_comments_button + '\n' comments_area += more_comments_button + '\n'
comments_area += '</section>\n' comments_area += '</section>\n'
return yt_comments_template.substitute( return yt_comments_template.substitute(
header = common.get_header(), header = html_common.get_header(),
comments_area = comments_area, comments_area = comments_area,
page_title = page_title, page_title = page_title,
).encode('utf-8') ).encode('utf-8')

View File

@ -1,46 +1,8 @@
from youtube.template import Template from youtube.template import Template
from youtube import local_playlist from youtube import local_playlist, yt_data_extract, util
import settings
import html
import json import json
import re import html
import urllib.parse
import gzip
import brotli
import time
import socks, sockshandler
URL_ORIGIN = "/https://www.youtube.com"
# videos (all of type str):
# id
# title
# url
# author
# author_url
# thumbnail
# description
# published
# duration
# likes
# dislikes
# views
# playlist_index
# playlists:
# id
# title
# url
# author
# author_url
# thumbnail
# description
# updated
# size
# first_video_id
with open('yt_basic_template.html', 'r', encoding='utf-8') as file: with open('yt_basic_template.html', 'r', encoding='utf-8') as file:
@ -139,153 +101,6 @@ medium_channel_item_template = Template('''
''') ''')
class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
'''Separate cookiejars for receiving and sending'''
def __init__(self, cookiejar_send=None, cookiejar_receive=None):
import http.cookiejar
self.cookiejar_send = cookiejar_send
self.cookiejar_receive = cookiejar_receive
def http_request(self, request):
if self.cookiejar_send is not None:
self.cookiejar_send.add_cookie_header(request)
return request
def http_response(self, request, response):
if self.cookiejar_receive is not None:
self.cookiejar_receive.extract_cookies(response, request)
return response
https_request = http_request
https_response = http_response
def decode_content(content, encoding_header):
encodings = encoding_header.replace(' ', '').split(',')
for encoding in reversed(encodings):
if encoding == 'identity':
continue
if encoding == 'br':
content = brotli.decompress(content)
elif encoding == 'gzip':
content = gzip.decompress(content)
return content
def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True):
'''
When cookiejar_send is set to a CookieJar object,
those cookies will be sent in the request (but cookies in response will not be merged into it)
When cookiejar_receive is set to a CookieJar object,
cookies received in the response will be merged into the object (nothing will be sent from it)
When both are set to the same object, cookies will be sent from the object,
and response cookies will be merged into it.
'''
headers = dict(headers) # Note: Calling dict() on a dict will make a copy
headers['Accept-Encoding'] = 'gzip, br'
# prevent python version being leaked by urllib if User-Agent isn't provided
# (urllib will use ex. Python-urllib/3.6 otherwise)
if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
headers['User-Agent'] = 'Python-urllib'
if data is not None:
if isinstance(data, str):
data = data.encode('ascii')
elif not isinstance(data, bytes):
data = urllib.parse.urlencode(data).encode('ascii')
start_time = time.time()
req = urllib.request.Request(url, data=data, headers=headers)
cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
if use_tor and settings.route_tor:
opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor)
else:
opener = urllib.request.build_opener(cookie_processor)
response = opener.open(req, timeout=timeout)
response_time = time.time()
content = response.read()
read_finish = time.time()
if report_text:
print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3))
content = decode_content(content, response.getheader('Content-Encoding', default='identity'))
return content
mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
mobile_ua = (('User-Agent', mobile_user_agent),)
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
desktop_ua = (('User-Agent', desktop_user_agent),)
def dict_add(*dicts):
for dictionary in dicts[1:]:
dicts[0].update(dictionary)
return dicts[0]
def video_id(url):
url_parts = urllib.parse.urlparse(url)
return urllib.parse.parse_qs(url_parts.query)['v'][0]
def uppercase_escape(s):
return re.sub(
r'\\U([0-9a-fA-F]{8})',
lambda m: chr(int(m.group(1), base=16)), s)
def default_multi_get(object, *keys, default):
''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors '''
try:
for key in keys:
object = object[key]
return object
except (IndexError, KeyError):
return default
def get_plain_text(node):
try:
return html.escape(node['simpleText'])
except KeyError:
return unformmated_text_runs(node['runs'])
def unformmated_text_runs(runs):
result = ''
for text_run in runs:
result += html.escape(text_run["text"])
return result
def format_text_runs(runs):
if isinstance(runs, str):
return runs
result = ''
for text_run in runs:
if text_run.get("bold", False):
result += "<b>" + html.escape(text_run["text"]) + "</b>"
elif text_run.get('italics', False):
result += "<i>" + html.escape(text_run["text"]) + "</i>"
else:
result += html.escape(text_run["text"])
return result
# default, sddefault, mqdefault, hqdefault, hq720
def get_thumbnail_url(video_id):
return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
def seconds_to_timestamp(seconds):
seconds = int(seconds)
hours, seconds = divmod(seconds,3600)
minutes, seconds = divmod(seconds,60)
if hours != 0:
timestamp = str(hours) + ":"
timestamp += str(minutes).zfill(2) # zfill pads with zeros
else:
timestamp = str(minutes)
timestamp += ":" + str(seconds).zfill(2)
return timestamp
# ----- # -----
@ -299,8 +114,8 @@ def small_video_item_html(item):
views = item["views"], views = item["views"],
author = html.escape(item["author"]), author = html.escape(item["author"]),
duration = item["duration"], duration = item["duration"],
url = URL_ORIGIN + "/watch?v=" + item["id"], url = util.URL_ORIGIN + "/watch?v=" + item["id"],
thumbnail = get_thumbnail_url(item['id']), thumbnail = util.get_thumbnail_url(item['id']),
video_info = html.escape(video_info), video_info = html.escape(video_info),
) )
@ -309,8 +124,8 @@ def small_playlist_item_html(item):
title=html.escape(item["title"]), title=html.escape(item["title"]),
size = item['size'], size = item['size'],
author="", author="",
url = URL_ORIGIN + "/playlist?list=" + item["id"], url = util.URL_ORIGIN + "/playlist?list=" + item["id"],
thumbnail= get_thumbnail_url(item['first_video_id']), thumbnail= util.get_thumbnail_url(item['first_video_id']),
) )
def medium_playlist_item_html(item): def medium_playlist_item_html(item):
@ -318,8 +133,8 @@ def medium_playlist_item_html(item):
title=html.escape(item["title"]), title=html.escape(item["title"]),
size = item['size'], size = item['size'],
author=item['author'], author=item['author'],
author_url= URL_ORIGIN + item['author_url'], author_url= util.URL_ORIGIN + item['author_url'],
url = URL_ORIGIN + "/playlist?list=" + item["id"], url = util.URL_ORIGIN + "/playlist?list=" + item["id"],
thumbnail= item['thumbnail'], thumbnail= item['thumbnail'],
) )
@ -330,11 +145,11 @@ def medium_video_item_html(medium_video_info):
title=html.escape(info["title"]), title=html.escape(info["title"]),
views=info["views"], views=info["views"],
published = info["published"], published = info["published"],
description = format_text_runs(info["description"]), description = yt_data_extract.format_text_runs(info["description"]),
author=html.escape(info["author"]), author=html.escape(info["author"]),
author_url=info["author_url"], author_url=info["author_url"],
duration=info["duration"], duration=info["duration"],
url = URL_ORIGIN + "/watch?v=" + info["id"], url = util.URL_ORIGIN + "/watch?v=" + info["id"],
thumbnail=info['thumbnail'], thumbnail=info['thumbnail'],
datetime='', # TODO datetime='', # TODO
) )
@ -440,158 +255,28 @@ def get_header(search_box_value=""):
def get_url(node):
try:
return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
except KeyError:
return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
def get_text(node):
try:
return node['simpleText']
except KeyError:
pass
try:
return node['runs'][0]['text']
except IndexError: # empty text runs
return ''
def get_formatted_text(node):
try:
return node['runs']
except KeyError:
return node['simpleText']
def get_badges(node):
badges = []
for badge_node in node:
badge = badge_node['metadataBadgeRenderer']['label']
if badge.lower() != 'new':
badges.append(badge)
return badges
def get_thumbnail(node):
try:
return node['thumbnails'][0]['url'] # polymer format
except KeyError:
return node['url'] # ajax format
dispatch = {
# polymer format
'title': ('title', get_text),
'publishedTimeText': ('published', get_text),
'videoId': ('id', lambda node: node),
'descriptionSnippet': ('description', get_formatted_text),
'lengthText': ('duration', get_text),
'thumbnail': ('thumbnail', get_thumbnail),
'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']),
'viewCountText': ('views', get_text),
'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos"
'videoCountText': ('size', get_text),
'playlistId': ('id', lambda node: node),
'descriptionText': ('description', get_formatted_text),
'subscriberCountText': ('subscriber_count', get_text),
'channelId': ('id', lambda node: node),
'badges': ('badges', get_badges),
# ajax format
'view_count_text': ('views', get_text),
'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]),
'owner_text': ('author', get_text),
'owner_endpoint': ('author_url', lambda node: node['url']),
'description': ('description', get_formatted_text),
'index': ('playlist_index', get_text),
'short_byline': ('author', get_text),
'length': ('duration', get_text),
'video_id': ('id', lambda node: node),
}
def renderer_info(renderer):
try:
info = {}
if 'viewCountText' in renderer: # prefer this one as it contains all the digits
info['views'] = get_text(renderer['viewCountText'])
elif 'shortViewCountText' in renderer:
info['views'] = get_text(renderer['shortViewCountText'])
if 'ownerText' in renderer:
info['author'] = renderer['ownerText']['runs'][0]['text']
info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
try:
overlays = renderer['thumbnailOverlays']
except KeyError:
pass
else:
for overlay in overlays:
if 'thumbnailOverlayTimeStatusRenderer' in overlay:
info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text'])
# show renderers don't have videoCountText
elif 'thumbnailOverlayBottomPanelRenderer' in overlay:
info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text'])
# show renderers don't have playlistId, have to dig into the url to get it
try:
info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId']
except KeyError:
pass
for key, node in renderer.items():
if key in ('longBylineText', 'shortBylineText'):
info['author'] = get_text(node)
try:
info['author_url'] = get_url(node)
except KeyError:
pass
# show renderers don't have thumbnail key at top level, dig into thumbnailRenderer
elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node:
info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url']
else:
try:
simple_key, function = dispatch[key]
except KeyError:
continue
info[simple_key] = function(node)
return info
except KeyError:
print(renderer)
raise
def ajax_info(item_json):
try:
info = {}
for key, node in item_json.items():
try:
simple_key, function = dispatch[key]
except KeyError:
continue
info[simple_key] = function(node)
return info
except KeyError:
print(item_json)
raise
def badges_html(badges): def badges_html(badges):
return ' | '.join(map(html.escape, badges)) return ' | '.join(map(html.escape, badges))
html_transform_dispatch = { html_transform_dispatch = {
'title': html.escape, 'title': html.escape,
'published': html.escape, 'published': html.escape,
'id': html.escape, 'id': html.escape,
'description': format_text_runs, 'description': yt_data_extract.format_text_runs,
'duration': html.escape, 'duration': html.escape,
'thumbnail': lambda url: html.escape('/' + url.lstrip('/')), 'thumbnail': lambda url: html.escape('/' + url.lstrip('/')),
'size': html.escape, 'size': html.escape,
'author': html.escape, 'author': html.escape,
'author_url': lambda url: html.escape(URL_ORIGIN + url), 'author_url': lambda url: html.escape(util.URL_ORIGIN + url),
'views': html.escape, 'views': html.escape,
'subscriber_count': html.escape, 'subscriber_count': html.escape,
'badges': badges_html, 'badges': badges_html,
@ -645,7 +330,7 @@ def video_item_html(item, template, html_exclude=set()):
html_ready = get_html_ready(item) html_ready = get_html_ready(item)
html_ready['video_info'] = html.escape(json.dumps(video_info) ) html_ready['video_info'] = html.escape(json.dumps(video_info) )
html_ready['url'] = URL_ORIGIN + "/watch?v=" + html_ready['id'] html_ready['url'] = util.URL_ORIGIN + "/watch?v=" + html_ready['id']
html_ready['datetime'] = '' #TODO html_ready['datetime'] = '' #TODO
for key in html_exclude: for key in html_exclude:
@ -658,7 +343,7 @@ def video_item_html(item, template, html_exclude=set()):
def playlist_item_html(item, template, html_exclude=set()): def playlist_item_html(item, template, html_exclude=set()):
html_ready = get_html_ready(item) html_ready = get_html_ready(item)
html_ready['url'] = URL_ORIGIN + "/playlist?list=" + html_ready['id'] html_ready['url'] = util.URL_ORIGIN + "/playlist?list=" + html_ready['id']
html_ready['datetime'] = '' #TODO html_ready['datetime'] = '' #TODO
for key in html_exclude: for key in html_exclude:
@ -672,10 +357,6 @@ def playlist_item_html(item, template, html_exclude=set()):
def update_query_string(query_string, items):
parameters = urllib.parse.parse_qs(query_string)
parameters.update(items)
return urllib.parse.urlencode(parameters, doseq=True)
page_button_template = Template('''<a class="page-button" href="$href">$page</a>''') page_button_template = Template('''<a class="page-button" href="$href">$page</a>''')
current_page_button_template = Template('''<div class="page-button">$page</div>''') current_page_button_template = Template('''<div class="page-button">$page</div>''')
@ -694,7 +375,7 @@ def page_buttons_html(current_page, estimated_pages, url, current_query_string):
template = current_page_button_template template = current_page_button_template
else: else:
template = page_button_template template = page_button_template
result += template.substitute(page=page, href = url + "?" + update_query_string(current_query_string, {'page': [str(page)]}) ) result += template.substitute(page=page, href = url + "?" + util.update_query_string(current_query_string, {'page': [str(page)]}) )
return result return result
@ -723,15 +404,15 @@ def renderer_html(renderer, additional_info={}, current_query_string=''):
return renderer_html(renderer['contents'][0], additional_info, current_query_string) return renderer_html(renderer['contents'][0], additional_info, current_query_string)
if type == 'channelRenderer': if type == 'channelRenderer':
info = renderer_info(renderer) info = yt_data_extract.renderer_info(renderer)
html_ready = get_html_ready(info) html_ready = get_html_ready(info)
html_ready['url'] = URL_ORIGIN + "/channel/" + html_ready['id'] html_ready['url'] = util.URL_ORIGIN + "/channel/" + html_ready['id']
return medium_channel_item_template.substitute(html_ready) return medium_channel_item_template.substitute(html_ready)
if type in ('movieRenderer', 'clarificationRenderer'): if type in ('movieRenderer', 'clarificationRenderer'):
return '' return ''
info = renderer_info(renderer) info = yt_data_extract.renderer_info(renderer)
info.update(additional_info) info.update(additional_info)
html_exclude = set(additional_info.keys()) html_exclude = set(additional_info.keys())
if type == 'compactVideoRenderer': if type == 'compactVideoRenderer':

View File

@ -1,11 +1,12 @@
from youtube.template import Template
from youtube import util, html_common
import settings
import os import os
import json import json
from youtube.template import Template
from youtube import common
import html import html
import gevent import gevent
import urllib import urllib
import settings
playlists_directory = os.path.join(settings.data_dir, "playlists") playlists_directory = os.path.join(settings.data_dir, "playlists")
thumbnails_directory = os.path.join(settings.data_dir, "playlist_thumbnails") thumbnails_directory = os.path.join(settings.data_dir, "playlist_thumbnails")
@ -38,7 +39,7 @@ def download_thumbnail(playlist_name, video_id):
url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
save_location = os.path.join(thumbnails_directory, playlist_name, video_id + ".jpg") save_location = os.path.join(thumbnails_directory, playlist_name, video_id + ".jpg")
try: try:
thumbnail = common.fetch_url(url, report_text="Saved local playlist thumbnail: " + video_id) thumbnail = util.fetch_url(url, report_text="Saved local playlist thumbnail: " + video_id)
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
print("Failed to download thumbnail for " + video_id + ": " + str(e)) print("Failed to download thumbnail for " + video_id + ": " + str(e))
return return
@ -78,15 +79,15 @@ def get_local_playlist_page(name):
if info['id'] + ".jpg" in thumbnails: if info['id'] + ".jpg" in thumbnails:
info['thumbnail'] = "/youtube.com/data/playlist_thumbnails/" + name + "/" + info['id'] + ".jpg" info['thumbnail'] = "/youtube.com/data/playlist_thumbnails/" + name + "/" + info['id'] + ".jpg"
else: else:
info['thumbnail'] = common.get_thumbnail_url(info['id']) info['thumbnail'] = util.get_thumbnail_url(info['id'])
missing_thumbnails.append(info['id']) missing_thumbnails.append(info['id'])
videos_html += common.video_item_html(info, common.small_video_item_template) videos_html += html_common.video_item_html(info, html_common.small_video_item_template)
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
pass pass
gevent.spawn(download_thumbnails, name, missing_thumbnails) gevent.spawn(download_thumbnails, name, missing_thumbnails)
return local_playlist_template.substitute( return local_playlist_template.substitute(
page_title = name + ' - Local playlist', page_title = name + ' - Local playlist',
header = common.get_header(), header = html_common.get_header(),
videos = videos_html, videos = videos_html,
title = name, title = name,
page_buttons = '' page_buttons = ''
@ -127,11 +128,11 @@ def get_playlists_list_page():
page = '''<ul>\n''' page = '''<ul>\n'''
list_item_template = Template(''' <li><a href="$url">$name</a></li>\n''') list_item_template = Template(''' <li><a href="$url">$name</a></li>\n''')
for name in get_playlist_names(): for name in get_playlist_names():
page += list_item_template.substitute(url = html.escape(common.URL_ORIGIN + '/playlists/' + name), name = html.escape(name)) page += list_item_template.substitute(url = html.escape(util.URL_ORIGIN + '/playlists/' + name), name = html.escape(name))
page += '''</ul>\n''' page += '''</ul>\n'''
return common.yt_basic_template.substitute( return html_common.yt_basic_template.substitute(
page_title = "Local playlists", page_title = "Local playlists",
header = common.get_header(), header = html_common.get_header(),
style = '', style = '',
page = page, page = page,
) )
@ -151,7 +152,7 @@ def path_edit_playlist(env, start_response):
if parameters['action'][0] == 'remove': if parameters['action'][0] == 'remove':
playlist_name = env['path_parts'][1] playlist_name = env['path_parts'][1]
remove_from_playlist(playlist_name, parameters['video_info_list']) remove_from_playlist(playlist_name, parameters['video_info_list'])
start_response('303 See Other', [('Location', common.URL_ORIGIN + env['PATH_INFO']),] ) start_response('303 See Other', [('Location', util.URL_ORIGIN + env['PATH_INFO']),] )
return b'' return b''
else: else:

View File

@ -1,10 +1,9 @@
from youtube import util, yt_data_extract, html_common, template, proto
import base64 import base64
import youtube.common as common
import urllib import urllib
import json import json
import string import string
from youtube import template
import youtube.proto as proto
import gevent import gevent
import math import math
@ -49,10 +48,10 @@ headers_1 = (
def playlist_first_page(playlist_id, report_text = "Retrieved playlist"): def playlist_first_page(playlist_id, report_text = "Retrieved playlist"):
url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1' url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1'
content = common.fetch_url(url, common.mobile_ua + headers_1, report_text=report_text) content = util.fetch_url(url, util.mobile_ua + headers_1, report_text=report_text)
'''with open('debug/playlist_debug', 'wb') as f: '''with open('debug/playlist_debug', 'wb') as f:
f.write(content)''' f.write(content)'''
content = json.loads(common.uppercase_escape(content.decode('utf-8'))) content = json.loads(util.uppercase_escape(content.decode('utf-8')))
return content return content
@ -69,11 +68,11 @@ def get_videos(playlist_id, page):
'X-YouTube-Client-Version': '2.20180508', 'X-YouTube-Client-Version': '2.20180508',
} }
content = common.fetch_url(url, headers, report_text="Retrieved playlist") content = util.fetch_url(url, headers, report_text="Retrieved playlist")
'''with open('debug/playlist_debug', 'wb') as f: '''with open('debug/playlist_debug', 'wb') as f:
f.write(content)''' f.write(content)'''
info = json.loads(common.uppercase_escape(content.decode('utf-8'))) info = json.loads(util.uppercase_escape(content.decode('utf-8')))
return info return info
@ -101,22 +100,22 @@ def get_playlist_page(env, start_response):
video_list = this_page_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] video_list = this_page_json['response']['continuationContents']['playlistVideoListContinuation']['contents']
videos_html = '' videos_html = ''
for video_json in video_list: for video_json in video_list:
info = common.renderer_info(video_json['playlistVideoRenderer']) info = yt_data_extract.renderer_info(video_json['playlistVideoRenderer'])
videos_html += common.video_item_html(info, common.small_video_item_template) videos_html += html_common.video_item_html(info, html_common.small_video_item_template)
metadata = common.renderer_info(first_page_json['response']['header']['playlistHeaderRenderer']) metadata = yt_data_extract.renderer_info(first_page_json['response']['header']['playlistHeaderRenderer'])
video_count = int(metadata['size'].replace(',', '')) video_count = int(metadata['size'].replace(',', ''))
page_buttons = common.page_buttons_html(int(page), math.ceil(video_count/20), common.URL_ORIGIN + "/playlist", env['QUERY_STRING']) page_buttons = html_common.page_buttons_html(int(page), math.ceil(video_count/20), util.URL_ORIGIN + "/playlist", env['QUERY_STRING'])
html_ready = common.get_html_ready(metadata) html_ready = html_common.get_html_ready(metadata)
html_ready['page_title'] = html_ready['title'] + ' - Page ' + str(page) html_ready['page_title'] = html_ready['title'] + ' - Page ' + str(page)
stats = '' stats = ''
stats += playlist_stat_template.substitute(stat=html_ready['size'] + ' videos') stats += playlist_stat_template.substitute(stat=html_ready['size'] + ' videos')
stats += playlist_stat_template.substitute(stat=html_ready['views']) stats += playlist_stat_template.substitute(stat=html_ready['views'])
return yt_playlist_template.substitute( return yt_playlist_template.substitute(
header = common.get_header(), header = html_common.get_header(),
videos = videos_html, videos = videos_html,
page_buttons = page_buttons, page_buttons = page_buttons,
stats = stats, stats = stats,

View File

@ -1,11 +1,11 @@
# Contains functions having to do with posting/editing/deleting comments # Contains functions having to do with posting/editing/deleting comments
from youtube import util, html_common, proto, comments, accounts
import settings
import urllib import urllib
import json import json
from youtube import common, proto, comments, accounts
import re import re
import traceback import traceback
import settings
import os import os
def _post_comment(text, video_id, session_token, cookiejar): def _post_comment(text, video_id, session_token, cookiejar):
@ -31,7 +31,7 @@ def _post_comment(text, video_id, session_token, cookiejar):
data = urllib.parse.urlencode(data_dict).encode() data = urllib.parse.urlencode(data_dict).encode()
content = common.fetch_url("https://m.youtube.com/service_ajax?name=createCommentEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) content = util.fetch_url("https://m.youtube.com/service_ajax?name=createCommentEndpoint", headers=headers, data=data, cookiejar_send=cookiejar)
code = json.loads(content)['code'] code = json.loads(content)['code']
print("Comment posting code: " + code) print("Comment posting code: " + code)
@ -62,7 +62,7 @@ def _post_comment_reply(text, video_id, parent_comment_id, session_token, cookie
} }
data = urllib.parse.urlencode(data_dict).encode() data = urllib.parse.urlencode(data_dict).encode()
content = common.fetch_url("https://m.youtube.com/service_ajax?name=createCommentReplyEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) content = util.fetch_url("https://m.youtube.com/service_ajax?name=createCommentReplyEndpoint", headers=headers, data=data, cookiejar_send=cookiejar)
code = json.loads(content)['code'] code = json.loads(content)['code']
print("Comment posting code: " + code) print("Comment posting code: " + code)
@ -90,7 +90,7 @@ def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar):
} }
data = urllib.parse.urlencode(data_dict).encode() data = urllib.parse.urlencode(data_dict).encode()
content = common.fetch_url("https://m.youtube.com/service_ajax?name=performCommentActionEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) content = util.fetch_url("https://m.youtube.com/service_ajax?name=performCommentActionEndpoint", headers=headers, data=data, cookiejar_send=cookiejar)
code = json.loads(content)['code'] code = json.loads(content)['code']
print("Comment deletion code: " + code) print("Comment deletion code: " + code)
return code return code
@ -101,8 +101,8 @@ def get_session_token(video_id, cookiejar):
# youtube-dl uses disable_polymer=1 which uses a different request format which has an obfuscated javascript algorithm to generate a parameter called "bgr" # youtube-dl uses disable_polymer=1 which uses a different request format which has an obfuscated javascript algorithm to generate a parameter called "bgr"
# Tokens retrieved from disable_polymer pages only work with that format. Tokens retrieved on mobile only work using mobile requests # Tokens retrieved from disable_polymer pages only work with that format. Tokens retrieved on mobile only work using mobile requests
# Additionally, tokens retrieved without sending the same cookie won't work. So this is necessary even if the bgr and stuff was reverse engineered. # Additionally, tokens retrieved without sending the same cookie won't work. So this is necessary even if the bgr and stuff was reverse engineered.
headers = {'User-Agent': common.mobile_user_agent} headers = {'User-Agent': util.mobile_user_agent}
mobile_page = common.fetch_url('https://m.youtube.com/watch?v=' + video_id, headers, report_text="Retrieved session token for comment", cookiejar_send=cookiejar, cookiejar_receive=cookiejar).decode() mobile_page = util.fetch_url('https://m.youtube.com/watch?v=' + video_id, headers, report_text="Retrieved session token for comment", cookiejar_send=cookiejar, cookiejar_receive=cookiejar).decode()
match = xsrf_token_regex.search(mobile_page) match = xsrf_token_regex.search(mobile_page)
if match: if match:
return match.group(1).replace("%3D", "=") return match.group(1).replace("%3D", "=")
@ -118,9 +118,9 @@ def delete_comment(env, start_response):
code = _delete_comment(video_id, parameters['comment_id'][0], parameters['author_id'][0], token, cookiejar) code = _delete_comment(video_id, parameters['comment_id'][0], parameters['author_id'][0], token, cookiejar)
if code == "SUCCESS": if code == "SUCCESS":
start_response('303 See Other', [('Location', common.URL_ORIGIN + '/comment_delete_success'),] ) start_response('303 See Other', [('Location', util.URL_ORIGIN + '/comment_delete_success'),] )
else: else:
start_response('303 See Other', [('Location', common.URL_ORIGIN + '/comment_delete_fail'),] ) start_response('303 See Other', [('Location', util.URL_ORIGIN + '/comment_delete_fail'),] )
def post_comment(env, start_response): def post_comment(env, start_response):
parameters = env['parameters'] parameters = env['parameters']
@ -131,11 +131,11 @@ def post_comment(env, start_response):
if 'parent_id' in parameters: if 'parent_id' in parameters:
code = _post_comment_reply(parameters['comment_text'][0], parameters['video_id'][0], parameters['parent_id'][0], token, cookiejar) code = _post_comment_reply(parameters['comment_text'][0], parameters['video_id'][0], parameters['parent_id'][0], token, cookiejar)
start_response('303 See Other', (('Location', common.URL_ORIGIN + '/comments?' + env['QUERY_STRING']),) ) start_response('303 See Other', (('Location', util.URL_ORIGIN + '/comments?' + env['QUERY_STRING']),) )
else: else:
code = _post_comment(parameters['comment_text'][0], parameters['video_id'][0], token, cookiejar) code = _post_comment(parameters['comment_text'][0], parameters['video_id'][0], token, cookiejar)
start_response('303 See Other', (('Location', common.URL_ORIGIN + '/comments?ctoken=' + comments.make_comment_ctoken(video_id, sort=1)),) ) start_response('303 See Other', (('Location', util.URL_ORIGIN + '/comments?ctoken=' + comments.make_comment_ctoken(video_id, sort=1)),) )
return b'' return b''
@ -163,10 +163,10 @@ def get_delete_comment_page(env, start_response):
page += ''' page += '''
<input type="submit" value="Yes, delete it"> <input type="submit" value="Yes, delete it">
</form>''' </form>'''
return common.yt_basic_template.substitute( return html_common.yt_basic_template.substitute(
page_title = "Delete comment?", page_title = "Delete comment?",
style = style, style = style,
header = common.get_header(), header = html_common.get_header(),
page = page, page = page,
).encode('utf-8') ).encode('utf-8')
@ -174,7 +174,7 @@ def get_post_comment_page(env, start_response):
start_response('200 OK', [('Content-type','text/html'),]) start_response('200 OK', [('Content-type','text/html'),])
parameters = env['parameters'] parameters = env['parameters']
video_id = parameters['video_id'][0] video_id = parameters['video_id'][0]
parent_id = common.default_multi_get(parameters, 'parent_id', 0, default='') parent_id = util.default_multi_get(parameters, 'parent_id', 0, default='')
style = ''' main{ style = ''' main{
display: grid; display: grid;
@ -194,23 +194,23 @@ textarea{
}''' }'''
if parent_id: # comment reply if parent_id: # comment reply
comment_box = comments.comment_box_template.substitute( comment_box = comments.comment_box_template.substitute(
form_action = common.URL_ORIGIN + '/comments?parent_id=' + parent_id + "&video_id=" + video_id, form_action = util.URL_ORIGIN + '/comments?parent_id=' + parent_id + "&video_id=" + video_id,
video_id_input = '', video_id_input = '',
post_text = "Post reply", post_text = "Post reply",
options=comments.comment_box_account_options(), options=comments.comment_box_account_options(),
) )
else: else:
comment_box = comments.comment_box_template.substitute( comment_box = comments.comment_box_template.substitute(
form_action = common.URL_ORIGIN + '/post_comment', form_action = util.URL_ORIGIN + '/post_comment',
video_id_input = '''<input type="hidden" name="video_id" value="''' + video_id + '''">''', video_id_input = '''<input type="hidden" name="video_id" value="''' + video_id + '''">''',
post_text = "Post comment", post_text = "Post comment",
options=comments.comment_box_account_options(), options=comments.comment_box_account_options(),
) )
page = '''<div class="left">\n''' + comment_box + '''</div>\n''' page = '''<div class="left">\n''' + comment_box + '''</div>\n'''
return common.yt_basic_template.substitute( return html_common.yt_basic_template.substitute(
page_title = "Post comment reply" if parent_id else "Post a comment", page_title = "Post comment reply" if parent_id else "Post a comment",
style = style, style = style,
header = common.get_header(), header = html_common.get_header(),
page = page, page = page,
).encode('utf-8') ).encode('utf-8')

View File

@ -1,11 +1,12 @@
from youtube import util, html_common, yt_data_extract, proto
import json import json
import urllib import urllib
import html import html
from string import Template from string import Template
import base64 import base64
from math import ceil from math import ceil
from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN
from youtube import common, proto
with open("yt_search_results_template.html", "r") as file: with open("yt_search_results_template.html", "r") as file:
yt_search_results_template = file.read() yt_search_results_template = file.read()
@ -54,7 +55,7 @@ def get_search_json(query, page, autocorrect, sort, filters):
'X-YouTube-Client-Version': '2.20180418', 'X-YouTube-Client-Version': '2.20180418',
} }
url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D") url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D")
content = common.fetch_url(url, headers=headers, report_text="Got search results") content = util.fetch_url(url, headers=headers, report_text="Got search results")
info = json.loads(content) info = json.loads(content)
return info return info
@ -70,9 +71,9 @@ def get_search_page(env, start_response):
start_response('200 OK', [('Content-type','text/html'),]) start_response('200 OK', [('Content-type','text/html'),])
parameters = env['parameters'] parameters = env['parameters']
if len(parameters) == 0: if len(parameters) == 0:
return common.yt_basic_template.substitute( return html_common.yt_basic_template.substitute(
page_title = "Search", page_title = "Search",
header = common.get_header(), header = html_common.get_header(),
style = '', style = '',
page = '', page = '',
).encode('utf-8') ).encode('utf-8')
@ -100,24 +101,24 @@ def get_search_page(env, start_response):
renderer = renderer[type] renderer = renderer[type]
corrected_query_string = parameters.copy() corrected_query_string = parameters.copy()
corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']]
corrected_query_url = URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) corrected_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True)
corrections = did_you_mean.substitute( corrections = did_you_mean.substitute(
corrected_query_url = corrected_query_url, corrected_query_url = corrected_query_url,
corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), corrected_query = yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']),
) )
continue continue
if type == 'showingResultsForRenderer': if type == 'showingResultsForRenderer':
renderer = renderer[type] renderer = renderer[type]
no_autocorrect_query_string = parameters.copy() no_autocorrect_query_string = parameters.copy()
no_autocorrect_query_string['autocorrect'] = ['0'] no_autocorrect_query_string['autocorrect'] = ['0']
no_autocorrect_query_url = URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True)
corrections = showing_results_for.substitute( corrections = showing_results_for.substitute(
corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), corrected_query = yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']),
original_query_url = no_autocorrect_query_url, original_query_url = no_autocorrect_query_url,
original_query = html.escape(renderer['originalQuery']['simpleText']), original_query = html.escape(renderer['originalQuery']['simpleText']),
) )
continue continue
result_list_html += common.renderer_html(renderer, current_query_string=env['QUERY_STRING']) result_list_html += html_common.renderer_html(renderer, current_query_string=env['QUERY_STRING'])
page = int(page) page = int(page)
if page <= 5: if page <= 5:
@ -129,13 +130,13 @@ def get_search_page(env, start_response):
result = Template(yt_search_results_template).substitute( result = Template(yt_search_results_template).substitute(
header = common.get_header(query), header = html_common.get_header(query),
results = result_list_html, results = result_list_html,
page_title = query + " - Search", page_title = query + " - Search",
search_box_value = html.escape(query), search_box_value = html.escape(query),
number_of_results = '{:,}'.format(estimated_results), number_of_results = '{:,}'.format(estimated_results),
number_of_pages = '{:,}'.format(estimated_pages), number_of_pages = '{:,}'.format(estimated_pages),
page_buttons = common.page_buttons_html(page, estimated_pages, URL_ORIGIN + "/search", env['QUERY_STRING']), page_buttons = html_common.page_buttons_html(page, estimated_pages, util.URL_ORIGIN + "/search", env['QUERY_STRING']),
corrections = corrections corrections = corrections
) )
return result.encode('utf-8') return result.encode('utf-8')

153
youtube/util.py Normal file
View File

@ -0,0 +1,153 @@
import socks, sockshandler
import gzip
import brotli
import urllib.parse
import re
import time
import settings
URL_ORIGIN = "/https://www.youtube.com"
class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
'''Separate cookiejars for receiving and sending'''
def __init__(self, cookiejar_send=None, cookiejar_receive=None):
import http.cookiejar
self.cookiejar_send = cookiejar_send
self.cookiejar_receive = cookiejar_receive
def http_request(self, request):
if self.cookiejar_send is not None:
self.cookiejar_send.add_cookie_header(request)
return request
def http_response(self, request, response):
if self.cookiejar_receive is not None:
self.cookiejar_receive.extract_cookies(response, request)
return response
https_request = http_request
https_response = http_response
def decode_content(content, encoding_header):
encodings = encoding_header.replace(' ', '').split(',')
for encoding in reversed(encodings):
if encoding == 'identity':
continue
if encoding == 'br':
content = brotli.decompress(content)
elif encoding == 'gzip':
content = gzip.decompress(content)
return content
def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True):
'''
When cookiejar_send is set to a CookieJar object,
those cookies will be sent in the request (but cookies in response will not be merged into it)
When cookiejar_receive is set to a CookieJar object,
cookies received in the response will be merged into the object (nothing will be sent from it)
When both are set to the same object, cookies will be sent from the object,
and response cookies will be merged into it.
'''
headers = dict(headers) # Note: Calling dict() on a dict will make a copy
headers['Accept-Encoding'] = 'gzip, br'
# prevent python version being leaked by urllib if User-Agent isn't provided
# (urllib will use ex. Python-urllib/3.6 otherwise)
if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
headers['User-Agent'] = 'Python-urllib'
if data is not None:
if isinstance(data, str):
data = data.encode('ascii')
elif not isinstance(data, bytes):
data = urllib.parse.urlencode(data).encode('ascii')
start_time = time.time()
req = urllib.request.Request(url, data=data, headers=headers)
cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
if use_tor and settings.route_tor:
opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor)
else:
opener = urllib.request.build_opener(cookie_processor)
response = opener.open(req, timeout=timeout)
response_time = time.time()
content = response.read()
read_finish = time.time()
if report_text:
print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3))
content = decode_content(content, response.getheader('Content-Encoding', default='identity'))
return content
mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
mobile_ua = (('User-Agent', mobile_user_agent),)
desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
desktop_ua = (('User-Agent', desktop_user_agent),)
def dict_add(*dicts):
for dictionary in dicts[1:]:
dicts[0].update(dictionary)
return dicts[0]
def video_id(url):
url_parts = urllib.parse.urlparse(url)
return urllib.parse.parse_qs(url_parts.query)['v'][0]
def default_multi_get(object, *keys, default):
''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors '''
try:
for key in keys:
object = object[key]
return object
except (IndexError, KeyError):
return default
# default, sddefault, mqdefault, hqdefault, hq720
def get_thumbnail_url(video_id):
return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
def seconds_to_timestamp(seconds):
seconds = int(seconds)
hours, seconds = divmod(seconds,3600)
minutes, seconds = divmod(seconds,60)
if hours != 0:
timestamp = str(hours) + ":"
timestamp += str(minutes).zfill(2) # zfill pads with zeros
else:
timestamp = str(minutes)
timestamp += ":" + str(seconds).zfill(2)
return timestamp
def update_query_string(query_string, items):
parameters = urllib.parse.parse_qs(query_string)
parameters.update(items)
return urllib.parse.urlencode(parameters, doseq=True)
def uppercase_escape(s):
return re.sub(
r'\\U([0-9a-fA-F]{8})',
lambda m: chr(int(m.group(1), base=16)), s)

View File

@ -1,12 +1,12 @@
from youtube import util, html_common, comments
from youtube_dl.YoutubeDL import YoutubeDL from youtube_dl.YoutubeDL import YoutubeDL
from youtube_dl.extractor.youtube import YoutubeError from youtube_dl.extractor.youtube import YoutubeError
import json import json
import urllib import urllib
from string import Template from string import Template
import html import html
import youtube.common as common
from youtube.common import default_multi_get, get_thumbnail_url, video_id, URL_ORIGIN
import youtube.comments as comments
import gevent import gevent
import settings import settings
import os import os
@ -127,9 +127,9 @@ def get_related_items_html(info):
result = "" result = ""
for item in info['related_vids']: for item in info['related_vids']:
if 'list' in item: # playlist: if 'list' in item: # playlist:
result += common.small_playlist_item_html(watch_page_related_playlist_info(item)) result += html_common.small_playlist_item_html(watch_page_related_playlist_info(item))
else: else:
result += common.small_video_item_html(watch_page_related_video_info(item)) result += html_common.small_video_item_html(watch_page_related_video_info(item))
return result return result
@ -137,7 +137,7 @@ def get_related_items_html(info):
# converts these to standard names # converts these to standard names
def watch_page_related_video_info(item): def watch_page_related_video_info(item):
result = {key: item[key] for key in ('id', 'title', 'author')} result = {key: item[key] for key in ('id', 'title', 'author')}
result['duration'] = common.seconds_to_timestamp(item['length_seconds']) result['duration'] = util.seconds_to_timestamp(item['length_seconds'])
try: try:
result['views'] = item['short_view_count_text'] result['views'] = item['short_view_count_text']
except KeyError: except KeyError:
@ -155,9 +155,9 @@ def watch_page_related_playlist_info(item):
def sort_formats(info): def sort_formats(info):
sorted_formats = info['formats'].copy() sorted_formats = info['formats'].copy()
sorted_formats.sort(key=lambda x: default_multi_get(_formats, x['format_id'], 'height', default=0)) sorted_formats.sort(key=lambda x: util.default_multi_get(_formats, x['format_id'], 'height', default=0))
for index, format in enumerate(sorted_formats): for index, format in enumerate(sorted_formats):
if default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360: if util.default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360:
break break
sorted_formats = sorted_formats[index:] + sorted_formats[0:index] sorted_formats = sorted_formats[index:] + sorted_formats[0:index]
sorted_formats = [format for format in info['formats'] if format['acodec'] != 'none' and format['vcodec'] != 'none'] sorted_formats = [format for format in info['formats'] if format['acodec'] != 'none' and format['vcodec'] != 'none']
@ -236,7 +236,7 @@ def get_watch_page(env, start_response):
start_response('200 OK', [('Content-type','text/html'),]) start_response('200 OK', [('Content-type','text/html'),])
lc = common.default_multi_get(env['parameters'], 'lc', 0, default='') lc = util.default_multi_get(env['parameters'], 'lc', 0, default='')
if settings.route_tor: if settings.route_tor:
proxy = 'socks5://127.0.0.1:9150/' proxy = 'socks5://127.0.0.1:9150/'
else: else:
@ -256,17 +256,17 @@ def get_watch_page(env, start_response):
#chosen_format = choose_format(info) #chosen_format = choose_format(info)
if isinstance(info, str): # youtube error if isinstance(info, str): # youtube error
return common.yt_basic_template.substitute( return html_common.yt_basic_template.substitute(
page_title = "Error", page_title = "Error",
style = "", style = "",
header = common.get_header(), header = html_common.get_header(),
page = html.escape(info), page = html.escape(info),
).encode('utf-8') ).encode('utf-8')
sorted_formats = sort_formats(info) sorted_formats = sort_formats(info)
video_info = { video_info = {
"duration": common.seconds_to_timestamp(info["duration"]), "duration": util.seconds_to_timestamp(info["duration"]),
"id": info['id'], "id": info['id'],
"title": info['title'], "title": info['title'],
"author": info['uploader'], "author": info['uploader'],
@ -338,7 +338,7 @@ def get_watch_page(env, start_response):
page = yt_watch_template.substitute( page = yt_watch_template.substitute(
video_title = html.escape(info["title"]), video_title = html.escape(info["title"]),
page_title = html.escape(info["title"]), page_title = html.escape(info["title"]),
header = common.get_header(), header = html_common.get_header(),
uploader = html.escape(info["uploader"]), uploader = html.escape(info["uploader"]),
uploader_channel_url = '/' + info["uploader_url"], uploader_channel_url = '/' + info["uploader_url"],
upload_date = upload_date, upload_date = upload_date,

View File

@ -1,7 +1,7 @@
import mimetypes import mimetypes
import urllib.parse import urllib.parse
import os import os
from youtube import local_playlist, watch, search, playlist, channel, comments, common, post_comment, accounts from youtube import local_playlist, watch, search, playlist, channel, comments, post_comment, accounts, util
import settings import settings
YOUTUBE_FILES = ( YOUTUBE_FILES = (
"/shared.css", "/shared.css",
@ -64,7 +64,7 @@ def youtube(env, start_response):
elif path.startswith("/api/"): elif path.startswith("/api/"):
start_response('200 OK', [('Content-type', 'text/vtt'),] ) start_response('200 OK', [('Content-type', 'text/vtt'),] )
result = common.fetch_url('https://www.youtube.com' + path + ('?' + query_string if query_string else '')) result = util.fetch_url('https://www.youtube.com' + path + ('?' + query_string if query_string else ''))
result = result.replace(b"align:start position:0%", b"") result = result.replace(b"align:start position:0%", b"")
return result return result

205
youtube/yt_data_extract.py Normal file
View File

@ -0,0 +1,205 @@
import html
# videos (all of type str):
# id
# title
# url
# author
# author_url
# thumbnail
# description
# published
# duration
# likes
# dislikes
# views
# playlist_index
# playlists:
# id
# title
# url
# author
# author_url
# thumbnail
# description
# updated
# size
# first_video_id
def get_plain_text(node):
try:
return html.escape(node['simpleText'])
except KeyError:
return unformmated_text_runs(node['runs'])
def unformmated_text_runs(runs):
result = ''
for text_run in runs:
result += html.escape(text_run["text"])
return result
def format_text_runs(runs):
if isinstance(runs, str):
return runs
result = ''
for text_run in runs:
if text_run.get("bold", False):
result += "<b>" + html.escape(text_run["text"]) + "</b>"
elif text_run.get('italics', False):
result += "<i>" + html.escape(text_run["text"]) + "</i>"
else:
result += html.escape(text_run["text"])
return result
def get_url(node):
try:
return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
except KeyError:
return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
def get_text(node):
try:
return node['simpleText']
except KeyError:
pass
try:
return node['runs'][0]['text']
except IndexError: # empty text runs
return ''
def get_formatted_text(node):
try:
return node['runs']
except KeyError:
return node['simpleText']
def get_badges(node):
badges = []
for badge_node in node:
badge = badge_node['metadataBadgeRenderer']['label']
if badge.lower() != 'new':
badges.append(badge)
return badges
def get_thumbnail(node):
try:
return node['thumbnails'][0]['url'] # polymer format
except KeyError:
return node['url'] # ajax format
dispatch = {
# polymer format
'title': ('title', get_text),
'publishedTimeText': ('published', get_text),
'videoId': ('id', lambda node: node),
'descriptionSnippet': ('description', get_formatted_text),
'lengthText': ('duration', get_text),
'thumbnail': ('thumbnail', get_thumbnail),
'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']),
'viewCountText': ('views', get_text),
'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos"
'videoCountText': ('size', get_text),
'playlistId': ('id', lambda node: node),
'descriptionText': ('description', get_formatted_text),
'subscriberCountText': ('subscriber_count', get_text),
'channelId': ('id', lambda node: node),
'badges': ('badges', get_badges),
# ajax format
'view_count_text': ('views', get_text),
'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]),
'owner_text': ('author', get_text),
'owner_endpoint': ('author_url', lambda node: node['url']),
'description': ('description', get_formatted_text),
'index': ('playlist_index', get_text),
'short_byline': ('author', get_text),
'length': ('duration', get_text),
'video_id': ('id', lambda node: node),
}
def renderer_info(renderer):
try:
info = {}
if 'viewCountText' in renderer: # prefer this one as it contains all the digits
info['views'] = get_text(renderer['viewCountText'])
elif 'shortViewCountText' in renderer:
info['views'] = get_text(renderer['shortViewCountText'])
if 'ownerText' in renderer:
info['author'] = renderer['ownerText']['runs'][0]['text']
info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
try:
overlays = renderer['thumbnailOverlays']
except KeyError:
pass
else:
for overlay in overlays:
if 'thumbnailOverlayTimeStatusRenderer' in overlay:
info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text'])
# show renderers don't have videoCountText
elif 'thumbnailOverlayBottomPanelRenderer' in overlay:
info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text'])
# show renderers don't have playlistId, have to dig into the url to get it
try:
info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId']
except KeyError:
pass
for key, node in renderer.items():
if key in ('longBylineText', 'shortBylineText'):
info['author'] = get_text(node)
try:
info['author_url'] = get_url(node)
except KeyError:
pass
# show renderers don't have thumbnail key at top level, dig into thumbnailRenderer
elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node:
info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url']
else:
try:
simple_key, function = dispatch[key]
except KeyError:
continue
info[simple_key] = function(node)
return info
except KeyError:
print(renderer)
raise
def ajax_info(item_json):
try:
info = {}
for key, node in item_json.items():
try:
simple_key, function = dispatch[key]
except KeyError:
continue
info[simple_key] = function(node)
return info
except KeyError:
print(item_json)
raise