Finally fix video count on channels accessed through general urls, rather than just channel id.

It was set to a fake value of 1000 previously in order to ensure there would be enough page buttons.
This was because two sequential requests are necessary (one to get the channel id corresponding to the custom url, another to get the number of videos from the "all uploaded videos" playlist, the url for which can be generated from the channel id).

Since Tor has a high latency, I thought at the time that this would be too slow, but in practice it's not too big of a deal.

Introduces cachetools dependency in order to cache the function which gets the number of videos.

The get_channel_id function has also been fixed since the ajax api seems to have been removed.
This commit is contained in:
James Taylor 2019-12-22 18:29:31 -08:00
parent bafae2837e
commit 222117143f
2 changed files with 35 additions and 19 deletions

View File

@ -4,3 +4,4 @@ Brotli>=1.0.7
PySocks>=1.6.8 PySocks>=1.6.8
urllib3>=1.24.1 urllib3>=1.24.1
defusedxml>=0.5.0 defusedxml>=0.5.0
cachetools>=4.0.0

View File

@ -10,7 +10,8 @@ import html
import math import math
import gevent import gevent
import re import re
import functools import cachetools.func
import traceback
import flask import flask
from flask import request from flask import request
@ -95,23 +96,25 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, print_st
return content return content
def get_number_of_videos(channel_id): # cache entries expire after 30 minutes
@cachetools.func.ttl_cache(maxsize=128, ttl=30*60)
def get_number_of_videos_channel(channel_id):
if channel_id is None:
return 1000
# Uploads playlist # Uploads playlist
playlist_id = 'UU' + channel_id[2:] playlist_id = 'UU' + channel_id[2:]
url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1' url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1'
print("Getting number of videos")
# Sometimes retrieving playlist info fails with 403 for no discernable reason
try: try:
response = util.fetch_url(url, util.mobile_ua + headers_pbj, debug_name='number_of_videos') response = util.fetch_url(url, util.mobile_ua + headers_pbj,
debug_name='number_of_videos', report_text='Got number of videos')
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
if e.code != 403: traceback.print_exc()
raise
print("Couldn't retrieve number of videos") print("Couldn't retrieve number of videos")
return 1000 return 1000
response = response.decode('utf-8') response = response.decode('utf-8')
print("Got response for number of videos")
match = re.search(r'"numVideosText":\s*{\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response) match = re.search(r'"numVideosText":\s*{\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response)
if match: if match:
@ -119,13 +122,21 @@ def get_number_of_videos(channel_id):
else: else:
return 0 return 0
@functools.lru_cache(maxsize=128) channel_id_re = re.compile(r'videos\.xml\?channel_id=([a-zA-Z0-9_-]{24})"')
def get_channel_id(username): @cachetools.func.lru_cache(maxsize=128)
# method that gives the smallest possible response at ~10 kb def get_channel_id(base_url):
# method that gives the smallest possible response at ~4 kb
# needs to be as fast as possible # needs to be as fast as possible
url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true' base_url = base_url.replace('https://www', 'https://m') # avoid redirect
response = util.fetch_url(url, util.mobile_ua + headers_1).decode('utf-8') response = util.fetch_url(base_url + '/about?pbj=1', util.mobile_ua + headers_pbj,
return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1) debug_name='get_channel_id', report_text='Got channel id').decode('utf-8')
match = channel_id_re.search(response)
if match:
return match.group(1)
return None
def get_number_of_videos_general(base_url):
return get_number_of_videos_channel(get_channel_id(base_url))
def get_channel_search_json(channel_id, query, page): def get_channel_search_json(channel_id, query, page):
params = proto.string(2, 'search') + proto.string(15, str(page)) params = proto.string(2, 'search') + proto.string(15, str(page))
@ -164,21 +175,25 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
if tab == 'videos' and channel_id: if tab == 'videos' and channel_id:
tasks = ( tasks = (
gevent.spawn(get_number_of_videos, channel_id ), gevent.spawn(get_number_of_videos_channel, channel_id),
gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view) gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view)
) )
gevent.joinall(tasks) gevent.joinall(tasks)
number_of_videos, polymer_json = tasks[0].value, tasks[1].value number_of_videos, polymer_json = tasks[0].value, tasks[1].value
elif tab == 'videos': elif tab == 'videos':
polymer_json = util.fetch_url(base_url + '/videos?pbj=1&view=0', util.desktop_ua + headers_1, debug_name='gen_channel_videos') tasks = (
number_of_videos = 1000 gevent.spawn(get_number_of_videos_general, base_url),
gevent.spawn(util.fetch_url, base_url + '/videos?pbj=1&view=0', util.desktop_ua + headers_1, debug_name='gen_channel_videos')
)
gevent.joinall(tasks)
number_of_videos, polymer_json = tasks[0].value, tasks[1].value
elif tab == 'about': elif tab == 'about':
polymer_json = util.fetch_url(base_url + '/about?pbj=1', util.desktop_ua + headers_1, debug_name='gen_channel_about') polymer_json = util.fetch_url(base_url + '/about?pbj=1', util.desktop_ua + headers_1, debug_name='gen_channel_about')
elif tab == 'playlists': elif tab == 'playlists':
polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], util.desktop_ua + headers_1, debug_name='gen_channel_playlists') polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], util.desktop_ua + headers_1, debug_name='gen_channel_playlists')
elif tab == 'search' and channel_id: elif tab == 'search' and channel_id:
tasks = ( tasks = (
gevent.spawn(get_number_of_videos, channel_id ), gevent.spawn(get_number_of_videos_channel, channel_id ),
gevent.spawn(get_channel_search_json, channel_id, query, page_number) gevent.spawn(get_channel_search_json, channel_id, query, page_number)
) )
gevent.joinall(tasks) gevent.joinall(tasks)
@ -199,11 +214,11 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
if tab in ('videos', 'search'): if tab in ('videos', 'search'):
info['number_of_videos'] = number_of_videos info['number_of_videos'] = number_of_videos
info['number_of_pages'] = math.ceil(number_of_videos/30) info['number_of_pages'] = math.ceil(number_of_videos/30)
info['header_playlist_names'] = local_playlist.get_playlist_names()
if tab in ('videos', 'playlists'): if tab in ('videos', 'playlists'):
info['current_sort'] = sort info['current_sort'] = sort
elif tab == 'search': elif tab == 'search':
info['search_box_value'] = query info['search_box_value'] = query
info['header_playlist_names'] = local_playlist.get_playlist_names()
info['subscribed'] = subscriptions.is_subscribed(info['channel_id']) info['subscribed'] = subscriptions.is_subscribed(info['channel_id'])
return flask.render_template('channel.html', return flask.render_template('channel.html',