yt-local/youtube/subscriptions.py
2019-08-10 00:09:03 -07:00

596 lines
24 KiB
Python

from youtube import util, yt_data_extract, channel
from youtube import yt_app
import settings
import sqlite3
import os
import time
import gevent
import json
import traceback
import contextlib
import defusedxml.ElementTree
import flask
from flask import request
thumbnails_directory = os.path.join(settings.data_dir, "subscription_thumbnails")
# https://stackabuse.com/a-sqlite-tutorial-with-python/
database_path = os.path.join(settings.data_dir, "subscriptions.sqlite")
def open_database():
if not os.path.exists(settings.data_dir):
os.makedirs(settings.data_dir)
connection = sqlite3.connect(database_path, check_same_thread=False)
# Create tables if they don't exist
try:
cursor = connection.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS subscribed_channels (
id integer PRIMARY KEY,
yt_channel_id text UNIQUE NOT NULL,
channel_name text NOT NULL,
time_last_checked integer,
muted integer DEFAULT 0,
upload_frequency integer
)''')
cursor.execute('''CREATE TABLE IF NOT EXISTS videos (
id integer PRIMARY KEY,
sql_channel_id integer NOT NULL REFERENCES subscribed_channels(id) ON UPDATE CASCADE ON DELETE CASCADE,
video_id text UNIQUE NOT NULL,
title text NOT NULL,
duration text,
time_published integer NOT NULL,
description text
)''')
cursor.execute('''CREATE TABLE IF NOT EXISTS tag_associations (
id integer PRIMARY KEY,
tag text NOT NULL,
sql_channel_id integer NOT NULL REFERENCES subscribed_channels(id) ON UPDATE CASCADE ON DELETE CASCADE,
UNIQUE(tag, sql_channel_id)
)''')
connection.commit()
except:
connection.rollback()
connection.close()
raise
# https://stackoverflow.com/questions/19522505/using-sqlite3-in-python-with-with-keyword
return contextlib.closing(connection)
def with_open_db(function, *args, **kwargs):
with open_database() as connection:
with connection as cursor:
return function(cursor, *args, **kwargs)
def is_subscribed(channel_id):
if not os.path.exists(database_path):
return False
with open_database() as connection:
with connection as cursor:
result = cursor.execute('''SELECT EXISTS(
SELECT 1
FROM subscribed_channels
WHERE yt_channel_id=?
LIMIT 1
)''', [channel_id]).fetchone()
return bool(result[0])
def _subscribe(cursor, channels):
''' channels is a list of (channel_id, channel_name) '''
# set time_last_checked to 0 on all channels being subscribed to
channels = ( (channel_id, channel_name, 0) for channel_id, channel_name in channels)
cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked)
VALUES (?, ?, ?)''', channels)
# TODO: delete thumbnails
def _unsubscribe(cursor, channel_ids):
''' channel_ids is a list of channel_ids '''
cursor.executemany("DELETE FROM subscribed_channels WHERE yt_channel_id=?", ((channel_id, ) for channel_id in channel_ids))
def _get_videos(cursor, number, offset):
db_videos = cursor.execute('''SELECT video_id, title, duration, channel_name
FROM videos
INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
ORDER BY time_published DESC
LIMIT ? OFFSET ?''', (number, offset))
for db_video in db_videos:
yield {
'id': db_video[0],
'title': db_video[1],
'duration': db_video[2],
'author': db_video[3],
}
def _get_subscribed_channels(cursor):
for item in cursor.execute('''SELECT channel_name, yt_channel_id, muted
FROM subscribed_channels
ORDER BY channel_name COLLATE NOCASE'''):
yield item
def _add_tags(cursor, channel_ids, tags):
pairs = [(tag, yt_channel_id) for tag in tags for yt_channel_id in channel_ids]
cursor.executemany('''INSERT OR IGNORE INTO tag_associations (tag, sql_channel_id)
SELECT ?, id FROM subscribed_channels WHERE yt_channel_id = ? ''', pairs)
def _remove_tags(cursor, channel_ids, tags):
pairs = [(tag, yt_channel_id) for tag in tags for yt_channel_id in channel_ids]
cursor.executemany('''DELETE FROM tag_associations
WHERE tag = ? AND sql_channel_id = (
SELECT id FROM subscribed_channels WHERE yt_channel_id = ?
)''', pairs)
def _get_tags(cursor, channel_id):
return [row[0] for row in cursor.execute('''SELECT tag
FROM tag_associations
WHERE sql_channel_id = (
SELECT id FROM subscribed_channels WHERE yt_channel_id = ?
)''', (channel_id,))]
def _get_all_tags(cursor):
return [row[0] for row in cursor.execute('''SELECT DISTINCT tag FROM tag_associations''')]
def _get_channel_names(cursor, channel_ids):
''' returns list of (channel_id, channel_name) '''
result = []
for channel_id in channel_ids:
row = cursor.execute('''SELECT channel_name
FROM subscribed_channels
WHERE yt_channel_id = ?''', (channel_id,)).fetchone()
result.append( (channel_id, row[0]) )
return result
def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_muted_status=False):
''' returns list of (channel_id, channel_name) '''
statement = '''SELECT yt_channel_id, channel_name'''
if include_muted_status:
statement += ''', muted'''
statement += '''
FROM subscribed_channels
WHERE subscribed_channels.id IN (
SELECT tag_associations.sql_channel_id FROM tag_associations WHERE tag=?
)
'''
if exclude_muted:
statement += '''AND muted != 1\n'''
if order:
statement += '''ORDER BY channel_name COLLATE NOCASE'''
return cursor.execute(statement, [tag]).fetchall()
units = {
'year': 31536000, # 365*24*3600
'month': 2592000, # 30*24*3600
'week': 604800, # 7*24*3600
'day': 86400, # 24*3600
'hour': 3600,
'minute': 60,
'second': 1,
}
def youtube_timestamp_to_posix(dumb_timestamp):
''' Given a dumbed down timestamp such as 1 year ago, 3 hours ago,
approximates the unix time (seconds since 1/1/1970) '''
dumb_timestamp = dumb_timestamp.lower()
now = time.time()
if dumb_timestamp == "just now":
return now
split = dumb_timestamp.split(' ')
number, unit = int(split[0]), split[1]
if number > 1:
unit = unit[:-1] # remove s from end
return now - number*units[unit]
try:
existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory))
except FileNotFoundError:
existing_thumbnails = set()
thumbnails_queue = util.RateLimitedQueue()
check_channels_queue = util.RateLimitedQueue()
# Use this to mark a thumbnail acceptable to be retrieved at the request of the browser
# can't simply check if it's in the queue because items are removed when the download starts, not when it finishes
downloading_thumbnails = set()
checking_channels = set()
# Just to use for printing channel checking status to console without opening database
channel_names = dict()
def download_thumbnail_worker():
while True:
video_id = thumbnails_queue.get()
try:
success = util.download_thumbnail(thumbnails_directory, video_id)
if success:
existing_thumbnails.add(video_id)
except Exception:
traceback.print_exc()
finally:
downloading_thumbnails.remove(video_id)
def check_channel_worker():
while True:
channel_id = check_channels_queue.get()
try:
_get_upstream_videos(channel_id)
finally:
checking_channels.remove(channel_id)
for i in range(0,5):
gevent.spawn(download_thumbnail_worker)
gevent.spawn(check_channel_worker)
def download_thumbnails_if_necessary(thumbnails):
for video_id in thumbnails:
if video_id not in existing_thumbnails and video_id not in downloading_thumbnails:
downloading_thumbnails.add(video_id)
thumbnails_queue.put(video_id)
def check_channels_if_necessary(channel_ids):
for channel_id in channel_ids:
if channel_id not in checking_channels:
checking_channels.add(channel_id)
check_channels_queue.put(channel_id)
def _get_upstream_videos(channel_id):
try:
print("Checking channel: " + channel_names[channel_id])
except KeyError:
print("Checking channel " + channel_id)
videos = []
channel_videos = channel.extract_info(json.loads(channel.get_channel_tab(channel_id)), 'videos')['items']
for i, video_item in enumerate(channel_videos):
if 'description' not in video_item:
video_item['description'] = ''
try:
video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order
except KeyError:
print(video_item)
videos.append((channel_id, video_item['id'], video_item['title'], video_item['duration'], video_item['time_published'], video_item['description']))
now = time.time()
download_thumbnails_if_necessary(video[1] for video in videos if (now - video[4]) < 30*24*3600) # Don't download thumbnails from videos older than a month
with open_database() as connection:
with connection as cursor:
cursor.executemany('''INSERT OR IGNORE INTO videos (sql_channel_id, video_id, title, duration, time_published, description)
VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?)''', videos)
cursor.execute('''UPDATE subscribed_channels
SET time_last_checked = ?
WHERE yt_channel_id=?''', [int(time.time()), channel_id])
def check_all_channels():
with open_database() as connection:
with connection as cursor:
channel_id_name_list = cursor.execute('''SELECT yt_channel_id, channel_name
FROM subscribed_channels
WHERE muted != 1''').fetchall()
channel_names.update(channel_id_name_list)
check_channels_if_necessary([item[0] for item in channel_id_name_list])
def check_tags(tags):
channel_id_name_list = []
with open_database() as connection:
with connection as cursor:
for tag in tags:
channel_id_name_list += _channels_with_tag(cursor, tag, exclude_muted=True)
channel_names.update(channel_id_name_list)
check_channels_if_necessary([item[0] for item in channel_id_name_list])
def check_specific_channels(channel_ids):
with open_database() as connection:
with connection as cursor:
channel_id_name_list = []
for channel_id in channel_ids:
channel_id_name_list += cursor.execute('''SELECT yt_channel_id, channel_name
FROM subscribed_channels
WHERE yt_channel_id=?''', [channel_id]).fetchall()
channel_names.update(channel_id_name_list)
check_channels_if_necessary(channel_ids)
def import_subscriptions(env, start_response):
content_type = env['parameters']['subscriptions_file'][0]
file = env['parameters']['subscriptions_file'][1]
file = file.decode('utf-8')
if content_type == 'application/json':
try:
file = json.loads(file)
except json.decoder.JSONDecodeError:
traceback.print_exc()
start_response('400 Bad Request', () )
return b'400 Bad Request: Invalid json file'
try:
channels = ( (item['snippet']['resourceId']['channelId'], item['snippet']['title']) for item in file)
except (KeyError, IndexError):
traceback.print_exc()
start_response('400 Bad Request', () )
return b'400 Bad Request: Unknown json structure'
elif content_type in ('application/xml', 'text/xml', 'text/x-opml'):
try:
root = defusedxml.ElementTree.fromstring(file)
assert root.tag == 'opml'
channels = []
for outline_element in root[0][0]:
if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib):
continue
channel_name = outline_element.attrib['text']
channel_rss_url = outline_element.attrib['xmlUrl']
channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip()
channels.append( (channel_id, channel_name) )
except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e:
start_response('400 Bad Request', () )
return b'400 Bad Request: Unable to read opml xml file, or the file is not the expected format'
else:
start_response('400 Bad Request', () )
return b'400 Bad Request: Unsupported file format: ' + html.escape(content_type).encode('utf-8') + b'. Only subscription.json files (from Google Takeouts) and XML OPML files exported from Youtube\'s subscription manager page are supported'
with_open_db(_subscribe, channels)
start_response('303 See Other', [('Location', util.URL_ORIGIN + '/subscription_manager'),] )
return b''
"""sub_list_item_template = Template('''
<li class="sub-list-item $mute_class">
<input class="sub-list-checkbox" name="channel_ids" value="$channel_id" form="subscription-manager-form" type="checkbox">
<a href="$channel_url" class="sub-list-item-name" title="$channel_name">$channel_name</a>
<span class="tag-list">$tags</span>
</li>''')
tag_group_template = Template('''
<li class="tag-group">
<h2 class="tag-group-name">$tag</h2>
<ol class="sub-list">
$sub_list
</ol>
</li>
''')"""
def get_subscription_manager_page(env, start_response):
with open_database() as connection:
with connection as cursor:
if env['parameters'].get('group_by_tags', '0')[0] == '1':
sort_name = "Don't group"
sort_link = util.URL_ORIGIN + '/subscription_manager'
main_list_html = '<ul class="tag-group-list">'
for tag in _get_all_tags(cursor):
sub_list_html = ''
for channel_id, channel_name, muted in _channels_with_tag(cursor, tag, order=True, include_muted_status=True):
sub_list_html += sub_list_item_template.substitute(
channel_url = util.URL_ORIGIN + '/channel/' + channel_id,
channel_name = html.escape(channel_name),
channel_id = channel_id,
tags = ', '.join(t for t in _get_tags(cursor, channel_id) if t != tag),
mute_class = 'muted' if muted else '',
)
main_list_html += tag_group_template.substitute(
tag = tag,
sub_list = sub_list_html,
)
# Channels with no tags
channel_list = cursor.execute('''SELECT yt_channel_id, channel_name, muted
FROM subscribed_channels
WHERE id NOT IN (
SELECT sql_channel_id FROM tag_associations
)
ORDER BY channel_name COLLATE NOCASE''').fetchall()
if channel_list:
sub_list_html = ''
for channel_id, channel_name, muted in channel_list:
sub_list_html += sub_list_item_template.substitute(
channel_url = util.URL_ORIGIN + '/channel/' + channel_id,
channel_name = html.escape(channel_name),
channel_id = channel_id,
tags = '',
mute_class = 'muted' if muted else '',
)
main_list_html += tag_group_template.substitute(
tag = "No tags",
sub_list = sub_list_html,
)
main_list_html += '</ul>'
else:
sort_name = "Group by tags"
sort_link = util.URL_ORIGIN + '/subscription_manager?group_by_tags=1'
main_list_html = '<ol class="sub-list">'
for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
main_list_html += sub_list_item_template.substitute(
channel_url = util.URL_ORIGIN + '/channel/' + channel_id,
channel_name = html.escape(channel_name),
channel_id = channel_id,
tags = ', '.join(_get_tags(cursor, channel_id)),
mute_class = 'muted' if muted else '',
)
main_list_html += '</ol>'
start_response('200 OK', [('Content-type','text/html'),])
return subscription_manager_template.substitute(
header = html_common.get_header(),
main_list = main_list_html,
sort_name = sort_name,
sort_link = sort_link,
page_buttons = '',
).encode('utf-8')
def list_from_comma_separated_tags(string):
return [tag.strip() for tag in string.split(',') if tag.strip()]
"""unsubscribe_list_item_template = Template('''
<li><a href="$channel_url" title="$channel_name">$channel_name</a></li>''')"""
def post_subscription_manager_page(env, start_response):
params = env['parameters']
action = params['action'][0]
with open_database() as connection:
with connection as cursor:
if action == 'add_tags':
_add_tags(cursor, params['channel_ids'], [tag.lower() for tag in list_from_comma_separated_tags(params['tags'][0])])
elif action == 'remove_tags':
_remove_tags(cursor, params['channel_ids'], [tag.lower() for tag in list_from_comma_separated_tags(params['tags'][0])])
elif action == 'unsubscribe':
_unsubscribe(cursor, params['channel_ids'])
elif action == 'unsubscribe_verify':
page = '''
<span>Are you sure you want to unsubscribe from these channels?</span>
<form class="subscriptions-import-form" action="/youtube.com/subscription_manager" method="POST">'''
for channel_id in params['channel_ids']:
page += '<input type="hidden" name="channel_ids" value="' + channel_id + '">\n'
page += '''
<input type="hidden" name="action" value="unsubscribe">
<input type="submit" value="Yes, unsubscribe">
</form>
<ul>'''
for channel_id, channel_name in _get_channel_names(cursor, params['channel_ids']):
page += unsubscribe_list_item_template.substitute(
channel_url = util.URL_ORIGIN + '/channel/' + channel_id,
channel_name = html.escape(channel_name),
)
page += '''</ul>'''
start_response('200 OK', [('Content-type','text/html'),])
return html_common.yt_basic_template.substitute(
page_title = 'Unsubscribe?',
style = '',
header = html_common.get_header(),
page = page,
).encode('utf-8')
elif action == 'mute':
cursor.executemany('''UPDATE subscribed_channels
SET muted = 1
WHERE yt_channel_id = ?''', [(ci,) for ci in params['channel_ids']])
elif action == 'unmute':
cursor.executemany('''UPDATE subscribed_channels
SET muted = 0
WHERE yt_channel_id = ?''', [(ci,) for ci in params['channel_ids']])
else:
start_response('400 Bad Request', ())
return b'400 Bad Request'
start_response('303 See Other', [('Location', util.URL_ORIGIN + '/subscription_manager'),] )
return b''
@yt_app.route('/subscriptions', methods=['GET'])
@yt_app.route('/feed/subscriptions', methods=['GET'])
def get_subscriptions_page():
with open_database() as connection:
with connection as cursor:
videos = []
for video in _get_videos(cursor, 60, 0):
if video['id'] in downloading_thumbnails:
video['thumbnail'] = util.get_thumbnail_url(video['id'])
else:
video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
video['type'] = 'video'
video['item_size'] = 'small'
videos.append(video)
tags = _get_all_tags(cursor)
subscription_list = []
for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
subscription_list.append({
'channel_url': util.URL_ORIGIN + '/channel/' + channel_id,
'channel_name': channel_name,
'channel_id': channel_id,
'muted': muted,
})
return flask.render_template('subscriptions.html',
videos = videos,
tags = tags,
subscription_list = subscription_list,
)
@yt_app.route('/subscriptions', methods=['POST'])
@yt_app.route('/feed/subscriptions', methods=['POST'])
def post_subscriptions_page():
action = request.values['action']
if action == 'subscribe':
if len(request.values.getlist('channel_id')) != len(request.values('channel_name')):
return '400 Bad Request, length of channel_id != length of channel_name', 400
with_open_db(_subscribe, zip(request.values.getlist('channel_id'), request.values.getlist('channel_name')))
elif action == 'unsubscribe':
with_open_db(_unsubscribe, request.values.getlist('channel_id'))
elif action == 'refresh':
type = request.values['type']
if type == 'all':
check_all_channels()
elif type == 'tag':
check_tags(request.values.getlist('tag_name'))
elif type == 'channel':
check_specific_channels(request.values.getlist('channel_id'))
else:
flask.abort(400)
else:
flask.abort(400)
return '', 204
@yt_app.route('/data/subscription_thumbnails/<thumbnail>')
def serve_subscription_thumbnail(thumbnail):
# .. is necessary because flask always uses the application directory at ./youtube, not the working directory
return flask.send_from_directory(os.path.join('..', thumbnails_directory), thumbnail)