Subscriptions: check channel's atom feed to get exact time published
This commit is contained in:
parent
899b088cdc
commit
056f6fd2b5
@ -14,6 +14,7 @@ import urllib
|
|||||||
import math
|
import math
|
||||||
import secrets
|
import secrets
|
||||||
import collections
|
import collections
|
||||||
|
import calendar # bullshit! https://bugs.python.org/issue6280
|
||||||
|
|
||||||
import flask
|
import flask
|
||||||
from flask import request
|
from flask import request
|
||||||
@ -278,7 +279,10 @@ def posix_to_dumbed_down(posix_time):
|
|||||||
raise Exception()
|
raise Exception()
|
||||||
|
|
||||||
def exact_timestamp(posix_time):
|
def exact_timestamp(posix_time):
|
||||||
return time.strftime('%m/%d/%y %I:%M %p', time.localtime(posix_time))
|
result = time.strftime('%I:%M %p %m/%d/%y', time.localtime(posix_time))
|
||||||
|
if result[0] == '0': # remove 0 infront of hour (like 01:00 PM)
|
||||||
|
return result[1:]
|
||||||
|
return result
|
||||||
|
|
||||||
try:
|
try:
|
||||||
existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory))
|
existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory))
|
||||||
@ -385,15 +389,64 @@ def _get_upstream_videos(channel_id):
|
|||||||
|
|
||||||
videos = []
|
videos = []
|
||||||
|
|
||||||
channel_videos = channel.extract_info(json.loads(channel.get_channel_tab(channel_id, print_status=False)), 'videos')['items']
|
tasks = (
|
||||||
|
gevent.spawn(channel.get_channel_tab, channel_id, print_status=False), # channel page, need for video duration
|
||||||
|
gevent.spawn(util.fetch_url, 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id) # atoma feed, need for exact published time
|
||||||
|
)
|
||||||
|
gevent.joinall(tasks)
|
||||||
|
|
||||||
|
channel_tab, feed = tasks[0].value, tasks[1].value
|
||||||
|
|
||||||
|
# extract published times from atoma feed
|
||||||
|
times_published = {}
|
||||||
|
try:
|
||||||
|
def remove_bullshit(tag):
|
||||||
|
'''Remove XML namespace bullshit from tagname. https://bugs.python.org/issue18304'''
|
||||||
|
if '}' in tag:
|
||||||
|
return tag[tag.rfind('}')+1:]
|
||||||
|
return tag
|
||||||
|
|
||||||
|
def find_element(base, tag_name):
|
||||||
|
for element in base:
|
||||||
|
if remove_bullshit(element.tag) == tag_name:
|
||||||
|
return element
|
||||||
|
return None
|
||||||
|
|
||||||
|
root = defusedxml.ElementTree.fromstring(feed.decode('utf-8'))
|
||||||
|
assert remove_bullshit(root.tag) == 'feed'
|
||||||
|
for entry in root:
|
||||||
|
if (remove_bullshit(entry.tag) != 'entry'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# it's yt:videoId in the xml but the yt: is turned into a namespace which is removed by remove_bullshit
|
||||||
|
video_id_element = find_element(entry, 'videoId')
|
||||||
|
time_published_element = find_element(entry, 'published')
|
||||||
|
assert video_id_element is not None
|
||||||
|
assert time_published_element is not None
|
||||||
|
|
||||||
|
time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00')))
|
||||||
|
times_published[video_id_element.text] = time_published
|
||||||
|
|
||||||
|
except (AssertionError, defusedxml.ElementTree.ParseError) as e:
|
||||||
|
print('Failed to read atoma feed for ' + channel_status_name)
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
channel_videos = channel.extract_info(json.loads(channel_tab), 'videos')['items']
|
||||||
for i, video_item in enumerate(channel_videos):
|
for i, video_item in enumerate(channel_videos):
|
||||||
if 'description' not in video_item:
|
if 'description' not in video_item:
|
||||||
video_item['description'] = ''
|
video_item['description'] = ''
|
||||||
try:
|
|
||||||
video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order
|
if video_item['id'] in times_published:
|
||||||
except KeyError:
|
video_item['time_published'] = times_published[video_item['id']]
|
||||||
print(video_item)
|
video_item['is_time_published_exact'] = True
|
||||||
videos.append((channel_id, video_item['id'], video_item['title'], video_item['duration'], video_item['time_published'], video_item['description']))
|
else:
|
||||||
|
video_item['is_time_published_exact'] = False
|
||||||
|
try:
|
||||||
|
video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order
|
||||||
|
except KeyError:
|
||||||
|
print(video_item)
|
||||||
|
videos.append((channel_id, video_item['id'], video_item['title'], video_item['duration'], video_item['time_published'], video_item['is_time_published_exact'], video_item['description']))
|
||||||
|
|
||||||
|
|
||||||
if len(videos) == 0:
|
if len(videos) == 0:
|
||||||
@ -430,8 +483,8 @@ def _get_upstream_videos(channel_id):
|
|||||||
index += 1
|
index += 1
|
||||||
number_of_new_videos = index
|
number_of_new_videos = index
|
||||||
|
|
||||||
cursor.executemany('''INSERT OR IGNORE INTO videos (sql_channel_id, video_id, title, duration, time_published, description)
|
cursor.executemany('''INSERT OR IGNORE INTO videos (sql_channel_id, video_id, title, duration, time_published, is_time_published_exact, description)
|
||||||
VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?)''', videos)
|
VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?)''', videos)
|
||||||
cursor.execute('''UPDATE subscribed_channels
|
cursor.execute('''UPDATE subscribed_channels
|
||||||
SET time_last_checked = ?, next_check_time = ?
|
SET time_last_checked = ?, next_check_time = ?
|
||||||
WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
|
WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user