Merge branch 'modular-data-extract'

Commits in this branch are prefixed with "Extraction:"
This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module.
Responses from requests are given to the module and it parses them into a consistent, more useful format.
The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons:
(1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle.
(2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos.
(3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain).
(4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
This commit is contained in:
James Taylor
2019-12-19 21:33:54 -08:00
61 changed files with 1753 additions and 32293 deletions

View File

@@ -172,7 +172,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
'id': db_video[0],
'title': db_video[1],
'duration': db_video[2],
'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
'author': db_video[5],
})
@@ -455,10 +455,17 @@ def _get_upstream_videos(channel_id):
print('Failed to read atoma feed for ' + channel_status_name)
traceback.print_exc()
videos = channel.extract_info(json.loads(channel_tab), 'videos')['items']
channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
if channel_info['error']:
print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
return
videos = channel_info['items']
for i, video_item in enumerate(videos):
if 'description' not in video_item:
if not video_item.get('description'):
video_item['description'] = ''
else:
video_item['description'] = ''.join(run.get('text', '') for run in video_item['description'])
if video_item['id'] in times_published:
video_item['time_published'] = times_published[video_item['id']]
@@ -466,7 +473,7 @@ def _get_upstream_videos(channel_id):
else:
video_item['is_time_published_exact'] = False
try:
video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order
video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order
except KeyError:
print(video_item)
@@ -759,7 +766,7 @@ def get_subscriptions_page():
video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
video['type'] = 'video'
video['item_size'] = 'small'
yt_data_extract.add_extra_html_info(video)
util.add_extra_html_info(video)
tags = _get_all_tags(cursor)