Merge branch 'modular-data-extract'

Commits in this branch are prefixed with "Extraction:" This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module. Responses from requests are given to the module and it parses them into a consistent, more useful format. The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons: (1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle. (2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos. (3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain). (4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
2019-12-19 21:33:54 -08:00
parent b614fcdb85 6b7a1212e3
commit b4406df9cf
61 changed files with 1753 additions and 32293 deletions
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -172,7 +172,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
            'id':   db_video[0],
            'title':    db_video[1],
            'duration': db_video[2],
-            'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
+            'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
            'author':   db_video[5],
        })

@@ -455,10 +455,17 @@ def _get_upstream_videos(channel_id):
        print('Failed to read atoma feed for ' + channel_status_name)
        traceback.print_exc()

-    videos = channel.extract_info(json.loads(channel_tab), 'videos')['items']
+    channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
+    if channel_info['error']:
+        print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
+        return
+
+    videos = channel_info['items']
    for i, video_item in enumerate(videos):
-        if 'description' not in video_item:
+        if not video_item.get('description'):
            video_item['description'] = ''
+        else:
+            video_item['description'] = ''.join(run.get('text', '') for run in video_item['description'])

        if video_item['id'] in times_published:
            video_item['time_published'] = times_published[video_item['id']]
@@ -466,7 +473,7 @@ def _get_upstream_videos(channel_id):
        else:
            video_item['is_time_published_exact'] = False
            try:
-                video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i  # subtract a few seconds off the videos so they will be in the right order
+                video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i  # subtract a few seconds off the videos so they will be in the right order
            except KeyError:
                print(video_item)

@@ -759,7 +766,7 @@ def get_subscriptions_page():
                video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
                video['type'] = 'video'
                video['item_size'] = 'small'
-                yt_data_extract.add_extra_html_info(video)
+                util.add_extra_html_info(video)

            tags = _get_all_tags(cursor)