Extraction: Replace youtube-dl with custom-built watch page extraction

2019-10-17 19:58:13 -07:00
parent 9abb83fdbc
commit 4c07546e7a
45 changed files with 518 additions and 31348 deletions
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -5,49 +5,15 @@ import settings
 from flask import request
 import flask

-from youtube_dl.YoutubeDL import YoutubeDL
-from youtube_dl.extractor.youtube import YoutubeError
 import json
 import html
 import gevent
 import os
+import math
+import traceback


-def get_related_items(info):
-    results = []
-    for item in info['related_vids']:
-        if 'list' in item:  # playlist:
-            result = watch_page_related_playlist_info(item)
-        else:
-            result = watch_page_related_video_info(item)
-        yt_data_extract.prefix_urls(result)
-        yt_data_extract.add_extra_html_info(result)
-        results.append(result)
-    return results

-    
-# json of related items retrieved directly from the watch page has different names for everything
-# converts these to standard names
-def watch_page_related_video_info(item):
-    result = {key: item[key] for key in ('id', 'title', 'author')}
-    result['duration'] = util.seconds_to_timestamp(item['length_seconds'])
-    try:
-        result['views'] = item['short_view_count_text']
-    except KeyError:
-        result['views'] = ''
-    result['thumbnail'] = util.get_thumbnail_url(item['id'])
-    result['type'] = 'video'
-    return result
-    
-def watch_page_related_playlist_info(item):
-    return {
-        'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+",
-        'title': item['playlist_title'],
-        'id': item['list'],
-        'first_video_id': item['video_id'],
-        'thumbnail': util.get_thumbnail_url(item['video_id']),
-        'type': 'playlist',
-    }

 def get_video_sources(info):
    video_sources = []
@@ -55,9 +21,10 @@ def get_video_sources(info):
        max_resolution = 360
    else:
        max_resolution = settings.default_resolution
-
    for format in info['formats']:
-        if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution:
+        if not all(attr in format for attr in ('height', 'width', 'ext', 'url')):
+            continue
+        if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution:
            video_sources.append({
                'src': format['url'],
                'type': 'video/' + format['ext'],
@@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list):

    return ordered_attributes

+headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '2'),
+    ('X-YouTube-Client-Version', '2.20180830'),
+) + util.mobile_ua

-def extract_info(downloader, *args, **kwargs):
+def extract_info(video_id):
+    polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch')
    try:
-        return downloader.extract_info(*args, **kwargs)
-    except YoutubeError as e:
-        return str(e)
+        polymer_json = json.loads(polymer_json)
+    except json.decoder.JSONDecodeError:
+        traceback.print_exc()
+        return {'error': 'Failed to parse json response'}
+    return yt_data_extract.extract_watch_info(polymer_json)

+def video_quality_string(format):
+    if 'vcodec' in format:
+        result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?'))
+        if 'fps' in format:
+            result += ' ' + format['fps'] + 'fps'
+        return result
+    elif 'acodec' in format:
+        return 'audio only'

+    return '?'
+
+def audio_quality_string(format):
+    if 'acodec' in format:
+        result = str(format.get('abr', '?')) + 'k'
+        if 'audio_sample_rate' in format:
+            result += ' ' + str(format['audio_sample_rate']) + ' Hz'
+        return result
+    elif 'vcodec' in format:
+        return 'video only'
+
+    return '?'
+
+# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
+def format_bytes(bytes):
+    if bytes is None:
+        return 'N/A'
+    if type(bytes) is str:
+        bytes = float(bytes)
+    if bytes == 0.0:
+        exponent = 0
+    else:
+        exponent = int(math.log(bytes, 1024.0))
+    suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
+    converted = float(bytes) / float(1024 ** exponent)
+    return '%.2f%s' % (converted, suffix)


@yt_app.route('/watch')
@@ -152,38 +162,26 @@ def get_watch_page():
        flask.abort(flask.Response('Incomplete video id (too short): ' + video_id))

    lc = request.args.get('lc', '')
-    if settings.route_tor:
-        proxy = 'socks5://127.0.0.1:9150/'
-    else:
-        proxy = ''
-    yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy})
    tasks = (
        gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ),
-        gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False)
+        gevent.spawn(extract_info, video_id)
    )
    gevent.joinall(tasks)
    comments_info, info = tasks[0].value, tasks[1].value

-    if isinstance(info, str): # youtube error
-        return flask.render_template('error.html', error_message = info)
+    if info['error']:
+        return flask.render_template('error.html', error_message = info['error'])

    video_info = {
-        "duration": util.seconds_to_timestamp(info["duration"]),
+        "duration": util.seconds_to_timestamp(info["duration"] or 0),
        "id":       info['id'],
        "title":    info['title'],
-        "author":   info['uploader'],
+        "author":   info['author'],
    }

-    upload_year = info["upload_date"][0:4]
-    upload_month = info["upload_date"][4:6]
-    upload_day = info["upload_date"][6:8]
-    upload_date = upload_month + "/" + upload_day + "/" + upload_year
-    
-    if settings.related_videos_mode:
-        related_videos = get_related_items(info)
-    else:
-        related_videos = []
-
+    for item in info['related_videos']:
+        yt_data_extract.prefix_urls(item)
+        yt_data_extract.add_extra_html_info(item)

    if settings.gather_googlevideo_domains:
        with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f:
@@ -195,23 +193,29 @@ def get_watch_page():
    download_formats = []

    for format in info['formats']:
+        if 'acodec' in format and 'vcodec' in format:
+            codecs_string = format['acodec'] + ', ' + format['vcodec']
+        else:
+            codecs_string = format.get('acodec') or format.get('vcodec') or '?'
        download_formats.append({
            'url': format['url'],
-            'ext': format['ext'],
-            'resolution': yt_dl_downloader.format_resolution(format),
-            'note': yt_dl_downloader._format_note(format),
+            'ext': format.get('ext', '?'),
+            'audio_quality': audio_quality_string(format),
+            'video_quality': video_quality_string(format),
+            'file_size': format_bytes(format['file_size']),
+            'codecs': codecs_string,
        })

    video_sources = get_video_sources(info)
-    video_height = video_sources[0]['height']
-
+    video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360)
+    video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640)
    # 1 second per pixel, or the actual video width
-    theater_video_target_width = max(640, info['duration'], video_sources[0]['width'])
+    theater_video_target_width = max(640, info['duration'] or 0, video_width)

    return flask.render_template('watch.html',
        header_playlist_names   = local_playlist.get_playlist_names(),
-        uploader_channel_url    = '/' + info['uploader_url'],
-        upload_date             = upload_date,
+        uploader_channel_url    = ('/' + info['author_url']) if info['author_url'] else '',
+        upload_date             = info['published_date'],
        views           = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
        likes           = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
        dislikes        = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
@@ -219,7 +223,7 @@ def get_watch_page():
        video_info              = json.dumps(video_info),
        video_sources           = video_sources,
        subtitle_sources        = get_subtitle_sources(info),
-        related                 = related_videos,
+        related                 = info['related_videos'],
        music_list              = info['music_list'],
        music_attributes        = get_ordered_music_list_attributes(info['music_list']),
        comments_info           = comments_info,
@@ -232,7 +236,7 @@ def get_watch_page():
        theater_video_target_width = theater_video_target_width,

        title       = info['title'],
-        uploader    = info['uploader'],
+        uploader    = info['author'],
        description = info['description'],
        unlisted    = info['unlisted'],
    )