Extraction: Replace youtube-dl with custom-built watch page extraction
This commit is contained in:
148
youtube/watch.py
148
youtube/watch.py
@@ -5,49 +5,15 @@ import settings
|
||||
from flask import request
|
||||
import flask
|
||||
|
||||
from youtube_dl.YoutubeDL import YoutubeDL
|
||||
from youtube_dl.extractor.youtube import YoutubeError
|
||||
import json
|
||||
import html
|
||||
import gevent
|
||||
import os
|
||||
import math
|
||||
import traceback
|
||||
|
||||
|
||||
def get_related_items(info):
|
||||
results = []
|
||||
for item in info['related_vids']:
|
||||
if 'list' in item: # playlist:
|
||||
result = watch_page_related_playlist_info(item)
|
||||
else:
|
||||
result = watch_page_related_video_info(item)
|
||||
yt_data_extract.prefix_urls(result)
|
||||
yt_data_extract.add_extra_html_info(result)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
# json of related items retrieved directly from the watch page has different names for everything
|
||||
# converts these to standard names
|
||||
def watch_page_related_video_info(item):
|
||||
result = {key: item[key] for key in ('id', 'title', 'author')}
|
||||
result['duration'] = util.seconds_to_timestamp(item['length_seconds'])
|
||||
try:
|
||||
result['views'] = item['short_view_count_text']
|
||||
except KeyError:
|
||||
result['views'] = ''
|
||||
result['thumbnail'] = util.get_thumbnail_url(item['id'])
|
||||
result['type'] = 'video'
|
||||
return result
|
||||
|
||||
def watch_page_related_playlist_info(item):
|
||||
return {
|
||||
'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+",
|
||||
'title': item['playlist_title'],
|
||||
'id': item['list'],
|
||||
'first_video_id': item['video_id'],
|
||||
'thumbnail': util.get_thumbnail_url(item['video_id']),
|
||||
'type': 'playlist',
|
||||
}
|
||||
|
||||
def get_video_sources(info):
|
||||
video_sources = []
|
||||
@@ -55,9 +21,10 @@ def get_video_sources(info):
|
||||
max_resolution = 360
|
||||
else:
|
||||
max_resolution = settings.default_resolution
|
||||
|
||||
for format in info['formats']:
|
||||
if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution:
|
||||
if not all(attr in format for attr in ('height', 'width', 'ext', 'url')):
|
||||
continue
|
||||
if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution:
|
||||
video_sources.append({
|
||||
'src': format['url'],
|
||||
'type': 'video/' + format['ext'],
|
||||
@@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list):
|
||||
|
||||
return ordered_attributes
|
||||
|
||||
headers = (
|
||||
('Accept', '*/*'),
|
||||
('Accept-Language', 'en-US,en;q=0.5'),
|
||||
('X-YouTube-Client-Name', '2'),
|
||||
('X-YouTube-Client-Version', '2.20180830'),
|
||||
) + util.mobile_ua
|
||||
|
||||
def extract_info(downloader, *args, **kwargs):
|
||||
def extract_info(video_id):
|
||||
polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch')
|
||||
try:
|
||||
return downloader.extract_info(*args, **kwargs)
|
||||
except YoutubeError as e:
|
||||
return str(e)
|
||||
polymer_json = json.loads(polymer_json)
|
||||
except json.decoder.JSONDecodeError:
|
||||
traceback.print_exc()
|
||||
return {'error': 'Failed to parse json response'}
|
||||
return yt_data_extract.extract_watch_info(polymer_json)
|
||||
|
||||
def video_quality_string(format):
|
||||
if 'vcodec' in format:
|
||||
result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?'))
|
||||
if 'fps' in format:
|
||||
result += ' ' + format['fps'] + 'fps'
|
||||
return result
|
||||
elif 'acodec' in format:
|
||||
return 'audio only'
|
||||
|
||||
return '?'
|
||||
|
||||
def audio_quality_string(format):
|
||||
if 'acodec' in format:
|
||||
result = str(format.get('abr', '?')) + 'k'
|
||||
if 'audio_sample_rate' in format:
|
||||
result += ' ' + str(format['audio_sample_rate']) + ' Hz'
|
||||
return result
|
||||
elif 'vcodec' in format:
|
||||
return 'video only'
|
||||
|
||||
return '?'
|
||||
|
||||
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
|
||||
def format_bytes(bytes):
|
||||
if bytes is None:
|
||||
return 'N/A'
|
||||
if type(bytes) is str:
|
||||
bytes = float(bytes)
|
||||
if bytes == 0.0:
|
||||
exponent = 0
|
||||
else:
|
||||
exponent = int(math.log(bytes, 1024.0))
|
||||
suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
|
||||
converted = float(bytes) / float(1024 ** exponent)
|
||||
return '%.2f%s' % (converted, suffix)
|
||||
|
||||
|
||||
@yt_app.route('/watch')
|
||||
@@ -152,38 +162,26 @@ def get_watch_page():
|
||||
flask.abort(flask.Response('Incomplete video id (too short): ' + video_id))
|
||||
|
||||
lc = request.args.get('lc', '')
|
||||
if settings.route_tor:
|
||||
proxy = 'socks5://127.0.0.1:9150/'
|
||||
else:
|
||||
proxy = ''
|
||||
yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy})
|
||||
tasks = (
|
||||
gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ),
|
||||
gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False)
|
||||
gevent.spawn(extract_info, video_id)
|
||||
)
|
||||
gevent.joinall(tasks)
|
||||
comments_info, info = tasks[0].value, tasks[1].value
|
||||
|
||||
if isinstance(info, str): # youtube error
|
||||
return flask.render_template('error.html', error_message = info)
|
||||
if info['error']:
|
||||
return flask.render_template('error.html', error_message = info['error'])
|
||||
|
||||
video_info = {
|
||||
"duration": util.seconds_to_timestamp(info["duration"]),
|
||||
"duration": util.seconds_to_timestamp(info["duration"] or 0),
|
||||
"id": info['id'],
|
||||
"title": info['title'],
|
||||
"author": info['uploader'],
|
||||
"author": info['author'],
|
||||
}
|
||||
|
||||
upload_year = info["upload_date"][0:4]
|
||||
upload_month = info["upload_date"][4:6]
|
||||
upload_day = info["upload_date"][6:8]
|
||||
upload_date = upload_month + "/" + upload_day + "/" + upload_year
|
||||
|
||||
if settings.related_videos_mode:
|
||||
related_videos = get_related_items(info)
|
||||
else:
|
||||
related_videos = []
|
||||
|
||||
for item in info['related_videos']:
|
||||
yt_data_extract.prefix_urls(item)
|
||||
yt_data_extract.add_extra_html_info(item)
|
||||
|
||||
if settings.gather_googlevideo_domains:
|
||||
with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f:
|
||||
@@ -195,23 +193,29 @@ def get_watch_page():
|
||||
download_formats = []
|
||||
|
||||
for format in info['formats']:
|
||||
if 'acodec' in format and 'vcodec' in format:
|
||||
codecs_string = format['acodec'] + ', ' + format['vcodec']
|
||||
else:
|
||||
codecs_string = format.get('acodec') or format.get('vcodec') or '?'
|
||||
download_formats.append({
|
||||
'url': format['url'],
|
||||
'ext': format['ext'],
|
||||
'resolution': yt_dl_downloader.format_resolution(format),
|
||||
'note': yt_dl_downloader._format_note(format),
|
||||
'ext': format.get('ext', '?'),
|
||||
'audio_quality': audio_quality_string(format),
|
||||
'video_quality': video_quality_string(format),
|
||||
'file_size': format_bytes(format['file_size']),
|
||||
'codecs': codecs_string,
|
||||
})
|
||||
|
||||
video_sources = get_video_sources(info)
|
||||
video_height = video_sources[0]['height']
|
||||
|
||||
video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360)
|
||||
video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640)
|
||||
# 1 second per pixel, or the actual video width
|
||||
theater_video_target_width = max(640, info['duration'], video_sources[0]['width'])
|
||||
theater_video_target_width = max(640, info['duration'] or 0, video_width)
|
||||
|
||||
return flask.render_template('watch.html',
|
||||
header_playlist_names = local_playlist.get_playlist_names(),
|
||||
uploader_channel_url = '/' + info['uploader_url'],
|
||||
upload_date = upload_date,
|
||||
uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '',
|
||||
upload_date = info['published_date'],
|
||||
views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
|
||||
likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
|
||||
dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
|
||||
@@ -219,7 +223,7 @@ def get_watch_page():
|
||||
video_info = json.dumps(video_info),
|
||||
video_sources = video_sources,
|
||||
subtitle_sources = get_subtitle_sources(info),
|
||||
related = related_videos,
|
||||
related = info['related_videos'],
|
||||
music_list = info['music_list'],
|
||||
music_attributes = get_ordered_music_list_attributes(info['music_list']),
|
||||
comments_info = comments_info,
|
||||
@@ -232,7 +236,7 @@ def get_watch_page():
|
||||
theater_video_target_width = theater_video_target_width,
|
||||
|
||||
title = info['title'],
|
||||
uploader = info['uploader'],
|
||||
uploader = info['author'],
|
||||
description = info['description'],
|
||||
unlisted = info['unlisted'],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user