
Adds support for AV1-encoded videos, which includes any videos above 1080p. These weren't getting included because they did not have a quality entry in the format table at the top of watch_extraction.py. So get the quality from the quality labels of the format if it's not there. Because YouTube often includes BOTH AV1 and H.264 (AVC) for each quality, after these are included, there will be way too many quality options and the code needs to choose which one to use. The choice is somewhat hard: AV1 is encoded in fewer bytes than H.264 and is patent-free, however, it has less hardware support, so might be more difficult to play. For instance, on my system, AV1 does not work on 1080p, but H.264 does. Adds a setting about which to prefer, set to H.264 as the default. Also adds support for the lower quality mp4 audio quality, which now gets used at 144p to save network bandwidth. For similar reasons, this was not getting included because it did not have an audio_bitrate entry in the table. Prefer bitrate instead for the quality. Signed-off-by: Jesús <heckyel@hyperbola.info>
855 lines
40 KiB
Python
855 lines
40 KiB
Python
from .common import (get, multi_get, deep_get, multi_deep_get,
|
|
liberal_update, conservative_update, remove_redirect, normalize_url,
|
|
extract_str, extract_formatted_text, extract_int, extract_approx_int,
|
|
extract_date, check_missing_keys, extract_item_info, extract_items,
|
|
extract_response, concat_or_none, liberal_dict_update,
|
|
conservative_dict_update)
|
|
|
|
import json
|
|
import urllib.parse
|
|
import traceback
|
|
import re
|
|
|
|
# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
|
|
_formats = {
|
|
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
|
'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
|
|
'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
|
|
'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
|
|
'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
|
|
'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
|
|
'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
|
|
'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
|
'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
|
'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
|
|
|
|
# 3D videos
|
|
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
|
|
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
|
|
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
|
|
|
|
# Apple HTTP Live Streaming
|
|
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
|
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
|
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
|
|
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
|
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
|
|
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
|
|
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},
|
|
|
|
# DASH mp4 video
|
|
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
|
|
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
|
'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
|
|
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
|
|
|
|
# Dash mp4 audio
|
|
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
|
|
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
|
|
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
|
|
'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
|
'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
|
|
'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
|
|
'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
|
|
|
|
# Dash webm
|
|
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
|
|
'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
|
|
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
|
|
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
|
|
'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
|
|
|
|
# Dash webm audio
|
|
'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
|
|
'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},
|
|
|
|
# Dash webm audio with opus inside
|
|
'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
|
|
'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
|
|
'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},
|
|
|
|
# RTMP (unnamed)
|
|
'_rtmp': {'protocol': 'rtmp'},
|
|
|
|
# av01 video only formats sometimes served with "unknown" codecs
|
|
'394': {'vcodec': 'av01.0.05M.08'},
|
|
'395': {'vcodec': 'av01.0.05M.08'},
|
|
'396': {'vcodec': 'av01.0.05M.08'},
|
|
'397': {'vcodec': 'av01.0.05M.08'},
|
|
}
|
|
|
|
|
|
def _extract_from_video_information_renderer(renderer_content):
|
|
subtitle = extract_str(renderer_content.get('expandedSubtitle'),
|
|
default='')
|
|
info = {
|
|
'title': extract_str(renderer_content.get('title')),
|
|
'view_count': extract_int(subtitle),
|
|
'unlisted': False,
|
|
'live': 'watching' in subtitle,
|
|
}
|
|
for badge in renderer_content.get('badges', []):
|
|
if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
|
|
info['unlisted'] = True
|
|
return info
|
|
|
|
def _extract_likes_dislikes(renderer_content):
|
|
info = {
|
|
'like_count': None,
|
|
'dislike_count': None,
|
|
}
|
|
for button in renderer_content.get('buttons', ()):
|
|
button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
|
|
|
|
# all the digits can be found in the accessibility data
|
|
count = extract_int(deep_get(
|
|
button_renderer,
|
|
'button', 'toggleButtonRenderer', 'defaultText',
|
|
'accessibility', 'accessibilityData', 'label'))
|
|
|
|
# this count doesn't have all the digits, it's like 53K for instance
|
|
dumb_count = extract_int(extract_str(deep_get(
|
|
button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
|
|
|
|
# The accessibility text will be "No likes" or "No dislikes" or
|
|
# something like that, but dumb count will be 0
|
|
if dumb_count == 0:
|
|
count = 0
|
|
|
|
if 'isLike' in button_renderer:
|
|
info['like_count'] = count
|
|
elif 'isDislike' in button_renderer:
|
|
info['dislike_count'] = count
|
|
return info
|
|
|
|
def _extract_from_owner_renderer(renderer_content):
|
|
return {
|
|
'author': extract_str(renderer_content.get('title')),
|
|
'author_id': deep_get(
|
|
renderer_content,
|
|
'navigationEndpoint', 'browseEndpoint', 'browseId'),
|
|
}
|
|
|
|
def _extract_from_video_header_renderer(renderer_content):
|
|
return {
|
|
'title': extract_str(renderer_content.get('title')),
|
|
'time_published': extract_date(extract_str(
|
|
renderer_content.get('publishDate'))),
|
|
}
|
|
|
|
def _extract_from_description_renderer(renderer_content):
|
|
return {
|
|
'description': extract_str(
|
|
renderer_content.get('descriptionBodyText'), recover_urls=True),
|
|
}
|
|
|
|
def _extract_metadata_row_info(renderer_content):
|
|
# extract category and music list
|
|
info = {
|
|
'category': None,
|
|
'music_list': [],
|
|
}
|
|
|
|
current_song = {}
|
|
for row in deep_get(renderer_content, 'rows', default=[]):
|
|
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
|
|
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
|
|
if row_title == 'Category':
|
|
info['category'] = row_content
|
|
elif row_title in ('Song', 'Music'):
|
|
if current_song:
|
|
info['music_list'].append(current_song)
|
|
current_song = {'title': row_content}
|
|
elif row_title == 'Artist':
|
|
current_song['artist'] = row_content
|
|
elif row_title == 'Album':
|
|
current_song['album'] = row_content
|
|
elif row_title == 'Writers':
|
|
current_song['writers'] = row_content
|
|
elif row_title.startswith('Licensed'):
|
|
current_song['licensor'] = row_content
|
|
if current_song:
|
|
info['music_list'].append(current_song)
|
|
|
|
return info
|
|
|
|
def _extract_from_video_metadata(renderer_content):
|
|
info = _extract_from_video_information_renderer(renderer_content)
|
|
liberal_dict_update(info, _extract_likes_dislikes(renderer_content))
|
|
liberal_dict_update(info, _extract_from_owner_renderer(renderer_content))
|
|
liberal_dict_update(info, _extract_metadata_row_info(deep_get(
|
|
renderer_content, 'metadataRowContainer',
|
|
'metadataRowContainerRenderer', default={}
|
|
)))
|
|
liberal_update(info, 'title', extract_str(renderer_content.get('title')))
|
|
liberal_update(
|
|
info, 'description',
|
|
extract_str(renderer_content.get('description'), recover_urls=True)
|
|
)
|
|
liberal_update(info, 'time_published',
|
|
extract_date(renderer_content.get('dateText')))
|
|
return info
|
|
|
|
visible_extraction_dispatch = {
|
|
# Either these ones spread around in various places
|
|
'slimVideoInformationRenderer': _extract_from_video_information_renderer,
|
|
'slimVideoActionBarRenderer': _extract_likes_dislikes,
|
|
'slimOwnerRenderer': _extract_from_owner_renderer,
|
|
'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
|
|
'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
|
|
'metadataRowContainerRenderer': _extract_metadata_row_info,
|
|
# OR just this one, which contains SOME of the above inside it
|
|
'slimVideoMetadataRenderer': _extract_from_video_metadata,
|
|
}
|
|
|
|
def _extract_watch_info_mobile(top_level):
|
|
'''Scrapes information from the visible page'''
|
|
info = {}
|
|
response = top_level.get('response', {})
|
|
|
|
# this renderer has the stuff visible on the page
|
|
# check for playlist
|
|
items, _ = extract_items(response,
|
|
item_types={'singleColumnWatchNextResults'})
|
|
if items:
|
|
watch_next_results = items[0]['singleColumnWatchNextResults']
|
|
playlist = deep_get(watch_next_results, 'playlist', 'playlist')
|
|
if playlist is None:
|
|
info['playlist'] = None
|
|
else:
|
|
info['playlist'] = {}
|
|
info['playlist']['title'] = playlist.get('title')
|
|
info['playlist']['author'] = extract_str(multi_get(playlist,
|
|
'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
|
|
author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
|
|
'navigationEndpoint', 'browseEndpoint', 'browseId')
|
|
info['playlist']['author_id'] = author_id
|
|
info['playlist']['author_url'] = concat_or_none(
|
|
'https://www.youtube.com/channel/', author_id)
|
|
info['playlist']['id'] = playlist.get('playlistId')
|
|
info['playlist']['url'] = concat_or_none(
|
|
'https://www.youtube.com/playlist?list=',
|
|
info['playlist']['id'])
|
|
info['playlist']['video_count'] = playlist.get('totalVideos')
|
|
info['playlist']['current_index'] = playlist.get('currentIndex')
|
|
info['playlist']['items'] = [
|
|
extract_item_info(i) for i in playlist.get('contents', ())]
|
|
else:
|
|
info['playlist'] = None
|
|
|
|
# use dispatch table to get information scattered in various renderers
|
|
items, _ = extract_items(
|
|
response,
|
|
item_types=visible_extraction_dispatch.keys(),
|
|
search_engagement_panels=True
|
|
)
|
|
found = set()
|
|
for renderer in items:
|
|
name, renderer_content = list(renderer.items())[0]
|
|
found.add(name)
|
|
liberal_dict_update(
|
|
info,
|
|
visible_extraction_dispatch[name](renderer_content)
|
|
)
|
|
# Call the function on blank dict for any that weren't found
|
|
# so that the empty keys get added
|
|
for name in visible_extraction_dispatch.keys() - found:
|
|
liberal_dict_update(info, visible_extraction_dispatch[name]({}))
|
|
|
|
# comment section info
|
|
items, _ = extract_items(response, item_types={
|
|
'commentSectionRenderer', 'commentsEntryPointHeaderRenderer'})
|
|
if items:
|
|
header_type = list(items[0])[0]
|
|
comment_info = items[0][header_type]
|
|
# This seems to be some kind of A/B test being done on mobile, where
|
|
# this is present instead of the normal commentSectionRenderer. It can
|
|
# be seen here:
|
|
# https://www.androidpolice.com/2019/10/31/google-youtube-app-comment-section-below-videos/
|
|
# https://www.youtube.com/watch?v=bR5Q-wD-6qo
|
|
if header_type == 'commentsEntryPointHeaderRenderer':
|
|
comment_count_text = extract_str(comment_info.get('headerText'))
|
|
else:
|
|
comment_count_text = extract_str(deep_get(comment_info,
|
|
'header', 'commentSectionHeaderRenderer', 'countText'))
|
|
if comment_count_text == 'Comments': # just this with no number, means 0 comments
|
|
info['comment_count'] = 0
|
|
else:
|
|
info['comment_count'] = extract_int(comment_count_text)
|
|
info['comments_disabled'] = False
|
|
else: # no comment section present means comments are disabled
|
|
info['comment_count'] = 0
|
|
info['comments_disabled'] = True
|
|
|
|
# check for limited state
|
|
items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'})
|
|
if items:
|
|
info['limited_state'] = True
|
|
else:
|
|
info['limited_state'] = False
|
|
|
|
# related videos
|
|
related, _ = extract_items(response)
|
|
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
|
|
|
return info
|
|
|
|
def _extract_watch_info_desktop(top_level):
|
|
info = {
|
|
'comment_count': None,
|
|
'comments_disabled': None,
|
|
'limited_state': None,
|
|
'playlist': None,
|
|
}
|
|
|
|
video_info = {}
|
|
for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
|
|
if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
|
|
video_info.update(list(renderer.values())[0])
|
|
|
|
info.update(_extract_metadata_row_info(video_info))
|
|
info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
|
|
info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
|
|
|
|
likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
|
|
if len(likes_dislikes) == 2:
|
|
info['like_count'] = extract_int(likes_dislikes[0])
|
|
info['dislike_count'] = extract_int(likes_dislikes[1])
|
|
else:
|
|
info['like_count'] = None
|
|
info['dislike_count'] = None
|
|
|
|
info['title'] = extract_str(video_info.get('title', None))
|
|
info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
|
|
info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
|
|
info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
|
|
|
|
related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
|
|
info['related_videos'] = [extract_item_info(renderer) for renderer in related]
|
|
|
|
return info
|
|
|
|
def update_format_with_codec_info(fmt, codec):
|
|
if (codec.startswith('av')
|
|
or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')):
|
|
if codec == 'vp8.0':
|
|
codec = 'vp8'
|
|
conservative_update(fmt, 'vcodec', codec)
|
|
elif (codec.startswith('mp4a')
|
|
or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')):
|
|
conservative_update(fmt, 'acodec', codec)
|
|
else:
|
|
print('Warning: unrecognized codec: ' + codec)
|
|
|
|
fmt_type_re = re.compile(
|
|
r'(text|audio|video)/([\w0-9]+); codecs="([\w0-9\.]+(?:, [\w0-9\.]+)*)"')
|
|
def update_format_with_type_info(fmt, yt_fmt):
|
|
# 'type' for invidious api format
|
|
mime_type = multi_get(yt_fmt, 'mimeType', 'type')
|
|
if mime_type is None:
|
|
return
|
|
match = re.fullmatch(fmt_type_re, mime_type)
|
|
|
|
type, fmt['ext'], codecs = match.groups()
|
|
codecs = codecs.split(', ')
|
|
for codec in codecs:
|
|
update_format_with_codec_info(fmt, codec)
|
|
if type == 'audio':
|
|
assert len(codecs) == 1
|
|
|
|
def _extract_formats(info, player_response):
|
|
streaming_data = player_response.get('streamingData', {})
|
|
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
|
|
|
|
info['formats'] = []
|
|
# because we may retry the extract_formats with a different player_response
|
|
# so keep what we have
|
|
conservative_update(info, 'hls_manifest_url',
|
|
streaming_data.get('hlsManifestUrl'))
|
|
conservative_update(info, 'dash_manifest_url',
|
|
streaming_data.get('dash_manifest_url'))
|
|
|
|
for yt_fmt in yt_formats:
|
|
itag = yt_fmt.get('itag')
|
|
|
|
fmt = {}
|
|
fmt['itag'] = itag
|
|
fmt['ext'] = None
|
|
fmt['audio_bitrate'] = None
|
|
fmt['bitrate'] = yt_fmt.get('bitrate')
|
|
fmt['acodec'] = None
|
|
fmt['vcodec'] = None
|
|
fmt['width'] = yt_fmt.get('width')
|
|
fmt['height'] = yt_fmt.get('height')
|
|
fmt['file_size'] = yt_fmt.get('contentLength')
|
|
fmt['audio_sample_rate'] = extract_int(yt_fmt.get('audioSampleRate'))
|
|
fmt['duration_ms'] = yt_fmt.get('approxDurationMs')
|
|
fmt['fps'] = yt_fmt.get('fps')
|
|
fmt['init_range'] = yt_fmt.get('initRange')
|
|
fmt['index_range'] = yt_fmt.get('indexRange')
|
|
for key in ('init_range', 'index_range'):
|
|
if fmt[key]:
|
|
fmt[key]['start'] = int(fmt[key]['start'])
|
|
fmt[key]['end'] = int(fmt[key]['end'])
|
|
update_format_with_type_info(fmt, yt_fmt)
|
|
cipher = dict(urllib.parse.parse_qsl(multi_get(yt_fmt,
|
|
'cipher', 'signatureCipher', default='')))
|
|
if cipher:
|
|
fmt['url'] = cipher.get('url')
|
|
else:
|
|
fmt['url'] = yt_fmt.get('url')
|
|
fmt['s'] = cipher.get('s')
|
|
fmt['sp'] = cipher.get('sp')
|
|
|
|
# update with information from big table
|
|
hardcoded_itag_info = _formats.get(str(itag), {})
|
|
for key, value in hardcoded_itag_info.items():
|
|
conservative_update(fmt, key, value) # prefer info from YouTube
|
|
fmt['quality'] = hardcoded_itag_info.get('height')
|
|
conservative_update(
|
|
fmt, 'quality',
|
|
extract_int(yt_fmt.get('quality'), whole_word=False)
|
|
)
|
|
conservative_update(
|
|
fmt, 'quality',
|
|
extract_int(yt_fmt.get('qualityLabel'), whole_word=False)
|
|
)
|
|
|
|
info['formats'].append(fmt)
|
|
|
|
# get ip address
|
|
if info['formats']:
|
|
query_string = (info['formats'][0].get('url') or '?').split('?')[1]
|
|
info['ip_address'] = deep_get(
|
|
urllib.parse.parse_qs(query_string), 'ip', 0)
|
|
else:
|
|
info['ip_address'] = None
|
|
|
|
hls_regex = re.compile(r'[\w_-]+=(?:"[^"]+"|[^",]+),')
|
|
def extract_hls_formats(hls_manifest):
|
|
'''returns hls_formats, err'''
|
|
hls_formats = []
|
|
try:
|
|
lines = hls_manifest.splitlines()
|
|
i = 0
|
|
while i < len(lines):
|
|
if lines[i].startswith('#EXT-X-STREAM-INF'):
|
|
fmt = {'acodec': None, 'vcodec': None, 'height': None,
|
|
'width': None, 'fps': None, 'audio_bitrate': None,
|
|
'itag': None, 'file_size': None, 'duration_ms': None,
|
|
'audio_sample_rate': None, 'url': None}
|
|
properties = lines[i].split(':')[1]
|
|
properties += ',' # make regex work for last key-value pair
|
|
|
|
for pair in hls_regex.findall(properties):
|
|
key, value = pair.rstrip(',').split('=')
|
|
if key == 'CODECS':
|
|
for codec in value.strip('"').split(','):
|
|
update_format_with_codec_info(fmt, codec)
|
|
elif key == 'RESOLUTION':
|
|
fmt['width'], fmt['height'] = map(int, value.split('x'))
|
|
fmt['resolution'] = value
|
|
elif key == 'FRAME-RATE':
|
|
fmt['fps'] = int(value)
|
|
i += 1
|
|
fmt['url'] = lines[i]
|
|
assert fmt['url'].startswith('http')
|
|
fmt['ext'] = 'm3u8'
|
|
hls_formats.append(fmt)
|
|
i += 1
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
return [], str(e)
|
|
return hls_formats, None
|
|
|
|
|
|
def _extract_playability_error(info, player_response, error_prefix=''):
|
|
if info['formats']:
|
|
info['playability_status'] = None
|
|
info['playability_error'] = None
|
|
return
|
|
|
|
playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None)
|
|
info['playability_status'] = playability_status
|
|
|
|
playability_reason = extract_str(multi_deep_get(player_response,
|
|
['playabilityStatus', 'reason'],
|
|
['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
|
|
default='Could not find playability error')
|
|
)
|
|
|
|
if playability_status not in (None, 'OK'):
|
|
info['playability_error'] = error_prefix + playability_reason
|
|
elif not info['playability_error']: # do not override
|
|
info['playability_error'] = error_prefix + 'Unknown playability error'
|
|
|
|
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
|
|
def extract_watch_info(polymer_json):
|
|
info = {'playability_error': None, 'error': None,
|
|
'player_response_missing': None}
|
|
|
|
if isinstance(polymer_json, dict):
|
|
top_level = polymer_json
|
|
elif isinstance(polymer_json, (list, tuple)):
|
|
top_level = {}
|
|
for page_part in polymer_json:
|
|
if not isinstance(page_part, dict):
|
|
return {'error': 'Invalid page part'}
|
|
top_level.update(page_part)
|
|
else:
|
|
return {'error': 'Invalid top level polymer data'}
|
|
|
|
error = check_missing_keys(top_level,
|
|
['player', 'args'],
|
|
['player', 'assets', 'js'],
|
|
['playerResponse'],
|
|
)
|
|
if error:
|
|
info['playability_error'] = error
|
|
|
|
player_response = top_level.get('playerResponse', {})
|
|
|
|
# usually, only the embedded one has the urls
|
|
player_args = deep_get(top_level, 'player', 'args', default={})
|
|
if 'player_response' in player_args:
|
|
embedded_player_response = json.loads(player_args['player_response'])
|
|
else:
|
|
embedded_player_response = {}
|
|
|
|
# captions
|
|
info['automatic_caption_languages'] = []
|
|
info['manual_caption_languages'] = []
|
|
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
|
|
info['translation_languages'] = []
|
|
captions_info = player_response.get('captions', {})
|
|
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
|
|
for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
|
|
lang_code = caption_track.get('languageCode')
|
|
if not lang_code:
|
|
continue
|
|
if caption_track.get('kind') == 'asr':
|
|
info['automatic_caption_languages'].append(lang_code)
|
|
else:
|
|
info['manual_caption_languages'].append(lang_code)
|
|
base_url = caption_track.get('baseUrl', '')
|
|
lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
|
|
if lang_name:
|
|
info['_manual_caption_language_names'][lang_code] = lang_name
|
|
|
|
for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
|
|
lang_code = translation_lang_info.get('languageCode')
|
|
if lang_code:
|
|
info['translation_languages'].append(lang_code)
|
|
if translation_lang_info.get('isTranslatable') == False:
|
|
print('WARNING: Found non-translatable caption language')
|
|
|
|
# formats
|
|
_extract_formats(info, embedded_player_response)
|
|
if not info['formats']:
|
|
_extract_formats(info, player_response)
|
|
|
|
# see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
|
|
info['player_urls_missing'] = (
|
|
not info['formats'] and not embedded_player_response)
|
|
|
|
# playability errors
|
|
_extract_playability_error(info, player_response)
|
|
|
|
# check age-restriction
|
|
info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
|
|
|
|
# base_js (for decryption of signatures)
|
|
info['base_js'] = deep_get(top_level, 'player', 'assets', 'js')
|
|
if info['base_js']:
|
|
info['base_js'] = normalize_url(info['base_js'])
|
|
# must uniquely identify url
|
|
info['player_name'] = urllib.parse.urlparse(info['base_js']).path
|
|
else:
|
|
info['player_name'] = None
|
|
|
|
# extract stuff from visible parts of page
|
|
mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
|
|
if mobile:
|
|
info.update(_extract_watch_info_mobile(top_level))
|
|
else:
|
|
info.update(_extract_watch_info_desktop(top_level))
|
|
|
|
# stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
|
|
vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
|
|
liberal_update(info, 'title', extract_str(vd.get('title')))
|
|
liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
|
|
liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
|
|
# videos with no description have a blank string
|
|
liberal_update(info, 'description', vd.get('shortDescription'))
|
|
liberal_update(info, 'id', vd.get('videoId'))
|
|
liberal_update(info, 'author', vd.get('author'))
|
|
liberal_update(info, 'author_id', vd.get('channelId'))
|
|
info['was_live'] = vd.get('isLiveContent')
|
|
conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted
|
|
liberal_update(info, 'tags', vd.get('keywords', []))
|
|
|
|
# fallback stuff from microformat
|
|
mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
|
|
conservative_update(info, 'title', extract_str(mf.get('title')))
|
|
conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
|
|
# this gives the view count for limited state videos
|
|
conservative_update(info, 'view_count', extract_int(mf.get('viewCount')))
|
|
conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True))
|
|
conservative_update(info, 'author', mf.get('ownerChannelName'))
|
|
conservative_update(info, 'author_id', mf.get('externalChannelId'))
|
|
conservative_update(info, 'live', deep_get(mf, 'liveBroadcastDetails',
|
|
'isLiveNow'))
|
|
liberal_update(info, 'unlisted', mf.get('isUnlisted'))
|
|
liberal_update(info, 'category', mf.get('category'))
|
|
liberal_update(info, 'time_published', mf.get('publishDate'))
|
|
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
|
|
family_safe = mf.get('isFamilySafe')
|
|
if family_safe is None:
|
|
conservative_update(info, 'age_restricted', None)
|
|
else:
|
|
conservative_update(info, 'age_restricted', not family_safe)
|
|
info['allowed_countries'] = mf.get('availableCountries', [])
|
|
|
|
# other stuff
|
|
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
|
return info
|
|
|
|
single_char_codes = {
|
|
'n': '\n',
|
|
'\\': '\\',
|
|
'"': '"',
|
|
"'": "'",
|
|
'b': '\b',
|
|
'f': '\f',
|
|
'n': '\n',
|
|
'r': '\r',
|
|
't': '\t',
|
|
'v': '\x0b',
|
|
'0': '\x00',
|
|
'\n': '', # backslash followed by literal newline joins lines
|
|
}
|
|
def js_escape_replace(match):
|
|
r'''Resolves javascript string escape sequences such as \x..'''
|
|
# some js-strings in the watch page html include them for no reason
|
|
# https://mathiasbynens.be/notes/javascript-escapes
|
|
escaped_sequence = match.group(1)
|
|
if escaped_sequence[0] in ('x', 'u'):
|
|
return chr(int(escaped_sequence[1:], base=16))
|
|
|
|
# In javascript, if it's not one of those escape codes, it's just the
|
|
# literal character. e.g., "\a" = "a"
|
|
return single_char_codes.get(escaped_sequence, escaped_sequence)
|
|
|
|
# works but complicated and unsafe:
|
|
#PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({(?:"(?:[^"\\]|\\.)*?"|[^"])+?});')
|
|
|
|
# Because there are sometimes additional statements after the json object
|
|
# so we just capture all of those until end of script and tell json decoder
|
|
# to ignore extra stuff after the json object
|
|
PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({.*?)</script>')
|
|
INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';")
|
|
BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
|
|
JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
|
|
def extract_watch_info_from_html(watch_html):
|
|
base_js_match = BASE_JS_RE.search(watch_html)
|
|
player_response_match = PLAYER_RESPONSE_RE.search(watch_html)
|
|
initial_data_match = INITIAL_DATA_RE.search(watch_html)
|
|
|
|
if base_js_match is not None:
|
|
base_js_url = base_js_match.group(1)
|
|
else:
|
|
base_js_url = None
|
|
|
|
if player_response_match is not None:
|
|
decoder = json.JSONDecoder()
|
|
# this will make it ignore extra stuff after end of object
|
|
player_response = decoder.raw_decode(player_response_match.group(1))[0]
|
|
else:
|
|
return {'error': 'Could not find ytInitialPlayerResponse'}
|
|
player_response = None
|
|
|
|
if initial_data_match is not None:
|
|
initial_data = initial_data_match.group(1)
|
|
initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace, initial_data)
|
|
initial_data = json.loads(initial_data)
|
|
else:
|
|
print('extract_watch_info_from_html: failed to find initialData')
|
|
initial_data = None
|
|
|
|
# imitate old format expected by extract_watch_info
|
|
fake_polymer_json = {
|
|
'player': {
|
|
'args': {},
|
|
'assets': {
|
|
'js': base_js_url
|
|
}
|
|
},
|
|
'playerResponse': player_response,
|
|
'response': initial_data,
|
|
}
|
|
|
|
return extract_watch_info(fake_polymer_json)
|
|
|
|
|
|
|
|
def get_caption_url(info, language, format, automatic=False, translation_language=None):
|
|
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
|
|
url = info['_captions_base_url']
|
|
url += '&lang=' + language
|
|
url += '&fmt=' + format
|
|
if automatic:
|
|
url += '&kind=asr'
|
|
elif language in info['_manual_caption_language_names']:
|
|
url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='')
|
|
|
|
if translation_language:
|
|
url += '&tlang=' + translation_language
|
|
return url
|
|
|
|
def update_with_age_restricted_info(info, player_response):
|
|
'''Inserts urls from player_response json'''
|
|
ERROR_PREFIX = 'Error getting missing player or bypassing age-restriction: '
|
|
|
|
try:
|
|
player_response = json.loads(player_response)
|
|
except json.decoder.JSONDecodeError:
|
|
traceback.print_exc()
|
|
info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response'
|
|
return
|
|
|
|
_extract_formats(info, player_response)
|
|
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
|
|
|
|
def requires_decryption(info):
|
|
return ('formats' in info) and info['formats'] and info['formats'][0]['s']
|
|
|
|
# adapted from youtube-dl and invidious:
|
|
# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr
|
|
decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}{]+)return a\.join\(""\)\}')
|
|
# gives us e.g. rt, .xK, 5 from rt.xK(a,5) or rt, ["xK"], 5 from rt["xK"](a,5)
|
|
# (var, operation, argument)
|
|
var_op_arg_re = re.compile(r'(\w+)(\.\w+|\["[^"]+"\])\(a,(\d+)\)')
|
|
def extract_decryption_function(info, base_js):
|
|
'''Insert decryption function into info. Return error string if not successful.
|
|
Decryption function is a list of list[2] of numbers.
|
|
It is advisable to cache the decryption function (uniquely identified by info['player_name']) so base.js (1 MB) doesn't need to be redownloaded each time'''
|
|
info['decryption_function'] = None
|
|
decrypt_function_match = decrypt_function_re.search(base_js)
|
|
if decrypt_function_match is None:
|
|
return 'Could not find decryption function in base.js'
|
|
|
|
function_body = decrypt_function_match.group(1).split(';')[1:-1]
|
|
if not function_body:
|
|
return 'Empty decryption function body'
|
|
|
|
var_with_operation_match = var_op_arg_re.fullmatch(function_body[0])
|
|
if var_with_operation_match is None:
|
|
return 'Could not find var_name'
|
|
|
|
var_name = var_with_operation_match.group(1)
|
|
var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL)
|
|
if var_body_match is None:
|
|
return 'Could not find var_body'
|
|
|
|
operations = var_body_match.group(1).replace('\n', '').split('},')
|
|
if not operations:
|
|
return 'Did not find any definitions in var_body'
|
|
operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others
|
|
operation_definitions = {}
|
|
for op in operations:
|
|
colon_index = op.find(':')
|
|
opening_brace_index = op.find('{')
|
|
|
|
if colon_index == -1 or opening_brace_index == -1:
|
|
return 'Could not parse operation'
|
|
op_name = op[:colon_index]
|
|
op_body = op[opening_brace_index+1:]
|
|
if op_body == 'a.reverse()':
|
|
operation_definitions[op_name] = 0
|
|
elif op_body == 'a.splice(0,b)':
|
|
operation_definitions[op_name] = 1
|
|
elif op_body.startswith('var c=a[0]'):
|
|
operation_definitions[op_name] = 2
|
|
else:
|
|
return 'Unknown op_body: ' + op_body
|
|
|
|
decryption_function = []
|
|
for op_with_arg in function_body:
|
|
match = var_op_arg_re.fullmatch(op_with_arg)
|
|
if match is None:
|
|
return 'Could not parse operation with arg'
|
|
op_name = match.group(2).strip('[].')
|
|
if op_name not in operation_definitions:
|
|
return 'Unknown op_name: ' + str(op_name)
|
|
op_argument = match.group(3)
|
|
decryption_function.append([operation_definitions[op_name], int(op_argument)])
|
|
|
|
info['decryption_function'] = decryption_function
|
|
return False
|
|
|
|
def _operation_2(a, b):
|
|
c = a[0]
|
|
a[0] = a[b % len(a)]
|
|
a[b % len(a)] = c
|
|
|
|
def decrypt_signatures(info):
|
|
'''Applies info['decryption_function'] to decrypt all the signatures. Return err.'''
|
|
if not info.get('decryption_function'):
|
|
return 'decryption_function not in info'
|
|
for format in info['formats']:
|
|
if not format['s'] or not format['sp'] or not format['url']:
|
|
print('Warning: s, sp, or url not in format')
|
|
continue
|
|
|
|
a = list(format['s'])
|
|
for op, argument in info['decryption_function']:
|
|
if op == 0:
|
|
a.reverse()
|
|
elif op == 1:
|
|
a = a[argument:]
|
|
else:
|
|
_operation_2(a, argument)
|
|
|
|
signature = ''.join(a)
|
|
format['url'] += '&' + format['sp'] + '=' + signature
|
|
return False
|