Always extract from html watch page to get base.js url
Youtube removed the url from the pbj responses. They are now only in the html page. Replaces previous fix for the missing base.js issue.
This commit is contained in:
parent
6443cedf62
commit
9d0be82e74
@ -189,20 +189,7 @@ def decrypt_signatures(info, video_id):
|
|||||||
if not yt_data_extract.requires_decryption(info):
|
if not yt_data_extract.requires_decryption(info):
|
||||||
return False
|
return False
|
||||||
if not info['player_name']:
|
if not info['player_name']:
|
||||||
# base.js urls missing. Usually this is because there is no
|
return 'Could not find player name'
|
||||||
# embedded player response; instead it's in the json as playerResponse,
|
|
||||||
# but there's no base.js key.
|
|
||||||
# Example: https://www.youtube.com/watch?v=W6iQPK3F16U
|
|
||||||
# See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
|
|
||||||
url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
|
|
||||||
html_watch_page = util.fetch_url(
|
|
||||||
url,
|
|
||||||
headers=watch_headers,
|
|
||||||
report_text='Fetching html watch page to retrieve missing base.js',
|
|
||||||
debug_name='watch_page_html').decode('utf-8')
|
|
||||||
err = yt_data_extract.update_with_missing_base_js(info, html_watch_page)
|
|
||||||
if err:
|
|
||||||
return err
|
|
||||||
|
|
||||||
player_name = info['player_name']
|
player_name = info['player_name']
|
||||||
if player_name in decrypt_cache:
|
if player_name in decrypt_cache:
|
||||||
@ -222,21 +209,15 @@ def decrypt_signatures(info, video_id):
|
|||||||
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
|
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
|
||||||
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
|
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
|
||||||
# videos
|
# videos
|
||||||
url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
|
url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
|
||||||
if playlist_id:
|
if playlist_id:
|
||||||
url += '&list=' + playlist_id
|
url += '&list=' + playlist_id
|
||||||
if index:
|
if index:
|
||||||
url += '&index=' + index
|
url += '&index=' + index
|
||||||
polymer_json = util.fetch_url(url, headers=watch_headers,
|
watch_page = util.fetch_url(url, headers=watch_headers,
|
||||||
debug_name='watch')
|
debug_name='watch')
|
||||||
polymer_json = polymer_json.decode('utf-8')
|
watch_page = watch_page.decode('utf-8')
|
||||||
# TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
|
info = yt_data_extract.extract_watch_info_from_html(watch_page)
|
||||||
try:
|
|
||||||
polymer_json = json.loads(polymer_json)
|
|
||||||
except json.decoder.JSONDecodeError:
|
|
||||||
traceback.print_exc()
|
|
||||||
return {'error': 'Failed to parse json response'}
|
|
||||||
info = yt_data_extract.extract_watch_info(polymer_json)
|
|
||||||
|
|
||||||
# request player urls if it's missing
|
# request player urls if it's missing
|
||||||
# see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
|
# see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
|
||||||
|
@ -10,4 +10,4 @@ from .watch_extraction import (extract_watch_info, get_caption_url,
|
|||||||
update_with_age_restricted_info, requires_decryption,
|
update_with_age_restricted_info, requires_decryption,
|
||||||
extract_decryption_function, decrypt_signatures, _formats,
|
extract_decryption_function, decrypt_signatures, _formats,
|
||||||
update_format_with_type_info, extract_hls_formats,
|
update_format_with_type_info, extract_hls_formats,
|
||||||
update_with_missing_base_js)
|
extract_watch_info_from_html)
|
||||||
|
@ -569,6 +569,76 @@ def extract_watch_info(polymer_json):
|
|||||||
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
single_char_codes = {
|
||||||
|
'n': '\n',
|
||||||
|
'\\': '\\',
|
||||||
|
'"': '"',
|
||||||
|
"'": "'",
|
||||||
|
'b': '\b',
|
||||||
|
'f': '\f',
|
||||||
|
'n': '\n',
|
||||||
|
'r': '\r',
|
||||||
|
't': '\t',
|
||||||
|
'v': '\x0b',
|
||||||
|
'0': '\x00',
|
||||||
|
'\n': '', # backslash followed by literal newline joins lines
|
||||||
|
}
|
||||||
|
def js_escape_replace(match):
|
||||||
|
r'''Resolves javascript string escape sequences such as \x..'''
|
||||||
|
# some js-strings in the watch page html include them for no reason
|
||||||
|
# https://mathiasbynens.be/notes/javascript-escapes
|
||||||
|
escaped_sequence = match.group(1)
|
||||||
|
if escaped_sequence[0] in ('x', 'u'):
|
||||||
|
return chr(int(escaped_sequence[1:], base=16))
|
||||||
|
|
||||||
|
# In javascript, if it's not one of those escape codes, it's just the
|
||||||
|
# literal character. e.g., "\a" = "a"
|
||||||
|
return single_char_codes.get(escaped_sequence, escaped_sequence)
|
||||||
|
|
||||||
|
PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>var ytInitialPlayerResponse = ({.*?});</script>')
|
||||||
|
INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';")
|
||||||
|
BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
|
||||||
|
JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
|
||||||
|
def extract_watch_info_from_html(watch_html):
|
||||||
|
base_js_match = BASE_JS_RE.search(watch_html)
|
||||||
|
player_response_match = PLAYER_RESPONSE_RE.search(watch_html)
|
||||||
|
initial_data_match = INITIAL_DATA_RE.search(watch_html)
|
||||||
|
|
||||||
|
if base_js_match is not None:
|
||||||
|
base_js_url = base_js_match.group(1)
|
||||||
|
else:
|
||||||
|
base_js_url = None
|
||||||
|
|
||||||
|
if player_response_match is not None:
|
||||||
|
player_response = json.loads(player_response_match.group(1))
|
||||||
|
else:
|
||||||
|
return {'error': 'Could not find ytInitialPlayerResponse'}
|
||||||
|
player_response = None
|
||||||
|
|
||||||
|
if initial_data_match is not None:
|
||||||
|
initial_data = initial_data_match.group(1)
|
||||||
|
initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace, initial_data)
|
||||||
|
initial_data = json.loads(initial_data)
|
||||||
|
else:
|
||||||
|
print('extract_watch_info_from_html: failed to find initialData')
|
||||||
|
initial_data = None
|
||||||
|
|
||||||
|
# imitate old format expected by extract_watch_info
|
||||||
|
fake_polymer_json = {
|
||||||
|
'player': {
|
||||||
|
'args': {},
|
||||||
|
'assets': {
|
||||||
|
'js': base_js_url
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'playerResponse': player_response,
|
||||||
|
'response': initial_data,
|
||||||
|
}
|
||||||
|
|
||||||
|
return extract_watch_info(fake_polymer_json)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_caption_url(info, language, format, automatic=False, translation_language=None):
|
def get_caption_url(info, language, format, automatic=False, translation_language=None):
|
||||||
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
|
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
|
||||||
url = info['_captions_base_url']
|
url = info['_captions_base_url']
|
||||||
@ -602,19 +672,6 @@ def update_with_age_restricted_info(info, video_info_page):
|
|||||||
_extract_formats(info, player_response)
|
_extract_formats(info, player_response)
|
||||||
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
|
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
|
||||||
|
|
||||||
html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
|
|
||||||
def update_with_missing_base_js(info, html_watch_page):
|
|
||||||
'''Extracts base_js url and player_name from html watch page. return err
|
|
||||||
Use when base_js is missing from the json page.'''
|
|
||||||
match = html_watch_page_base_js_re.search(html_watch_page)
|
|
||||||
if match:
|
|
||||||
info['base_js'] = normalize_url(match.group(1))
|
|
||||||
# must uniquely identify url
|
|
||||||
info['player_name'] = urllib.parse.urlparse(info['base_js']).path
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return 'Could not find base_js url in watch page html'
|
|
||||||
|
|
||||||
def requires_decryption(info):
|
def requires_decryption(info):
|
||||||
return ('formats' in info) and info['formats'] and info['formats'][0]['s']
|
return ('formats' in info) and info['formats'] and info['formats'][0]['s']
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user