Retrieve base.js url from html watch page when it's missing

Fixes failure mode 3 in #22
This commit is contained in:
James Taylor
2020-12-09 17:08:12 -08:00
parent 1a7ed0a981
commit 6443cedf62
3 changed files with 40 additions and 14 deletions

View File

@@ -9,4 +9,5 @@ from .everything_else import (extract_channel_info, extract_search_info,
from .watch_extraction import (extract_watch_info, get_caption_url,
update_with_age_restricted_info, requires_decryption,
extract_decryption_function, decrypt_signatures, _formats,
update_format_with_type_info, extract_hls_formats)
update_format_with_type_info, extract_hls_formats,
update_with_missing_base_js)

View File

@@ -602,6 +602,19 @@ def update_with_age_restricted_info(info, video_info_page):
_extract_formats(info, player_response)
_extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
def update_with_missing_base_js(info, html_watch_page):
'''Extracts base_js url and player_name from html watch page. return err
Use when base_js is missing from the json page.'''
match = html_watch_page_base_js_re.search(html_watch_page)
if match:
info['base_js'] = normalize_url(match.group(1))
# must uniquely identify url
info['player_name'] = urllib.parse.urlparse(info['base_js']).path
return False
else:
return 'Could not find base_js url in watch page html'
def requires_decryption(info):
return ('formats' in info) and info['formats'] and info['formats'][0]['s']