diff --git a/youtube/watch.py b/youtube/watch.py
index f7b8051..7f3b5be 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -189,20 +189,7 @@ def decrypt_signatures(info, video_id):
if not yt_data_extract.requires_decryption(info):
return False
if not info['player_name']:
- # base.js urls missing. Usually this is because there is no
- # embedded player response; instead it's in the json as playerResponse,
- # but there's no base.js key.
- # Example: https://www.youtube.com/watch?v=W6iQPK3F16U
- # See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
- url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
- html_watch_page = util.fetch_url(
- url,
- headers=watch_headers,
- report_text='Fetching html watch page to retrieve missing base.js',
- debug_name='watch_page_html').decode('utf-8')
- err = yt_data_extract.update_with_missing_base_js(info, html_watch_page)
- if err:
- return err
+ return 'Could not find player name'
player_name = info['player_name']
if player_name in decrypt_cache:
@@ -222,21 +209,15 @@ def decrypt_signatures(info, video_id):
def extract_info(video_id, use_invidious, playlist_id=None, index=None):
# bpctr=9999999999 will bypass are-you-sure dialogs for controversial
# videos
- url = 'https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999'
+ url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
if playlist_id:
url += '&list=' + playlist_id
if index:
url += '&index=' + index
- polymer_json = util.fetch_url(url, headers=watch_headers,
- debug_name='watch')
- polymer_json = polymer_json.decode('utf-8')
- # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
- try:
- polymer_json = json.loads(polymer_json)
- except json.decoder.JSONDecodeError:
- traceback.print_exc()
- return {'error': 'Failed to parse json response'}
- info = yt_data_extract.extract_watch_info(polymer_json)
+ watch_page = util.fetch_url(url, headers=watch_headers,
+ debug_name='watch')
+ watch_page = watch_page.decode('utf-8')
+ info = yt_data_extract.extract_watch_info_from_html(watch_page)
# request player urls if it's missing
# see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py
index 697e003..ad7bd03 100644
--- a/youtube/yt_data_extract/__init__.py
+++ b/youtube/yt_data_extract/__init__.py
@@ -10,4 +10,4 @@ from .watch_extraction import (extract_watch_info, get_caption_url,
update_with_age_restricted_info, requires_decryption,
extract_decryption_function, decrypt_signatures, _formats,
update_format_with_type_info, extract_hls_formats,
- update_with_missing_base_js)
+ extract_watch_info_from_html)
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 75fa206..c304d23 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -569,6 +569,76 @@ def extract_watch_info(polymer_json):
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
return info
+single_char_codes = {
+ 'n': '\n',
+ '\\': '\\',
+ '"': '"',
+ "'": "'",
+ 'b': '\b',
+ 'f': '\f',
+ 'n': '\n',
+ 'r': '\r',
+ 't': '\t',
+ 'v': '\x0b',
+ '0': '\x00',
+ '\n': '', # backslash followed by literal newline joins lines
+}
+def js_escape_replace(match):
+ r'''Resolves javascript string escape sequences such as \x..'''
+ # some js-strings in the watch page html include them for no reason
+ # https://mathiasbynens.be/notes/javascript-escapes
+ escaped_sequence = match.group(1)
+ if escaped_sequence[0] in ('x', 'u'):
+ return chr(int(escaped_sequence[1:], base=16))
+
+ # In javascript, if it's not one of those escape codes, it's just the
+ # literal character. e.g., "\a" = "a"
+ return single_char_codes.get(escaped_sequence, escaped_sequence)
+
+PLAYER_RESPONSE_RE = re.compile(r'')
+INITIAL_DATA_RE = re.compile(r"