Retrieve base.js url from html watch page when it's missing

Fixes failure mode 3 in #22
2020-12-09 17:08:12 -08:00
parent 1a7ed0a981
commit 6443cedf62
3 changed files with 40 additions and 14 deletions
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -177,14 +177,32 @@ def save_decrypt_cache():
    f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True))
    f.close()

-def decrypt_signatures(info):
+watch_headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '2'),
+    ('X-YouTube-Client-Version', '2.20180830'),
+) + util.mobile_ua
+
+def decrypt_signatures(info, video_id):
    '''return error string, or False if no errors'''
    if not yt_data_extract.requires_decryption(info):
        return False
    if not info['player_name']:
-        return 'Could not find player name'
-    if not info['base_js']:
-        return 'Failed to find base.js'
+        # base.js urls missing. Usually this is because there is no
+        # embedded player response; instead it's in the json as playerResponse,
+        # but there's no base.js key.
+        # Example: https://www.youtube.com/watch?v=W6iQPK3F16U
+        # See https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+        url = 'https://m.youtube.com/watch?v=' + video_id + '&bpctr=9999999999'
+        html_watch_page = util.fetch_url(
+            url,
+            headers=watch_headers,
+            report_text='Fetching html watch page to retrieve missing base.js',
+            debug_name='watch_page_html').decode('utf-8')
+        err = yt_data_extract.update_with_missing_base_js(info, html_watch_page)
+        if err:
+            return err

    player_name = info['player_name']
    if player_name in decrypt_cache:
@@ -201,13 +219,6 @@ def decrypt_signatures(info):
    err = yt_data_extract.decrypt_signatures(info)
    return err

-headers = (
-    ('Accept', '*/*'),
-    ('Accept-Language', 'en-US,en;q=0.5'),
-    ('X-YouTube-Client-Name', '2'),
-    ('X-YouTube-Client-Version', '2.20180830'),
-) + util.mobile_ua
-
 def extract_info(video_id, use_invidious, playlist_id=None, index=None):
    # bpctr=9999999999 will bypass are-you-sure dialogs for controversial
    # videos
@@ -216,7 +227,8 @@ def extract_info(video_id, use_invidious, playlist_id=None, index=None):
        url += '&list=' + playlist_id
    if index:
        url += '&index=' + index
-    polymer_json = util.fetch_url(url, headers=headers, debug_name='watch')
+    polymer_json = util.fetch_url(url, headers=watch_headers,
+                                  debug_name='watch')
    polymer_json = polymer_json.decode('utf-8')
    # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
    try:
@@ -242,7 +254,7 @@ def extract_info(video_id, use_invidious, playlist_id=None, index=None):
        yt_data_extract.update_with_age_restricted_info(info, video_info_page)

    # signature decryption
-    decryption_error = decrypt_signatures(info)
+    decryption_error = decrypt_signatures(info, video_id)
    if decryption_error:
        decryption_error = 'Error decrypting url signatures: ' + decryption_error
        info['playability_error'] = decryption_error
--- a/youtube/yt_data_extract/init.py
+++ b/youtube/yt_data_extract/init.py
@@ -9,4 +9,5 @@ from .everything_else import (extract_channel_info, extract_search_info,
 from .watch_extraction import (extract_watch_info, get_caption_url,
    update_with_age_restricted_info, requires_decryption,
    extract_decryption_function, decrypt_signatures, _formats,
-    update_format_with_type_info, extract_hls_formats)
+    update_format_with_type_info, extract_hls_formats,
+    update_with_missing_base_js)
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -602,6 +602,19 @@ def update_with_age_restricted_info(info, video_info_page):
    _extract_formats(info, player_response)
    _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)

+html_watch_page_base_js_re = re.compile(r'jsUrl":\s*"([\w\-\./]+/base.js)"')
+def update_with_missing_base_js(info, html_watch_page):
+    '''Extracts base_js url and player_name from html watch page. return err
+    Use when base_js is missing from the json page.'''
+    match = html_watch_page_base_js_re.search(html_watch_page)
+    if match:
+        info['base_js'] = normalize_url(match.group(1))
+        # must uniquely identify url
+        info['player_name'] = urllib.parse.urlparse(info['base_js']).path
+        return False
+    else:
+        return 'Could not find base_js url in watch page html'
+
 def requires_decryption(info):
    return ('formats' in info) and info['formats'] and info['formats'][0]['s']