Extraction: Add signature decryption

2019-10-18 14:02:28 -07:00
parent 4c07546e7a
commit 70b56d6eef
2 changed files with 193 additions and 84 deletions
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -11,8 +11,14 @@ import gevent
 import os
 import math
 import traceback
+import re
+import urllib

-
+try:
+    with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f:
+        decrypt_cache = json.loads(f.read())['decrypt_cache']
+except FileNotFoundError:
+    decrypt_cache = {}


 def get_video_sources(info):
@@ -22,9 +28,9 @@ def get_video_sources(info):
    else:
        max_resolution = settings.default_resolution
    for format in info['formats']:
-        if not all(attr in format for attr in ('height', 'width', 'ext', 'url')):
+        if not all(format[attr] for attr in ('height', 'width', 'ext', 'url')):
            continue
-        if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution:
+        if format['acodec'] and format['vcodec'] and format['height'] <= max_resolution:
            video_sources.append({
                'src': format['url'],
                'type': 'video/' + format['ext'],
@@ -101,6 +107,112 @@ def get_ordered_music_list_attributes(music_list):

    return ordered_attributes

+def save_decrypt_cache():
+    try:
+        f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w')
+    except FileNotFoundError:
+        os.makedirs(settings.data_dir)
+        f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w')
+
+    f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True))
+    f.close()
+
+# adapted from youtube-dl and invidious:
+# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr
+decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}')
+op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)')
+def decrypt_signatures(info):
+    '''return error string, or False if no errors'''
+    if not info['formats'] or not info['formats'][0]['s']:
+        return False    # No decryption needed
+    if not info['base_js']:
+        return 'Failed to find base.js'
+    player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2)
+    if not player_name:
+        return 'Could not find player name'
+
+    if player_name in decrypt_cache:
+        print('Using cached decryption function for: ' + player_name)
+        decryption_function = decrypt_cache[player_name]
+    else:
+        base_js = util.fetch_url(info['base_js'], debug_name='base.js', report_text='Fetched player ' + player_name)
+        base_js = base_js.decode('utf-8')
+
+        decrypt_function_match = decrypt_function_re.search(base_js)
+        if decrypt_function_match is None:
+            return 'Could not find decryption function in base.js'
+
+        function_body = decrypt_function_match.group(1).split(';')[1:-1]
+        if not function_body:
+            return 'Empty decryption function body'
+
+        var_name = yt_data_extract.default_get(function_body[0].split('.'), 0)
+        if var_name is None:
+            return 'Could not find var_name'
+
+        var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL)
+        if var_body_match is None:
+            return 'Could not find var_body'
+
+        operations = var_body_match.group(1).replace('\n', '').split('},')
+        if not operations:
+            return 'Did not find any definitions in var_body'
+        operations[-1] = operations[-1][:-1]    # remove the trailing '}' since we split by '},' on the others
+        operation_definitions = {}
+        for op in operations:
+            colon_index = op.find(':')
+            opening_brace_index = op.find('{')
+
+            if colon_index == -1 or opening_brace_index == -1:
+                return 'Could not parse operation'
+            op_name = op[:colon_index]
+            op_body = op[opening_brace_index+1:]
+            if op_body == 'a.reverse()':
+                operation_definitions[op_name] = 0
+            elif op_body == 'a.splice(0,b)':
+                operation_definitions[op_name] = 1
+            elif op_body.startswith('var c=a[0]'):
+                operation_definitions[op_name] = 2
+            else:
+                return 'Unknown op_body: ' + op_body
+
+        decryption_function = []
+        for op_with_arg in function_body:
+            match = op_with_arg_re.fullmatch(op_with_arg)
+            if match is None:
+                return 'Could not parse operation with arg'
+            op_name = match.group(1)
+            if op_name not in operation_definitions:
+                return 'Unknown op_name: ' + op_name
+            op_argument = match.group(2)
+            decryption_function.append([operation_definitions[op_name], int(op_argument)])
+
+        decrypt_cache[player_name] = decryption_function
+        save_decrypt_cache()
+
+    for format in info['formats']:
+        if not format['s'] or not format['sp'] or not format['url']:
+            print('Warning: s, sp, or url not in format')
+            continue
+
+        a = list(format['s'])
+        for op, argument in decryption_function:
+            if op == 0:
+                a.reverse()
+            elif op == 1:
+                a = a[argument:]
+            else:
+                operation_2(a, argument)
+
+        signature = ''.join(a)
+        format['url'] += '&' + format['sp'] + '=' + signature
+    return False
+
+def operation_2(a, b):
+    c = a[0]
+    a[0] = a[b % len(a)]
+    a[b % len(a)] = c
+
 headers = (
    ('Accept', '*/*'),
    ('Accept-Language', 'en-US,en;q=0.5'),
@@ -115,26 +227,31 @@ def extract_info(video_id):
    except json.decoder.JSONDecodeError:
        traceback.print_exc()
        return {'error': 'Failed to parse json response'}
-    return yt_data_extract.extract_watch_info(polymer_json)
+    info = yt_data_extract.extract_watch_info(polymer_json)
+    error = decrypt_signatures(info)
+    if error:
+        print('Error decrypting url signatures: ' + error)
+        info['playability_error'] = error
+    return info

 def video_quality_string(format):
-    if 'vcodec' in format:
-        result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?'))
-        if 'fps' in format:
-            result += ' ' + format['fps'] + 'fps'
+    if format['vcodec']:
+        result =str(format['width'] or '?') + 'x' + str(format['height'] or '?')
+        if format['fps']:
+            result += ' ' + str(format['fps']) + 'fps'
        return result
-    elif 'acodec' in format:
+    elif format['acodec']:
        return 'audio only'

    return '?'

 def audio_quality_string(format):
-    if 'acodec' in format:
-        result = str(format.get('abr', '?')) + 'k'
-        if 'audio_sample_rate' in format:
+    if format['acodec']:
+        result = str(format['audio_bitrate'] or '?') + 'k'
+        if format['audio_sample_rate']:
            result += ' ' + str(format['audio_sample_rate']) + ' Hz'
        return result
-    elif 'vcodec' in format:
+    elif format['vcodec']:
        return 'video only'

    return '?'
@@ -193,13 +310,13 @@ def get_watch_page():
    download_formats = []

    for format in info['formats']:
-        if 'acodec' in format and 'vcodec' in format:
+        if format['acodec'] and format['vcodec']:
            codecs_string = format['acodec'] + ', ' + format['vcodec']
        else:
-            codecs_string = format.get('acodec') or format.get('vcodec') or '?'
+            codecs_string = format['acodec'] or format['vcodec'] or '?'
        download_formats.append({
            'url': format['url'],
-            'ext': format.get('ext', '?'),
+            'ext': format['ext'] or '?',
            'audio_quality': audio_quality_string(format),
            'video_quality': video_quality_string(format),
            'file_size': format_bytes(format['file_size']),
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -39,44 +39,44 @@ import traceback

 # from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
 _formats = {
-    '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
-    '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+    '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
+    '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
    '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
-    '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
-    '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
-    '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
-    '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
+    '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
+    '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
+    '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
+    '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
    '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
-    '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
-    '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
-    '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
-    '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
-    '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
-    '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
-    '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
+    '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
+    '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
+    '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
+    '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
+    '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
+    '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},


    # 3D videos
-    '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
-    '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
-    '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
-    '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
-    '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+    '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
+    '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
+    '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
+    '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
+    '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},

    # Apple HTTP Live Streaming
-    '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
-    '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
-    '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-    '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
-    '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
-    '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
-    '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'},
+    '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
+    '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
+    '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
+    '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
+    '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
+    '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
+    '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},

    # DASH mp4 video
    '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
@@ -93,9 +93,9 @@ _formats = {
    '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},

    # Dash mp4 audio
-    '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
-    '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
-    '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+    '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
+    '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
+    '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
    '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
    '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
    '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
@@ -126,13 +126,13 @@ _formats = {
    '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

    # Dash webm audio
-    '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
-    '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
+    '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
+    '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},

    # Dash webm audio with opus inside
-    '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
-    '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
-    '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
+    '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
+    '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
+    '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},

    # RTMP (unnamed)
    '_rtmp': {'protocol': 'rtmp'},
@@ -1042,39 +1042,32 @@ def extract_watch_info(polymer_json):


    player_args = default_multi_get(top_level, 'player', 'args', default={})
-    parsed_formats = []
-
-    if 'url_encoded_fmt_stream_map' in player_args:
-        string_formats = player_args['url_encoded_fmt_stream_map'].split(',')
-        parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
-
-    if 'adaptive_fmts' in player_args:
-        string_formats = player_args['adaptive_fmts'].split(',')
-        parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
+    player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {}
+    streaming_data = player_response.get('streamingData', {})
+    yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])

    info['formats'] = []

-    for parsed_fmt in parsed_formats:
-        # start with defaults from the big table at the top
-        if 'itag' in parsed_fmt:
-            fmt = _formats.get(parsed_fmt['itag'], {}).copy()
-        else:
+    for yt_fmt in yt_formats:
        fmt = {}
-
-        # then override them
-        fmt.update(parsed_fmt)
-        try:
-            fmt['width'], fmt['height'] = map(int, fmt['size'].split('x'))
-        except (KeyError, ValueError, TypeError):
-            pass
-
-        fmt['file_size'] = None
-        if 'clen' in fmt:
-            fmt['file_size'] = int(fmt.get('clen'))
+        fmt['ext'] = None
+        fmt['audio_bitrate'] = None
+        fmt['acodec'] = None
+        fmt['vcodec'] = None
+        fmt['width'] = yt_fmt.get('width')
+        fmt['height'] = yt_fmt.get('height')
+        fmt['file_size'] = yt_fmt.get('contentLength')
+        fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate')
+        fmt['fps'] = yt_fmt.get('fps')
+        cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', '')))
+        if cipher:
+            fmt['url'] = cipher.get('url')
        else:
-            match = re.search(r'&clen=(\d+)', fmt.get('url'))
-            if match:
-                fmt['file_size'] = int(match.group(1))
+            fmt['url'] = yt_fmt.get('url')
+        fmt['s'] = cipher.get('s')
+        fmt['sp'] = cipher.get('sp')
+        fmt.update(_formats.get(str(yt_fmt.get('itag')), {}))
+
        info['formats'].append(fmt)

    info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js')
@@ -1104,5 +1097,4 @@ def extract_watch_info(polymer_json):
    # other stuff
    info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
    info['subtitles'] = {}  # TODO
-
    return info