Extraction: Add general subtitle extraction and translation

2019-11-29 18:36:27 -08:00
parent 95da24a206
commit 205ad29cb0
2 changed files with 123 additions and 66 deletions
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -44,50 +44,104 @@ def get_video_sources(info):

    return video_sources

+def make_caption_src(info, lang, auto=False, trans_lang=None):
+    label = lang
+    if auto:
+        label += ' (Automatic)'
+    if trans_lang:
+        label += ' -> ' + trans_lang
+    return {
+        'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang),
+        'label': label,
+        'srclang': trans_lang[0:2] if trans_lang else lang[0:2],
+        'on': False,
+    }
+
+def lang_in(lang, sequence):
+    '''Tests if the language is in sequence, with e.g. en and en-US considered the same'''
+    lang = lang[0:2]
+    return lang in (l[0:2] for l in sequence)
+
+def lang_eq(lang1, lang2):
+    '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same.
+       Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model'''
+    return lang1[0:2] == lang2[0:2]
+
+def equiv_lang_in(lang, sequence):
+    '''Extracts a language in sequence which is equivalent to lang.
+    e.g. if lang is en, extracts en-GB from sequence.
+    Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.'''
+    lang = lang[0:2]
+    for l in sequence:
+        if l[0:2] == lang:
+            return l
+    return None
+
 def get_subtitle_sources(info):
+    '''Returns these sources, ordered from least to most intelligible:
+    native_video_lang (Automatic)
+    foreign_langs (Manual)
+    native_video_lang (Automatic) -> pref_lang
+    foreign_langs (Manual) -> pref_lang
+    native_video_lang (Manual) -> pref_lang
+    pref_lang (Automatic)
+    pref_lang (Manual)'''
    sources = []
-    default_found = False
-    default = None
-    for language, formats in info['subtitles'].items():
-        for format in formats:
-            if format['ext'] == 'vtt':
-                source = {
-                    'url': '/' + format['url'],
-                    'label': language,
-                    'srclang': language,
+    pref_lang = settings.subtitles_language
+    native_video_lang = None
+    if info['automatic_caption_languages']:
+        native_video_lang = info['automatic_caption_languages'][0]

-                    # set as on by default if this is the preferred language and a default-on subtitles mode is in settings
-                    'on': language == settings.subtitles_language and settings.subtitles_mode > 0,
-                }
+    highest_fidelity_is_manual = False

-                if language == settings.subtitles_language:
-                    default_found = True
-                    default = source
-                else:
-                    sources.append(source)
-                break
-
-    # Put it at the end to avoid browser bug when there are too many languages
+    # Sources are added in very specific order outlined above
+    # More intelligible sources are put further down to avoid browser bug when there are too many languages
    # (in firefox, it is impossible to select a language near the top of the list because it is cut off)
-    if default_found:
-        sources.append(default)

-    try:
-        formats = info['automatic_captions'][settings.subtitles_language]
-    except KeyError:
-        pass
-    else:
-        for format in formats:
-            if format['ext'] == 'vtt':
-                sources.append({
-                    'url': '/' + format['url'],
-                    'label': settings.subtitles_language + ' - Automatic',
-                    'srclang': settings.subtitles_language,
+    # native_video_lang (Automatic)
+    if native_video_lang and not lang_eq(native_video_lang, pref_lang):
+        sources.append(make_caption_src(info, native_video_lang, auto=True))

-                    # set as on by default if this is the preferred language and a default-on subtitles mode is in settings
-                    'on': settings.subtitles_mode == 2 and not default_found,
+    # foreign_langs (Manual)
+    for lang in info['manual_caption_languages']:
+        if not lang_eq(lang, pref_lang):
+            sources.append(make_caption_src(info, lang))

-                })
+    if (lang_in(pref_lang, info['translation_languages'])
+            and not lang_in(pref_lang, info['automatic_caption_languages'])
+            and not lang_in(pref_lang, info['manual_caption_languages'])):
+        # native_video_lang (Automatic) -> pref_lang
+        if native_video_lang and not lang_eq(pref_lang, native_video_lang):
+            sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang))
+
+        # foreign_langs (Manual) -> pref_lang
+        for lang in info['manual_caption_languages']:
+            if not lang_eq(lang, native_video_lang):
+                sources.append(make_caption_src(info, lang, trans_lang=pref_lang))
+
+        # native_video_lang (Manual) -> pref_lang
+        if lang_in(native_video_lang, info['manual_caption_languages']):
+            sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang))
+
+    # pref_lang (Automatic)
+    if lang_in(pref_lang, info['automatic_caption_languages']):
+        sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True))
+
+    # pref_lang (Manual)
+    if lang_in(pref_lang, info['manual_caption_languages']):
+        sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages'])))
+        highest_fidelity_is_manual = True
+
+    if sources and sources[-1]['srclang'] == pref_lang:
+        # set as on by default since it's manual a default-on subtitles mode is in settings
+        if highest_fidelity_is_manual and settings.subtitles_mode > 0:
+            sources[-1]['on'] = True
+        # set as on by default since settings indicate to set it as such even if it's not manual
+        elif settings.subtitles_mode == 2:
+            sources[-1]['on'] = True
+
+    if len(sources) == 0:
+        assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0

    return sources

--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -309,6 +309,8 @@ def ajax_info(item_json):

 youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
 def normalize_url(url):
+    if url is None:
+        return None
    match = youtube_url_re.fullmatch(url)
    if match is None:
        raise Exception()
@@ -1042,7 +1044,18 @@ def extract_watch_info_desktop(top_level):

    return info

-_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
+def get_caption_url(info, language, format, automatic=False, translation_language=None):
+    '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
+    url = info['_captions_base_url']
+    url += '&lang=' + language
+    url += '&fmt=' + format
+    if automatic:
+        url += '&kind=asr'
+    if translation_language:
+        url += '&tlang=' + translation_language
+    return url
+
+SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
 def extract_watch_info(polymer_json):
    info = {'playability_error': None, 'error': None}

@@ -1072,34 +1085,25 @@ def extract_watch_info(polymer_json):
    if playability_status not in (None, 'OK'):
        info['playability_error'] = playability_reason

-    # automatic captions
-
-    # adapted from youtube_dl:
-    # https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523
-    info['automatic_captions'] = {}
-
-    renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={})
-    base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl')
-
-    if base_url and '?' in base_url:
-        base_url = normalize_url(base_url)
-        base_url_path, base_url_query_string = base_url.split('?')
-        url_info = urllib.parse.parse_qs(base_url_query_string)
-
-        for lang in renderer.get('translationLanguages', []):
-            lang_code = lang.get('languageCode')
-            if not lang_code:
-                continue
-            formats_for_this_lang = []
-            for ext in _SUBTITLE_FORMATS:
-                url_info['tlang'] = [lang_code]
-                url_info['fmt'] = [ext]
-                url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True)
-                formats_for_this_lang.append({
-                    'url': url,
-                    'ext': ext,
-                })
-            info['automatic_captions'][lang_code] = formats_for_this_lang
+    # captions
+    info['automatic_caption_languages'] = []
+    info['manual_caption_languages'] = []
+    info['translation_languages'] = []
+    captions_info = player_response.get('captions', {})
+    info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
+    for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
+        lang_code = caption_track.get('languageCode')
+        if lang_code:
+            if caption_track.get('kind') == 'asr':
+                info['automatic_caption_languages'].append(lang_code)
+            else:
+                info['manual_caption_languages'].append(lang_code)
+    for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
+        lang_code = translation_lang_info.get('languageCode')
+        if lang_code:
+            info['translation_languages'].append(lang_code)
+        if translation_lang_info.get('isTranslatable') == False:
+            print('WARNING: Found non-translatable caption language')

    # formats
    streaming_data = player_response.get('streamingData', {})
@@ -1157,5 +1161,4 @@ def extract_watch_info(polymer_json):

    # other stuff
    info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
-    info['subtitles'] = {}  # TODO
    return info