Extraction: Add general subtitle extraction and translation

This commit is contained in:
James Taylor 2019-11-29 18:36:27 -08:00
parent 95da24a206
commit 205ad29cb0
2 changed files with 123 additions and 66 deletions

View File

@ -44,50 +44,104 @@ def get_video_sources(info):
return video_sources
def make_caption_src(info, lang, auto=False, trans_lang=None):
label = lang
if auto:
label += ' (Automatic)'
if trans_lang:
label += ' -> ' + trans_lang
return {
'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang),
'label': label,
'srclang': trans_lang[0:2] if trans_lang else lang[0:2],
'on': False,
}
def lang_in(lang, sequence):
'''Tests if the language is in sequence, with e.g. en and en-US considered the same'''
lang = lang[0:2]
return lang in (l[0:2] for l in sequence)
def lang_eq(lang1, lang2):
'''Tests if two iso 639-1 codes are equal, with en and en-US considered the same.
Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model'''
return lang1[0:2] == lang2[0:2]
def equiv_lang_in(lang, sequence):
'''Extracts a language in sequence which is equivalent to lang.
e.g. if lang is en, extracts en-GB from sequence.
Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.'''
lang = lang[0:2]
for l in sequence:
if l[0:2] == lang:
return l
return None
def get_subtitle_sources(info):
'''Returns these sources, ordered from least to most intelligible:
native_video_lang (Automatic)
foreign_langs (Manual)
native_video_lang (Automatic) -> pref_lang
foreign_langs (Manual) -> pref_lang
native_video_lang (Manual) -> pref_lang
pref_lang (Automatic)
pref_lang (Manual)'''
sources = []
default_found = False
default = None
for language, formats in info['subtitles'].items():
for format in formats:
if format['ext'] == 'vtt':
source = {
'url': '/' + format['url'],
'label': language,
'srclang': language,
pref_lang = settings.subtitles_language
native_video_lang = None
if info['automatic_caption_languages']:
native_video_lang = info['automatic_caption_languages'][0]
# set as on by default if this is the preferred language and a default-on subtitles mode is in settings
'on': language == settings.subtitles_language and settings.subtitles_mode > 0,
}
highest_fidelity_is_manual = False
if language == settings.subtitles_language:
default_found = True
default = source
else:
sources.append(source)
break
# Put it at the end to avoid browser bug when there are too many languages
# Sources are added in very specific order outlined above
# More intelligible sources are put further down to avoid browser bug when there are too many languages
# (in firefox, it is impossible to select a language near the top of the list because it is cut off)
if default_found:
sources.append(default)
try:
formats = info['automatic_captions'][settings.subtitles_language]
except KeyError:
pass
else:
for format in formats:
if format['ext'] == 'vtt':
sources.append({
'url': '/' + format['url'],
'label': settings.subtitles_language + ' - Automatic',
'srclang': settings.subtitles_language,
# native_video_lang (Automatic)
if native_video_lang and not lang_eq(native_video_lang, pref_lang):
sources.append(make_caption_src(info, native_video_lang, auto=True))
# set as on by default if this is the preferred language and a default-on subtitles mode is in settings
'on': settings.subtitles_mode == 2 and not default_found,
# foreign_langs (Manual)
for lang in info['manual_caption_languages']:
if not lang_eq(lang, pref_lang):
sources.append(make_caption_src(info, lang))
})
if (lang_in(pref_lang, info['translation_languages'])
and not lang_in(pref_lang, info['automatic_caption_languages'])
and not lang_in(pref_lang, info['manual_caption_languages'])):
# native_video_lang (Automatic) -> pref_lang
if native_video_lang and not lang_eq(pref_lang, native_video_lang):
sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang))
# foreign_langs (Manual) -> pref_lang
for lang in info['manual_caption_languages']:
if not lang_eq(lang, native_video_lang):
sources.append(make_caption_src(info, lang, trans_lang=pref_lang))
# native_video_lang (Manual) -> pref_lang
if lang_in(native_video_lang, info['manual_caption_languages']):
sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang))
# pref_lang (Automatic)
if lang_in(pref_lang, info['automatic_caption_languages']):
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True))
# pref_lang (Manual)
if lang_in(pref_lang, info['manual_caption_languages']):
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages'])))
highest_fidelity_is_manual = True
if sources and sources[-1]['srclang'] == pref_lang:
# set as on by default since it's manual a default-on subtitles mode is in settings
if highest_fidelity_is_manual and settings.subtitles_mode > 0:
sources[-1]['on'] = True
# set as on by default since settings indicate to set it as such even if it's not manual
elif settings.subtitles_mode == 2:
sources[-1]['on'] = True
if len(sources) == 0:
assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0
return sources

View File

@ -309,6 +309,8 @@ def ajax_info(item_json):
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
def normalize_url(url):
if url is None:
return None
match = youtube_url_re.fullmatch(url)
if match is None:
raise Exception()
@ -1042,7 +1044,18 @@ def extract_watch_info_desktop(top_level):
return info
_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def get_caption_url(info, language, format, automatic=False, translation_language=None):
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
url = info['_captions_base_url']
url += '&lang=' + language
url += '&fmt=' + format
if automatic:
url += '&kind=asr'
if translation_language:
url += '&tlang=' + translation_language
return url
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def extract_watch_info(polymer_json):
info = {'playability_error': None, 'error': None}
@ -1072,34 +1085,25 @@ def extract_watch_info(polymer_json):
if playability_status not in (None, 'OK'):
info['playability_error'] = playability_reason
# automatic captions
# adapted from youtube_dl:
# https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523
info['automatic_captions'] = {}
renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={})
base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl')
if base_url and '?' in base_url:
base_url = normalize_url(base_url)
base_url_path, base_url_query_string = base_url.split('?')
url_info = urllib.parse.parse_qs(base_url_query_string)
for lang in renderer.get('translationLanguages', []):
lang_code = lang.get('languageCode')
if not lang_code:
continue
formats_for_this_lang = []
for ext in _SUBTITLE_FORMATS:
url_info['tlang'] = [lang_code]
url_info['fmt'] = [ext]
url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True)
formats_for_this_lang.append({
'url': url,
'ext': ext,
})
info['automatic_captions'][lang_code] = formats_for_this_lang
# captions
info['automatic_caption_languages'] = []
info['manual_caption_languages'] = []
info['translation_languages'] = []
captions_info = player_response.get('captions', {})
info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
lang_code = caption_track.get('languageCode')
if lang_code:
if caption_track.get('kind') == 'asr':
info['automatic_caption_languages'].append(lang_code)
else:
info['manual_caption_languages'].append(lang_code)
for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
lang_code = translation_lang_info.get('languageCode')
if lang_code:
info['translation_languages'].append(lang_code)
if translation_lang_info.get('isTranslatable') == False:
print('WARNING: Found non-translatable caption language')
# formats
streaming_data = player_response.get('streamingData', {})
@ -1157,5 +1161,4 @@ def extract_watch_info(polymer_json):
# other stuff
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
info['subtitles'] = {} # TODO
return info