Extraction: Add general subtitle extraction and translation
This commit is contained in:
parent
95da24a206
commit
205ad29cb0
126
youtube/watch.py
126
youtube/watch.py
@ -44,50 +44,104 @@ def get_video_sources(info):
|
||||
|
||||
return video_sources
|
||||
|
||||
def make_caption_src(info, lang, auto=False, trans_lang=None):
|
||||
label = lang
|
||||
if auto:
|
||||
label += ' (Automatic)'
|
||||
if trans_lang:
|
||||
label += ' -> ' + trans_lang
|
||||
return {
|
||||
'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang),
|
||||
'label': label,
|
||||
'srclang': trans_lang[0:2] if trans_lang else lang[0:2],
|
||||
'on': False,
|
||||
}
|
||||
|
||||
def lang_in(lang, sequence):
|
||||
'''Tests if the language is in sequence, with e.g. en and en-US considered the same'''
|
||||
lang = lang[0:2]
|
||||
return lang in (l[0:2] for l in sequence)
|
||||
|
||||
def lang_eq(lang1, lang2):
|
||||
'''Tests if two iso 639-1 codes are equal, with en and en-US considered the same.
|
||||
Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model'''
|
||||
return lang1[0:2] == lang2[0:2]
|
||||
|
||||
def equiv_lang_in(lang, sequence):
|
||||
'''Extracts a language in sequence which is equivalent to lang.
|
||||
e.g. if lang is en, extracts en-GB from sequence.
|
||||
Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.'''
|
||||
lang = lang[0:2]
|
||||
for l in sequence:
|
||||
if l[0:2] == lang:
|
||||
return l
|
||||
return None
|
||||
|
||||
def get_subtitle_sources(info):
|
||||
'''Returns these sources, ordered from least to most intelligible:
|
||||
native_video_lang (Automatic)
|
||||
foreign_langs (Manual)
|
||||
native_video_lang (Automatic) -> pref_lang
|
||||
foreign_langs (Manual) -> pref_lang
|
||||
native_video_lang (Manual) -> pref_lang
|
||||
pref_lang (Automatic)
|
||||
pref_lang (Manual)'''
|
||||
sources = []
|
||||
default_found = False
|
||||
default = None
|
||||
for language, formats in info['subtitles'].items():
|
||||
for format in formats:
|
||||
if format['ext'] == 'vtt':
|
||||
source = {
|
||||
'url': '/' + format['url'],
|
||||
'label': language,
|
||||
'srclang': language,
|
||||
pref_lang = settings.subtitles_language
|
||||
native_video_lang = None
|
||||
if info['automatic_caption_languages']:
|
||||
native_video_lang = info['automatic_caption_languages'][0]
|
||||
|
||||
# set as on by default if this is the preferred language and a default-on subtitles mode is in settings
|
||||
'on': language == settings.subtitles_language and settings.subtitles_mode > 0,
|
||||
}
|
||||
highest_fidelity_is_manual = False
|
||||
|
||||
if language == settings.subtitles_language:
|
||||
default_found = True
|
||||
default = source
|
||||
else:
|
||||
sources.append(source)
|
||||
break
|
||||
|
||||
# Put it at the end to avoid browser bug when there are too many languages
|
||||
# Sources are added in very specific order outlined above
|
||||
# More intelligible sources are put further down to avoid browser bug when there are too many languages
|
||||
# (in firefox, it is impossible to select a language near the top of the list because it is cut off)
|
||||
if default_found:
|
||||
sources.append(default)
|
||||
|
||||
try:
|
||||
formats = info['automatic_captions'][settings.subtitles_language]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
for format in formats:
|
||||
if format['ext'] == 'vtt':
|
||||
sources.append({
|
||||
'url': '/' + format['url'],
|
||||
'label': settings.subtitles_language + ' - Automatic',
|
||||
'srclang': settings.subtitles_language,
|
||||
# native_video_lang (Automatic)
|
||||
if native_video_lang and not lang_eq(native_video_lang, pref_lang):
|
||||
sources.append(make_caption_src(info, native_video_lang, auto=True))
|
||||
|
||||
# set as on by default if this is the preferred language and a default-on subtitles mode is in settings
|
||||
'on': settings.subtitles_mode == 2 and not default_found,
|
||||
# foreign_langs (Manual)
|
||||
for lang in info['manual_caption_languages']:
|
||||
if not lang_eq(lang, pref_lang):
|
||||
sources.append(make_caption_src(info, lang))
|
||||
|
||||
})
|
||||
if (lang_in(pref_lang, info['translation_languages'])
|
||||
and not lang_in(pref_lang, info['automatic_caption_languages'])
|
||||
and not lang_in(pref_lang, info['manual_caption_languages'])):
|
||||
# native_video_lang (Automatic) -> pref_lang
|
||||
if native_video_lang and not lang_eq(pref_lang, native_video_lang):
|
||||
sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang))
|
||||
|
||||
# foreign_langs (Manual) -> pref_lang
|
||||
for lang in info['manual_caption_languages']:
|
||||
if not lang_eq(lang, native_video_lang):
|
||||
sources.append(make_caption_src(info, lang, trans_lang=pref_lang))
|
||||
|
||||
# native_video_lang (Manual) -> pref_lang
|
||||
if lang_in(native_video_lang, info['manual_caption_languages']):
|
||||
sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang))
|
||||
|
||||
# pref_lang (Automatic)
|
||||
if lang_in(pref_lang, info['automatic_caption_languages']):
|
||||
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True))
|
||||
|
||||
# pref_lang (Manual)
|
||||
if lang_in(pref_lang, info['manual_caption_languages']):
|
||||
sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages'])))
|
||||
highest_fidelity_is_manual = True
|
||||
|
||||
if sources and sources[-1]['srclang'] == pref_lang:
|
||||
# set as on by default since it's manual a default-on subtitles mode is in settings
|
||||
if highest_fidelity_is_manual and settings.subtitles_mode > 0:
|
||||
sources[-1]['on'] = True
|
||||
# set as on by default since settings indicate to set it as such even if it's not manual
|
||||
elif settings.subtitles_mode == 2:
|
||||
sources[-1]['on'] = True
|
||||
|
||||
if len(sources) == 0:
|
||||
assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0
|
||||
|
||||
return sources
|
||||
|
||||
|
@ -309,6 +309,8 @@ def ajax_info(item_json):
|
||||
|
||||
youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
|
||||
def normalize_url(url):
|
||||
if url is None:
|
||||
return None
|
||||
match = youtube_url_re.fullmatch(url)
|
||||
if match is None:
|
||||
raise Exception()
|
||||
@ -1042,7 +1044,18 @@ def extract_watch_info_desktop(top_level):
|
||||
|
||||
return info
|
||||
|
||||
_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
|
||||
def get_caption_url(info, language, format, automatic=False, translation_language=None):
|
||||
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
|
||||
url = info['_captions_base_url']
|
||||
url += '&lang=' + language
|
||||
url += '&fmt=' + format
|
||||
if automatic:
|
||||
url += '&kind=asr'
|
||||
if translation_language:
|
||||
url += '&tlang=' + translation_language
|
||||
return url
|
||||
|
||||
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
|
||||
def extract_watch_info(polymer_json):
|
||||
info = {'playability_error': None, 'error': None}
|
||||
|
||||
@ -1072,34 +1085,25 @@ def extract_watch_info(polymer_json):
|
||||
if playability_status not in (None, 'OK'):
|
||||
info['playability_error'] = playability_reason
|
||||
|
||||
# automatic captions
|
||||
|
||||
# adapted from youtube_dl:
|
||||
# https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523
|
||||
info['automatic_captions'] = {}
|
||||
|
||||
renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={})
|
||||
base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl')
|
||||
|
||||
if base_url and '?' in base_url:
|
||||
base_url = normalize_url(base_url)
|
||||
base_url_path, base_url_query_string = base_url.split('?')
|
||||
url_info = urllib.parse.parse_qs(base_url_query_string)
|
||||
|
||||
for lang in renderer.get('translationLanguages', []):
|
||||
lang_code = lang.get('languageCode')
|
||||
if not lang_code:
|
||||
continue
|
||||
formats_for_this_lang = []
|
||||
for ext in _SUBTITLE_FORMATS:
|
||||
url_info['tlang'] = [lang_code]
|
||||
url_info['fmt'] = [ext]
|
||||
url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True)
|
||||
formats_for_this_lang.append({
|
||||
'url': url,
|
||||
'ext': ext,
|
||||
})
|
||||
info['automatic_captions'][lang_code] = formats_for_this_lang
|
||||
# captions
|
||||
info['automatic_caption_languages'] = []
|
||||
info['manual_caption_languages'] = []
|
||||
info['translation_languages'] = []
|
||||
captions_info = player_response.get('captions', {})
|
||||
info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
|
||||
for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
|
||||
lang_code = caption_track.get('languageCode')
|
||||
if lang_code:
|
||||
if caption_track.get('kind') == 'asr':
|
||||
info['automatic_caption_languages'].append(lang_code)
|
||||
else:
|
||||
info['manual_caption_languages'].append(lang_code)
|
||||
for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
|
||||
lang_code = translation_lang_info.get('languageCode')
|
||||
if lang_code:
|
||||
info['translation_languages'].append(lang_code)
|
||||
if translation_lang_info.get('isTranslatable') == False:
|
||||
print('WARNING: Found non-translatable caption language')
|
||||
|
||||
# formats
|
||||
streaming_data = player_response.get('streamingData', {})
|
||||
@ -1157,5 +1161,4 @@ def extract_watch_info(polymer_json):
|
||||
|
||||
# other stuff
|
||||
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
|
||||
info['subtitles'] = {} # TODO
|
||||
return info
|
||||
|
Loading…
x
Reference in New Issue
Block a user