Extraction: extract automatic captions

This commit is contained in:
James Taylor 2019-11-28 22:24:17 -08:00
parent 782a826397
commit 95da24a206

View File

@ -3,7 +3,7 @@ from youtube import util, proto
import html import html
import json import json
import re import re
import urllib import urllib.parse
import collections import collections
from math import ceil from math import ceil
import traceback import traceback
@ -1042,7 +1042,7 @@ def extract_watch_info_desktop(top_level):
return info return info
_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def extract_watch_info(polymer_json): def extract_watch_info(polymer_json):
info = {'playability_error': None, 'error': None} info = {'playability_error': None, 'error': None}
@ -1072,6 +1072,36 @@ def extract_watch_info(polymer_json):
if playability_status not in (None, 'OK'): if playability_status not in (None, 'OK'):
info['playability_error'] = playability_reason info['playability_error'] = playability_reason
# automatic captions
# adapted from youtube_dl:
# https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523
info['automatic_captions'] = {}
renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={})
base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl')
if base_url and '?' in base_url:
base_url = normalize_url(base_url)
base_url_path, base_url_query_string = base_url.split('?')
url_info = urllib.parse.parse_qs(base_url_query_string)
for lang in renderer.get('translationLanguages', []):
lang_code = lang.get('languageCode')
if not lang_code:
continue
formats_for_this_lang = []
for ext in _SUBTITLE_FORMATS:
url_info['tlang'] = [lang_code]
url_info['fmt'] = [ext]
url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True)
formats_for_this_lang.append({
'url': url,
'ext': ext,
})
info['automatic_captions'][lang_code] = formats_for_this_lang
# formats
streaming_data = player_response.get('streamingData', {}) streaming_data = player_response.get('streamingData', {})
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])