Add video transcript to downloads

Generated from the video captions
This commit is contained in:
James Taylor 2020-07-25 19:40:37 -07:00
parent 44d7f9da99
commit 9fc347e093
2 changed files with 90 additions and 1 deletions

View File

@ -381,6 +381,16 @@ Reload without invidious (for usage of new identity button).</a>
</a>
</li>
{% endfor %}
{% for download in other_downloads %}
<li class="download-format">
<a href="{{ download['url'] }}">
<ol class="format-attributes">
<li class="format-ext">{{ download['ext'] }}</li>
<li class="format-label">{{ download['label'] }}</li>
</ol>
</a>
</li>
{% endfor %}
</ul>
</details>
<input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">

View File

@ -453,6 +453,22 @@ def get_watch_page(video_id=None):
print('Comment count:', info['comment_count'])
info['comment_count'] = None # hack to make it obvious there's a bug
# captions and transcript
subtitle_sources = get_subtitle_sources(info)
other_downloads = []
for source in subtitle_sources:
best_caption_parse = urllib.parse.urlparse(
source['url'].lstrip('/'))
transcript_url = (util.URL_ORIGIN
+ '/watch/transcript'
+ best_caption_parse.path
+ '?' + best_caption_parse.query)
other_downloads.append({
'label': 'Video Transcript: ' + source['label'],
'ext': 'txt',
'url': transcript_url
})
return flask.render_template('watch.html',
header_playlist_names = local_playlist.get_playlist_names(),
uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '',
@ -461,10 +477,11 @@ def get_watch_page(video_id=None):
like_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
download_formats = download_formats,
other_downloads = other_downloads,
video_info = json.dumps(video_info),
video_sources = video_sources,
hls_formats = info['hls_formats'],
subtitle_sources = get_subtitle_sources(info),
subtitle_sources = subtitle_sources,
related = info['related_videos'],
playlist = info['playlist'],
music_list = info['music_list'],
@ -504,5 +521,67 @@ def get_captions(dummy):
return result
times_reg = re.compile(r'^\d\d:\d\d:\d\d\.\d\d\d --> \d\d:\d\d:\d\d\.\d\d\d.*$')
inner_timestamp_removal_reg = re.compile(r'<[^>]+>')
@yt_app.route('/watch/transcript/<path:caption_path>')
def get_transcript(caption_path):
try:
captions = util.fetch_url('https://www.youtube.com/'
+ caption_path
+ '?' + request.environ['QUERY_STRING']).decode('utf-8')
except util.FetchError as e:
msg = ('Error retrieving captions: ' + str(e) + '\n\n'
+ 'The caption url may have expired.')
print(msg)
return flask.Response(msg,
status = e.code,
mimetype='text/plain;charset=UTF-8')
lines = captions.splitlines()
segments = []
# skip captions file header
i = 0
while lines[i] != '':
i += 1
current_segment = None
while i < len(lines):
line = lines[i]
if line == '':
if ((current_segment is not None)
and (current_segment['begin'] is not None)):
segments.append(current_segment)
current_segment = {
'begin': None,
'end': None,
'lines': [],
}
elif times_reg.fullmatch(line.rstrip()):
current_segment['begin'], current_segment['end'] = line.split(' --> ')
else:
current_segment['lines'].append(
inner_timestamp_removal_reg.sub('', line))
i += 1
# if automatic captions, but not translated
if request.args.get('kind') == 'asr' and not request.args.get('tlang'):
# Automatic captions repeat content. The new segment is displayed
# on the bottom row; the old one is displayed on the top row.
# So grab the bottom row only
for seg in segments:
seg['text'] = seg['lines'][1]
else:
for seg in segments:
seg['text'] = ' '.join(map(str.rstrip, seg['lines']))
result = ''
for seg in segments:
if seg['text'] != ' ':
result += seg['begin'] + ' ' + seg['text'] + '\r\n'
return flask.Response(result.encode('utf-8'),
mimetype='text/plain;charset=UTF-8')