Add video transcript to downloads
Generated from the video captions
This commit is contained in:
parent
44d7f9da99
commit
9fc347e093
@ -381,6 +381,16 @@ Reload without invidious (for usage of new identity button).</a>
|
|||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
{% for download in other_downloads %}
|
||||||
|
<li class="download-format">
|
||||||
|
<a href="{{ download['url'] }}">
|
||||||
|
<ol class="format-attributes">
|
||||||
|
<li class="format-ext">{{ download['ext'] }}</li>
|
||||||
|
<li class="format-label">{{ download['label'] }}</li>
|
||||||
|
</ol>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
</ul>
|
</ul>
|
||||||
</details>
|
</details>
|
||||||
<input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
|
<input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
|
||||||
|
@ -453,6 +453,22 @@ def get_watch_page(video_id=None):
|
|||||||
print('Comment count:', info['comment_count'])
|
print('Comment count:', info['comment_count'])
|
||||||
info['comment_count'] = None # hack to make it obvious there's a bug
|
info['comment_count'] = None # hack to make it obvious there's a bug
|
||||||
|
|
||||||
|
# captions and transcript
|
||||||
|
subtitle_sources = get_subtitle_sources(info)
|
||||||
|
other_downloads = []
|
||||||
|
for source in subtitle_sources:
|
||||||
|
best_caption_parse = urllib.parse.urlparse(
|
||||||
|
source['url'].lstrip('/'))
|
||||||
|
transcript_url = (util.URL_ORIGIN
|
||||||
|
+ '/watch/transcript'
|
||||||
|
+ best_caption_parse.path
|
||||||
|
+ '?' + best_caption_parse.query)
|
||||||
|
other_downloads.append({
|
||||||
|
'label': 'Video Transcript: ' + source['label'],
|
||||||
|
'ext': 'txt',
|
||||||
|
'url': transcript_url
|
||||||
|
})
|
||||||
|
|
||||||
return flask.render_template('watch.html',
|
return flask.render_template('watch.html',
|
||||||
header_playlist_names = local_playlist.get_playlist_names(),
|
header_playlist_names = local_playlist.get_playlist_names(),
|
||||||
uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '',
|
uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '',
|
||||||
@ -461,10 +477,11 @@ def get_watch_page(video_id=None):
|
|||||||
like_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
|
like_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
|
||||||
dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
|
dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
|
||||||
download_formats = download_formats,
|
download_formats = download_formats,
|
||||||
|
other_downloads = other_downloads,
|
||||||
video_info = json.dumps(video_info),
|
video_info = json.dumps(video_info),
|
||||||
video_sources = video_sources,
|
video_sources = video_sources,
|
||||||
hls_formats = info['hls_formats'],
|
hls_formats = info['hls_formats'],
|
||||||
subtitle_sources = get_subtitle_sources(info),
|
subtitle_sources = subtitle_sources,
|
||||||
related = info['related_videos'],
|
related = info['related_videos'],
|
||||||
playlist = info['playlist'],
|
playlist = info['playlist'],
|
||||||
music_list = info['music_list'],
|
music_list = info['music_list'],
|
||||||
@ -504,5 +521,67 @@ def get_captions(dummy):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
times_reg = re.compile(r'^\d\d:\d\d:\d\d\.\d\d\d --> \d\d:\d\d:\d\d\.\d\d\d.*$')
|
||||||
|
inner_timestamp_removal_reg = re.compile(r'<[^>]+>')
|
||||||
|
@yt_app.route('/watch/transcript/<path:caption_path>')
|
||||||
|
def get_transcript(caption_path):
|
||||||
|
try:
|
||||||
|
captions = util.fetch_url('https://www.youtube.com/'
|
||||||
|
+ caption_path
|
||||||
|
+ '?' + request.environ['QUERY_STRING']).decode('utf-8')
|
||||||
|
except util.FetchError as e:
|
||||||
|
msg = ('Error retrieving captions: ' + str(e) + '\n\n'
|
||||||
|
+ 'The caption url may have expired.')
|
||||||
|
print(msg)
|
||||||
|
return flask.Response(msg,
|
||||||
|
status = e.code,
|
||||||
|
mimetype='text/plain;charset=UTF-8')
|
||||||
|
|
||||||
|
lines = captions.splitlines()
|
||||||
|
segments = []
|
||||||
|
|
||||||
|
# skip captions file header
|
||||||
|
i = 0
|
||||||
|
while lines[i] != '':
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
current_segment = None
|
||||||
|
while i < len(lines):
|
||||||
|
line = lines[i]
|
||||||
|
if line == '':
|
||||||
|
if ((current_segment is not None)
|
||||||
|
and (current_segment['begin'] is not None)):
|
||||||
|
segments.append(current_segment)
|
||||||
|
current_segment = {
|
||||||
|
'begin': None,
|
||||||
|
'end': None,
|
||||||
|
'lines': [],
|
||||||
|
}
|
||||||
|
elif times_reg.fullmatch(line.rstrip()):
|
||||||
|
current_segment['begin'], current_segment['end'] = line.split(' --> ')
|
||||||
|
else:
|
||||||
|
current_segment['lines'].append(
|
||||||
|
inner_timestamp_removal_reg.sub('', line))
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# if automatic captions, but not translated
|
||||||
|
if request.args.get('kind') == 'asr' and not request.args.get('tlang'):
|
||||||
|
# Automatic captions repeat content. The new segment is displayed
|
||||||
|
# on the bottom row; the old one is displayed on the top row.
|
||||||
|
# So grab the bottom row only
|
||||||
|
for seg in segments:
|
||||||
|
seg['text'] = seg['lines'][1]
|
||||||
|
else:
|
||||||
|
for seg in segments:
|
||||||
|
seg['text'] = ' '.join(map(str.rstrip, seg['lines']))
|
||||||
|
|
||||||
|
result = ''
|
||||||
|
for seg in segments:
|
||||||
|
if seg['text'] != ' ':
|
||||||
|
result += seg['begin'] + ' ' + seg['text'] + '\r\n'
|
||||||
|
|
||||||
|
return flask.Response(result.encode('utf-8'),
|
||||||
|
mimetype='text/plain;charset=UTF-8')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user