Add video transcript to downloads

Generated from the video captions
2020-07-25 19:40:37 -07:00
parent 44d7f9da99
commit 9fc347e093
2 changed files with 90 additions and 1 deletions
--- a/youtube/templates/watch.html
+++ b/youtube/templates/watch.html
@@ -381,6 +381,16 @@ Reload without invidious (for usage of new identity button).</a>
                        </a>
                    </li>
                {% endfor %}
                {% for download in other_downloads %}
                    <li class="download-format">
                        <a href="{{ download['url'] }}">
                            <ol class="format-attributes">
                                <li class="format-ext">{{ download['ext'] }}</li>
                                <li class="format-label">{{ download['label'] }}</li>
                            </ol>
                        </a>
                    </li>
                {% endfor %}
            </ul>
        </details>
        <input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -453,6 +453,22 @@ def get_watch_page(video_id=None):
        print('Comment count:', info['comment_count'])
        info['comment_count'] = None # hack to make it obvious there's a bug
    # captions and transcript
    subtitle_sources = get_subtitle_sources(info)
    other_downloads = []
    for source in subtitle_sources:
        best_caption_parse = urllib.parse.urlparse(
            source['url'].lstrip('/'))
        transcript_url = (util.URL_ORIGIN
            + '/watch/transcript'
            + best_caption_parse.path
            + '?' + best_caption_parse.query)
        other_downloads.append({
            'label': 'Video Transcript: ' + source['label'],
            'ext': 'txt',
            'url': transcript_url
        })
    return flask.render_template('watch.html',
        header_playlist_names   = local_playlist.get_playlist_names(),
        uploader_channel_url    = ('/' + info['author_url']) if info['author_url'] else '',
@@ -461,10 +477,11 @@ def get_watch_page(video_id=None):
        like_count    = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
        dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
        download_formats        = download_formats,
        other_downloads         = other_downloads,
        video_info              = json.dumps(video_info),
        video_sources           = video_sources,
        hls_formats             = info['hls_formats'],
-        subtitle_sources        = get_subtitle_sources(info),
+        subtitle_sources        = subtitle_sources,
        related                 = info['related_videos'],
        playlist                = info['playlist'],
        music_list              = info['music_list'],
@@ -504,5 +521,67 @@ def get_captions(dummy):
    return result
 times_reg = re.compile(r'^\d\d:\d\d:\d\d\.\d\d\d --> \d\d:\d\d:\d\d\.\d\d\d.*$')
 inner_timestamp_removal_reg = re.compile(r'<[^>]+>')
@yt_app.route('/watch/transcript/<path:caption_path>')
 def get_transcript(caption_path):
    try:
        captions = util.fetch_url('https://www.youtube.com/'
            + caption_path
            + '?' + request.environ['QUERY_STRING']).decode('utf-8')
    except util.FetchError as e:
        msg = ('Error retrieving captions: ' + str(e) + '\n\n'
            + 'The caption url may have expired.')
        print(msg)
        return flask.Response(msg,
            status = e.code,
            mimetype='text/plain;charset=UTF-8')
    lines = captions.splitlines()
    segments = []
    # skip captions file header
    i = 0
    while lines[i] != '':
        i += 1
    current_segment = None
    while i < len(lines):
        line = lines[i]
        if line == '':
            if ((current_segment is not None)
                    and (current_segment['begin'] is not None)):
                segments.append(current_segment)
            current_segment = {
                'begin': None,
                'end': None,
                'lines': [],
            }
        elif times_reg.fullmatch(line.rstrip()):
            current_segment['begin'], current_segment['end'] = line.split(' --> ')
        else:
            current_segment['lines'].append(
                inner_timestamp_removal_reg.sub('', line))
        i += 1
    # if automatic captions, but not translated
    if request.args.get('kind') == 'asr' and not request.args.get('tlang'):
        # Automatic captions repeat content. The new segment is displayed
        # on the bottom row; the old one is displayed on the top row.
        # So grab the bottom row only
        for seg in segments:
            seg['text'] = seg['lines'][1]
    else:
        for seg in segments:
            seg['text'] = ' '.join(map(str.rstrip, seg['lines']))
    result = ''
    for seg in segments:
        if seg['text'] != ' ':
            result += seg['begin'] + ' ' + seg['text'] + '\r\n'
    return flask.Response(result.encode('utf-8'),
        mimetype='text/plain;charset=UTF-8')