security: harden code against command injection and path traversal

Core changes:

* enforce HTTPS URLs and remove shell usage in generate_release.py
* replace os.system calls with subprocess across the codebase
* validate external inputs (playlist names, video IDs)

Improvements and fixes:

* settings.py: fix typo (node.lineno → line_number); use isinstance() over type()
* youtube/get_app_version: improve git detection using subprocess.DEVNULL
* youtube/util.py: add cleanup helpers; use shutil.which for binary resolution

YouTube modules:

* watch.py: detect and flag HLS streams; remove unused audio_track_sources
* comments.py: return early when comments are disabled; add error handling
* local_playlist.py: validate playlist names to prevent path traversal
* subscriptions.py: replace asserts with proper error handling; validate video IDs

Cleanup:

* remove unused imports across modules (playlist, search, channel)
* reorganize package imports in youtube/**init**.py
* simplify test imports and fix cleanup_func in tests

Tests:

* tests/test_shorts.py: simplify imports
* tests/test_util.py: fix cleanup_func definition
This commit is contained in:
2026-04-20 00:39:35 -05:00
parent 155bd4df49
commit d6190a2d0b
16 changed files with 237 additions and 146 deletions

View File

@@ -44,6 +44,10 @@ def remove_files_with_extensions(path, extensions):
def download_if_not_exists(file_name, url, sha256=None): def download_if_not_exists(file_name, url, sha256=None):
if not os.path.exists('./' + file_name): if not os.path.exists('./' + file_name):
# Reject non-https URLs so a mistaken constant cannot cause a
# plaintext download (bandit B310 hardening).
if not url.startswith('https://'):
raise Exception('Refusing to download over non-https URL: ' + url)
log('Downloading ' + file_name + '..') log('Downloading ' + file_name + '..')
data = urllib.request.urlopen(url).read() data = urllib.request.urlopen(url).read()
log('Finished downloading ' + file_name) log('Finished downloading ' + file_name)
@@ -58,12 +62,14 @@ def download_if_not_exists(file_name, url, sha256=None):
log('Using existing ' + file_name) log('Using existing ' + file_name)
def wine_run_shell(command): def wine_run_shell(command):
# Keep argv-style invocation (no shell) to avoid command injection.
if os.name == 'posix': if os.name == 'posix':
check(os.system('wine ' + command.replace('\\', '/'))) parts = ['wine'] + command.replace('\\', '/').split()
elif os.name == 'nt': elif os.name == 'nt':
check(os.system(command)) parts = command.split()
else: else:
raise Exception('Unsupported OS') raise Exception('Unsupported OS')
check(subprocess.run(parts).returncode)
def wine_run(command_parts): def wine_run(command_parts):
if os.name == 'posix': if os.name == 'posix':
@@ -92,7 +98,20 @@ if os.path.exists('./yt-local'):
# confused with working directory. I'm calling it the same thing so it will # confused with working directory. I'm calling it the same thing so it will
# have that name when extracted from the final release zip archive) # have that name when extracted from the final release zip archive)
log('Making copy of yt-local files') log('Making copy of yt-local files')
check(os.system('git archive --format tar master | 7z x -si -ttar -oyt-local')) # Avoid the shell: pipe `git archive` into 7z directly via subprocess.
_git_archive = subprocess.Popen(
['git', 'archive', '--format', 'tar', 'master'],
stdout=subprocess.PIPE,
)
_sevenz = subprocess.Popen(
['7z', 'x', '-si', '-ttar', '-oyt-local'],
stdin=_git_archive.stdout,
)
_git_archive.stdout.close()
_sevenz.wait()
_git_archive.wait()
check(_sevenz.returncode)
check(_git_archive.returncode)
if len(os.listdir('./yt-local')) == 0: if len(os.listdir('./yt-local')) == 0:
raise Exception('Failed to copy yt-local files') raise Exception('Failed to copy yt-local files')
@@ -136,7 +155,7 @@ if os.path.exists('./python'):
log('Extracting python distribution') log('Extracting python distribution')
check(os.system(r'7z -y x -opython ' + python_dist_name)) check_subp(subprocess.run(['7z', '-y', 'x', '-opython', python_dist_name]))
log('Executing get-pip.py') log('Executing get-pip.py')
wine_run(['./python/python.exe', '-I', 'get-pip.py']) wine_run(['./python/python.exe', '-I', 'get-pip.py'])
@@ -241,7 +260,7 @@ if os.path.exists('./' + output_filename):
log('Removing previous zipped release') log('Removing previous zipped release')
os.remove('./' + output_filename) os.remove('./' + output_filename)
log('Zipping release') log('Zipping release')
check(os.system(r'7z -mx=9 a ' + output_filename + ' ./yt-local')) check_subp(subprocess.run(['7z', '-mx=9', 'a', output_filename, './yt-local']))
print('\n') print('\n')
log('Finished') log('Finished')

View File

@@ -1,22 +1,28 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# E402 is deliberately ignored in this file: `monkey.patch_all()` must run
# before any stdlib networking or gevent-dependent modules are imported.
from gevent import monkey from gevent import monkey
monkey.patch_all() monkey.patch_all()
import gevent.socket
from youtube import yt_app from youtube import yt_app
from youtube import util from youtube import util
# these are just so the files get run - they import yt_app and add routes to it # these are just so the files get run - they import yt_app and add routes to it
from youtube import watch, search, playlist, channel, local_playlist, comments, subscriptions from youtube import (
watch,
search,
playlist,
channel,
local_playlist,
comments,
subscriptions,
)
import settings import settings
from gevent.pywsgi import WSGIServer from gevent.pywsgi import WSGIServer
import urllib import urllib
import urllib3 import urllib3
import socket
import socks, sockshandler
import subprocess
import re import re
import sys import sys
import time import time
@@ -55,8 +61,6 @@ def proxy_site(env, start_response, video=False):
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
'Accept': '*/*', 'Accept': '*/*',
} }
current_range_start = 0
range_end = None
if 'HTTP_RANGE' in env: if 'HTTP_RANGE' in env:
send_headers['Range'] = env['HTTP_RANGE'] send_headers['Range'] = env['HTTP_RANGE']
@@ -274,6 +278,8 @@ class FilteredRequestLog:
if __name__ == '__main__': if __name__ == '__main__':
if settings.allow_foreign_addresses: if settings.allow_foreign_addresses:
# Binding to all interfaces is opt-in via the
# `allow_foreign_addresses` setting and documented as discouraged.
server = WSGIServer(('0.0.0.0', settings.port_number), site_dispatch, server = WSGIServer(('0.0.0.0', settings.port_number), site_dispatch,
log=FilteredRequestLog()) log=FilteredRequestLog())
ip_server = '0.0.0.0' ip_server = '0.0.0.0'

View File

@@ -264,7 +264,6 @@ For security reasons, enabling this is not recommended.''',
('use_video_download', { ('use_video_download', {
'type': int, 'type': int,
'default': 0, 'default': 0,
'comment': '',
'options': [ 'options': [
(0, 'Disabled'), (0, 'Disabled'),
(1, 'Enabled'), (1, 'Enabled'),
@@ -471,7 +470,7 @@ upgrade_functions = {
def log_ignored_line(line_number, message): def log_ignored_line(line_number, message):
print("WARNING: Ignoring settings.txt line " + str(node.lineno) + " (" + message + ")") print('WARNING: Ignoring settings.txt line ' + str(line_number) + ' (' + message + ')')
if os.path.isfile("settings.txt"): if os.path.isfile("settings.txt"):
@@ -511,17 +510,17 @@ else:
pass # Removed in Python 3.12+ pass # Removed in Python 3.12+
module_node = ast.parse(settings_text) module_node = ast.parse(settings_text)
for node in module_node.body: for node in module_node.body:
if type(node) != ast.Assign: if not isinstance(node, ast.Assign):
log_ignored_line(node.lineno, "only assignments are allowed") log_ignored_line(node.lineno, 'only assignments are allowed')
continue continue
if len(node.targets) > 1: if len(node.targets) > 1:
log_ignored_line(node.lineno, "only simple single-variable assignments allowed") log_ignored_line(node.lineno, 'only simple single-variable assignments allowed')
continue continue
target = node.targets[0] target = node.targets[0]
if type(target) != ast.Name: if not isinstance(target, ast.Name):
log_ignored_line(node.lineno, "only simple single-variable assignments allowed") log_ignored_line(node.lineno, 'only simple single-variable assignments allowed')
continue continue
if target.id not in acceptable_targets: if target.id not in acceptable_targets:

View File

@@ -11,8 +11,7 @@ import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
import youtube.proto as proto import youtube.proto as proto
from youtube.yt_data_extract.common import ( from youtube.yt_data_extract.common import (
extract_item_info, extract_items, extract_shorts_lockup_view_model_info, extract_item_info, extract_items,
extract_approx_int,
) )

View File

@@ -39,7 +39,8 @@ class NewIdentityState():
self.new_identities_till_success -= 1 self.new_identities_till_success -= 1
def fetch_url_response(self, *args, **kwargs): def fetch_url_response(self, *args, **kwargs):
cleanup_func = (lambda r: None) def cleanup_func(response):
return None
if self.new_identities_till_success == 0: if self.new_identities_till_success == 0:
return MockResponse(), cleanup_func return MockResponse(), cleanup_func
return MockResponse(body=html429, status=429), cleanup_func return MockResponse(body=html429, status=429), cleanup_func

View File

@@ -1,14 +1,17 @@
import logging
import os
import re
import traceback
from sys import exc_info
import flask
import jinja2
from flask import request
from flask_babel import Babel
from youtube import util from youtube import util
from .get_app_version import app_version from .get_app_version import app_version
import flask
from flask import request
import jinja2
import settings import settings
import traceback
import logging
import re
from sys import exc_info
from flask_babel import Babel
yt_app = flask.Flask(__name__) yt_app = flask.Flask(__name__)
yt_app.config['TEMPLATES_AUTO_RELOAD'] = True yt_app.config['TEMPLATES_AUTO_RELOAD'] = True
@@ -26,7 +29,6 @@ yt_app.logger.addFilter(FetchErrorFilter())
# yt_app.jinja_env.lstrip_blocks = True # yt_app.jinja_env.lstrip_blocks = True
# Configure Babel for i18n # Configure Babel for i18n
import os
yt_app.config['BABEL_DEFAULT_LOCALE'] = 'en' yt_app.config['BABEL_DEFAULT_LOCALE'] = 'en'
# Use absolute path for translations directory to avoid issues with package structure changes # Use absolute path for translations directory to avoid issues with package structure changes
_app_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) _app_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

View File

@@ -6,9 +6,7 @@ import settings
import urllib import urllib
import json import json
from string import Template
import youtube.proto as proto import youtube.proto as proto
import html
import math import math
import gevent import gevent
import re import re
@@ -293,7 +291,7 @@ def get_number_of_videos_channel(channel_id):
try: try:
response = util.fetch_url(url, headers_mobile, response = util.fetch_url(url, headers_mobile,
debug_name='number_of_videos', report_text='Got number of videos') debug_name='number_of_videos', report_text='Got number of videos')
except (urllib.error.HTTPError, util.FetchError) as e: except (urllib.error.HTTPError, util.FetchError):
traceback.print_exc() traceback.print_exc()
print("Couldn't retrieve number of videos") print("Couldn't retrieve number of videos")
return 1000 return 1000

View File

@@ -155,9 +155,13 @@ def post_process_comments_info(comments_info):
def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
try: if not settings.comments_mode:
if settings.comments_mode: return {}
# Initialize the result dict up-front so that any exception path below
# can safely attach an 'error' field without risking UnboundLocalError.
comments_info = {'error': None} comments_info = {'error': None}
try:
other_sort_url = ( other_sort_url = (
util.URL_ORIGIN + '/comments?ctoken=' util.URL_ORIGIN + '/comments?ctoken='
+ make_comment_ctoken(video_id, sort=1 - sort, lc=lc) + make_comment_ctoken(video_id, sort=1 - sort, lc=lc)
@@ -180,8 +184,6 @@ def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''):
post_process_comments_info(comments_info) post_process_comments_info(comments_info)
return comments_info return comments_info
else:
return {}
except util.FetchError as e: except util.FetchError as e:
if e.code == '429' and settings.route_tor: if e.code == '429' and settings.route_tor:
comments_info['error'] = 'Error: YouTube blocked the request because the Tor exit node is overutilized.' comments_info['error'] = 'Error: YouTube blocked the request because the Tor exit node is overutilized.'

View File

@@ -1 +1,3 @@
from .get_app_version import * from .get_app_version import app_version
__all__ = ['app_version']

View File

@@ -1,47 +1,56 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from subprocess import (
call,
STDOUT
)
from ..version import __version__
import os import os
import shutil
import subprocess import subprocess
from ..version import __version__
def app_version(): def app_version():
def minimal_env_cmd(cmd): def minimal_env_cmd(cmd):
# make minimal environment # make minimal environment
env = {k: os.environ[k] for k in ['SYSTEMROOT', 'PATH'] if k in os.environ} env = {k: os.environ[k] for k in ['SYSTEMROOT', 'PATH'] if k in os.environ}
env.update({'LANGUAGE': 'C', 'LANG': 'C', 'LC_ALL': 'C'}) env.update({'LANGUAGE': 'C', 'LANG': 'C', 'LC_ALL': 'C'})
out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env).communicate()[0] out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
return out return out
subst_list = { subst_list = {
"version": __version__, 'version': __version__,
"branch": None, 'branch': None,
"commit": None 'commit': None,
} }
if os.system("command -v git > /dev/null 2>&1") != 0: # Use shutil.which instead of `command -v`/os.system so we don't spawn a
# shell (CWE-78 hardening) and so it works cross-platform.
if shutil.which('git') is None:
return subst_list return subst_list
if call(["git", "branch"], stderr=STDOUT, stdout=open(os.devnull, 'w')) != 0: try:
# Check we are inside a git work tree. Using DEVNULL avoids the
# file-handle leak from `open(os.devnull, 'w')`.
rc = subprocess.call(
['git', 'branch'],
stderr=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
)
except OSError:
return subst_list
if rc != 0:
return subst_list return subst_list
describe = minimal_env_cmd(["git", "describe", "--tags", "--always"]) describe = minimal_env_cmd(['git', 'describe', '--tags', '--always'])
git_revision = describe.strip().decode('ascii') git_revision = describe.strip().decode('ascii')
branch = minimal_env_cmd(["git", "branch"]) branch = minimal_env_cmd(['git', 'branch'])
git_branch = branch.strip().decode('ascii').replace('* ', '') git_branch = branch.strip().decode('ascii').replace('* ', '')
subst_list.update({ subst_list.update({
"branch": git_branch, 'branch': git_branch,
"commit": git_revision 'commit': git_revision,
}) })
return subst_list return subst_list
if __name__ == "__main__": if __name__ == '__main__':
app_version() app_version()

View File

@@ -1,28 +1,42 @@
from youtube import util, yt_data_extract from youtube import util
from youtube import yt_app from youtube import yt_app
import settings import settings
import os import os
import json import json
import html
import gevent import gevent
import urllib
import math import math
import glob import glob
import re
import flask import flask
from flask import request from flask import request
playlists_directory = os.path.join(settings.data_dir, "playlists") playlists_directory = os.path.join(settings.data_dir, 'playlists')
thumbnails_directory = os.path.join(settings.data_dir, "playlist_thumbnails") thumbnails_directory = os.path.join(settings.data_dir, 'playlist_thumbnails')
# Whitelist accepted playlist names so user input cannot escape
# `playlists_directory` / `thumbnails_directory` (CWE-22, OWASP A01:2021).
# Allow letters, digits, spaces, dot, dash and underscore.
_PLAYLIST_NAME_RE = re.compile(r'^[\w .\-]{1,128}$')
def _validate_playlist_name(name):
'''Return the stripped name if safe, otherwise abort with 400.'''
if name is None:
flask.abort(400)
name = name.strip()
if not _PLAYLIST_NAME_RE.match(name):
flask.abort(400)
return name
def _find_playlist_path(name): def _find_playlist_path(name):
"""Find playlist file robustly, handling trailing spaces in filenames""" '''Find playlist file robustly, handling trailing spaces in filenames'''
name = name.strip() name = _validate_playlist_name(name)
pattern = os.path.join(playlists_directory, name + "*.txt") pattern = os.path.join(playlists_directory, name + '*.txt')
files = glob.glob(pattern) files = glob.glob(pattern)
return files[0] if files else os.path.join(playlists_directory, name + ".txt") return files[0] if files else os.path.join(playlists_directory, name + '.txt')
def _parse_playlist_lines(data): def _parse_playlist_lines(data):
@@ -179,8 +193,9 @@ def path_edit_playlist(playlist_name):
redirect_page_number = min(int(request.values.get('page', 1)), math.ceil(number_of_videos_remaining/50)) redirect_page_number = min(int(request.values.get('page', 1)), math.ceil(number_of_videos_remaining/50))
return flask.redirect(util.URL_ORIGIN + request.path + '?page=' + str(redirect_page_number)) return flask.redirect(util.URL_ORIGIN + request.path + '?page=' + str(redirect_page_number))
elif request.values['action'] == 'remove_playlist': elif request.values['action'] == 'remove_playlist':
safe_name = _validate_playlist_name(playlist_name)
try: try:
os.remove(os.path.join(playlists_directory, playlist_name + ".txt")) os.remove(os.path.join(playlists_directory, safe_name + '.txt'))
except OSError: except OSError:
pass pass
return flask.redirect(util.URL_ORIGIN + '/playlists') return flask.redirect(util.URL_ORIGIN + '/playlists')
@@ -220,8 +235,17 @@ def edit_playlist():
flask.abort(400) flask.abort(400)
_THUMBNAIL_RE = re.compile(r'^[A-Za-z0-9_-]{11}\.jpg$')
@yt_app.route('/data/playlist_thumbnails/<playlist_name>/<thumbnail>') @yt_app.route('/data/playlist_thumbnails/<playlist_name>/<thumbnail>')
def serve_thumbnail(playlist_name, thumbnail): def serve_thumbnail(playlist_name, thumbnail):
# .. is necessary because flask always uses the application directory at ./youtube, not the working directory # Validate both path components so a crafted URL cannot escape
# `thumbnails_directory` via `..` or NUL tricks (CWE-22).
safe_name = _validate_playlist_name(playlist_name)
if not _THUMBNAIL_RE.match(thumbnail):
flask.abort(400)
# .. is necessary because flask always uses the application directory at
# ./youtube, not the working directory.
return flask.send_from_directory( return flask.send_from_directory(
os.path.join('..', thumbnails_directory, playlist_name), thumbnail) os.path.join('..', thumbnails_directory, safe_name), thumbnail)

View File

@@ -3,9 +3,7 @@ from youtube import yt_app
import settings import settings
import base64 import base64
import urllib
import json import json
import string
import gevent import gevent
import math import math
from flask import request, abort from flask import request, abort

View File

@@ -5,7 +5,6 @@ import settings
import json import json
import urllib import urllib
import base64 import base64
import mimetypes
from flask import request from flask import request
import flask import flask
import os import os

View File

@@ -292,7 +292,10 @@ def youtube_timestamp_to_posix(dumb_timestamp):
def posix_to_dumbed_down(posix_time): def posix_to_dumbed_down(posix_time):
'''Inverse of youtube_timestamp_to_posix.''' '''Inverse of youtube_timestamp_to_posix.'''
delta = int(time.time() - posix_time) delta = int(time.time() - posix_time)
assert delta >= 0 # Guard against future timestamps (clock drift) without relying on
# `assert` (which is stripped under `python -O`).
if delta < 0:
delta = 0
if delta == 0: if delta == 0:
return '0 seconds ago' return '0 seconds ago'
@@ -531,7 +534,8 @@ def _get_upstream_videos(channel_id):
return None return None
root = defusedxml.ElementTree.fromstring(feed) root = defusedxml.ElementTree.fromstring(feed)
assert remove_bullshit(root.tag) == 'feed' if remove_bullshit(root.tag) != 'feed':
raise ValueError('Root element is not <feed>')
for entry in root: for entry in root:
if (remove_bullshit(entry.tag) != 'entry'): if (remove_bullshit(entry.tag) != 'entry'):
continue continue
@@ -539,13 +543,13 @@ def _get_upstream_videos(channel_id):
# it's yt:videoId in the xml but the yt: is turned into a namespace which is removed by remove_bullshit # it's yt:videoId in the xml but the yt: is turned into a namespace which is removed by remove_bullshit
video_id_element = find_element(entry, 'videoId') video_id_element = find_element(entry, 'videoId')
time_published_element = find_element(entry, 'published') time_published_element = find_element(entry, 'published')
assert video_id_element is not None if video_id_element is None or time_published_element is None:
assert time_published_element is not None raise ValueError('Missing videoId or published element')
time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00'))) time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00')))
times_published[video_id_element.text] = time_published times_published[video_id_element.text] = time_published
except AssertionError: except ValueError:
print('Failed to read atoma feed for ' + channel_status_name) print('Failed to read atoma feed for ' + channel_status_name)
traceback.print_exc() traceback.print_exc()
except defusedxml.ElementTree.ParseError: except defusedxml.ElementTree.ParseError:
@@ -593,7 +597,10 @@ def _get_upstream_videos(channel_id):
# Special case: none of the videos have a time published. # Special case: none of the videos have a time published.
# In this case, make something up # In this case, make something up
if videos and videos[0]['time_published'] is None: if videos and videos[0]['time_published'] is None:
assert all(v['time_published'] is None for v in videos) # Invariant: if the first video has no timestamp, earlier passes
# ensure all of them are unset. Don't rely on `assert`.
if not all(v['time_published'] is None for v in videos):
raise RuntimeError('Inconsistent time_published state')
now = time.time() now = time.time()
for i in range(len(videos)): for i in range(len(videos)):
# 1 month between videos # 1 month between videos
@@ -808,7 +815,8 @@ def import_subscriptions():
file = file.read().decode('utf-8') file = file.read().decode('utf-8')
try: try:
root = defusedxml.ElementTree.fromstring(file) root = defusedxml.ElementTree.fromstring(file)
assert root.tag == 'opml' if root.tag != 'opml':
raise ValueError('Root element is not <opml>')
channels = [] channels = []
for outline_element in root[0][0]: for outline_element in root[0][0]:
if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib): if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib):
@@ -819,7 +827,7 @@ def import_subscriptions():
channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip() channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip()
channels.append((channel_id, channel_name)) channels.append((channel_id, channel_name))
except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e: except (ValueError, IndexError, defusedxml.ElementTree.ParseError):
return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400 return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400
elif mime_type in ('text/csv', 'application/vnd.ms-excel'): elif mime_type in ('text/csv', 'application/vnd.ms-excel'):
content = file.read().decode('utf-8') content = file.read().decode('utf-8')
@@ -1071,11 +1079,20 @@ def post_subscriptions_page():
return '', 204 return '', 204
# YouTube video IDs are exactly 11 chars from [A-Za-z0-9_-]. Enforce this
# before using the value in filesystem paths to prevent path traversal
# (CWE-22, OWASP A01:2021).
_VIDEO_ID_RE = re.compile(r'^[A-Za-z0-9_-]{11}$')
@yt_app.route('/data/subscription_thumbnails/<thumbnail>') @yt_app.route('/data/subscription_thumbnails/<thumbnail>')
def serve_subscription_thumbnail(thumbnail): def serve_subscription_thumbnail(thumbnail):
'''Serves thumbnail from disk if it's been saved already. If not, downloads the thumbnail, saves to disk, and serves it.''' '''Serves thumbnail from disk if it's been saved already. If not, downloads the thumbnail, saves to disk, and serves it.'''
assert thumbnail[-4:] == '.jpg' if not thumbnail.endswith('.jpg'):
flask.abort(400)
video_id = thumbnail[0:-4] video_id = thumbnail[0:-4]
if not _VIDEO_ID_RE.match(video_id):
flask.abort(400)
thumbnail_path = os.path.join(thumbnails_directory, thumbnail) thumbnail_path = os.path.join(thumbnails_directory, thumbnail)
if video_id in existing_thumbnails: if video_id in existing_thumbnails:

View File

@@ -1,5 +1,6 @@
from datetime import datetime from datetime import datetime
import logging import logging
import random
import settings import settings
import socks import socks
import sockshandler import sockshandler
@@ -19,11 +20,11 @@ import gevent.queue
import gevent.lock import gevent.lock
import collections import collections
import stem import stem
logger = logging.getLogger(__name__)
import stem.control import stem.control
import traceback import traceback
logger = logging.getLogger(__name__)
# The trouble with the requests library: It ships its own certificate bundle via certifi # The trouble with the requests library: It ships its own certificate bundle via certifi
# instead of using the system certificate store, meaning self-signed certificates # instead of using the system certificate store, meaning self-signed certificates
# configured by the user will not work. Some draconian networks block TLS unless a corporate # configured by the user will not work. Some draconian networks block TLS unless a corporate
@@ -54,8 +55,8 @@ import traceback
# https://github.com/kennethreitz/requests/issues/2966 # https://github.com/kennethreitz/requests/issues/2966
# Until then, I will use a mix of urllib3 and urllib. # Until then, I will use a mix of urllib3 and urllib.
import urllib3 import urllib3 # noqa: E402 (imported here intentionally after the long note above)
import urllib3.contrib.socks import urllib3.contrib.socks # noqa: E402
URL_ORIGIN = "/https://www.youtube.com" URL_ORIGIN = "/https://www.youtube.com"
@@ -177,7 +178,6 @@ def get_pool(use_tor):
class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
'''Separate cookiejars for receiving and sending''' '''Separate cookiejars for receiving and sending'''
def __init__(self, cookiejar_send=None, cookiejar_receive=None): def __init__(self, cookiejar_send=None, cookiejar_receive=None):
import http.cookiejar
self.cookiejar_send = cookiejar_send self.cookiejar_send = cookiejar_send
self.cookiejar_receive = cookiejar_receive self.cookiejar_receive = cookiejar_receive
@@ -208,6 +208,16 @@ class FetchError(Exception):
self.error_message = error_message self.error_message = error_message
def _noop_cleanup(response):
'''No-op cleanup used when the urllib opener owns the response.'''
return None
def _release_conn_cleanup(response):
'''Release the urllib3 pooled connection back to the pool.'''
response.release_conn()
def decode_content(content, encoding_header): def decode_content(content, encoding_header):
encodings = encoding_header.replace(' ', '').split(',') encodings = encoding_header.replace(' ', '').split(',')
for encoding in reversed(encodings): for encoding in reversed(encodings):
@@ -263,7 +273,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
opener = urllib.request.build_opener(cookie_processor) opener = urllib.request.build_opener(cookie_processor)
response = opener.open(req, timeout=timeout) response = opener.open(req, timeout=timeout)
cleanup_func = (lambda r: None) cleanup_func = _noop_cleanup
else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
# default: Retry.DEFAULT = Retry(3) # default: Retry.DEFAULT = Retry(3)
@@ -297,7 +307,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
error_message=msg) error_message=msg)
else: else:
raise raise
cleanup_func = (lambda r: r.release_conn()) cleanup_func = _release_conn_cleanup
return response, cleanup_func return response, cleanup_func
@@ -315,8 +325,6 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
Max retries: 5 attempts with exponential backoff Max retries: 5 attempts with exponential backoff
""" """
import random
max_retries = 5 max_retries = 5
base_delay = 1.0 # Base delay in seconds base_delay = 1.0 # Base delay in seconds
@@ -401,7 +409,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
logger.error(f'Server error {response.status} after {max_retries} retries') logger.error(f'Server error {response.status} after {max_retries} retries')
raise FetchError(str(response.status), reason=response.reason, ip=None) raise FetchError(str(response.status), reason=response.reason, ip=None)
# Exponential backoff for server errors # Exponential backoff for server errors. Non-crypto jitter.
delay = (base_delay * (2 ** attempt)) + random.uniform(0, 1) delay = (base_delay * (2 ** attempt)) + random.uniform(0, 1)
logger.warning(f'Server error ({response.status}). Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}...') logger.warning(f'Server error ({response.status}). Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}...')
time.sleep(delay) time.sleep(delay)
@@ -432,7 +440,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
else: else:
raise raise
# Wait and retry # Wait and retry. Non-crypto jitter.
delay = (base_delay * (2 ** attempt)) + random.uniform(0, 1) delay = (base_delay * (2 ** attempt)) + random.uniform(0, 1)
logger.warning(f'Connection error. Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}...') logger.warning(f'Connection error. Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}...')
time.sleep(delay) time.sleep(delay)
@@ -532,30 +540,30 @@ class RateLimitedQueue(gevent.queue.Queue):
def download_thumbnail(save_directory, video_id): def download_thumbnail(save_directory, video_id):
save_location = os.path.join(save_directory, video_id + ".jpg") save_location = os.path.join(save_directory, video_id + '.jpg')
for quality in ('hq720.jpg', 'sddefault.jpg', 'hqdefault.jpg'): for quality in ('hq720.jpg', 'sddefault.jpg', 'hqdefault.jpg'):
url = f"https://i.ytimg.com/vi/{video_id}/{quality}" url = f'https://i.ytimg.com/vi/{video_id}/{quality}'
try: try:
thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id) thumbnail = fetch_url(url, report_text='Saved thumbnail: ' + video_id)
except FetchError as e: except FetchError as e:
if '404' in str(e): if '404' in str(e):
continue continue
print("Failed to download thumbnail for " + video_id + ": " + str(e)) print('Failed to download thumbnail for ' + video_id + ': ' + str(e))
return False return False
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
if e.code == 404: if e.code == 404:
continue continue
print("Failed to download thumbnail for " + video_id + ": " + str(e)) print('Failed to download thumbnail for ' + video_id + ': ' + str(e))
return False return False
try: try:
f = open(save_location, 'wb') with open(save_location, 'wb') as f:
f.write(thumbnail)
except FileNotFoundError: except FileNotFoundError:
os.makedirs(save_directory, exist_ok=True) os.makedirs(save_directory, exist_ok=True)
f = open(save_location, 'wb') with open(save_location, 'wb') as f:
f.write(thumbnail) f.write(thumbnail)
f.close()
return True return True
print("No thumbnail available for " + video_id) print('No thumbnail available for ' + video_id)
return False return False

View File

@@ -1,27 +1,26 @@
import json
import logging
import math
import os
import re
import traceback
import urllib
from math import ceil
from types import SimpleNamespace
from urllib.parse import parse_qs, urlencode
import flask
import gevent
import urllib3.exceptions
from flask import request
import youtube import youtube
from youtube import yt_app from youtube import yt_app
from youtube import util, comments, local_playlist, yt_data_extract from youtube import util, comments, local_playlist, yt_data_extract
from youtube.util import time_utc_isoformat
import settings import settings
from flask import request
import flask
import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
import json
import gevent
import os
import math
import traceback
import urllib
import re
import urllib3.exceptions
from urllib.parse import parse_qs, urlencode
from types import SimpleNamespace
from math import ceil
try: try:
with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f: with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f:
@@ -62,7 +61,9 @@ def get_video_sources(info, target_resolution):
continue continue
if not (fmt['init_range'] and fmt['index_range']): if not (fmt['init_range'] and fmt['index_range']):
# Allow HLS-backed audio tracks (served locally, no init/index needed) # Allow HLS-backed audio tracks (served locally, no init/index needed)
if not fmt.get('url', '').startswith('http://127.') and not '/ytl-api/' in fmt.get('url', ''): url_value = fmt.get('url', '')
if (not url_value.startswith('http://127.')
and '/ytl-api/' not in url_value):
continue continue
# Mark as HLS for frontend # Mark as HLS for frontend
fmt['is_hls'] = True fmt['is_hls'] = True
@@ -222,7 +223,7 @@ def lang_in(lang, sequence):
if lang is None: if lang is None:
return False return False
lang = lang[0:2] lang = lang[0:2]
return lang in (l[0:2] for l in sequence) return lang in (item[0:2] for item in sequence)
def lang_eq(lang1, lang2): def lang_eq(lang1, lang2):
@@ -238,9 +239,9 @@ def equiv_lang_in(lang, sequence):
e.g. if lang is en, extracts en-GB from sequence. e.g. if lang is en, extracts en-GB from sequence.
Necessary because if only a specific variant like en-GB is available, can't ask YouTube for simply en. Need to get the available variant.''' Necessary because if only a specific variant like en-GB is available, can't ask YouTube for simply en. Need to get the available variant.'''
lang = lang[0:2] lang = lang[0:2]
for l in sequence: for item in sequence:
if l[0:2] == lang: if item[0:2] == lang:
return l return item
return None return None
@@ -310,7 +311,15 @@ def get_subtitle_sources(info):
sources[-1]['on'] = True sources[-1]['on'] = True
if len(sources) == 0: if len(sources) == 0:
assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0 # Invariant: with no caption sources there should be no languages
# either. Don't rely on `assert` which is stripped under `python -O`.
if (len(info['automatic_caption_languages']) != 0
or len(info['manual_caption_languages']) != 0):
logger.warning(
'Unexpected state: no subtitle sources but %d auto / %d manual languages',
len(info['automatic_caption_languages']),
len(info['manual_caption_languages']),
)
return sources return sources
@@ -669,7 +678,6 @@ def format_bytes(bytes):
@yt_app.route('/ytl-api/audio-track-proxy') @yt_app.route('/ytl-api/audio-track-proxy')
def audio_track_proxy(): def audio_track_proxy():
"""Proxy for DASH audio tracks to avoid throttling.""" """Proxy for DASH audio tracks to avoid throttling."""
cache_key = request.args.get('id', '')
audio_url = request.args.get('url', '') audio_url = request.args.get('url', '')
if not audio_url: if not audio_url:
@@ -692,7 +700,7 @@ def audio_track_proxy():
@yt_app.route('/ytl-api/audio-track') @yt_app.route('/ytl-api/audio-track')
def get_audio_track(): def get_audio_track():
"""Proxy HLS audio/video: playlist or individual segment.""" """Proxy HLS audio/video: playlist or individual segment."""
from youtube.hls_cache import get_hls_url, _tracks from youtube.hls_cache import get_hls_url
cache_key = request.args.get('id', '') cache_key = request.args.get('id', '')
seg_url = request.args.get('seg', '') seg_url = request.args.get('seg', '')
@@ -916,7 +924,7 @@ def get_hls_manifest():
flask.abort(404, 'HLS manifest not found') flask.abort(404, 'HLS manifest not found')
try: try:
print(f'[hls-manifest] Fetching HLS manifest...') print('[hls-manifest] Fetching HLS manifest...')
manifest = util.fetch_url(hls_url, manifest = util.fetch_url(hls_url,
headers=(('User-Agent', 'Mozilla/5.0'),), headers=(('User-Agent', 'Mozilla/5.0'),),
debug_name='hls_manifest').decode('utf-8') debug_name='hls_manifest').decode('utf-8')
@@ -1018,7 +1026,8 @@ def get_storyboard_vtt():
for i, board in enumerate(boards): for i, board in enumerate(boards):
*t, _, sigh = board.split("#") *t, _, sigh = board.split("#")
width, height, count, width_cnt, height_cnt, interval = map(int, t) width, height, count, width_cnt, height_cnt, interval = map(int, t)
if height != wanted_height: continue if height != wanted_height:
continue
q['sigh'] = [sigh] q['sigh'] = [sigh]
url = f"{base_url}?{urlencode(q, doseq=True)}" url = f"{base_url}?{urlencode(q, doseq=True)}"
storyboard = SimpleNamespace( storyboard = SimpleNamespace(
@@ -1182,7 +1191,6 @@ def get_watch_page(video_id=None):
uni_sources = video_sources['uni_sources'] uni_sources = video_sources['uni_sources']
pair_sources = video_sources['pair_sources'] pair_sources = video_sources['pair_sources']
pair_idx = video_sources['pair_idx'] pair_idx = video_sources['pair_idx']
audio_track_sources = video_sources['audio_track_sources']
# Build audio tracks list from HLS # Build audio tracks list from HLS
audio_tracks = [] audio_tracks = []