yt-local/server.py

#!/usr/bin/env python3
from gevent import monkey
monkey.patch_all()
import gevent.socket

from youtube import yt_app
from youtube import util

# these are just so the files get run - they import yt_app and add routes to it
from youtube import watch, search, playlist, channel, local_playlist, comments, subscriptions

import settings

from gevent.pywsgi import WSGIServer
import urllib
import urllib3
import socket
import socks, sockshandler
import subprocess
import re
import sys
import time


def youtu_be(env, start_response):
    id = env['PATH_INFO'][1:]
    env['PATH_INFO'] = '/watch'
    if not env['QUERY_STRING']:
        env['QUERY_STRING'] = 'v=' + id
    else:
        env['QUERY_STRING'] += '&v=' + id
    yield from yt_app(env, start_response)


RANGE_RE = re.compile(r'bytes=(\d+-(?:\d+)?)')
def parse_range(range_header, content_length):
    # Range header can be like bytes=200-1000 or bytes=200-
    # amount_received is the length of bytes from the range that have already
    # been received
    match = RANGE_RE.fullmatch(range_header.strip())
    if not match:
        print('Unsupported range header format:', range_header)
        return None
    start, end = match.group(1).split('-')
    start_byte = int(start)
    if not end:
        end_byte = start_byte + content_length - 1
    else:
        end_byte = int(end)
    return start_byte, end_byte


def proxy_site(env, start_response, video=False):
    send_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
        'Accept': '*/*',
    }
    current_range_start = 0
    range_end = None
    if 'HTTP_RANGE' in env:
        send_headers['Range'] = env['HTTP_RANGE']

    url = "https://" + env['SERVER_NAME'] + env['PATH_INFO']
    # remove /name portion
    if video and '/videoplayback/name/' in url:
        url = url[0:url.rfind('/name/')]
    if env['QUERY_STRING']:
        url += '?' + env['QUERY_STRING']

    try_num = 1
    first_attempt = True
    current_attempt_position = 0
    while try_num <= 3:   # Try a given byte position three times
        if not first_attempt:
            print('(Try %d)' % try_num, 'Trying with', send_headers['Range'])

        if video:
            params = urllib.parse.parse_qs(env['QUERY_STRING'])
            params_use_tor = int(params.get('use_tor', '0')[0])
            use_tor = (settings.route_tor == 2) or params_use_tor
            response, cleanup_func = util.fetch_url_response(url, send_headers,
                                                             use_tor=use_tor,
                                                             max_redirects=10)
        else:
            response, cleanup_func = util.fetch_url_response(url, send_headers)

        response_headers = response.getheaders()
        if isinstance(response_headers, urllib3._collections.HTTPHeaderDict):
            response_headers = response_headers.items()
        if video:
            response_headers = (list(response_headers)
                                +[('Access-Control-Allow-Origin', '*')])

        if first_attempt:
            start_response(str(response.status) + ' ' + response.reason,
                           response_headers)

        content_length = int(dict(response_headers).get('Content-Length', 0))
        if response.status >= 400:
            print('Error: YouTube returned "%d %s" while routing %s' % (
                response.status, response.reason, url.split('?')[0]))

        total_received = 0
        retry = False
        while True:
            # a bit over 3 seconds of 360p video
            # we want each TCP packet to transmit in large multiples,
            # such as 65,536, so we shouldn't read in small chunks
            # such as 8192 lest that causes the socket library to limit the
            # TCP window size
            # Might need fine-tuning, since this gives us 4*65536
            # The tradeoff is that larger values (such as 6 seconds) only
            # allows video to buffer in those increments, meaning user must
            # wait until the entire chunk is downloaded before video starts
            # playing
            content_part = response.read(32*8192)
            total_received += len(content_part)
            if not content_part:
                # Sometimes YouTube closes the connection before sending all of
                # the content. Retry with a range request for the missing
                # content. See
                # https://github.com/user234683/youtube-local/issues/40
                if total_received < content_length:
                    if 'Range' in send_headers:
                        int_range = parse_range(send_headers['Range'],
                                                content_length)
                        if not int_range: # give up b/c unrecognized range
                            break
                        start, end = int_range
                    else:
                        start, end = 0, (content_length - 1)

                    fail_byte = start + total_received
                    send_headers['Range'] = 'bytes=%d-%d' % (fail_byte, end)
                    print(
                        'Warning: YouTube closed the connection before byte',
                        str(fail_byte) + '.', 'Expected', start+content_length,
                        'bytes.'
                    )

                    retry = True
                    first_attempt = False
                    if fail_byte == current_attempt_position:
                        try_num += 1
                    else:
                        try_num = 1
                        current_attempt_position = fail_byte
                break
            yield content_part
        cleanup_func(response)
        if retry:
            # YouTube will return 503 Service Unavailable if you do a bunch
            # of range requests too quickly.
            time.sleep(1)
            continue
        else:
            break
    else: # no break
        print('Error: YouTube closed the connection before',
              'providing all content. Retried three times:', url.split('?')[0])


def proxy_video(env, start_response):
    yield from proxy_site(env, start_response, video=True)


site_handlers = {
    'youtube.com': yt_app,
    'youtube-nocookie.com': yt_app,
    'youtu.be': youtu_be,
    'ytimg.com': proxy_site,
    'yt3.ggpht.com': proxy_site,
    'lh3.googleusercontent.com': proxy_site,
    'sponsor.ajay.app': proxy_site,
    'googlevideo.com': proxy_video,
}


def split_url(url):
    ''' Split https://sub.example.com/foo/bar.html into ('sub.example.com', '/foo/bar.html')'''
    # XXX: Is this regex safe from REDOS?
    # python STILL doesn't have a proper regular expression engine like grep uses built in...
    match = re.match(r'(?:https?://)?([\w-]+(?:\.[\w-]+)+?)(/.*|$)', url)
    if match is None:
        raise ValueError('Invalid or unsupported url: ' + url)

    return match.group(1), match.group(2)


def error_code(code, start_response):
    start_response(code, ())
    return code.encode()


def site_dispatch(env, start_response):
    client_address = env['REMOTE_ADDR']
    try:
        # correct malformed query string with ? separators instead of &
        env['QUERY_STRING'] = env['QUERY_STRING'].replace('?', '&')

        # Fix PATH_INFO for UWSGI
        if 'REQUEST_URI' in env:
            env['PATH_INFO'] = urllib.parse.unquote(
                env['REQUEST_URI'].split('?')[0]
            )

        method = env['REQUEST_METHOD']
        path = env['PATH_INFO']

        if (method == "POST"
                and client_address not in ('127.0.0.1', '::1')
                and not settings.allow_foreign_post_requests):
            yield error_code('403 Forbidden', start_response)
            return

        # redirect localhost:8080 to localhost:8080/https://youtube.com
        if path == '' or path == '/':
            start_response('302 Found', [('Location', '/https://youtube.com')])
            return

        try:
            env['SERVER_NAME'], env['PATH_INFO'] = split_url(path[1:])
        except ValueError:
            yield error_code('404 Not Found', start_response)
            return

        base_name = ''
        for domain in reversed(env['SERVER_NAME'].split('.')):
            if base_name == '':
                base_name = domain
            else:
                base_name = domain + '.' + base_name

            try:
                handler = site_handlers[base_name]
            except KeyError:
                continue
            else:
                yield from handler(env, start_response)
                break
        else:   # did not break
            yield error_code('404 Not Found', start_response)
            return
    except Exception:
        start_response('500 Internal Server Error', ())
        yield b'500 Internal Server Error'
        raise
    return


class FilteredRequestLog:
    '''Don't log noisy thumbnail and avatar requests'''
    filter_re = re.compile(r"""(?x)^
                            "GET /https://(i[.]ytimg[.]com/|
                            www[.]youtube[.]com/data/subscription_thumbnails/|
                            yt3[.]ggpht[.]com/|
                            www[.]youtube[.]com/api/timedtext).*" 200
                            """)

    def __init__(self):
        pass

    def write(self, s):
        if not self.filter_re.search(s):
            sys.stderr.write(s)


if __name__ == '__main__':
    if settings.allow_foreign_addresses:
        server = WSGIServer(('0.0.0.0', settings.port_number), site_dispatch,
                            log=FilteredRequestLog())
        ip_server = '0.0.0.0'
    else:
        server = WSGIServer(('127.0.0.1', settings.port_number), site_dispatch,
                            log=FilteredRequestLog())
        ip_server = '127.0.0.1'

    print('Starting httpserver at http://%s:%s/' %
          (ip_server, settings.port_number))
    server.serve_forever()

# for uwsgi, gunicorn, etc.
application = site_dispatch