feat(channels): fix pagination for "Sorted by newest - no shorts"

Replace UU-uploads playlist workaround (proto field 104) with direct requests to the channel Videos tab API (tab="videos"), aligning with Invidious content-type handling. This restores proper continuation tokens and stable pagination (~30 videos per page). Update display logic: - Show channel total upload count as an upper-bound while continuation tokens exist. - On final page, display exact fetched video count. - Ensure page number never falls below current page (fix page reset to "1"). Maintain separate handling: - Shorts and streams tabs continue using tab-specific continuation tokens. Add test: - TestChannelCtokenV5::test_include_shorts_false_adds_filter Fixes issue where channels with many Shorts (e.g., Celine Dept) showed only a few videos and broken pagination under "no shorts" sorting.
2026-04-19 22:34:14 -05:00
parent 3795d9e4ff
commit 5577e9e1f2
2 changed files with 171 additions and 94 deletions
--- a/tests/test_shorts.py
+++ b/tests/test_shorts.py
@@ -58,6 +58,59 @@ class TestChannelCtokenV5:
        assert t_shorts != t_streams
        assert t_videos != t_streams
    def test_include_shorts_false_adds_filter(self):
        """Test that include_shorts=False adds the shorts filter (field 104)."""
        # Token with shorts included (default)
        t_with_shorts = self.channel_ctoken_v5('UCtest', '1', '3', 'videos', include_shorts=True)
        # Token with shorts excluded
        t_without_shorts = self.channel_ctoken_v5('UCtest', '1', '3', 'videos', include_shorts=False)
        # The tokens should be different because of the shorts filter
        assert t_with_shorts != t_without_shorts
        # Decode and verify the filter is present
        raw_with_shorts = base64.urlsafe_b64decode(t_with_shorts + '==')
        raw_without_shorts = base64.urlsafe_b64decode(t_without_shorts + '==')
        # Parse the outer protobuf structure
        import youtube.proto as proto
        outer_fields_with = list(proto.read_protobuf(raw_with_shorts))
        outer_fields_without = list(proto.read_protobuf(raw_without_shorts))
        # Field 80226972 contains the inner data
        inner_with = [v for _, fn, v in outer_fields_with if fn == 80226972][0]
        inner_without = [v for _, fn, v in outer_fields_without if fn == 80226972][0]
        # Parse the inner data - field 3 contains percent-encoded base64 data
        inner_fields_with = list(proto.read_protobuf(inner_with))
        inner_fields_without = list(proto.read_protobuf(inner_without))
        # Get field 3 data (the encoded inner which is percent-encoded base64)
        encoded_inner_with = [v for _, fn, v in inner_fields_with if fn == 3][0]
        encoded_inner_without = [v for _, fn, v in inner_fields_without if fn == 3][0]
        # The inner without shorts should contain field 104
        # Decode the percent-encoded base64 data
        import urllib.parse
        decoded_with = urllib.parse.unquote(encoded_inner_with.decode('ascii'))
        decoded_without = urllib.parse.unquote(encoded_inner_without.decode('ascii'))
        # Decode the base64 data
        decoded_with_bytes = base64.urlsafe_b64decode(decoded_with + '==')
        decoded_without_bytes = base64.urlsafe_b64decode(decoded_without + '==')
        # Parse the decoded protobuf data
        fields_with = list(proto.read_protobuf(decoded_with_bytes))
        fields_without = list(proto.read_protobuf(decoded_without_bytes))
        field_numbers_with = [fn for _, fn, _ in fields_with]
        field_numbers_without = [fn for _, fn, _ in fields_without]
        # The 'with' version should NOT have field 104
        assert 104 not in field_numbers_with
        # The 'without' version SHOULD have field 104
        assert 104 in field_numbers_without
 # --- shortsLockupViewModel parsing ---
--- a/youtube/channel.py
+++ b/youtube/channel.py
@@ -33,9 +33,9 @@ headers_mobile = (
 real_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=8XihrAcN1l4'),)
 generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),)
 # FIXED 2026: YouTube changed continuation token structure (from Invidious commit a9f8127)
 # Sort values for YouTube API (from Invidious): 2=popular, 4=newest, 5=oldest
-def channel_ctoken_v5(channel_id, page, sort, tab, view=1):
+# include_shorts only applies to tab='videos'; tab='shorts'/'streams' always include their own content.
 def channel_ctoken_v5(channel_id, page, sort, tab, view=1, include_shorts=True):
    # Tab-specific protobuf field numbers (from Invidious source)
    # Each tab uses different field numbers in the protobuf structure:
    #   videos:  110 -> 3 -> 15 -> { 2:{1:UUID}, 4:sort, 8:{1:UUID, 3:sort} }
@@ -74,6 +74,11 @@ def channel_ctoken_v5(channel_id, page, sort, tab, view=1):
    inner_container = proto.string(3, tab_wrapper)
    outer_container = proto.string(110, inner_container)
    # Add shorts filter when include_shorts=False (field 104, same as playlist.py)
    # This tells YouTube to exclude shorts from the results
    if not include_shorts:
        outer_container += proto.string(104, proto.uint(2, 1))
    encoded_inner = proto.percent_b64encode(outer_container)
    pointless_nest = proto.string(80226972,
@@ -236,12 +241,12 @@ def channel_ctoken_v1(channel_id, page, sort, tab, view=1):
 def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1,
-                    ctoken=None, print_status=True):
+                    ctoken=None, print_status=True, include_shorts=True):
    message = 'Got channel tab' if print_status else None
    if not ctoken:
        if tab in ('videos', 'shorts', 'streams'):
-            ctoken = channel_ctoken_v5(channel_id, page, sort, tab, view)
+            ctoken = channel_ctoken_v5(channel_id, page, sort, tab, view, include_shorts)
        else:
            ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
        ctoken = ctoken.replace('=', '%3D')
@@ -295,12 +300,23 @@ def get_number_of_videos_channel(channel_id):
    response = response.decode('utf-8')
-    # match = re.search(r'"numVideosText":\s*{\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response)
+    # Try several patterns since YouTube's format changes:
-    match = re.search(r'"numVideosText".*?([,\d]+)', response)
+    #   "numVideosText":{"runs":[{"text":"1,234"},{"text":" videos"}]}
-    if match:
+    #   "stats":[..., {"runs":[{"text":"1,234"},{"text":" videos"}]}]
-        return int(match.group(1).replace(',',''))
+    for pattern in (
-    else:
+        r'"numVideosText".*?"text":\s*"([\d,]+)"',
-        return 0
+        r'"numVideosText".*?([\d,]+)\s*videos?',
        r'"numVideosText".*?([,\d]+)',
        r'([\d,]+)\s*videos?\s*</span>',
    ):
        match = re.search(pattern, response)
        if match:
            try:
                return int(match.group(1).replace(',', ''))
            except ValueError:
                continue
    # Fallback: unknown count
    return 0
 def set_cached_number_of_videos(channel_id, num_videos):
    @cachetools.cached(number_of_videos_cache)
    def dummy_func_using_same_cache(channel_id):
@@ -425,24 +441,27 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
    page_number = int(request.args.get('page', 1))
    # sort 1: views
    # sort 2: oldest
-    # sort 4: newest - no shorts (Just a kludge on our end, not internal to yt)
+    # sort 3: newest (includes shorts, via UU uploads playlist)
    # sort 4: newest - no shorts (uses channel Videos tab API directly, like Invidious)
    default_sort = '3' if settings.include_shorts_in_channel else '4'
    sort = request.args.get('sort', default_sort)
    view = request.args.get('view', '1')
    query = request.args.get('query', '')
    ctoken = request.args.get('ctoken', '')
    include_shorts = (sort != '4')
    default_params = (page_number == 1 and sort in ('3', '4') and view == '1')
-    continuation = bool(ctoken) # whether or not we're using a continuation
+    continuation = bool(ctoken)
    page_size = 30
    try_channel_api = True
    polymer_json = None
    number_of_videos = 0
    info = None
-    # Use the special UU playlist which contains all the channel's uploads
+    # -------------------------------------------------------------------------
-    if tab == 'videos' and sort in ('3', '4'):
+    # sort=3: use UU uploads playlist (includes shorts)
    # -------------------------------------------------------------------------
    if tab == 'videos' and sort == '3':
        if not channel_id:
            channel_id = get_channel_id(base_url)
-        if page_number == 1 and include_shorts:
+        if page_number == 1:
            tasks = (
                gevent.spawn(playlist.playlist_first_page,
                             'UU' + channel_id[2:],
@@ -451,9 +470,6 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
            )
            gevent.joinall(tasks)
            util.check_gevent_exceptions(*tasks)
            # Ignore the metadata for now, it is cached and will be
            # recalled later
            pl_json = tasks[0].value
            pl_info = yt_data_extract.extract_playlist_info(pl_json)
            number_of_videos = pl_info['metadata']['video_count']
@@ -464,86 +480,70 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
        else:
            tasks = (
                gevent.spawn(playlist.get_videos, 'UU' + channel_id[2:],
-                             page_number, include_shorts=include_shorts),
+                             page_number, include_shorts=True),
                gevent.spawn(get_metadata, channel_id),
                gevent.spawn(get_number_of_videos_channel, channel_id),
                gevent.spawn(playlist.playlist_first_page, 'UU' + channel_id[2:],
                             report_text='Retrieved channel video count'),
            )
            gevent.joinall(tasks)
            util.check_gevent_exceptions(*tasks)
            pl_json = tasks[0].value
            pl_info = yt_data_extract.extract_playlist_info(pl_json)
-            number_of_videos = tasks[2].value
+            first_page_meta = yt_data_extract.extract_playlist_metadata(tasks[3].value)
            number_of_videos = (tasks[2].value
                                or first_page_meta.get('video_count')
                                or 0)
-        info = pl_info
+        if pl_info['items']:
-        info['channel_id'] = channel_id
+            info = pl_info
-        info['current_tab'] = 'videos'
+            info['channel_id'] = channel_id
-        if info['items']:   # Success
+            info['current_tab'] = 'videos'
            page_size = 100
-            try_channel_api = False
+        # else fall through to the channel browse API below
        else:   # Try the first-page method next
            try_channel_api = True
-    # Use the regular channel API
+    # -------------------------------------------------------------------------
-    if tab in ('shorts', 'streams') or (tab=='videos' and try_channel_api):
+    # Channel browse API: sort=4 (videos tab, no shorts), shorts, streams,
    # or fallback when the UU playlist returned no items.
    # Uses channel_ctoken_v5 per-tab tokens, mirroring Invidious's approach.
    # Pagination is driven by the continuation token YouTube returns each page.
    # -------------------------------------------------------------------------
    used_channel_api = False
    if info is None and (
        tab in ('shorts', 'streams')
        or (tab == 'videos' and sort == '4')
        or (tab == 'videos' and sort == '3')   # UU-playlist fallback
    ):
        if not channel_id:
            channel_id = get_channel_id(base_url)
        used_channel_api = True
-        # For shorts/streams, use continuation token from cache or request
+        # Determine what browse call to make
-        if tab in ('shorts', 'streams'):
+        if ctoken:
-            if ctoken:
+            browse_call = (util.call_youtube_api, 'web', 'browse',
-                # Use ctoken directly from request (passed via pagination)
+                           {'continuation': ctoken})
-                polymer_json = util.call_youtube_api('web', 'browse', {
+            continuation = True
-                    'continuation': ctoken,
+        elif page_number > 1:
-                })
+            cache_key = (channel_id, tab, sort, page_number - 1)
-                continuation = True
+            cached_ctoken = continuation_token_cache.get(cache_key)
-            elif page_number > 1:
+            if cached_ctoken:
-                # For page 2+, get ctoken from cache
+                browse_call = (util.call_youtube_api, 'web', 'browse',
-                cache_key = (channel_id, tab, sort, page_number - 1)
+                               {'continuation': cached_ctoken})
                cached_ctoken = continuation_token_cache.get(cache_key)
                if cached_ctoken:
                    polymer_json = util.call_youtube_api('web', 'browse', {
                        'continuation': cached_ctoken,
                    })
                    continuation = True
                else:
                    # Fallback: generate fresh ctoken
                    page_call = (get_channel_tab, channel_id, str(page_number), sort, tab, int(view))
                    continuation = True
                    polymer_json = gevent.spawn(*page_call)
                    polymer_json.join()
                    if polymer_json.exception:
                        raise polymer_json.exception
                    polymer_json = polymer_json.value
            else:
-                # Page 1: generate fresh ctoken
+                # Cache miss — restart from page 1 (better than an error)
-                page_call = (get_channel_tab, channel_id, str(page_number), sort, tab, int(view))
+                browse_call = (get_channel_tab, channel_id, '1', sort, tab, int(view))
-                continuation = True
+            continuation = True
                polymer_json = gevent.spawn(*page_call)
                polymer_json.join()
                if polymer_json.exception:
                    raise polymer_json.exception
                polymer_json = polymer_json.value
        else:
-            # videos tab - original logic
+            browse_call = (get_channel_tab, channel_id, '1', sort, tab, int(view))
            page_call = (get_channel_tab, channel_id, str(page_number), sort,
                         tab, int(view))
            continuation = True
-        if tab == 'videos':
+        # Single browse call; number_of_videos is computed from items actually
-            # Only need video count for the videos tab
+        # fetched so we don't mislead the user with a total that includes
-            if channel_id:
+        # shorts (which this branch is explicitly excluding for sort=4).
-                num_videos_call = (get_number_of_videos_channel, channel_id)
+        task = gevent.spawn(*browse_call)
-            else:
+        task.join()
-                num_videos_call = (get_number_of_videos_general, base_url)
+        util.check_gevent_exceptions(task)
-            tasks = (
+        polymer_json = task.value
                gevent.spawn(*num_videos_call),
                gevent.spawn(*page_call),
            )
            gevent.joinall(tasks)
            util.check_gevent_exceptions(*tasks)
            number_of_videos, polymer_json = tasks[0].value, tasks[1].value
        # For shorts/streams, polymer_json is already set above, nothing to do here
    elif tab == 'about':
        # polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about')
@@ -571,16 +571,16 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
    elif tab == 'search':
        url = base_url + '/search?pbj=1&query=' + urllib.parse.quote(query, safe='')
        polymer_json = util.fetch_url(url, headers_desktop, debug_name='gen_channel_search')
-    elif tab == 'videos':
+    elif tab != 'videos':
        pass
    else:
        flask.abort(404, 'Unknown channel tab: ' + tab)
-    if polymer_json is not None:
+    if polymer_json is not None and info is None:
        info = yt_data_extract.extract_channel_info(
            json.loads(polymer_json), tab, continuation=continuation
        )
    if info is None:
        return flask.render_template('error.html', error_message='Could not retrieve channel data')
    if info['error'] is not None:
        return flask.render_template('error.html', error_message=info['error'])
@@ -610,16 +610,40 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
            item.update(additional_info)
    if tab in ('videos', 'shorts', 'streams'):
-        if tab in ('shorts', 'streams'):
+        # For any tab using the channel browse API (sort=4, shorts, streams),
-            # For shorts/streams, use ctoken to determine pagination
+        # pagination is driven by the ctoken YouTube returns in the response.
        # Cache it so the next page request can use it.
        if info.get('ctoken'):
            cache_key = (channel_id, tab, sort, page_number)
            continuation_token_cache[cache_key] = info['ctoken']
        # Determine is_last_page and final number_of_pages.
        # For channel-API-driven tabs (sort=4, shorts, streams, UU fallback),
        # YouTube doesn't give us a reliable total filtered count. So instead
        # of displaying a misleading number (the total-including-shorts from
        # get_number_of_videos_channel), we count only what we've actually
        # paged through, and use the ctoken to know whether to show "next".
        if used_channel_api:
            info['is_last_page'] = (info.get('ctoken') is None)
-            number_of_videos = len(info.get('items', []))
+            items_on_page = len(info.get('items', []))
-            # Cache the ctoken for next page
+            items_seen_so_far = (page_number - 1) * page_size + items_on_page
            # Use accumulated count as the displayed total so "N videos" shown
            # to the user always matches what they could actually reach.
            number_of_videos = items_seen_so_far
            # If there's more content, bump by 1 so the Next-page button exists
            if info.get('ctoken'):
-                cache_key = (channel_id, tab, sort, page_number)
+                number_of_videos = max(number_of_videos,
-                continuation_token_cache[cache_key] = info['ctoken']
+                                       page_number * page_size + 1)
        # For sort=3 via UU playlist (used_channel_api=False), number_of_videos
        # was already set from playlist metadata above.
        info['number_of_videos'] = number_of_videos
-        info['number_of_pages'] = math.ceil(number_of_videos/page_size) if number_of_videos else 1
+        info['number_of_pages'] = math.ceil(number_of_videos / page_size) if number_of_videos else 1
        # Never show fewer pages than the page the user is actually on
        if info['number_of_pages'] < page_number:
            info['number_of_pages'] = page_number
        info['header_playlist_names'] = local_playlist.get_playlist_names()
    if tab in ('videos', 'shorts', 'streams', 'playlists'):
        info['current_sort'] = sort