fix: add support for YouTube Shorts tab on channel pages

- Rewrite channel_ctoken_v5 with correct protobuf field numbers per tab (videos=15, shorts=10, streams=14) based on Invidious source - Replace broken pbj=1 endpoint with youtubei browse API for shorts/streams - Add shortsLockupViewModel parser to extract video data from new YT format - Fix channel metadata not loading (get_metadata now uses browse API) - Fix metadata caching: skip caching when channel_name is absent - Show actual item count instead of UU playlist count for shorts/streams - Format view counts with spaced suffixes (7.1 K, 1.2 M, 3 B)
2026-04-01 11:43:46 -05:00
parent bed14713ad
commit a374f90f6e
2 changed files with 152 additions and 81 deletions
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -332,6 +332,84 @@ def extract_lockup_view_model_info(item, additional_info={}):
    return info


+def extract_shorts_lockup_view_model_info(item, additional_info={}):
+    """Extract info from shortsLockupViewModel format (YouTube Shorts)"""
+    info = {'error': None, 'type': 'video'}
+
+    # Video ID from reelWatchEndpoint or entityId
+    info['id'] = deep_get(item,
+        'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId')
+    if not info['id']:
+        entity_id = item.get('entityId', '')
+        if entity_id.startswith('shorts-shelf-item-'):
+            info['id'] = entity_id[len('shorts-shelf-item-'):]
+
+    # Thumbnail
+    info['thumbnail'] = normalize_url(deep_get(item,
+        'onTap', 'innertubeCommand', 'reelWatchEndpoint',
+        'thumbnail', 'thumbnails', 0, 'url'))
+
+    # Parse title and views from accessibilityText
+    # Format: "Title, N views - play Short"
+    acc_text = item.get('accessibilityText', '')
+    info['title'] = ''
+    info['view_count'] = None
+    info['approx_view_count'] = None
+    if acc_text:
+        # Remove trailing " - play Short"
+        cleaned = re.sub(r'\s*-\s*play Short$', '', acc_text)
+        # Split on last comma+views pattern to separate title from view count
+        match = re.match(r'^(.*?),\s*([\d,.]+\s*(?:thousand|million|billion|)\s*views?)$',
+                         cleaned, re.IGNORECASE)
+        if match:
+            info['title'] = match.group(1).strip()
+            view_text = match.group(2)
+            info['view_count'] = extract_int(view_text)
+            # Convert "7.1 thousand" -> "7.1 K" for display
+            suffix_map = {'thousand': 'K', 'million': 'M', 'billion': 'B'}
+            suffix_match = re.search(r'([\d,.]+)\s*(thousand|million|billion)?', view_text, re.IGNORECASE)
+            if suffix_match:
+                num = suffix_match.group(1)
+                word = suffix_match.group(2)
+                if word:
+                    info['approx_view_count'] = num + ' ' + suffix_map[word.lower()]
+                else:
+                    info['approx_view_count'] = '{:,}'.format(int(num.replace(',', ''))) if num.isdigit() or num.replace(',','').isdigit() else num
+            else:
+                info['approx_view_count'] = extract_approx_int(view_text)
+        else:
+            # Fallback: try "N views" at end
+            match2 = re.match(r'^(.*?),\s*(.+views?)$', cleaned, re.IGNORECASE)
+            if match2:
+                info['title'] = match2.group(1).strip()
+                info['approx_view_count'] = extract_approx_int(match2.group(2))
+            else:
+                info['title'] = cleaned
+
+    # Overlay text (usually has the title too)
+    overlay_metadata = deep_get(item, 'overlayMetadata',
+        'secondaryText', 'content')
+    if overlay_metadata and not info['approx_view_count']:
+        info['approx_view_count'] = extract_approx_int(overlay_metadata)
+
+    primary_text = deep_get(item, 'overlayMetadata',
+        'primaryText', 'content')
+    if primary_text and not info['title']:
+        info['title'] = primary_text
+
+    info['duration'] = ''
+    info['time_published'] = None
+    info['description'] = None
+    info['badges'] = []
+    info['author'] = None
+    info['author_id'] = None
+    info['author_url'] = None
+    info['index'] = None
+
+    info.update(additional_info)
+    return info
+
+
 def extract_item_info(item, additional_info={}):
    if not item:
        return {'error': 'No item given'}
@@ -353,6 +431,10 @@ def extract_item_info(item, additional_info={}):
    if type == 'lockupViewModel':
        return extract_lockup_view_model_info(item, additional_info)

+    # Handle shortsLockupViewModel format (YouTube Shorts)
+    if type == 'shortsLockupViewModel':
+        return extract_shorts_lockup_view_model_info(item, additional_info)
+
    # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer'
    # camelCase split, https://stackoverflow.com/a/37697078
    type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()]
@@ -561,6 +643,7 @@ _item_types = {

    # New viewModel format (YouTube 2024+)
    'lockupViewModel',
+    'shortsLockupViewModel',
 }

 def _traverse_browse_renderer(renderer):