222 lines
5.7 KiB
Python
222 lines
5.7 KiB
Python
from datetime import datetime
|
|
from io import BytesIO
|
|
from typing import Optional, List
|
|
from xml.etree.ElementTree import Element
|
|
|
|
import attr
|
|
|
|
from .utils import (
|
|
parse_xml, get_child, get_text, get_int, get_datetime, FeedParseError
|
|
)
|
|
|
|
|
|
@attr.s
|
|
class RSSImage:
|
|
url: str = attr.ib()
|
|
title: Optional[str] = attr.ib()
|
|
link: str = attr.ib()
|
|
width: int = attr.ib()
|
|
height: int = attr.ib()
|
|
description: Optional[str] = attr.ib()
|
|
|
|
|
|
@attr.s
|
|
class RSSEnclosure:
|
|
url: str = attr.ib()
|
|
length: Optional[int] = attr.ib()
|
|
type: Optional[str] = attr.ib()
|
|
|
|
|
|
@attr.s
|
|
class RSSSource:
|
|
title: str = attr.ib()
|
|
url: Optional[str] = attr.ib()
|
|
|
|
|
|
@attr.s
|
|
class RSSItem:
|
|
title: Optional[str] = attr.ib()
|
|
link: Optional[str] = attr.ib()
|
|
description: Optional[str] = attr.ib()
|
|
author: Optional[str] = attr.ib()
|
|
categories: List[str] = attr.ib()
|
|
comments: Optional[str] = attr.ib()
|
|
enclosures: List[RSSEnclosure] = attr.ib()
|
|
guid: Optional[str] = attr.ib()
|
|
pub_date: Optional[datetime] = attr.ib()
|
|
source: Optional[RSSSource] = attr.ib()
|
|
|
|
# Extension
|
|
content_encoded: Optional[str] = attr.ib()
|
|
|
|
|
|
@attr.s
|
|
class RSSChannel:
|
|
title: Optional[str] = attr.ib()
|
|
link: Optional[str] = attr.ib()
|
|
description: Optional[str] = attr.ib()
|
|
language: Optional[str] = attr.ib()
|
|
copyright: Optional[str] = attr.ib()
|
|
managing_editor: Optional[str] = attr.ib()
|
|
web_master: Optional[str] = attr.ib()
|
|
pub_date: Optional[datetime] = attr.ib()
|
|
last_build_date: Optional[datetime] = attr.ib()
|
|
categories: List[str] = attr.ib()
|
|
generator: Optional[str] = attr.ib()
|
|
docs: Optional[str] = attr.ib()
|
|
ttl: Optional[int] = attr.ib()
|
|
image: Optional[RSSImage] = attr.ib()
|
|
|
|
items: List[RSSItem] = attr.ib()
|
|
|
|
# Extension
|
|
content_encoded: Optional[str] = attr.ib()
|
|
|
|
|
|
def _get_image(element: Element, name,
|
|
optional: bool=True) -> Optional[RSSImage]:
|
|
child = get_child(element, name, optional)
|
|
if child is None:
|
|
return None
|
|
|
|
return RSSImage(
|
|
get_text(child, 'url', optional=False),
|
|
get_text(child, 'title'),
|
|
get_text(child, 'link', optional=False),
|
|
get_int(child, 'width') or 88,
|
|
get_int(child, 'height') or 31,
|
|
get_text(child, 'description')
|
|
)
|
|
|
|
|
|
def _get_source(element: Element, name,
|
|
optional: bool=True) -> Optional[RSSSource]:
|
|
child = get_child(element, name, optional)
|
|
if child is None:
|
|
return None
|
|
|
|
return RSSSource(
|
|
child.text.strip(),
|
|
child.attrib.get('url'),
|
|
)
|
|
|
|
|
|
def _get_enclosure(element: Element) -> RSSEnclosure:
|
|
length = element.attrib.get('length')
|
|
try:
|
|
length = int(length)
|
|
except (TypeError, ValueError):
|
|
length = None
|
|
|
|
return RSSEnclosure(
|
|
element.attrib['url'],
|
|
length,
|
|
element.attrib.get('type'),
|
|
)
|
|
|
|
|
|
def _get_link(element: Element) -> Optional[str]:
|
|
"""Attempt to retrieve item link.
|
|
|
|
Use the GUID as a fallback if it is a permalink.
|
|
"""
|
|
link = get_text(element, 'link')
|
|
if link is not None:
|
|
return link
|
|
|
|
guid = get_child(element, 'guid')
|
|
if guid is not None and guid.attrib.get('isPermaLink') == 'true':
|
|
return get_text(element, 'guid')
|
|
|
|
return None
|
|
|
|
|
|
def _get_item(element: Element) -> RSSItem:
|
|
root = element
|
|
|
|
title = get_text(root, 'title')
|
|
link = _get_link(root)
|
|
description = get_text(root, 'description')
|
|
author = get_text(root, 'author')
|
|
categories = [e.text for e in root.findall('category')]
|
|
comments = get_text(root, 'comments')
|
|
enclosure = [_get_enclosure(e) for e in root.findall('enclosure')]
|
|
guid = get_text(root, 'guid')
|
|
pub_date = get_datetime(root, 'pubDate')
|
|
source = _get_source(root, 'source')
|
|
|
|
content_encoded = get_text(root, 'content:encoded')
|
|
|
|
return RSSItem(
|
|
title,
|
|
link,
|
|
description,
|
|
author,
|
|
categories,
|
|
comments,
|
|
enclosure,
|
|
guid,
|
|
pub_date,
|
|
source,
|
|
content_encoded
|
|
)
|
|
|
|
|
|
def _parse_rss(root: Element) -> RSSChannel:
|
|
rss_version = root.get('version')
|
|
if rss_version != '2.0':
|
|
raise FeedParseError('Cannot process RSS feed version "{}"'
|
|
.format(rss_version))
|
|
|
|
root = root.find('channel')
|
|
|
|
title = get_text(root, 'title')
|
|
link = get_text(root, 'link')
|
|
description = get_text(root, 'description')
|
|
language = get_text(root, 'language')
|
|
copyright = get_text(root, 'copyright')
|
|
managing_editor = get_text(root, 'managingEditor')
|
|
web_master = get_text(root, 'webMaster')
|
|
pub_date = get_datetime(root, 'pubDate')
|
|
last_build_date = get_datetime(root, 'lastBuildDate')
|
|
categories = [e.text for e in root.findall('category')]
|
|
generator = get_text(root, 'generator')
|
|
docs = get_text(root, 'docs')
|
|
ttl = get_int(root, 'ttl')
|
|
|
|
image = _get_image(root, 'image')
|
|
items = [_get_item(e) for e in root.findall('item')]
|
|
|
|
content_encoded = get_text(root, 'content:encoded')
|
|
|
|
return RSSChannel(
|
|
title,
|
|
link,
|
|
description,
|
|
language,
|
|
copyright,
|
|
managing_editor,
|
|
web_master,
|
|
pub_date,
|
|
last_build_date,
|
|
categories,
|
|
generator,
|
|
docs,
|
|
ttl,
|
|
image,
|
|
items,
|
|
content_encoded
|
|
)
|
|
|
|
|
|
def parse_rss_file(filename: str) -> RSSChannel:
|
|
"""Parse an RSS feed from a local XML file."""
|
|
root = parse_xml(filename).getroot()
|
|
return _parse_rss(root)
|
|
|
|
|
|
def parse_rss_bytes(data: bytes) -> RSSChannel:
|
|
"""Parse an RSS feed from a byte-string containing XML data."""
|
|
root = parse_xml(BytesIO(data)).getroot()
|
|
return _parse_rss(root)
|