yt-local/python/atoma/atom.py

from datetime import datetime
import enum
from io import BytesIO
from typing import Optional, List
from xml.etree.ElementTree import Element

import attr

from .utils import (
    parse_xml, get_child, get_text, get_datetime, FeedParseError, ns
)


class AtomTextType(enum.Enum):
    text = "text"
    html = "html"
    xhtml = "xhtml"


@attr.s
class AtomTextConstruct:
    text_type: str = attr.ib()
    lang: Optional[str] = attr.ib()
    value: str = attr.ib()


@attr.s
class AtomEntry:
    title: AtomTextConstruct = attr.ib()
    id_: str = attr.ib()

    # Should be mandatory but many feeds use published instead
    updated: Optional[datetime] = attr.ib()

    authors: List['AtomPerson'] = attr.ib()
    contributors: List['AtomPerson'] = attr.ib()
    links: List['AtomLink'] = attr.ib()
    categories: List['AtomCategory'] = attr.ib()
    published: Optional[datetime] = attr.ib()
    rights: Optional[AtomTextConstruct] = attr.ib()
    summary: Optional[AtomTextConstruct] = attr.ib()
    content: Optional[AtomTextConstruct] = attr.ib()
    source: Optional['AtomFeed'] = attr.ib()


@attr.s
class AtomFeed:
    title: Optional[AtomTextConstruct] = attr.ib()
    id_: str = attr.ib()

    # Should be mandatory but many feeds do not include it
    updated: Optional[datetime] = attr.ib()

    authors: List['AtomPerson'] = attr.ib()
    contributors: List['AtomPerson'] = attr.ib()
    links: List['AtomLink'] = attr.ib()
    categories: List['AtomCategory'] = attr.ib()
    generator: Optional['AtomGenerator'] = attr.ib()
    subtitle: Optional[AtomTextConstruct] = attr.ib()
    rights: Optional[AtomTextConstruct] = attr.ib()
    icon: Optional[str] = attr.ib()
    logo: Optional[str] = attr.ib()

    entries: List[AtomEntry] = attr.ib()


@attr.s
class AtomPerson:
    name: str = attr.ib()
    uri: Optional[str] = attr.ib()
    email: Optional[str] = attr.ib()


@attr.s
class AtomLink:
    href: str = attr.ib()
    rel: Optional[str] = attr.ib()
    type_: Optional[str] = attr.ib()
    hreflang: Optional[str] = attr.ib()
    title: Optional[str] = attr.ib()
    length: Optional[int] = attr.ib()


@attr.s
class AtomCategory:
    term: str = attr.ib()
    scheme: Optional[str] = attr.ib()
    label: Optional[str] = attr.ib()


@attr.s
class AtomGenerator:
    name: str = attr.ib()
    uri: Optional[str] = attr.ib()
    version: Optional[str] = attr.ib()


def _get_generator(element: Element, name,
                   optional: bool=True) -> Optional[AtomGenerator]:
    child = get_child(element, name, optional)
    if child is None:
        return None

    return AtomGenerator(
        child.text.strip(),
        child.attrib.get('uri'),
        child.attrib.get('version'),
    )


def _get_text_construct(element: Element, name,
                        optional: bool=True) -> Optional[AtomTextConstruct]:
    child = get_child(element, name, optional)
    if child is None:
        return None

    try:
        text_type = AtomTextType(child.attrib['type'])
    except KeyError:
        text_type = AtomTextType.text

    try:
        lang = child.lang
    except AttributeError:
        lang = None

    if child.text is None:
        if optional:
            return None

        raise FeedParseError(
            'Could not parse atom feed: "{}" text is required but is empty'
            .format(name)
        )

    return AtomTextConstruct(
        text_type,
        lang,
        child.text.strip()
    )


def _get_person(element: Element) -> Optional[AtomPerson]:
    try:
        return AtomPerson(
            get_text(element, 'feed:name', optional=False),
            get_text(element, 'feed:uri'),
            get_text(element, 'feed:email')
        )
    except FeedParseError:
        return None


def _get_link(element: Element) -> AtomLink:
    length = element.attrib.get('length')
    length = int(length) if length else None
    return AtomLink(
        element.attrib['href'],
        element.attrib.get('rel'),
        element.attrib.get('type'),
        element.attrib.get('hreflang'),
        element.attrib.get('title'),
        length
    )


def _get_category(element: Element) -> AtomCategory:
    return AtomCategory(
        element.attrib['term'],
        element.attrib.get('scheme'),
        element.attrib.get('label'),
    )


def _get_entry(element: Element,
               default_authors: List[AtomPerson]) -> AtomEntry:
    root = element

    # Mandatory
    title = _get_text_construct(root, 'feed:title')
    id_ = get_text(root, 'feed:id')

    # Optional
    try:
        source = _parse_atom(get_child(root, 'feed:source', optional=False),
                             parse_entries=False)
    except FeedParseError:
        source = None
        source_authors = []
    else:
        source_authors = source.authors

    authors = [_get_person(e)
               for e in root.findall('feed:author', ns)] or default_authors
    authors = [a for a in authors if a is not None]
    authors = authors or default_authors or source_authors

    contributors = [_get_person(e)
                    for e in root.findall('feed:contributor', ns) if e]
    contributors = [c for c in contributors if c is not None]

    links = [_get_link(e) for e in root.findall('feed:link', ns)]
    categories = [_get_category(e) for e in root.findall('feed:category', ns)]

    updated = get_datetime(root, 'feed:updated')
    published = get_datetime(root, 'feed:published')
    rights = _get_text_construct(root, 'feed:rights')
    summary = _get_text_construct(root, 'feed:summary')
    content = _get_text_construct(root, 'feed:content')

    return AtomEntry(
        title,
        id_,
        updated,
        authors,
        contributors,
        links,
        categories,
        published,
        rights,
        summary,
        content,
        source
    )


def _parse_atom(root: Element, parse_entries: bool=True) -> AtomFeed:
    # Mandatory
    id_ = get_text(root, 'feed:id', optional=False)

    # Optional
    title = _get_text_construct(root, 'feed:title')
    updated = get_datetime(root, 'feed:updated')
    authors = [_get_person(e)
               for e in root.findall('feed:author', ns) if e]
    authors = [a for a in authors if a is not None]
    contributors = [_get_person(e)
                    for e in root.findall('feed:contributor', ns) if e]
    contributors = [c for c in contributors if c is not None]
    links = [_get_link(e)
             for e in root.findall('feed:link', ns)]
    categories = [_get_category(e)
                  for e in root.findall('feed:category', ns)]

    generator = _get_generator(root, 'feed:generator')
    subtitle = _get_text_construct(root, 'feed:subtitle')
    rights = _get_text_construct(root, 'feed:rights')
    icon = get_text(root, 'feed:icon')
    logo = get_text(root, 'feed:logo')

    if parse_entries:
        entries = [_get_entry(e, authors)
                   for e in root.findall('feed:entry', ns)]
    else:
        entries = []

    atom_feed = AtomFeed(
        title,
        id_,
        updated,
        authors,
        contributors,
        links,
        categories,
        generator,
        subtitle,
        rights,
        icon,
        logo,
        entries
    )
    return atom_feed


def parse_atom_file(filename: str) -> AtomFeed:
    """Parse an Atom feed from a local XML file."""
    root = parse_xml(filename).getroot()
    return _parse_atom(root)


def parse_atom_bytes(data: bytes) -> AtomFeed:
    """Parse an Atom feed from a byte-string containing XML data."""
    root = parse_xml(BytesIO(data)).getroot()
    return _parse_atom(root)