yt-local/python/atoma/utils.py
2019-02-16 23:41:52 -08:00

85 lines
2.1 KiB
Python

from datetime import datetime, timezone
from xml.etree.ElementTree import Element
from typing import Optional
import dateutil.parser
from defusedxml.ElementTree import parse as defused_xml_parse, ParseError
from .exceptions import FeedXMLError, FeedParseError
ns = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'feed': 'http://www.w3.org/2005/Atom'
}
def parse_xml(xml_content):
try:
return defused_xml_parse(xml_content)
except ParseError:
raise FeedXMLError('Not a valid XML document')
def get_child(element: Element, name,
optional: bool=True) -> Optional[Element]:
child = element.find(name, namespaces=ns)
if child is None and not optional:
raise FeedParseError(
'Could not parse feed: "{}" does not have a "{}"'
.format(element.tag, name)
)
elif child is None:
return None
return child
def get_text(element: Element, name, optional: bool=True) -> Optional[str]:
child = get_child(element, name, optional)
if child is None:
return None
if child.text is None:
if optional:
return None
raise FeedParseError(
'Could not parse feed: "{}" text is required but is empty'
.format(name)
)
return child.text.strip()
def get_int(element: Element, name, optional: bool=True) -> Optional[int]:
text = get_text(element, name, optional)
if text is None:
return None
return int(text)
def get_datetime(element: Element, name,
optional: bool=True) -> Optional[datetime]:
text = get_text(element, name, optional)
if text is None:
return None
return try_parse_date(text)
def try_parse_date(date_str: str) -> Optional[datetime]:
try:
date = dateutil.parser.parse(date_str, fuzzy=True)
except (ValueError, OverflowError):
return None
if date.tzinfo is None:
# TZ naive datetime, make it a TZ aware datetime by assuming it
# contains UTC time
date = date.replace(tzinfo=timezone.utc)
return date