#!/usr/bin/python from codecs import open import sys import os from lxml import etree COMMENT_DIR='comments' def wp2comments(xml): #xmlfile = open(xml, encoding='utf-8', mode='r').read() tree = etree.parse(xml) root = tree.getroot() items = root.findall('.//item') n = 0 if not os.path.exists(COMMENT_DIR): os.makedirs(COMMENT_DIR) elif not os.path.isdir(COMMENT_DIR): print('"%s" exists but is not a directory!' % COMMENT_DIR) sys.exit(1) for item in items: title = item.find('title') if title is None: continue # Only fetch comments from published posts. status = item.find('wp:status', namespaces=root.nsmap) if status is None or status.text != 'publish': continue slug = item.find('wp:post_name', namespaces=root.nsmap) if slug is None: print('WARNING: skipping "%s" with no post_name') continue slug = slug.text comments = item.findall('wp:comment', namespaces=root.nsmap) if not comments: # No comments found for this post. continue for comment in comments: def comment_tag(tag): result = comment.find(tag, namespaces=root.nsmap) if result is None: return '' else: return result.text status = comment_tag('wp:comment_approved') if status != '1': continue author = comment_tag('wp:comment_author') ip = comment_tag('wp:comment_author_IP') date = comment_tag('wp:comment_date_gmt') email = comment_tag('wp:comment_author_email') url = comment_tag('wp:comment_author_url') content = comment_tag('wp:comment_content') n += 1 f = open(os.path.join(COMMENT_DIR, '%s-%d.md' % (slug, n)), encoding='utf-8', mode='w') f.write(u'post_id: %s\n' % (slug, )) f.write(u'Author: %s\n' % (author, )) f.write(u'Date: %s\n' % (date, )) f.write(u'Author_Email: %s\n' % (email, )) f.write(u'Author_IP: %s\n' % (ip, )) f.write(u'\n%s\n' % (content, )) f.close() if __name__ == '__main__': wp2comments(sys.argv[1])