A super strict HTML cleaner method with mediocre tests.

This commit is contained in:
Christopher Allan Webber 2011-06-13 21:01:19 -05:00
parent 1e85d28e01
commit a68ee5556e
2 changed files with 46 additions and 0 deletions

View File

@ -103,3 +103,22 @@ def test_locale_to_lower_lower():
# crazy renditions. Useful? # crazy renditions. Useful?
assert util.locale_to_lower_lower('en-US') == 'en-us' assert util.locale_to_lower_lower('en-US') == 'en-us'
assert util.locale_to_lower_lower('en_us') == 'en-us' assert util.locale_to_lower_lower('en_us') == 'en-us'
def test_html_cleaner():
# Remove images
result = util.clean_html(
'<p>Hi everybody! '
'<img src="http://example.org/huge-purple-barney.png" /></p>\n'
'<p>:)</p>')
assert result == (
'<div>'
'<p>Hi everybody! </p>\n'
'<p>:)</p>'
'</div>')
# Remove evil javascript
result = util.clean_html(
'<p><a href="javascript:nasty_surprise">innocent link!</a></p>')
assert result == (
'<p><a href="">innocent link!</a></p>')

View File

@ -30,6 +30,7 @@ import jinja2
import translitcodec import translitcodec
from paste.deploy.loadwsgi import NicerConfigParser from paste.deploy.loadwsgi import NicerConfigParser
from webob import Response, exc from webob import Response, exc
from lxml.html.clean import Cleaner
from mediagoblin import mg_globals from mediagoblin import mg_globals
from mediagoblin.db.util import ObjectId from mediagoblin.db.util import ObjectId
@ -373,6 +374,32 @@ def read_config_file(conf_file):
return mgoblin_conf return mgoblin_conf
# A super strict version of the lxml.html cleaner class
HTML_CLEANER = Cleaner(
scripts=True,
javascript=True,
comments=True,
style=True,
links=True,
page_structure=True,
processing_instructions=True,
embedded=True,
frames=True,
forms=True,
annoying_tags=True,
allow_tags=[
'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
remove_unknown_tags=False, # can't be used with allow_tags
safe_attrs_only=True,
add_nofollow=True, # for now
host_whitelist=(),
whitelist_tags=set([]))
def clean_html(html):
return HTML_CLEANER.clean_html(html)
SETUP_GETTEXTS = {} SETUP_GETTEXTS = {}
def setup_gettext(locale): def setup_gettext(locale):