A super strict HTML cleaner method with mediocre tests.

This commit is contained in:
Christopher Allan Webber 2011-06-13 21:01:19 -05:00
parent 1e85d28e01
commit a68ee5556e
2 changed files with 46 additions and 0 deletions

View File

@ -103,3 +103,22 @@ def test_locale_to_lower_lower():
# crazy renditions. Useful?
assert util.locale_to_lower_lower('en-US') == 'en-us'
assert util.locale_to_lower_lower('en_us') == 'en-us'
def test_html_cleaner():
# Remove images
result = util.clean_html(
'<p>Hi everybody! '
'<img src="http://example.org/huge-purple-barney.png" /></p>\n'
'<p>:)</p>')
assert result == (
'<div>'
'<p>Hi everybody! </p>\n'
'<p>:)</p>'
'</div>')
# Remove evil javascript
result = util.clean_html(
'<p><a href="javascript:nasty_surprise">innocent link!</a></p>')
assert result == (
'<p><a href="">innocent link!</a></p>')

View File

@ -30,6 +30,7 @@ import jinja2
import translitcodec
from paste.deploy.loadwsgi import NicerConfigParser
from webob import Response, exc
from lxml.html.clean import Cleaner
from mediagoblin import mg_globals
from mediagoblin.db.util import ObjectId
@ -373,6 +374,32 @@ def read_config_file(conf_file):
return mgoblin_conf
# A super strict version of the lxml.html cleaner class
HTML_CLEANER = Cleaner(
scripts=True,
javascript=True,
comments=True,
style=True,
links=True,
page_structure=True,
processing_instructions=True,
embedded=True,
frames=True,
forms=True,
annoying_tags=True,
allow_tags=[
'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
remove_unknown_tags=False, # can't be used with allow_tags
safe_attrs_only=True,
add_nofollow=True, # for now
host_whitelist=(),
whitelist_tags=set([]))
def clean_html(html):
return HTML_CLEANER.clean_html(html)
SETUP_GETTEXTS = {}
def setup_gettext(locale):