On Tuesday 06 October 2009 07.14:17 Adrian von Bidder wrote: > I want to allow HTML content. I'm sure there already is code for > allowing restricted HTML subset - pointers very welcome.
Seeing as there was no answer...
Comments very welcome:
def check_restricted_html(text):
'''The supported very simple subset of HTML is:
- blockquote, ol, p, pre, ul tags at toplevel or inside a blockquote.
- li in ol and ul
- b, i, a anywhere but not nested
'''
def hasparent(e, tag):
while True:
e = e.getparent()
if e is None:
return False
if e.tag == tag:
return True
for element in xml.iter():
if element is xml:
continue
if element.tag in ['blockquote', 'ol', 'p', 'pre', 'ul'] \
and not len(element.attrib) \
and (element.getparent() is xml \
or element.getparent().tag == 'blockquote'):
continue
if element.tag == 'li' and not len(element.attrib) \
and element.getparent().tag in ['ol', 'ul']:
continue
if element.tag in ['b', 'i'] and not len(element.attrib) \
and not hasparent(element, element.tag):
continue
if element.tag == 'a' and not hasparent(element, 'a') \
and element.attrib.keys() == ['href'] \
and re.match('^(mailto:|https?://)', element.attrib['href']):
# TODO sanitize href
continue
raise ValueError('Unsupported construct in restricted HTML at "%s"' %
etree.tostring(element)[:40])
return xml
cheers
-- vbi
--
SCO's lawsuit is a lost cause. The implications for Linux users are
rather like the implications for passengers on an ocean liner of a
seagull diving into the water nearby.
-- Thomas Carey, Bromberg & Sunstein, LLP, attorney
signature.asc
Description: This is a digitally signed message part.

