Last week I was surprised to discover that there are Unicode characters that aren't valid in an XML document. That is regardless of escaping (e.g. �) and unicode encoding (e.g. UTF-8) - not every Unicode string can be stored in XML. The valid characters are (as of XML 1.0) #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]. Others such as #x13 must be stripped, replaced or placed inside a wrapper such as base64.
I didn't find an existing function to strip these so I wrote some and benchmarked them. I'd be interested for thoughts, suggestions and improvements. regsub_p2 was the fastest on a string containing mostly printable-ascii. regsub_p1 0.422097921371 True regsub_p2 0.353546857834 True regsub_p3 0.697242021561 True regsub_p4 0.677567005157 True genexp_p1 6.43633103371 True genexp_p2 6.43329787254 True genexp_p3 6.80837488174 True genexp_p4 6.81470417976 True filter_p1 7.21444416046 True filter_p2 7.46805095673 True filter_p3 7.37018704414 True filter_p4 7.03261303902 True genexp_f1 12.8470640182 True genexp_f2 5.43630099297 True genexp_f3 4.9708840847 True genexp_f4 12.2384109497 True genexp_f5 6.95861411095 True genexp_f6 4.7168610096 True genexp_f7 20.2065701485 True genexp_f8 21.1112251282 True Regards, Alex #!/usr/bin/python # Valid XML 1.0 characters are # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] # http://www.w3.org/TR/2008/PER-xml-20080205/#charsets # # Before passing an arbitrary unicode string to an XML encoder invalid characters # must be stripped or replaced. Escaping them doesn't help - they're simply not # allowed in a well formed XML 1.0 document. # The following script banchmarks several functions that strip them import re import string import timeit p1 = re.compile(u'[^\x09\x0A\x0D\u0020-\uD7FF' u'\uE000-\uFFFD\U00010000-\U0010FFFF]', re.U) p2 = re.compile(u'[^\u0020-\uD7FF\x09\x0A\x0D' u'\uE000-\uFFFD\U00010000-\U0010FFFF]', re.U) p3 = re.compile(p1.pattern + u'+', re.U) p4 = re.compile(p2.pattern + u'+', re.U) def regsub_p1(s): return p1.sub(u'', s) def regsub_p2(s): return p2.sub(u'', s) def regsub_p3(s): return p3.sub(u'', s) def regsub_p4(s): return p4.sub(u'', s) def genexp_p1(s): return u''.join(c for c in s if not p1.match(c)) def genexp_p2(s): return u''.join(c for c in s if not p2.match(c)) def genexp_p3(s): return u''.join(c for c in s if not p3.match(c)) def genexp_p4(s): return u''.join(c for c in s if not p4.match(c)) def filter_p1(s): return u''.join(filter(lambda c: not p1.match(c), s)) def filter_p2(s): return u''.join(filter(lambda c: not p2.match(c), s)) def filter_p3(s): return u''.join(filter(lambda c: not p3.match(c), s)) def filter_p4(s): return u''.join(filter(lambda c: not p4.match(c), s)) def f1(c): i = ord(c) return (i in set([0x09, 0x0A, 0x0D]) or 0x0020 <= i <= 0xD7FF or 0xE000 <= i <= 0xFFFD or 0x00010000 <= i <= 0x0010FFFF) def f2(c): i = ord(c) return (0x0020 <= i <= 0xD7FF or i in set([0x09, 0x0A, 0x0D]) or 0xE000 <= i <= 0xFFFD or 0x00010000 <= i <= 0x0010FFFF) def f3(c): return (u'\u0020' <= c <= u'\uD7FF' or c in set([u'\x09', u'\x0A', u'\x0D']) or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF') def f4(c): return (c in set([u'\x09', u'\x0A', u'\x0D']) or u'\u0020' <= c <= u'\uD7FF' or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF') def f5(c): return (c == u'\x09' or c == u'\x0A' or c == u'\x0D' or u'\u0020' <= c <= u'\uD7FF' or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF') def f6(c): return (u'\u0020' <= c <= u'\uD7FF' or c == u'\x09' or c == u'\x0A' or c == u'\x0D' or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF') every_8bit = u''.join(unichr(i) for i in range(256)) valid_8bit = u''.join(c for c in every_8bit if f1(c)) invalid_8bit = u''.join(c for c in every_8bit if not f1(c)) invalid_8bit_iso88591 = invalid_8bit.encode('iso-8859-1') translator = string.maketrans(invalid_8bit_iso88591, '\x00' * len(invalid_8bit_iso88591)) def f7(c): return ((c <= u'\xff' and ord(string.translate(c.encode('iso-8859-1'), translator))) or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF') def f8(c): return ((c <= u'\xff' and string.translate(c.encode('iso-8859-1'), None, invalid_8bit_iso88591)) or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF') def genexp_f1(s): return u''.join(c for c in s if f1(c)) def genexp_f2(s): return u''.join(c for c in s if f2(c)) def genexp_f3(s): return u''.join(c for c in s if f3(c)) def genexp_f4(s): return u''.join(c for c in s if f4(c)) def genexp_f5(s): return u''.join(c for c in s if f5(c)) def genexp_f6(s): return u''.join(c for c in s if f6(c)) def genexp_f7(s): return u''.join(c for c in s if f7(c)) def genexp_f8(s): return u''.join(c for c in s if f8(c)) if __name__ == '__main__': sample_in = u'''Lorem ipsum dolor sit amet\x00, consectetur adipisicing elit, \tsed \rdo eiusmod tempor incididunt \x13ut labore et dolore magna \xf7aliqua.\ufffe''' expected_out = u'''Lorem ipsum dolor sit amet, consectetur adipisicing elit, \tsed \rdo eiusmod tempor incididunt ut labore et dolore magna \xf7aliqua.''' for func, inner_fun in [(regsub_p1, p1), (regsub_p2, p2), (regsub_p3, p3), (regsub_p4, p4), (genexp_p1, p1), (genexp_p2, p2), (genexp_p3, p3), (genexp_p4, p4), (filter_p1, p1), (filter_p2, p2), (filter_p3, p3), (filter_p4, p4), (genexp_f1, f1), (genexp_f2, f2), (genexp_f3, f3), (genexp_f4, f4), (genexp_f5, f5), (genexp_f6, f6), (genexp_f7, f7), (genexp_f8, f8), ]: t = timeit.Timer(r'%s(%s)' % (func.__name__, repr(sample_in)), 'from __main__ import %s' % (func.__name__,)) print func.__name__, print min(t.repeat(3, 100000)), print func(sample_in) == expected_out, print -- http://mail.python.org/mailman/listinfo/python-list