Vinzenz Feenstra has uploaded a new change for review. Change subject: agent: Reworking the data filtering for the guest agent ......................................................................
agent: Reworking the data filtering for the guest agent This patchset is oriented on http://gerrit.ovirt.org/#/c/16652 We're now filtering properly invalid unicode characters and invalid characters. Which would cause VDSM to build invalid XML data. Change-Id: I3a4347350d791f4df0b26b1cf81d0ca4f9656981 Signed-off-by: Vinzenz Feenstra <[email protected]> --- M ovirt-guest-agent/VirtIoChannel.py 1 file changed, 33 insertions(+), 18 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-guest-agent refs/changes/80/16880/1 diff --git a/ovirt-guest-agent/VirtIoChannel.py b/ovirt-guest-agent/VirtIoChannel.py index 7df833d..dd68668 100644 --- a/ovirt-guest-agent/VirtIoChannel.py +++ b/ovirt-guest-agent/VirtIoChannel.py @@ -20,6 +20,7 @@ import platform import time import locale +import unicodedata # avoid pep8 warnings @@ -32,12 +33,13 @@ return simplejson json = import_json() +__REPLACEMENT_CHAR = u'\ufffd' # Set taken from http://www.w3.org/TR/xml11/#NT-RestrictedChar -__RESTRICTED_CHARS = set(range(8 + 1)).union( - set(range(0xB, 0xC + 1))).union( - set(range(0xE, 0x1F + 1))).union( - set(range(0x7F, 0x84 + 1))).union( - set(range(0x86, 0x9F + 1))) +__RESTRICTED_CHARS = set(range(8 + 1))\ + .union(set(range(0xB, 0xC + 1)))\ + .union(set(range(0xE, 0x1F + 1)))\ + .union(set(range(0x7F, 0x84 + 1)))\ + .union(set(range(0x86, 0x9F + 1))) def _string_check(str): @@ -54,22 +56,40 @@ except UnicodeError: # unrepresentable string return unicode() - return str + return unicode(str) def _filter_xml_chars(u): """ - Filter out restricted xml chars from unicode string. Not using - Python's xmlcharrefreplace because it accepts '\x01', which - the spec frown upon. + The set of characters allowed in XML documents is described in + http://www.w3.org/TR/xml11/#charsets + + "Char" is defined as any unicode character except the surrogate blocks, + \ufffe and \uffff. + "RestrictedChar" is defiend as the code points in __RESTRICTED_CHARS above + + It's a little hard to follow, but the uposhot is an XML document must + contain only characters in Char that are not in RestrictedChar. + + Note that Python's xmlcharrefreplace option is not relevant here - + that's about handling charaters which can't be encoded in a given charset + encoding, not which aren't permitted in XML. """ - def mask_restricted(c): - if ord(c) in __RESTRICTED_CHARS: - return '?' + def filter_xml_char(c): + if ord(c) > 0x10ffff: + return __REPLACEMENT_CHAR # Outside Unicode range + elif unicodedata.category(c) == 'Cs': + return __REPLACEMENT_CHAR # Surrogate pair code point + elif ord(c) == 0xFFFE or ord(c) == 0xFFFF: + return __REPLACEMENT_CHAR # Specifically excluded code points + elif ord(c) in __RESTRICTED_CHARS: + return __REPLACEMENT_CHAR else: return c + if not isinstance(u, unicode): + raise TypeError - return ''.join(mask_restricted(c) for c in u) + return ''.join(filter_xml_char(c) for c in u) def _filter_object(obj): @@ -152,11 +172,6 @@ args['__name__'] = name args = _filter_object(args) message = (json.dumps(args) + '\n').encode('utf8') - filtered_message = _filter_xml_chars(message) - # Sanity check only, on purpose we're throwing away the string - # to ensure we've produced a decodable utf-8 string after filtering - filtered_message.decode('utf-8') - message = filtered_message while len(message) > 0: if self.is_windows: written = self._vport.write(message) -- To view, visit http://gerrit.ovirt.org/16880 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3a4347350d791f4df0b26b1cf81d0ca4f9656981 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-guest-agent Gerrit-Branch: master Gerrit-Owner: Vinzenz Feenstra <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
