URL: https://github.com/freeipa/freeipa/pull/459 Author: tiran Title: #459: [WIP] Faster JSON encoder/decoder Action: synchronized
To pull the PR as Git branch: git remote add ghfreeipa https://github.com/freeipa/freeipa git fetch ghfreeipa pull/459/head:pr459 git checkout pr459
From e685e106dbcfb54d1651c97d6a07a17c3417127f Mon Sep 17 00:00:00 2001 From: Christian Heimes <chei...@redhat.com> Date: Mon, 13 Feb 2017 09:46:39 +0100 Subject: [PATCH 1/4] Faster JSON encoder/decoder Improve performance of FreeIPA's JSON serializer and deserializer. * Don't indent and sort keys. Both options trigger a slow path in Python's json package. Without indention and sorting, encoding mostly happens in optimized C code. * Replace O(n) type checks with O(1) type lookup and eliminate the use of isinstance(). * Check each client capability only once for every conversion. * Use decoder's obj_hook feature to traverse the object tree once and to eliminate calls to isinstance(). Closes: https://fedorahosted.org/freeipa/ticket/6655 Signed-off-by: Christian Heimes <chei...@redhat.com> --- ipalib/rpc.py | 211 +++++++++++++++++++++++++++++++------------------ ipaserver/rpcserver.py | 7 +- 2 files changed, 134 insertions(+), 84 deletions(-) diff --git a/ipalib/rpc.py b/ipalib/rpc.py index 7d9f6ec..6cad397 100644 --- a/ipalib/rpc.py +++ b/ipalib/rpc.py @@ -51,7 +51,7 @@ from ipalib.backend import Connectible from ipalib.constants import LDAP_GENERALIZED_TIME_FORMAT from ipalib.errors import (public_errors, UnknownError, NetworkError, - KerberosError, XMLRPCMarshallError, JSONError, ConversionError) + KerberosError, XMLRPCMarshallError, JSONError) from ipalib import errors, capabilities from ipalib.request import context, Connection from ipapython.ipa_log_manager import root_logger @@ -274,67 +274,140 @@ def xml_dumps(params, version, methodname=None, methodresponse=False, ) -def json_encode_binary(val, version): - ''' - JSON cannot encode binary values. We encode binary values in Python str - objects and text in Python unicode objects. In order to allow a binary - object to be passed through JSON we base64 encode it thus converting it to - text which JSON can transport. To assure we recognize the value is a base64 - encoded representation of the original binary value and not confuse it with - other text we convert the binary value to a dict in this form: - - {'__base64__' : base64_encoding_of_binary_value} - - This modification of the original input value cannot be done "in place" as - one might first assume (e.g. replacing any binary items in a container - (e.g. list, tuple, dict) with the base64 dict because the container might be - an immutable object (i.e. a tuple). Therefore this function returns a copy - of any container objects it encounters with tuples replaced by lists. This - is O.K. because the JSON encoding will map both lists and tuples to JSON - arrays. - ''' - - if isinstance(val, dict): - new_dict = {} - for k, v in val.items(): - new_dict[k] = json_encode_binary(v, version) - return new_dict - elif isinstance(val, (list, tuple)): - new_list = [json_encode_binary(v, version) for v in val] - return new_list - elif isinstance(val, bytes): - encoded = base64.b64encode(val) - if not six.PY2: - encoded = encoded.decode('ascii') - return {'__base64__': encoded} - elif isinstance(val, Decimal): - return unicode(val) - elif isinstance(val, DN): - return str(val) - elif isinstance(val, datetime.datetime): - if capabilities.client_has_capability(version, 'datetime_values'): +class _JSONConverter(dict): + __slots__ = ('version', '_cap_datetime', '_cap_dnsname') + + _identity = object() + + def __init__(self, version, _identity=_identity): + super(_JSONConverter, self).__init__() + self.version = version + self._cap_datetime = None + self._cap_dnsname = None + self.update({ + unicode: _identity, + bool: _identity, + type(None): _identity, + float: _identity, + Decimal: unicode, + DN: str, + Principal: unicode, + DNSName: self._enc_dnsname, + datetime.datetime: self._enc_datetime, + bytes: self._enc_bytes, + list: self._enc_list, + tuple: self._enc_list, + dict: self._enc_dict, + }) + # int, long + for t in six.integer_types: + self[t] = _identity + + def __missing__(self, typ): + # walk MRO to find best match + for c in typ.__mro__: + if c in self: + self[typ] = self[c] + return self[c] + # use issubclass to check for registered ABCs + for c in self: + if issubclass(typ, c): + self[typ] = self[c] + return self[c] + raise TypeError(typ) + + def convert(self, obj, _identity=_identity): + # obj.__class__ is twice as fast as type(obj) + func = self[obj.__class__] + return obj if func is _identity else func(obj) + + def _enc_datetime(self, val): + cap = self._cap_datetime + if cap is None: + cap = capabilities.client_has_capability(self.version, + 'datetime_values') + self._cap_datetime = cap + if cap: return {'__datetime__': val.strftime(LDAP_GENERALIZED_TIME_FORMAT)} else: return val.strftime(LDAP_GENERALIZED_TIME_FORMAT) - elif isinstance(val, DNSName): - if capabilities.client_has_capability(version, 'dns_name_values'): + + def _enc_dnsname(self, val): + cap = self._cap_dnsname + if cap is None: + cap = capabilities.client_has_capability(self.version, + 'dns_name_values') + self._cap_dnsname = cap + if cap: return {'__dns_name__': unicode(val)} else: return unicode(val) - elif isinstance(val, Principal): - return unicode(val) + + def _enc_bytes(self, val): + encoded = base64.b64encode(val) + if not six.PY2: + encoded = encoded.decode('ascii') + return {'__base64__': encoded} + + def _enc_list(self, val, _identity=_identity): + result = [] + append = result.append + for v in val: + func = self[v.__class__] + append(v if func is _identity else func(v)) + return result + + def _enc_dict(self, val, _identity=_identity, _iteritems=six.iteritems): + result = {} + for k, v in _iteritems(val): + func = self[v.__class__] + result[k] = v if func is _identity else func(v) + return result + + +def json_encode_binary(val, version): + """ + JSON cannot encode binary values. We encode binary values in Python str + objects and text in Python unicode objects. In order to allow a binary + object to be passed through JSON we base64 encode it thus converting it to + text which JSON can transport. To assure we recognize the value is a base64 + encoded representation of the original binary value and not confuse it with + other text we convert the binary value to a dict in this form: + + {'__base64__' : base64_encoding_of_binary_value} + + This modification of the original input value cannot be done "in place" as + one might first assume (e.g. replacing any binary items in a container + (e.g. list, tuple, dict) with the base64 dict because the container might + be an immutable object (i.e. a tuple). Therefore this function returns a + copy of any container objects it encounters with tuples replaced by lists. + This is O.K. because the JSON encoding will map both lists and tuples to + JSON arrays. + """ + result = _JSONConverter(version).convert(val) + return json.dumps(result) + + +def _ipa_obj_hook(dct): + if '__base64__' in dct: + return base64.b64decode(dct['__base64__']) + elif '__datetime__' in dct: + return datetime.datetime.strptime(dct['__datetime__'], + LDAP_GENERALIZED_TIME_FORMAT) + elif '__dns_name__' in dct: + return DNSName(dct['__dns_name__']) else: - return val + return dct def json_decode_binary(val): - ''' + """ JSON cannot transport binary data. In order to transport binary data we convert binary data to a form like this: - {'__base64__' : base64_encoding_of_binary_value} + {'__base64__' : base64_encoding_of_binary_value} - see json_encode_binary() + see json_encode_binary() After JSON had decoded the JSON stream back into a Python object we must recursively scan the object looking for any dicts which might represent @@ -345,31 +418,11 @@ def json_decode_binary(val): don't modify objects in place because of side effects which may be dangerous. Thus we elect to spend a few more cycles and avoid the possibility of unintended side effects in favor of robustness. - ''' + """ + if isinstance(val, bytes): + val = val.decode('utf-8') - if isinstance(val, dict): - if '__base64__' in val: - return base64.b64decode(val['__base64__']) - elif '__datetime__' in val: - return datetime.datetime.strptime(val['__datetime__'], - LDAP_GENERALIZED_TIME_FORMAT) - elif '__dns_name__' in val: - return DNSName(val['__dns_name__']) - else: - return dict((k, json_decode_binary(v)) for k, v in val.items()) - elif isinstance(val, list): - return tuple(json_decode_binary(v) for v in val) - else: - if isinstance(val, bytes): - try: - return val.decode('utf-8') - except UnicodeDecodeError: - raise ConversionError( - name=val, - error='incorrect type' - ) - else: - return val + return json.loads(val, object_hook=_ipa_obj_hook) def decode_fault(e, encoding='UTF-8'): @@ -1091,27 +1144,27 @@ def __request(self, name, args): payload = json_encode_binary(payload, version) if self.__verbose >= 2: - root_logger.info('Request: %s', - json.dumps(payload, sort_keys=True, indent=4)) + root_logger.info( + 'Request: %s', + json.dumps(json.loads(payload), sort_keys=True, indent=4) + ) response = self.__transport.request( self.__host, self.__handler, - json.dumps(payload).encode('utf-8'), + payload.encode('utf-8'), verbose=self.__verbose >= 3, ) try: - response = json_decode_binary( - json.loads(response.decode('utf-8'))) + response = json_decode_binary(response) except ValueError as e: raise JSONError(error=str(e)) if self.__verbose >= 2: root_logger.info( 'Response: %s', - json.dumps(json_encode_binary(response, version), - sort_keys=True, indent=4) + json.dumps(response, sort_keys=True, indent=4) ) error = response.get('error') if error: diff --git a/ipaserver/rpcserver.py b/ipaserver/rpcserver.py index 45550fb..c5fc308 100644 --- a/ipaserver/rpcserver.py +++ b/ipaserver/rpcserver.py @@ -26,7 +26,6 @@ from xml.sax.saxutils import escape import os import datetime -import json import traceback import gssapi import time @@ -489,13 +488,12 @@ def marshal(self, result, error, _id=None, principal=unicode(principal), version=unicode(VERSION), ) - response = json_encode_binary(response, version) - dump = json.dumps(response, sort_keys=True, indent=4) + dump = json_encode_binary(response, version) return dump.encode('utf-8') def unmarshal(self, data): try: - d = json.loads(data) + d = json_decode_binary(data) except ValueError as e: raise JSONError(error=e) if not isinstance(d, dict): @@ -504,7 +502,6 @@ def unmarshal(self, data): raise JSONError(error=_('Request is missing "method"')) if 'params' not in d: raise JSONError(error=_('Request is missing "params"')) - d = json_decode_binary(d) method = d['method'] params = d['params'] _id = d.get('id') From 5c819e554266700ca155ab40d057de7941c9fd17 Mon Sep 17 00:00:00 2001 From: Christian Heimes <chei...@redhat.com> Date: Mon, 13 Feb 2017 10:46:36 +0100 Subject: [PATCH 2/4] Convert list to tuples Some tests assume that JSON deserializier returns tuples instead of lists. I don't think it is necessary but let's pass the tests for now. Signed-off-by: Christian Heimes <chei...@redhat.com> --- ipalib/rpc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ipalib/rpc.py b/ipalib/rpc.py index 6cad397..753d7e1 100644 --- a/ipalib/rpc.py +++ b/ipalib/rpc.py @@ -388,7 +388,7 @@ def json_encode_binary(val, version): return json.dumps(result) -def _ipa_obj_hook(dct): +def _ipa_obj_hook(dct, _iteritems=six.iteritems, _list=list): if '__base64__' in dct: return base64.b64decode(dct['__base64__']) elif '__datetime__' in dct: @@ -397,6 +397,10 @@ def _ipa_obj_hook(dct): elif '__dns_name__' in dct: return DNSName(dct['__dns_name__']) else: + # XXX tests assume tuples. Is this really necessary? + for k, v in _iteritems(dct): + if v.__class__ is _list: + dct[k] = tuple(v) return dct From fa32e3ff1cebe5c9ab450dd16bc258e85710d258 Mon Sep 17 00:00:00 2001 From: Christian Heimes <chei...@redhat.com> Date: Mon, 13 Feb 2017 19:09:14 +0100 Subject: [PATCH 3/4] Pretty print JSON in debug mode (debug level >= 2) Signed-off-by: Christian Heimes <chei...@redhat.com> --- ipalib/rpc.py | 100 ++++++++++++++++++++++++++++--------------------- ipaserver/rpcserver.py | 4 +- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/ipalib/rpc.py b/ipalib/rpc.py index 753d7e1..91c5aee 100644 --- a/ipalib/rpc.py +++ b/ipalib/rpc.py @@ -274,13 +274,38 @@ def xml_dumps(params, version, methodname=None, methodresponse=False, ) -class _JSONConverter(dict): +class _JSONPrimer(dict): + """Fast JSON primer and pre-converter + + Prepare a data structure for JSON serialization. In an ideal world, priming + could be handled by the default hook of json.dumps(). Unfortunately the + hook treats Python 2 str as text while FreeIPA considers str as bytes. + + The primer uses a couple of tricks to archive maximum performance: + + * O(1) type look instead of O(n) chain of costly isinstance() calls + * __missing__ and __mro__ with caching to handle subclasses + * inlined code with minor code duplication + * function default arguments to turn global into local lookups + * on-demand lookup of client capabilities with cache + + Depending on the client version number, the primer converts: + + * bytes -> {'__base64__': b64encode} + * datetime -> {'__datetime__': LDAP_GENERALIZED_TIME} + * DNSName -> {'__dns_name__': unicode} + + The _ipa_obj_hook() functions unserializes the marked JSON objects to + bytes, datetime and DNSName. + + :see: _ipa_obj_hook + """ __slots__ = ('version', '_cap_datetime', '_cap_dnsname') _identity = object() def __init__(self, version, _identity=_identity): - super(_JSONConverter, self).__init__() + super(_JSONPrimer, self).__init__() self.version = version self._cap_datetime = None self._cap_dnsname = None @@ -365,30 +390,28 @@ def _enc_dict(self, val, _identity=_identity, _iteritems=six.iteritems): return result -def json_encode_binary(val, version): - """ - JSON cannot encode binary values. We encode binary values in Python str - objects and text in Python unicode objects. In order to allow a binary - object to be passed through JSON we base64 encode it thus converting it to - text which JSON can transport. To assure we recognize the value is a base64 - encoded representation of the original binary value and not confuse it with - other text we convert the binary value to a dict in this form: - - {'__base64__' : base64_encoding_of_binary_value} - - This modification of the original input value cannot be done "in place" as - one might first assume (e.g. replacing any binary items in a container - (e.g. list, tuple, dict) with the base64 dict because the container might - be an immutable object (i.e. a tuple). Therefore this function returns a - copy of any container objects it encounters with tuples replaced by lists. - This is O.K. because the JSON encoding will map both lists and tuples to - JSON arrays. +def json_encode_binary(val, version, pretty_print=False): + """Serialize a Python object structure to JSON + + :param object val: Python object structure + :param str version: client version + :param bool pretty_print: indent and sort JSON (warning: slow!) + :return: text + :note: pretty printing triggers a slow path in Python's JSON module. Only + use pretty_print in debug mode. """ - result = _JSONConverter(version).convert(val) - return json.dumps(result) + result = _JSONPrimer(version).convert(val) + if pretty_print: + return json.dumps(result, indent=4, sort_keys=True) + else: + return json.dumps(result) def _ipa_obj_hook(dct, _iteritems=six.iteritems, _list=list): + """JSON object hook + + :see: _JSONPrimer + """ if '__base64__' in dct: return base64.b64decode(dct['__base64__']) elif '__datetime__' in dct: @@ -405,23 +428,12 @@ def _ipa_obj_hook(dct, _iteritems=six.iteritems, _list=list): def json_decode_binary(val): - """ - JSON cannot transport binary data. In order to transport binary data we - convert binary data to a form like this: - - {'__base64__' : base64_encoding_of_binary_value} - - see json_encode_binary() - - After JSON had decoded the JSON stream back into a Python object we must - recursively scan the object looking for any dicts which might represent - binary values and replace the dict containing the base64 encoding of the - binary value with the decoded binary value. Unlike the encoding problem - where the input might consist of immutable object, all JSON decoded - container are mutable so the conversion could be done in place. However we - don't modify objects in place because of side effects which may be - dangerous. Thus we elect to spend a few more cycles and avoid the - possibility of unintended side effects in favor of robustness. + """Convert serialized JSON string back to Python data structure + + :param val: JSON string + :type val: str, bytes + :return: Python data structure + :see: _ipa_obj_hook, _JSONPrimer """ if isinstance(val, bytes): val = val.decode('utf-8') @@ -1143,14 +1155,16 @@ def __init__(self, uri, transport, encoding, verbose, allow_none): self._ServerProxy__transport = transport def __request(self, name, args): + print_json = self.__verbose >= 2 payload = {'method': unicode(name), 'params': args, 'id': 0} version = args[1].get('version', VERSION_WITHOUT_CAPABILITIES) - payload = json_encode_binary(payload, version) + payload = json_encode_binary( + payload, version, pretty_print=print_json) - if self.__verbose >= 2: + if print_json: root_logger.info( 'Request: %s', - json.dumps(json.loads(payload), sort_keys=True, indent=4) + payload ) response = self.__transport.request( @@ -1165,7 +1179,7 @@ def __request(self, name, args): except ValueError as e: raise JSONError(error=str(e)) - if self.__verbose >= 2: + if print_json: root_logger.info( 'Response: %s', json.dumps(response, sort_keys=True, indent=4) diff --git a/ipaserver/rpcserver.py b/ipaserver/rpcserver.py index c5fc308..5ff55a5 100644 --- a/ipaserver/rpcserver.py +++ b/ipaserver/rpcserver.py @@ -488,7 +488,9 @@ def marshal(self, result, error, _id=None, principal=unicode(principal), version=unicode(VERSION), ) - dump = json_encode_binary(response, version) + dump = json_encode_binary( + response, version, pretty_print=self.api.env.debug >= 2 + ) return dump.encode('utf-8') def unmarshal(self, data): From cf9a9f3d75782243d276706b5326283751f01789 Mon Sep 17 00:00:00 2001 From: Christian Heimes <chei...@redhat.com> Date: Tue, 14 Feb 2017 09:33:49 +0100 Subject: [PATCH 4/4] Fix test, nested lists are no longer converted to nested tuples --- ipatests/test_ipaserver/test_rpcserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipatests/test_ipaserver/test_rpcserver.py b/ipatests/test_ipaserver/test_rpcserver.py index 7ee94d3..d8f6015 100644 --- a/ipatests/test_ipaserver/test_rpcserver.py +++ b/ipatests/test_ipaserver/test_rpcserver.py @@ -257,7 +257,7 @@ def test_unmarshal(self): assert unicode(e.error) == 'params[1] (aka options) must be a dict' # Test with valid values: - args = (u'jdoe', ) + args = [u'jdoe'] options = dict(givenname=u'John', sn='Doe') d = dict(method=u'user_add', params=(args, options), id=18) assert o.unmarshal(json.dumps(d)) == (u'user_add', args, options, 18)
-- Manage your subscription for the Freeipa-devel mailing list: https://www.redhat.com/mailman/listinfo/freeipa-devel Contribute to FreeIPA: http://www.freeipa.org/page/Contribute/Code