URL: https://github.com/freeipa/freeipa/pull/459
Author: tiran
 Title: #459: Faster JSON encoder/decoder
Action: synchronized

To pull the PR as Git branch:
git remote add ghfreeipa https://github.com/freeipa/freeipa
git fetch ghfreeipa pull/459/head:pr459
git checkout pr459
From cace1ed0605bd763ea99077938eda2b6c4ed57a1 Mon Sep 17 00:00:00 2001
From: Christian Heimes <chei...@redhat.com>
Date: Mon, 13 Feb 2017 09:46:39 +0100
Subject: [PATCH 1/5] Faster JSON encoder/decoder

Improve performance of FreeIPA's JSON serializer and deserializer.

* Don't indent and sort keys. Both options trigger a slow path in
  Python's json package. Without indention and sorting, encoding
  mostly happens in optimized C code.
* Replace O(n) type checks with O(1) type lookup and eliminate
  the use of isinstance().
* Check each client capability only once for every conversion.
* Use decoder's obj_hook feature to traverse the object tree once and
  to eliminate calls to isinstance().

Closes: https://fedorahosted.org/freeipa/ticket/6655
Signed-off-by: Christian Heimes <chei...@redhat.com>
---
 ipalib/rpc.py          | 211 +++++++++++++++++++++++++++++++------------------
 ipaserver/rpcserver.py |   8 +-
 2 files changed, 135 insertions(+), 84 deletions(-)

diff --git a/ipalib/rpc.py b/ipalib/rpc.py
index 31ed64e..d8207dc 100644
--- a/ipalib/rpc.py
+++ b/ipalib/rpc.py
@@ -51,7 +51,7 @@
 from ipalib.backend import Connectible
 from ipalib.constants import LDAP_GENERALIZED_TIME_FORMAT
 from ipalib.errors import (public_errors, UnknownError, NetworkError,
-    KerberosError, XMLRPCMarshallError, JSONError, ConversionError)
+    KerberosError, XMLRPCMarshallError, JSONError)
 from ipalib import errors, capabilities
 from ipalib.request import context, Connection
 from ipapython.ipa_log_manager import root_logger
@@ -274,67 +274,140 @@ def xml_dumps(params, version, methodname=None, methodresponse=False,
     )
 
 
-def json_encode_binary(val, version):
-    '''
-   JSON cannot encode binary values. We encode binary values in Python str
-   objects and text in Python unicode objects. In order to allow a binary
-   object to be passed through JSON we base64 encode it thus converting it to
-   text which JSON can transport. To assure we recognize the value is a base64
-   encoded representation of the original binary value and not confuse it with
-   other text we convert the binary value to a dict in this form:
-
-   {'__base64__' : base64_encoding_of_binary_value}
-
-   This modification of the original input value cannot be done "in place" as
-   one might first assume (e.g. replacing any binary items in a container
-   (e.g. list, tuple, dict) with the base64 dict because the container might be
-   an immutable object (i.e. a tuple). Therefore this function returns a copy
-   of any container objects it encounters with tuples replaced by lists. This
-   is O.K. because the JSON encoding will map both lists and tuples to JSON
-   arrays.
-   '''
-
-    if isinstance(val, dict):
-        new_dict = {}
-        for k, v in val.items():
-            new_dict[k] = json_encode_binary(v, version)
-        return new_dict
-    elif isinstance(val, (list, tuple)):
-        new_list = [json_encode_binary(v, version) for v in val]
-        return new_list
-    elif isinstance(val, bytes):
-        encoded = base64.b64encode(val)
-        if not six.PY2:
-            encoded = encoded.decode('ascii')
-        return {'__base64__': encoded}
-    elif isinstance(val, Decimal):
-        return unicode(val)
-    elif isinstance(val, DN):
-        return str(val)
-    elif isinstance(val, datetime.datetime):
-        if capabilities.client_has_capability(version, 'datetime_values'):
+class _JSONConverter(dict):
+    __slots__ = ('version', '_cap_datetime', '_cap_dnsname')
+
+    _identity = object()
+
+    def __init__(self, version, _identity=_identity):
+        super(_JSONConverter, self).__init__()
+        self.version = version
+        self._cap_datetime = None
+        self._cap_dnsname = None
+        self.update({
+            unicode: _identity,
+            bool: _identity,
+            type(None): _identity,
+            float: _identity,
+            Decimal: unicode,
+            DN: str,
+            Principal: unicode,
+            DNSName: self._enc_dnsname,
+            datetime.datetime: self._enc_datetime,
+            bytes: self._enc_bytes,
+            list: self._enc_list,
+            tuple: self._enc_list,
+            dict: self._enc_dict,
+        })
+        # int, long
+        for t in six.integer_types:
+            self[t] = _identity
+
+    def __missing__(self, typ):
+        # walk MRO to find best match
+        for c in typ.__mro__:
+            if c in self:
+                self[typ] = self[c]
+                return self[c]
+        # use issubclass to check for registered ABCs
+        for c in self:
+            if issubclass(typ, c):
+                self[typ] = self[c]
+                return self[c]
+        raise TypeError(typ)
+
+    def convert(self, obj, _identity=_identity):
+        # obj.__class__ is twice as fast as type(obj)
+        func = self[obj.__class__]
+        return obj if func is _identity else func(obj)
+
+    def _enc_datetime(self, val):
+        cap = self._cap_datetime
+        if cap is None:
+            cap = capabilities.client_has_capability(self.version,
+                                                     'datetime_values')
+            self._cap_datetime = cap
+        if cap:
             return {'__datetime__': val.strftime(LDAP_GENERALIZED_TIME_FORMAT)}
         else:
             return val.strftime(LDAP_GENERALIZED_TIME_FORMAT)
-    elif isinstance(val, DNSName):
-        if capabilities.client_has_capability(version, 'dns_name_values'):
+
+    def _enc_dnsname(self, val):
+        cap = self._cap_dnsname
+        if cap is None:
+            cap = capabilities.client_has_capability(self.version,
+                                                     'dns_name_values')
+            self._cap_dnsname = cap
+        if cap:
             return {'__dns_name__': unicode(val)}
         else:
             return unicode(val)
-    elif isinstance(val, Principal):
-        return unicode(val)
+
+    def _enc_bytes(self, val):
+        encoded = base64.b64encode(val)
+        if not six.PY2:
+            encoded = encoded.decode('ascii')
+        return {'__base64__': encoded}
+
+    def _enc_list(self, val, _identity=_identity):
+        result = []
+        append = result.append
+        for v in val:
+            func = self[v.__class__]
+            append(v if func is _identity else func(v))
+        return result
+
+    def _enc_dict(self, val, _identity=_identity, _iteritems=six.iteritems):
+        result = {}
+        for k, v in _iteritems(val):
+            func = self[v.__class__]
+            result[k] = v if func is _identity else func(v)
+        return result
+
+
+def json_encode_binary(val, version):
+    """
+    JSON cannot encode binary values. We encode binary values in Python str
+    objects and text in Python unicode objects. In order to allow a binary
+    object to be passed through JSON we base64 encode it thus converting it to
+    text which JSON can transport. To assure we recognize the value is a base64
+    encoded representation of the original binary value and not confuse it with
+    other text we convert the binary value to a dict in this form:
+
+    {'__base64__' : base64_encoding_of_binary_value}
+
+    This modification of the original input value cannot be done "in place" as
+    one might first assume (e.g. replacing any binary items in a container
+    (e.g. list, tuple, dict) with the base64 dict because the container might
+    be an immutable object (i.e. a tuple). Therefore this function returns a
+    copy of any container objects it encounters with tuples replaced by lists.
+    This is O.K. because the JSON encoding will map both lists and tuples to
+    JSON arrays.
+    """
+    result = _JSONConverter(version).convert(val)
+    return json.dumps(result)
+
+
+def _ipa_obj_hook(dct):
+    if '__base64__' in dct:
+        return base64.b64decode(dct['__base64__'])
+    elif '__datetime__' in dct:
+        return datetime.datetime.strptime(dct['__datetime__'],
+                                          LDAP_GENERALIZED_TIME_FORMAT)
+    elif '__dns_name__' in dct:
+        return DNSName(dct['__dns_name__'])
     else:
-        return val
+        return dct
 
 
 def json_decode_binary(val):
-    '''
+    """
     JSON cannot transport binary data. In order to transport binary data we
     convert binary data to a form like this:
 
-   {'__base64__' : base64_encoding_of_binary_value}
+    {'__base64__' : base64_encoding_of_binary_value}
 
-   see json_encode_binary()
+    see json_encode_binary()
 
     After JSON had decoded the JSON stream back into a Python object we must
     recursively scan the object looking for any dicts which might represent
@@ -345,31 +418,11 @@ def json_decode_binary(val):
     don't modify objects in place because of side effects which may be
     dangerous. Thus we elect to spend a few more cycles and avoid the
     possibility of unintended side effects in favor of robustness.
-    '''
+    """
+    if isinstance(val, bytes):
+        val = val.decode('utf-8')
 
-    if isinstance(val, dict):
-        if '__base64__' in val:
-            return base64.b64decode(val['__base64__'])
-        elif '__datetime__' in val:
-            return datetime.datetime.strptime(val['__datetime__'],
-                                              LDAP_GENERALIZED_TIME_FORMAT)
-        elif '__dns_name__' in val:
-            return DNSName(val['__dns_name__'])
-        else:
-            return dict((k, json_decode_binary(v)) for k, v in val.items())
-    elif isinstance(val, list):
-        return tuple(json_decode_binary(v) for v in val)
-    else:
-        if isinstance(val, bytes):
-            try:
-                return val.decode('utf-8')
-            except UnicodeDecodeError:
-                raise ConversionError(
-                    name=val,
-                    error='incorrect type'
-                )
-        else:
-            return val
+    return json.loads(val, object_hook=_ipa_obj_hook)
 
 
 def decode_fault(e, encoding='UTF-8'):
@@ -1105,27 +1158,27 @@ def __request(self, name, args):
         payload = json_encode_binary(payload, version)
 
         if self.__verbose >= 2:
-            root_logger.info('Request: %s',
-                             json.dumps(payload, sort_keys=True, indent=4))
+            root_logger.info(
+                'Request: %s',
+                json.dumps(json.loads(payload), sort_keys=True, indent=4)
+            )
 
         response = self.__transport.request(
             self.__host,
             self.__handler,
-            json.dumps(payload).encode('utf-8'),
+            payload.encode('utf-8'),
             verbose=self.__verbose >= 3,
         )
 
         try:
-            response = json_decode_binary(
-                json.loads(response.decode('utf-8')))
+            response = json_decode_binary(response)
         except ValueError as e:
             raise JSONError(error=str(e))
 
         if self.__verbose >= 2:
             root_logger.info(
                 'Response: %s',
-                json.dumps(json_encode_binary(response, version),
-                           sort_keys=True, indent=4)
+                json.dumps(response, sort_keys=True, indent=4)
             )
         error = response.get('error')
         if error:
diff --git a/ipaserver/rpcserver.py b/ipaserver/rpcserver.py
index 357e836..10b4a6d 100644
--- a/ipaserver/rpcserver.py
+++ b/ipaserver/rpcserver.py
@@ -25,8 +25,8 @@
 
 from xml.sax.saxutils import escape
 import os
-import json
 import traceback
+
 import gssapi
 import requests
 
@@ -483,13 +483,12 @@ def marshal(self, result, error, _id=None,
             principal=unicode(principal),
             version=unicode(VERSION),
         )
-        response = json_encode_binary(response, version)
-        dump = json.dumps(response, sort_keys=True, indent=4)
+        dump = json_encode_binary(response, version)
         return dump.encode('utf-8')
 
     def unmarshal(self, data):
         try:
-            d = json.loads(data)
+            d = json_decode_binary(data)
         except ValueError as e:
             raise JSONError(error=e)
         if not isinstance(d, dict):
@@ -498,7 +497,6 @@ def unmarshal(self, data):
             raise JSONError(error=_('Request is missing "method"'))
         if 'params' not in d:
             raise JSONError(error=_('Request is missing "params"'))
-        d = json_decode_binary(d)
         method = d['method']
         params = d['params']
         _id = d.get('id')

From 8bb297177e07f86bb72be4121e05920837e6eb97 Mon Sep 17 00:00:00 2001
From: Christian Heimes <chei...@redhat.com>
Date: Mon, 13 Feb 2017 10:46:36 +0100
Subject: [PATCH 2/5] Convert list to tuples

Some tests assume that JSON deserializier returns tuples instead of
lists. I don't think it is necessary but let's pass the tests for now.

Signed-off-by: Christian Heimes <chei...@redhat.com>
---
 ipalib/rpc.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ipalib/rpc.py b/ipalib/rpc.py
index d8207dc..8e9d1e3 100644
--- a/ipalib/rpc.py
+++ b/ipalib/rpc.py
@@ -388,7 +388,7 @@ def json_encode_binary(val, version):
     return json.dumps(result)
 
 
-def _ipa_obj_hook(dct):
+def _ipa_obj_hook(dct, _iteritems=six.iteritems, _list=list):
     if '__base64__' in dct:
         return base64.b64decode(dct['__base64__'])
     elif '__datetime__' in dct:
@@ -397,6 +397,10 @@ def _ipa_obj_hook(dct):
     elif '__dns_name__' in dct:
         return DNSName(dct['__dns_name__'])
     else:
+        # XXX tests assume tuples. Is this really necessary?
+        for k, v in _iteritems(dct):
+            if v.__class__ is _list:
+                dct[k] = tuple(v)
         return dct
 
 

From 2296ed4d526725685dae542c21b94f1eed663982 Mon Sep 17 00:00:00 2001
From: Christian Heimes <chei...@redhat.com>
Date: Mon, 13 Feb 2017 19:09:14 +0100
Subject: [PATCH 3/5] Pretty print JSON in debug mode (debug level >= 2)

Signed-off-by: Christian Heimes <chei...@redhat.com>
---
 ipalib/rpc.py          | 100 ++++++++++++++++++++++++++++---------------------
 ipaserver/rpcserver.py |   4 +-
 2 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/ipalib/rpc.py b/ipalib/rpc.py
index 8e9d1e3..b4e5ecb 100644
--- a/ipalib/rpc.py
+++ b/ipalib/rpc.py
@@ -274,13 +274,38 @@ def xml_dumps(params, version, methodname=None, methodresponse=False,
     )
 
 
-class _JSONConverter(dict):
+class _JSONPrimer(dict):
+    """Fast JSON primer and pre-converter
+
+    Prepare a data structure for JSON serialization. In an ideal world, priming
+    could be handled by the default hook of json.dumps(). Unfortunately the
+    hook treats Python 2 str as text while FreeIPA considers str as bytes.
+
+    The primer uses a couple of tricks to archive maximum performance:
+
+    * O(1) type look instead of O(n) chain of costly isinstance() calls
+    * __missing__ and __mro__ with caching to handle subclasses
+    * inlined code with minor code duplication
+    * function default arguments to turn global into local lookups
+    * on-demand lookup of client capabilities with cache
+
+    Depending on the client version number, the primer converts:
+
+    * bytes -> {'__base64__': b64encode}
+    * datetime -> {'__datetime__': LDAP_GENERALIZED_TIME}
+    * DNSName -> {'__dns_name__': unicode}
+
+    The _ipa_obj_hook() functions unserializes the marked JSON objects to
+    bytes, datetime and DNSName.
+
+    :see: _ipa_obj_hook
+    """
     __slots__ = ('version', '_cap_datetime', '_cap_dnsname')
 
     _identity = object()
 
     def __init__(self, version, _identity=_identity):
-        super(_JSONConverter, self).__init__()
+        super(_JSONPrimer, self).__init__()
         self.version = version
         self._cap_datetime = None
         self._cap_dnsname = None
@@ -365,30 +390,28 @@ def _enc_dict(self, val, _identity=_identity, _iteritems=six.iteritems):
         return result
 
 
-def json_encode_binary(val, version):
-    """
-    JSON cannot encode binary values. We encode binary values in Python str
-    objects and text in Python unicode objects. In order to allow a binary
-    object to be passed through JSON we base64 encode it thus converting it to
-    text which JSON can transport. To assure we recognize the value is a base64
-    encoded representation of the original binary value and not confuse it with
-    other text we convert the binary value to a dict in this form:
-
-    {'__base64__' : base64_encoding_of_binary_value}
-
-    This modification of the original input value cannot be done "in place" as
-    one might first assume (e.g. replacing any binary items in a container
-    (e.g. list, tuple, dict) with the base64 dict because the container might
-    be an immutable object (i.e. a tuple). Therefore this function returns a
-    copy of any container objects it encounters with tuples replaced by lists.
-    This is O.K. because the JSON encoding will map both lists and tuples to
-    JSON arrays.
+def json_encode_binary(val, version, pretty_print=False):
+    """Serialize a Python object structure to JSON
+
+    :param object val: Python object structure
+    :param str version: client version
+    :param bool pretty_print: indent and sort JSON (warning: slow!)
+    :return: text
+    :note: pretty printing triggers a slow path in Python's JSON module. Only
+           use pretty_print in debug mode.
     """
-    result = _JSONConverter(version).convert(val)
-    return json.dumps(result)
+    result = _JSONPrimer(version).convert(val)
+    if pretty_print:
+        return json.dumps(result, indent=4, sort_keys=True)
+    else:
+        return json.dumps(result)
 
 
 def _ipa_obj_hook(dct, _iteritems=six.iteritems, _list=list):
+    """JSON object hook
+
+    :see: _JSONPrimer
+    """
     if '__base64__' in dct:
         return base64.b64decode(dct['__base64__'])
     elif '__datetime__' in dct:
@@ -405,23 +428,12 @@ def _ipa_obj_hook(dct, _iteritems=six.iteritems, _list=list):
 
 
 def json_decode_binary(val):
-    """
-    JSON cannot transport binary data. In order to transport binary data we
-    convert binary data to a form like this:
-
-    {'__base64__' : base64_encoding_of_binary_value}
-
-    see json_encode_binary()
-
-    After JSON had decoded the JSON stream back into a Python object we must
-    recursively scan the object looking for any dicts which might represent
-    binary values and replace the dict containing the base64 encoding of the
-    binary value with the decoded binary value. Unlike the encoding problem
-    where the input might consist of immutable object, all JSON decoded
-    container are mutable so the conversion could be done in place. However we
-    don't modify objects in place because of side effects which may be
-    dangerous. Thus we elect to spend a few more cycles and avoid the
-    possibility of unintended side effects in favor of robustness.
+    """Convert serialized JSON string back to Python data structure
+
+    :param val: JSON string
+    :type val: str, bytes
+    :return: Python data structure
+    :see: _ipa_obj_hook, _JSONPrimer
     """
     if isinstance(val, bytes):
         val = val.decode('utf-8')
@@ -1157,14 +1169,16 @@ def __init__(self, uri, transport, encoding, verbose, allow_none):
         self._ServerProxy__transport = transport
 
     def __request(self, name, args):
+        print_json = self.__verbose >= 2
         payload = {'method': unicode(name), 'params': args, 'id': 0}
         version = args[1].get('version', VERSION_WITHOUT_CAPABILITIES)
-        payload = json_encode_binary(payload, version)
+        payload = json_encode_binary(
+            payload, version, pretty_print=print_json)
 
-        if self.__verbose >= 2:
+        if print_json:
             root_logger.info(
                 'Request: %s',
-                json.dumps(json.loads(payload), sort_keys=True, indent=4)
+                payload
             )
 
         response = self.__transport.request(
@@ -1179,7 +1193,7 @@ def __request(self, name, args):
         except ValueError as e:
             raise JSONError(error=str(e))
 
-        if self.__verbose >= 2:
+        if print_json:
             root_logger.info(
                 'Response: %s',
                 json.dumps(response, sort_keys=True, indent=4)
diff --git a/ipaserver/rpcserver.py b/ipaserver/rpcserver.py
index 10b4a6d..f5c520f 100644
--- a/ipaserver/rpcserver.py
+++ b/ipaserver/rpcserver.py
@@ -483,7 +483,9 @@ def marshal(self, result, error, _id=None,
             principal=unicode(principal),
             version=unicode(VERSION),
         )
-        dump = json_encode_binary(response, version)
+        dump = json_encode_binary(
+            response, version, pretty_print=self.api.env.debug >= 2
+        )
         return dump.encode('utf-8')
 
     def unmarshal(self, data):

From cb644693b809f7d0a5fae78df0255786fc07dcc0 Mon Sep 17 00:00:00 2001
From: Christian Heimes <chei...@redhat.com>
Date: Tue, 14 Feb 2017 09:33:49 +0100
Subject: [PATCH 4/5] Fix test, nested lists are no longer converted to nested
 tuples

---
 ipatests/test_ipaserver/test_rpcserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ipatests/test_ipaserver/test_rpcserver.py b/ipatests/test_ipaserver/test_rpcserver.py
index 7ee94d3..d8f6015 100644
--- a/ipatests/test_ipaserver/test_rpcserver.py
+++ b/ipatests/test_ipaserver/test_rpcserver.py
@@ -257,7 +257,7 @@ def test_unmarshal(self):
         assert unicode(e.error) == 'params[1] (aka options) must be a dict'
 
         # Test with valid values:
-        args = (u'jdoe', )
+        args = [u'jdoe']
         options = dict(givenname=u'John', sn='Doe')
         d = dict(method=u'user_add', params=(args, options), id=18)
         assert o.unmarshal(json.dumps(d)) == (u'user_add', args, options, 18)

From 320568c001c2600f7ff954292e93c1f66f3e9c9e Mon Sep 17 00:00:00 2001
From: Christian Heimes <chei...@redhat.com>
Date: Tue, 14 Feb 2017 13:19:19 +0100
Subject: [PATCH 5/5] Explain more performance tricks in doc string

Signed-off-by: Christian Heimes <chei...@redhat.com>
---
 ipalib/rpc.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ipalib/rpc.py b/ipalib/rpc.py
index b4e5ecb..b91e8e2 100644
--- a/ipalib/rpc.py
+++ b/ipalib/rpc.py
@@ -285,9 +285,12 @@ class _JSONPrimer(dict):
 
     * O(1) type look instead of O(n) chain of costly isinstance() calls
     * __missing__ and __mro__ with caching to handle subclasses
-    * inlined code with minor code duplication
+    * inline code with minor code duplication (func lookup in enc_list/dict)
+    * avoid surplus function calls (e.g. func is _identity, obj.__class__
+      instead if type(obj))
     * function default arguments to turn global into local lookups
-    * on-demand lookup of client capabilities with cache
+    * avoid re-creation of bound method objects (e.g. result.append)
+    * on-demand lookup of client capabilities with cached values
 
     Depending on the client version number, the primer converts:
 
-- 
Manage your subscription for the Freeipa-devel mailing list:
https://www.redhat.com/mailman/listinfo/freeipa-devel
Contribute to FreeIPA: http://www.freeipa.org/page/Contribute/Code

Reply via email to