This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 155bf076204dcb6bf1f68a556e9243556f5e4ebb Author: Wes McKinney <wes.mckin...@twosigma.com> AuthorDate: Tue Nov 28 20:06:18 2017 -0500 ARROW-1854: [Python] Use pickle to serialize numpy arrays of objects. **Just posting this for discussion.** See the preceding discussion on https://issues.apache.org/jira/browse/ARROW-1854. I think the ideal way to solve this would actually be to improve our handling of lists, which should be possible given that pickle seems to outperform us by 6x according to the benchmarks in https://issues.apache.org/jira/browse/ARROW-1854. Note that the implementation in this PR will not handle numpy arrays of user-defined classes because it will not fall back to cloudpickle when needed. cc @pcmoritz @wesm Author: Wes McKinney <wes.mckin...@twosigma.com> Author: Robert Nishihara <robertnishih...@gmail.com> Closes #1360 from robertnishihara/numpyobject and squashes the following commits: c37a0a08 [Wes McKinney] Fix flake 51915032 [Wes McKinney] Fix post rebase 43f2c805 [Wes McKinney] Add SerializationContext.clone method. Add pandas_serialization_context member that uses pickle for NumPy arrays with unsupported tensor types c9440231 [Wes McKinney] Use pickle.HIGHEST_PROTOCOL, convert to Buffer then memoryview for more memory-efficient transport cf719c3f [Robert Nishihara] Use pickle to serialize numpy arrays of objects. --- python/pyarrow/__init__.py | 1 + python/pyarrow/serialization.pxi | 16 +++++++++++ python/pyarrow/serialization.py | 46 +++++++++++++++++++++--------- python/pyarrow/tests/test_serialization.py | 7 +++-- 4 files changed, 53 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index bd31b21..a245fe6 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -125,6 +125,7 @@ from pyarrow.ipc import (Message, MessageReader, localfs = LocalFileSystem.get_instance() from pyarrow.serialization import (_default_serialization_context, + pandas_serialization_context, register_default_serialization_handlers) import pyarrow.types as types diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index bb266b2..faf164b 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -57,6 +57,22 @@ cdef class SerializationContext: self.custom_serializers = dict() self.custom_deserializers = dict() + def clone(self): + """ + Return copy of this SerializationContext + + Returns + ------- + clone : SerializationContext + """ + result = SerializationContext() + result.type_to_type_id = self.type_to_type_id.copy() + result.whitelisted_types = self.whitelisted_types.copy() + result.custom_serializers = self.custom_serializers.copy() + result.custom_deserializers = self.custom_deserializers.copy() + + return result + def register_type(self, type_, type_id, custom_serializer=None, custom_deserializer=None): """EXPERIMENTAL: Add type to the list of types we can serialize. diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index ab25b63..08e6cce 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -22,7 +22,7 @@ import pickle import numpy as np from pyarrow import serialize_pandas, deserialize_pandas -from pyarrow.lib import _default_serialization_context +from pyarrow.lib import _default_serialization_context, frombuffer try: import cloudpickle @@ -30,6 +30,28 @@ except ImportError: cloudpickle = pickle +# ---------------------------------------------------------------------- +# Set up serialization for numpy with dtype object (primitive types are +# handled efficiently with Arrow's Tensor facilities, see +# python_to_arrow.cc) + +def _serialize_numpy_array_list(obj): + return obj.tolist(), obj.dtype.str + + +def _deserialize_numpy_array_list(data): + return np.array(data[0], dtype=np.dtype(data[1])) + + +def _serialize_numpy_array_pickle(obj): + pickled = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) + return frombuffer(pickled) + + +def _deserialize_numpy_array_pickle(data): + return pickle.loads(memoryview(data)) + + def register_default_serialization_handlers(serialization_context): # ---------------------------------------------------------------------- @@ -80,21 +102,10 @@ def register_default_serialization_handlers(serialization_context): custom_serializer=cloudpickle.dumps, custom_deserializer=cloudpickle.loads) - # ---------------------------------------------------------------------- - # Set up serialization for numpy with dtype object (primitive types are - # handled efficiently with Arrow's Tensor facilities, see - # python_to_arrow.cc) - - def _serialize_numpy_array(obj): - return obj.tolist(), obj.dtype.str - - def _deserialize_numpy_array(data): - return np.array(data[0], dtype=np.dtype(data[1])) - serialization_context.register_type( np.ndarray, 'np.array', - custom_serializer=_serialize_numpy_array, - custom_deserializer=_deserialize_numpy_array) + custom_serializer=_serialize_numpy_array_list, + custom_deserializer=_deserialize_numpy_array_list) # ---------------------------------------------------------------------- # Set up serialization for pandas Series and DataFrame @@ -153,3 +164,10 @@ def register_default_serialization_handlers(serialization_context): register_default_serialization_handlers(_default_serialization_context) + +pandas_serialization_context = _default_serialization_context.clone() + +pandas_serialization_context.register_type( + np.ndarray, 'np.array', + custom_serializer=_serialize_numpy_array_pickle, + custom_deserializer=_deserialize_numpy_array_pickle) diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index d06beea..6d85621 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -212,11 +212,11 @@ def make_serialization_context(): serialization_context = make_serialization_context() -def serialization_roundtrip(value, f): +def serialization_roundtrip(value, f, ctx=serialization_context): f.seek(0) - pa.serialize_to(value, f, serialization_context) + pa.serialize_to(value, f, ctx) f.seek(0) - result = pa.deserialize_from(f, None, serialization_context) + result = pa.deserialize_from(f, None, ctx) assert_equal(value, result) _check_component_roundtrip(value) @@ -249,6 +249,7 @@ def test_primitive_serialization(large_memory_map): with pa.memory_map(large_memory_map, mode="r+") as mmap: for obj in PRIMITIVE_OBJECTS: serialization_roundtrip(obj, mmap) + serialization_roundtrip(obj, mmap, pa.pandas_serialization_context) def test_serialize_to_buffer(): -- To stop receiving notification emails like this one, please contact "commits@arrow.apache.org" <commits@arrow.apache.org>.