[
https://issues.apache.org/jira/browse/ARROW-1854?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16269852#comment-16269852
]
ASF GitHub Bot commented on ARROW-1854:
---------------------------------------
wesm closed pull request #1360: ARROW-1854: [Python] Use pickle to serialize
numpy arrays of objects.
URL: https://github.com/apache/arrow/pull/1360
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index bd31b21c1..a245fe679 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -125,6 +125,7 @@
localfs = LocalFileSystem.get_instance()
from pyarrow.serialization import (_default_serialization_context,
+ pandas_serialization_context,
register_default_serialization_handlers)
import pyarrow.types as types
diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
index bb266b2f9..faf164b3e 100644
--- a/python/pyarrow/serialization.pxi
+++ b/python/pyarrow/serialization.pxi
@@ -57,6 +57,22 @@ cdef class SerializationContext:
self.custom_serializers = dict()
self.custom_deserializers = dict()
+ def clone(self):
+ """
+ Return copy of this SerializationContext
+
+ Returns
+ -------
+ clone : SerializationContext
+ """
+ result = SerializationContext()
+ result.type_to_type_id = self.type_to_type_id.copy()
+ result.whitelisted_types = self.whitelisted_types.copy()
+ result.custom_serializers = self.custom_serializers.copy()
+ result.custom_deserializers = self.custom_deserializers.copy()
+
+ return result
+
def register_type(self, type_, type_id,
custom_serializer=None, custom_deserializer=None):
"""EXPERIMENTAL: Add type to the list of types we can serialize.
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index ab25b63d5..08e6cce75 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -22,7 +22,7 @@
import numpy as np
from pyarrow import serialize_pandas, deserialize_pandas
-from pyarrow.lib import _default_serialization_context
+from pyarrow.lib import _default_serialization_context, frombuffer
try:
import cloudpickle
@@ -30,6 +30,28 @@
cloudpickle = pickle
+# ----------------------------------------------------------------------
+# Set up serialization for numpy with dtype object (primitive types are
+# handled efficiently with Arrow's Tensor facilities, see
+# python_to_arrow.cc)
+
+def _serialize_numpy_array_list(obj):
+ return obj.tolist(), obj.dtype.str
+
+
+def _deserialize_numpy_array_list(data):
+ return np.array(data[0], dtype=np.dtype(data[1]))
+
+
+def _serialize_numpy_array_pickle(obj):
+ pickled = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+ return frombuffer(pickled)
+
+
+def _deserialize_numpy_array_pickle(data):
+ return pickle.loads(memoryview(data))
+
+
def register_default_serialization_handlers(serialization_context):
# ----------------------------------------------------------------------
@@ -80,21 +102,10 @@ def _deserialize_default_dict(data):
custom_serializer=cloudpickle.dumps,
custom_deserializer=cloudpickle.loads)
- # ----------------------------------------------------------------------
- # Set up serialization for numpy with dtype object (primitive types are
- # handled efficiently with Arrow's Tensor facilities, see
- # python_to_arrow.cc)
-
- def _serialize_numpy_array(obj):
- return obj.tolist(), obj.dtype.str
-
- def _deserialize_numpy_array(data):
- return np.array(data[0], dtype=np.dtype(data[1]))
-
serialization_context.register_type(
np.ndarray, 'np.array',
- custom_serializer=_serialize_numpy_array,
- custom_deserializer=_deserialize_numpy_array)
+ custom_serializer=_serialize_numpy_array_list,
+ custom_deserializer=_deserialize_numpy_array_list)
# ----------------------------------------------------------------------
# Set up serialization for pandas Series and DataFrame
@@ -153,3 +164,10 @@ def _deserialize_torch_tensor(data):
register_default_serialization_handlers(_default_serialization_context)
+
+pandas_serialization_context = _default_serialization_context.clone()
+
+pandas_serialization_context.register_type(
+ np.ndarray, 'np.array',
+ custom_serializer=_serialize_numpy_array_pickle,
+ custom_deserializer=_deserialize_numpy_array_pickle)
diff --git a/python/pyarrow/tests/test_serialization.py
b/python/pyarrow/tests/test_serialization.py
index d06beeac9..6d85621d4 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -212,11 +212,11 @@ def make_serialization_context():
serialization_context = make_serialization_context()
-def serialization_roundtrip(value, f):
+def serialization_roundtrip(value, f, ctx=serialization_context):
f.seek(0)
- pa.serialize_to(value, f, serialization_context)
+ pa.serialize_to(value, f, ctx)
f.seek(0)
- result = pa.deserialize_from(f, None, serialization_context)
+ result = pa.deserialize_from(f, None, ctx)
assert_equal(value, result)
_check_component_roundtrip(value)
@@ -249,6 +249,7 @@ def test_primitive_serialization(large_memory_map):
with pa.memory_map(large_memory_map, mode="r+") as mmap:
for obj in PRIMITIVE_OBJECTS:
serialization_roundtrip(obj, mmap)
+ serialization_roundtrip(obj, mmap, pa.pandas_serialization_context)
def test_serialize_to_buffer():
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Improve performance of serializing object dtype ndarrays
> -----------------------------------------------------------------
>
> Key: ARROW-1854
> URL: https://issues.apache.org/jira/browse/ARROW-1854
> Project: Apache Arrow
> Issue Type: Improvement
> Components: Python
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> I haven't looked carefully at the hot path for this, but I would expect these
> statements to have roughly the same performance (offloading the ndarray
> serialization to pickle)
> {code}
> In [1]: import pickle
> In [2]: import numpy as np
> In [3]: import pyarrow as pa
> a
> In [4]: arr = np.array(['foo', 'bar', None] * 100000, dtype=object)
> In [5]: timeit serialized = pa.serialize(arr).to_buffer()
> 10 loops, best of 3: 27.1 ms per loop
> In [6]: timeit pickled = pickle.dumps(arr)
> 100 loops, best of 3: 6.03 ms per loop
> {code}
> [~robertnishihara] [~pcmoritz] I encountered this while working on
> ARROW-1783, but it can likely be resolved independently
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)