This is an automated email from the ASF dual-hosted git repository. robertnishihara pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 685147c ARROW-2451: [Python] Handle non-object arrays more efficiently in custom serializer. 685147c is described below commit 685147c47194fa11a349790fe8a54b34841b3ef8 Author: Robert Nishihara <robertnishih...@gmail.com> AuthorDate: Thu Apr 12 15:38:46 2018 -0700 ARROW-2451: [Python] Handle non-object arrays more efficiently in custom serializer. **Before this PR** ```python import numpy as np import pyarrow as pa x = np.array(10 ** 8 * [True, False]) %time serialized_obj = pa.serialize(x).to_buffer() # 10.4s %time new_x = pa.deserialize(serialized_obj) # 8.94s x = np.array([str(i) for i in range(10 ** 7)]) %time serialized_obj = pa.serialize(x).to_buffer() # 2.24s %time new_x = pa.deserialize(serialized_obj) # 2.03s ``` **After this PR** ```python import numpy as np import pyarrow as pa x = np.array(10 ** 8 * [True, False]) %time serialized_obj = pa.serialize(x).to_buffer() # 117ms %time new_x = pa.deserialize(serialized_obj) # 265us x = np.array([str(i) for i in range(10 ** 7)]) %time serialized_obj = pa.serialize(x).to_buffer() # 174ms %time new_x = pa.deserialize(serialized_obj) # 13.2ms ``` cc @devin-petersohn @mitar @pcmoritz Author: Robert Nishihara <robertnishih...@gmail.com> Closes #1887 from robertnishihara/pyarrowbools and squashes the following commits: 831afea <Robert Nishihara> Handle non-object arrays more efficiently in custom serializer. --- python/pyarrow/serialization.py | 15 +++++++++++++-- python/pyarrow/tests/test_serialization.py | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 6c8df35..b3ef625 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -37,11 +37,22 @@ except ImportError: # python_to_arrow.cc) def _serialize_numpy_array_list(obj): - return obj.tolist(), obj.dtype.str + if obj.dtype.str != '|O': + # Make the array c_contiguous if necessary so that we can call change + # the view. + if not obj.flags.c_contiguous: + obj = np.ascontiguousarray(obj) + return obj.view('uint8'), obj.dtype.str + else: + return obj.tolist(), obj.dtype.str def _deserialize_numpy_array_list(data): - return np.array(data[0], dtype=np.dtype(data[1])) + if data[1] != '|O': + assert data[0].dtype == np.uint8 + return data[0].view(data[1]) + else: + return np.array(data[0], dtype=np.dtype(data[1])) def _pickle_to_buffer(x): diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 16b477b..ba32330 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -315,9 +315,12 @@ def test_default_dict_serialization(large_buffer): def test_numpy_serialization(large_buffer): for t in ["bool", "int8", "uint8", "int16", "uint16", "int32", - "uint32", "float16", "float32", "float64"]: + "uint32", "float16", "float32", "float64", "<U1", "<U2", "<U3", + "<U4", "|S1", "|S2", "|S3", "|S4", "|O"]: obj = np.random.randint(0, 10, size=(100, 100)).astype(t) serialization_roundtrip(obj, large_buffer) + obj = obj[1:99, 10:90] + serialization_roundtrip(obj, large_buffer) def test_datetime_serialization(large_buffer): -- To stop receiving notification emails like this one, please contact robertnishih...@apache.org.