This is an automated email from the ASF dual-hosted git repository.

robertnishihara pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 685147c  ARROW-2451: [Python] Handle non-object arrays more 
efficiently in custom serializer.
685147c is described below

commit 685147c47194fa11a349790fe8a54b34841b3ef8
Author: Robert Nishihara <robertnishih...@gmail.com>
AuthorDate: Thu Apr 12 15:38:46 2018 -0700

    ARROW-2451: [Python] Handle non-object arrays more efficiently in custom 
serializer.
    
    **Before this PR**
    
    ```python
    import numpy as np
    import pyarrow as pa
    
    x = np.array(10 ** 8 * [True, False])
    %time serialized_obj = pa.serialize(x).to_buffer()  # 10.4s
    %time new_x = pa.deserialize(serialized_obj)  # 8.94s
    
    x = np.array([str(i) for i in range(10 ** 7)])
    %time serialized_obj = pa.serialize(x).to_buffer()  # 2.24s
    %time new_x = pa.deserialize(serialized_obj)  # 2.03s
    ```
    
    **After this PR**
    
    ```python
    import numpy as np
    import pyarrow as pa
    
    x = np.array(10 ** 8 * [True, False])
    %time serialized_obj = pa.serialize(x).to_buffer()  # 117ms
    %time new_x = pa.deserialize(serialized_obj)  # 265us
    
    x = np.array([str(i) for i in range(10 ** 7)])
    %time serialized_obj = pa.serialize(x).to_buffer()  # 174ms
    %time new_x = pa.deserialize(serialized_obj)  # 13.2ms
    ```
    
    cc @devin-petersohn @mitar @pcmoritz
    
    Author: Robert Nishihara <robertnishih...@gmail.com>
    
    Closes #1887 from robertnishihara/pyarrowbools and squashes the following 
commits:
    
    831afea <Robert Nishihara> Handle non-object arrays more efficiently in 
custom serializer.
---
 python/pyarrow/serialization.py            | 15 +++++++++++++--
 python/pyarrow/tests/test_serialization.py |  5 ++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index 6c8df35..b3ef625 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -37,11 +37,22 @@ except ImportError:
 # python_to_arrow.cc)
 
 def _serialize_numpy_array_list(obj):
-    return obj.tolist(), obj.dtype.str
+    if obj.dtype.str != '|O':
+        # Make the array c_contiguous if necessary so that we can call change
+        # the view.
+        if not obj.flags.c_contiguous:
+            obj = np.ascontiguousarray(obj)
+        return obj.view('uint8'), obj.dtype.str
+    else:
+        return obj.tolist(), obj.dtype.str
 
 
 def _deserialize_numpy_array_list(data):
-    return np.array(data[0], dtype=np.dtype(data[1]))
+    if data[1] != '|O':
+        assert data[0].dtype == np.uint8
+        return data[0].view(data[1])
+    else:
+        return np.array(data[0], dtype=np.dtype(data[1]))
 
 
 def _pickle_to_buffer(x):
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index 16b477b..ba32330 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -315,9 +315,12 @@ def test_default_dict_serialization(large_buffer):
 
 def test_numpy_serialization(large_buffer):
     for t in ["bool", "int8", "uint8", "int16", "uint16", "int32",
-              "uint32", "float16", "float32", "float64"]:
+              "uint32", "float16", "float32", "float64", "<U1", "<U2", "<U3",
+              "<U4", "|S1", "|S2", "|S3", "|S4", "|O"]:
         obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
         serialization_roundtrip(obj, large_buffer)
+        obj = obj[1:99, 10:90]
+        serialization_roundtrip(obj, large_buffer)
 
 
 def test_datetime_serialization(large_buffer):

-- 
To stop receiving notification emails like this one, please contact
robertnishih...@apache.org.

Reply via email to