[ 
https://issues.apache.org/jira/browse/ARROW-2451?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16436413#comment-16436413
 ] 

ASF GitHub Bot commented on ARROW-2451:
---------------------------------------

robertnishihara closed pull request #1887: ARROW-2451: [Python] Handle 
non-object arrays more efficiently in custom serializer.
URL: https://github.com/apache/arrow/pull/1887
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index 6c8df350bf..b3ef625de0 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -37,11 +37,22 @@
 # python_to_arrow.cc)
 
 def _serialize_numpy_array_list(obj):
-    return obj.tolist(), obj.dtype.str
+    if obj.dtype.str != '|O':
+        # Make the array c_contiguous if necessary so that we can call change
+        # the view.
+        if not obj.flags.c_contiguous:
+            obj = np.ascontiguousarray(obj)
+        return obj.view('uint8'), obj.dtype.str
+    else:
+        return obj.tolist(), obj.dtype.str
 
 
 def _deserialize_numpy_array_list(data):
-    return np.array(data[0], dtype=np.dtype(data[1]))
+    if data[1] != '|O':
+        assert data[0].dtype == np.uint8
+        return data[0].view(data[1])
+    else:
+        return np.array(data[0], dtype=np.dtype(data[1]))
 
 
 def _pickle_to_buffer(x):
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index 16b477bc99..ba32330b61 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -315,9 +315,12 @@ def test_default_dict_serialization(large_buffer):
 
 def test_numpy_serialization(large_buffer):
     for t in ["bool", "int8", "uint8", "int16", "uint16", "int32",
-              "uint32", "float16", "float32", "float64"]:
+              "uint32", "float16", "float32", "float64", "<U1", "<U2", "<U3",
+              "<U4", "|S1", "|S2", "|S3", "|S4", "|O"]:
         obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
         serialization_roundtrip(obj, large_buffer)
+        obj = obj[1:99, 10:90]
+        serialization_roundtrip(obj, large_buffer)
 
 
 def test_datetime_serialization(large_buffer):


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Handle more dtypes efficiently in custom numpy array serializer.
> ----------------------------------------------------------------
>
>                 Key: ARROW-2451
>                 URL: https://issues.apache.org/jira/browse/ARROW-2451
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Robert Nishihara
>            Assignee: Robert Nishihara
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: JS-0.4.0
>
>
> Right now certain dtypes like bool or fixed length strings are serialized as 
> lists, which is inefficient. We can handle these more efficiently by casting 
> them to uint8 and saving the original dtype as additional data.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to