[ 
https://issues.apache.org/jira/browse/ARROW-2121?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16369640#comment-16369640
 ] 

ASF GitHub Bot commented on ARROW-2121:
---------------------------------------

wesm closed pull request #1581: ARROW-2121: [Python] Handle object arrays 
directly in pandas serializer.
URL: https://github.com/apache/arrow/pull/1581
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/README-benchmarks.md b/python/README-benchmarks.md
index 3fecb35cb..60fa88f4a 100644
--- a/python/README-benchmarks.md
+++ b/python/README-benchmarks.md
@@ -41,8 +41,6 @@ First you have to install ASV's development version:
 pip install git+https://github.com/airspeed-velocity/asv.git
 ```
 
-<!--- TODO remove the above once 
https://github.com/airspeed-velocity/asv/pull/611 is merged -->
-
 Then you need to set up a few environment variables:
 
 ```shell
diff --git a/python/benchmarks/convert_pandas.py 
b/python/benchmarks/convert_pandas.py
index c4a7a59cb..244b3dcc8 100644
--- a/python/benchmarks/convert_pandas.py
+++ b/python/benchmarks/convert_pandas.py
@@ -48,3 +48,23 @@ def setup(self, n, dtype):
 
     def time_to_series(self, n, dtype):
         self.arrow_data.to_pandas()
+
+
+class ZeroCopyPandasRead(object):
+
+    def setup(self):
+        # Transpose to make column-major
+        values = np.random.randn(10, 100000)
+
+        df = pd.DataFrame(values.T)
+        ctx = pa.default_serialization_context()
+
+        self.serialized = ctx.serialize(df)
+        self.as_buffer = self.serialized.to_buffer()
+        self.as_components = self.serialized.to_components()
+
+    def time_deserialize_from_buffer(self):
+        pa.deserialize(self.as_buffer)
+
+    def time_deserialize_from_components(self):
+        pa.deserialize_components(self.as_components)
diff --git a/python/doc/source/ipc.rst b/python/doc/source/ipc.rst
index 9bf93ffe8..bce8b1ed1 100644
--- a/python/doc/source/ipc.rst
+++ b/python/doc/source/ipc.rst
@@ -317,9 +317,8 @@ An object can be reconstructed from its component-based 
representation using
 Serializing pandas Objects
 --------------------------
 
-We provide a serialization context that has optimized handling of pandas
-objects like ``DataFrame`` and ``Series``. This can be created with
-``pyarrow.pandas_serialization_context()``. Combined with component-based
+The default serialization context has optimized handling of pandas
+objects like ``DataFrame`` and ``Series``. Combined with component-based
 serialization above, this enables zero-copy transport of pandas DataFrame
 objects not containing any Python objects:
 
@@ -327,7 +326,7 @@ objects not containing any Python objects:
 
    import pandas as pd
    df = pd.DataFrame({'a': [1, 2, 3, 4, 5]})
-   context = pa.pandas_serialization_context()
+   context = pa.default_serialization_context()
    serialized_df = context.serialize(df)
    df_components = serialized_df.to_components()
    original_df = context.deserialize_components(df_components)
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index d95954ed3..15a37ca10 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -125,7 +125,6 @@
 localfs = LocalFileSystem.get_instance()
 
 from pyarrow.serialization import (default_serialization_context,
-                                   pandas_serialization_context,
                                    register_default_serialization_handlers,
                                    register_torch_serialization_handlers)
 
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index e8fa83fe7..6d4bf5e78 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -27,7 +27,7 @@
 import six
 
 import pyarrow as pa
-from pyarrow.compat import PY2, zip_longest  # noqa
+from pyarrow.compat import builtin_pickle, PY2, zip_longest  # noqa
 
 
 def infer_dtype(column):
@@ -424,11 +424,19 @@ def dataframe_to_serialized_dict(frame):
             block_data.update(dictionary=values.categories,
                               ordered=values.ordered)
             values = values.codes
-
         block_data.update(
             placement=block.mgr_locs.as_array,
             block=values
         )
+
+        # If we are dealing with an object array, pickle it instead. Note that
+        # we do not use isinstance here because _int.CategoricalBlock is a
+        # subclass of _int.ObjectBlock.
+        if type(block) == _int.ObjectBlock:
+            block_data['object'] = None
+            block_data['block'] = builtin_pickle.dumps(
+                values, protocol=builtin_pickle.HIGHEST_PROTOCOL)
+
         blocks.append(block_data)
 
     return {
@@ -463,6 +471,9 @@ def _reconstruct_block(item):
         block = _int.make_block(block_arr, placement=placement,
                                 klass=_int.DatetimeTZBlock,
                                 dtype=dtype)
+    elif 'object' in item:
+        block = _int.make_block(builtin_pickle.loads(block_arr),
+                                placement=placement, klass=_int.ObjectBlock)
     else:
         block = _int.make_block(block_arr, placement=placement)
 
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index c8b72b748..bdf753579 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -57,10 +57,6 @@ def _load_pickle_from_buffer(data):
         return builtin_pickle.loads(as_memoryview)
 
 
-_serialize_numpy_array_pickle = _pickle_to_buffer
-_deserialize_numpy_array_pickle = _load_pickle_from_buffer
-
-
 # ----------------------------------------------------------------------
 # pandas-specific serialization matters
 
@@ -190,11 +186,3 @@ def default_serialization_context():
 
 
 register_default_serialization_handlers(_default_serialization_context)
-
-
-def pandas_serialization_context():
-    context = default_serialization_context()
-    context.register_type(np.ndarray, 'np.array',
-                          custom_serializer=_serialize_numpy_array_pickle,
-                          custom_deserializer=_deserialize_numpy_array_pickle)
-    return context
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 95137ffb2..f7718f06a 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1613,7 +1613,7 @@ def _fully_loaded_dataframe_example():
 
 
 def _check_serialize_components_roundtrip(df):
-    ctx = pa.pandas_serialization_context()
+    ctx = pa.default_serialization_context()
 
     components = ctx.serialize(df).to_components()
     deserialized = ctx.deserialize_components(components)
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index 20c195a4b..0917172d2 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -284,8 +284,6 @@ def custom_deserializer(serialized_obj):
 def test_primitive_serialization(large_buffer):
     for obj in PRIMITIVE_OBJECTS:
         serialization_roundtrip(obj, large_buffer)
-        serialization_roundtrip(obj, large_buffer,
-                                pa.pandas_serialization_context())
 
 
 def test_serialize_to_buffer():


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Consider special casing object arrays in pandas serializers.
> ------------------------------------------------------------
>
>                 Key: ARROW-2121
>                 URL: https://issues.apache.org/jira/browse/ARROW-2121
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Robert Nishihara
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>




--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to