This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new fd972f0  ARROW-2121: [Python] Handle object arrays directly in pandas 
serializer.
fd972f0 is described below

commit fd972f0e552ec4b85c4a0e3d8a358cbff4580fc6
Author: Robert Nishihara <robertnishih...@gmail.com>
AuthorDate: Mon Feb 19 20:55:20 2018 -0500

    ARROW-2121: [Python] Handle object arrays directly in pandas serializer.
    
    The goal here is to get the best of both the `pandas_serialization_context` 
(speed at serializing pandas dataframes containing strings and other objects) 
and the `default_serialization_context` (correctly serializing a large class of 
numpy object arrays).
    
    This PR sort of messes up the function 
`pa.pandas_compat.dataframe_to_serialized_dict`. Is that function just a helper 
function for implementing the custom pandas serializers? Or is it intended to 
be used in other places.
    
    TODO in this PR (assuming you think this approach is reasonable):
    
    - [x] remove `pandas_serialization_context`
    - [x] make sure this code path is tested
    - [x] double check performance/behavior
    
    cc @wesm @pcmoritz @devin-petersohn
    
    Author: Robert Nishihara <robertnishih...@gmail.com>
    Author: Wes McKinney <wes.mckin...@twosigma.com>
    
    Closes #1581 from robertnishihara/pandasserialization and squashes the 
following commits:
    
    c551ed6c [Wes McKinney] Add benchmark for zero-copy pandas deserialization
    e12cd721 [Robert Nishihara] Use highest protocol.
    6e3eceb6 [Robert Nishihara] Fix bug.
    31b3cbee [Robert Nishihara] Fixes.
    0fc9ecfd [Robert Nishihara] Update documentation and testing.
    b796fbdb [Robert Nishihara] Handle object arrays directly in pandas 
serializer.
---
 python/README-benchmarks.md                 |  2 --
 python/benchmarks/convert_pandas.py         | 20 ++++++++++++++++++++
 python/doc/source/ipc.rst                   |  7 +++----
 python/pyarrow/__init__.py                  |  1 -
 python/pyarrow/pandas_compat.py             | 15 +++++++++++++--
 python/pyarrow/serialization.py             | 12 ------------
 python/pyarrow/tests/test_convert_pandas.py |  2 +-
 python/pyarrow/tests/test_serialization.py  |  2 --
 8 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/python/README-benchmarks.md b/python/README-benchmarks.md
index 3fecb35..60fa88f 100644
--- a/python/README-benchmarks.md
+++ b/python/README-benchmarks.md
@@ -41,8 +41,6 @@ First you have to install ASV's development version:
 pip install git+https://github.com/airspeed-velocity/asv.git
 ```
 
-<!--- TODO remove the above once 
https://github.com/airspeed-velocity/asv/pull/611 is merged -->
-
 Then you need to set up a few environment variables:
 
 ```shell
diff --git a/python/benchmarks/convert_pandas.py 
b/python/benchmarks/convert_pandas.py
index c4a7a59..244b3dc 100644
--- a/python/benchmarks/convert_pandas.py
+++ b/python/benchmarks/convert_pandas.py
@@ -48,3 +48,23 @@ class PandasConversionsFromArrow(PandasConversionsBase):
 
     def time_to_series(self, n, dtype):
         self.arrow_data.to_pandas()
+
+
+class ZeroCopyPandasRead(object):
+
+    def setup(self):
+        # Transpose to make column-major
+        values = np.random.randn(10, 100000)
+
+        df = pd.DataFrame(values.T)
+        ctx = pa.default_serialization_context()
+
+        self.serialized = ctx.serialize(df)
+        self.as_buffer = self.serialized.to_buffer()
+        self.as_components = self.serialized.to_components()
+
+    def time_deserialize_from_buffer(self):
+        pa.deserialize(self.as_buffer)
+
+    def time_deserialize_from_components(self):
+        pa.deserialize_components(self.as_components)
diff --git a/python/doc/source/ipc.rst b/python/doc/source/ipc.rst
index 9bf93ff..bce8b1e 100644
--- a/python/doc/source/ipc.rst
+++ b/python/doc/source/ipc.rst
@@ -317,9 +317,8 @@ An object can be reconstructed from its component-based 
representation using
 Serializing pandas Objects
 --------------------------
 
-We provide a serialization context that has optimized handling of pandas
-objects like ``DataFrame`` and ``Series``. This can be created with
-``pyarrow.pandas_serialization_context()``. Combined with component-based
+The default serialization context has optimized handling of pandas
+objects like ``DataFrame`` and ``Series``. Combined with component-based
 serialization above, this enables zero-copy transport of pandas DataFrame
 objects not containing any Python objects:
 
@@ -327,7 +326,7 @@ objects not containing any Python objects:
 
    import pandas as pd
    df = pd.DataFrame({'a': [1, 2, 3, 4, 5]})
-   context = pa.pandas_serialization_context()
+   context = pa.default_serialization_context()
    serialized_df = context.serialize(df)
    df_components = serialized_df.to_components()
    original_df = context.deserialize_components(df_components)
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index d95954e..15a37ca 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -125,7 +125,6 @@ from pyarrow.ipc import (Message, MessageReader,
 localfs = LocalFileSystem.get_instance()
 
 from pyarrow.serialization import (default_serialization_context,
-                                   pandas_serialization_context,
                                    register_default_serialization_handlers,
                                    register_torch_serialization_handlers)
 
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index e8fa83f..6d4bf5e 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -27,7 +27,7 @@ import pandas as pd
 import six
 
 import pyarrow as pa
-from pyarrow.compat import PY2, zip_longest  # noqa
+from pyarrow.compat import builtin_pickle, PY2, zip_longest  # noqa
 
 
 def infer_dtype(column):
@@ -424,11 +424,19 @@ def dataframe_to_serialized_dict(frame):
             block_data.update(dictionary=values.categories,
                               ordered=values.ordered)
             values = values.codes
-
         block_data.update(
             placement=block.mgr_locs.as_array,
             block=values
         )
+
+        # If we are dealing with an object array, pickle it instead. Note that
+        # we do not use isinstance here because _int.CategoricalBlock is a
+        # subclass of _int.ObjectBlock.
+        if type(block) == _int.ObjectBlock:
+            block_data['object'] = None
+            block_data['block'] = builtin_pickle.dumps(
+                values, protocol=builtin_pickle.HIGHEST_PROTOCOL)
+
         blocks.append(block_data)
 
     return {
@@ -463,6 +471,9 @@ def _reconstruct_block(item):
         block = _int.make_block(block_arr, placement=placement,
                                 klass=_int.DatetimeTZBlock,
                                 dtype=dtype)
+    elif 'object' in item:
+        block = _int.make_block(builtin_pickle.loads(block_arr),
+                                placement=placement, klass=_int.ObjectBlock)
     else:
         block = _int.make_block(block_arr, placement=placement)
 
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index c8b72b7..bdf7535 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -57,10 +57,6 @@ def _load_pickle_from_buffer(data):
         return builtin_pickle.loads(as_memoryview)
 
 
-_serialize_numpy_array_pickle = _pickle_to_buffer
-_deserialize_numpy_array_pickle = _load_pickle_from_buffer
-
-
 # ----------------------------------------------------------------------
 # pandas-specific serialization matters
 
@@ -190,11 +186,3 @@ def default_serialization_context():
 
 
 register_default_serialization_handlers(_default_serialization_context)
-
-
-def pandas_serialization_context():
-    context = default_serialization_context()
-    context.register_type(np.ndarray, 'np.array',
-                          custom_serializer=_serialize_numpy_array_pickle,
-                          custom_deserializer=_deserialize_numpy_array_pickle)
-    return context
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 95137ff..f7718f0 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1613,7 +1613,7 @@ def _fully_loaded_dataframe_example():
 
 
 def _check_serialize_components_roundtrip(df):
-    ctx = pa.pandas_serialization_context()
+    ctx = pa.default_serialization_context()
 
     components = ctx.serialize(df).to_components()
     deserialized = ctx.deserialize_components(components)
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index 20c195a..0917172 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -284,8 +284,6 @@ def test_clone():
 def test_primitive_serialization(large_buffer):
     for obj in PRIMITIVE_OBJECTS:
         serialization_roundtrip(obj, large_buffer)
-        serialization_roundtrip(obj, large_buffer,
-                                pa.pandas_serialization_context())
 
 
 def test_serialize_to_buffer():

-- 
To stop receiving notification emails like this one, please contact
w...@apache.org.

Reply via email to