[ 
https://issues.apache.org/jira/browse/ARROW-1924?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16294527#comment-16294527
 ] 

ASF GitHub Bot commented on ARROW-1924:
---------------------------------------

wesm closed pull request #1420: ARROW-1924: [Python] Bring back pickle=True 
option for serialization
URL: https://github.com/apache/arrow/pull/1420
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
index cbc5e3b84..d95d582fe 100644
--- a/python/pyarrow/serialization.pxi
+++ b/python/pyarrow/serialization.pxi
@@ -47,6 +47,7 @@ cdef class SerializationContext:
     cdef:
         object type_to_type_id
         object whitelisted_types
+        object types_to_pickle
         object custom_serializers
         object custom_deserializers
 
@@ -54,6 +55,7 @@ cdef class SerializationContext:
         # Types with special serialization handlers
         self.type_to_type_id = dict()
         self.whitelisted_types = dict()
+        self.types_to_pickle = set()
         self.custom_serializers = dict()
         self.custom_deserializers = dict()
 
@@ -73,7 +75,7 @@ cdef class SerializationContext:
 
         return result
 
-    def register_type(self, type_, type_id,
+    def register_type(self, type_, type_id, pickle=False,
                       custom_serializer=None, custom_deserializer=None):
         """EXPERIMENTAL: Add type to the list of types we can serialize.
 
@@ -83,6 +85,9 @@ cdef class SerializationContext:
             The type that we can serialize.
         type_id : bytes
             A string of bytes used to identify the type.
+        pickle : bool
+            True if the serialization should be done with pickle.
+            False if it should be done efficiently with Arrow.
         custom_serializer : callable
             This argument is optional, but can be provided to
             serialize objects of the class in a particular way.
@@ -92,6 +97,8 @@ cdef class SerializationContext:
         """
         self.type_to_type_id[type_] = type_id
         self.whitelisted_types[type_id] = type_
+        if pickle:
+            self.types_to_pickle.add(type_id)
         if custom_serializer is not None:
             self.custom_serializers[type_id] = custom_serializer
             self.custom_deserializers[type_id] = custom_deserializer
@@ -111,7 +118,9 @@ cdef class SerializationContext:
 
         # use the closest match to type(obj)
         type_id = self.type_to_type_id[type_]
-        if type_id in self.custom_serializers:
+        if type_id in self.types_to_pickle:
+            serialized_obj = {"data": pickle.dumps(obj), "pickle": True}
+        elif type_id in self.custom_serializers:
             serialized_obj = {"data": self.custom_serializers[type_id](obj)}
         else:
             if is_named_tuple(type_):
@@ -132,6 +141,7 @@ cdef class SerializationContext:
             # The object was pickled, so unpickle it.
             obj = pickle.loads(serialized_obj["data"])
         else:
+            assert type_id not in self.types_to_pickle
             if type_id not in self.whitelisted_types:
                 msg = "Type ID " + str(type_id) + " not registered in " \
                       "deserialization callback"
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index 3059dfc1b..689ec15d3 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -104,12 +104,9 @@ def _deserialize_default_dict(data):
 
     serialization_context.register_type(
         type(lambda: 0), "function",
-        custom_serializer=cloudpickle.dumps,
-        custom_deserializer=cloudpickle.loads)
+        pickle=True)
 
-    serialization_context.register_type(type, "type",
-                                        custom_serializer=cloudpickle.dumps,
-                                        custom_deserializer=cloudpickle.loads)
+    serialization_context.register_type(type, "type", pickle=True)
 
     serialization_context.register_type(
         np.ndarray, 'np.array',
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index 6d85621d4..f245dc299 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -21,9 +21,9 @@
 
 from collections import namedtuple, OrderedDict, defaultdict
 import datetime
+import os
 import string
 import sys
-import pickle
 
 import pyarrow as pa
 import numpy as np
@@ -198,9 +198,7 @@ def make_serialization_context():
     context.register_type(Baz, "Baz")
     context.register_type(Qux, "Quz")
     context.register_type(SubQux, "SubQux")
-    context.register_type(SubQuxPickle, "SubQuxPickle",
-                          custom_serializer=pickle.dumps,
-                          custom_deserializer=pickle.loads)
+    context.register_type(SubQuxPickle, "SubQuxPickle", pickle=True)
     context.register_type(Exception, "Exception")
     context.register_type(CustomError, "CustomError")
     context.register_type(Point, "Point")
@@ -519,3 +517,27 @@ def test_serialize_to_components_invalid_cases():
 
     with pytest.raises(pa.ArrowException):
         pa.deserialize_components(components)
+
+
[email protected](os.name == 'nt', reason="deserialize_regex not pickleable")
+def test_deserialize_in_different_process():
+    from multiprocessing import Process, Queue
+    import re
+
+    regex = re.compile(r"\d+\.\d*")
+
+    serialization_context = pa.SerializationContext()
+    serialization_context.register_type(type(regex), "Regex", pickle=True)
+
+    serialized = pa.serialize(regex, serialization_context)
+    serialized_bytes = serialized.to_buffer().to_pybytes()
+
+    def deserialize_regex(serialized, q):
+        import pyarrow as pa
+        q.put(pa.deserialize(serialized))
+
+    q = Queue()
+    p = Process(target=deserialize_regex, args=(serialized_bytes, q))
+    p.start()
+    assert q.get().pattern == regex.pattern
+    p.join()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Bring back pickle=True option for serialization
> --------------------------------------------------------
>
>                 Key: ARROW-1924
>                 URL: https://issues.apache.org/jira/browse/ARROW-1924
>             Project: Apache Arrow
>          Issue Type: Bug
>            Reporter: Philipp Moritz
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> We need to revert https://issues.apache.org/jira/browse/ARROW-1758
> The reason is that the semantics for pickle=True is slightly different from 
> just using (cloud-)pickle as the custom serializer:
> If pickle=True is used, the object can be deserialized in any process, even 
> if a deserializer for that type_id has not been registered in that process. 
> On the other hand, if (cloud-)pickle is used as a custom serializer, the 
> object can only be deserialized if pyarrow has the type_id registered and can 
> call the deserializer.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to