[
https://issues.apache.org/jira/browse/ARROW-1924?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16294527#comment-16294527
]
ASF GitHub Bot commented on ARROW-1924:
---------------------------------------
wesm closed pull request #1420: ARROW-1924: [Python] Bring back pickle=True
option for serialization
URL: https://github.com/apache/arrow/pull/1420
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
index cbc5e3b84..d95d582fe 100644
--- a/python/pyarrow/serialization.pxi
+++ b/python/pyarrow/serialization.pxi
@@ -47,6 +47,7 @@ cdef class SerializationContext:
cdef:
object type_to_type_id
object whitelisted_types
+ object types_to_pickle
object custom_serializers
object custom_deserializers
@@ -54,6 +55,7 @@ cdef class SerializationContext:
# Types with special serialization handlers
self.type_to_type_id = dict()
self.whitelisted_types = dict()
+ self.types_to_pickle = set()
self.custom_serializers = dict()
self.custom_deserializers = dict()
@@ -73,7 +75,7 @@ cdef class SerializationContext:
return result
- def register_type(self, type_, type_id,
+ def register_type(self, type_, type_id, pickle=False,
custom_serializer=None, custom_deserializer=None):
"""EXPERIMENTAL: Add type to the list of types we can serialize.
@@ -83,6 +85,9 @@ cdef class SerializationContext:
The type that we can serialize.
type_id : bytes
A string of bytes used to identify the type.
+ pickle : bool
+ True if the serialization should be done with pickle.
+ False if it should be done efficiently with Arrow.
custom_serializer : callable
This argument is optional, but can be provided to
serialize objects of the class in a particular way.
@@ -92,6 +97,8 @@ cdef class SerializationContext:
"""
self.type_to_type_id[type_] = type_id
self.whitelisted_types[type_id] = type_
+ if pickle:
+ self.types_to_pickle.add(type_id)
if custom_serializer is not None:
self.custom_serializers[type_id] = custom_serializer
self.custom_deserializers[type_id] = custom_deserializer
@@ -111,7 +118,9 @@ cdef class SerializationContext:
# use the closest match to type(obj)
type_id = self.type_to_type_id[type_]
- if type_id in self.custom_serializers:
+ if type_id in self.types_to_pickle:
+ serialized_obj = {"data": pickle.dumps(obj), "pickle": True}
+ elif type_id in self.custom_serializers:
serialized_obj = {"data": self.custom_serializers[type_id](obj)}
else:
if is_named_tuple(type_):
@@ -132,6 +141,7 @@ cdef class SerializationContext:
# The object was pickled, so unpickle it.
obj = pickle.loads(serialized_obj["data"])
else:
+ assert type_id not in self.types_to_pickle
if type_id not in self.whitelisted_types:
msg = "Type ID " + str(type_id) + " not registered in " \
"deserialization callback"
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index 3059dfc1b..689ec15d3 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -104,12 +104,9 @@ def _deserialize_default_dict(data):
serialization_context.register_type(
type(lambda: 0), "function",
- custom_serializer=cloudpickle.dumps,
- custom_deserializer=cloudpickle.loads)
+ pickle=True)
- serialization_context.register_type(type, "type",
- custom_serializer=cloudpickle.dumps,
- custom_deserializer=cloudpickle.loads)
+ serialization_context.register_type(type, "type", pickle=True)
serialization_context.register_type(
np.ndarray, 'np.array',
diff --git a/python/pyarrow/tests/test_serialization.py
b/python/pyarrow/tests/test_serialization.py
index 6d85621d4..f245dc299 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -21,9 +21,9 @@
from collections import namedtuple, OrderedDict, defaultdict
import datetime
+import os
import string
import sys
-import pickle
import pyarrow as pa
import numpy as np
@@ -198,9 +198,7 @@ def make_serialization_context():
context.register_type(Baz, "Baz")
context.register_type(Qux, "Quz")
context.register_type(SubQux, "SubQux")
- context.register_type(SubQuxPickle, "SubQuxPickle",
- custom_serializer=pickle.dumps,
- custom_deserializer=pickle.loads)
+ context.register_type(SubQuxPickle, "SubQuxPickle", pickle=True)
context.register_type(Exception, "Exception")
context.register_type(CustomError, "CustomError")
context.register_type(Point, "Point")
@@ -519,3 +517,27 @@ def test_serialize_to_components_invalid_cases():
with pytest.raises(pa.ArrowException):
pa.deserialize_components(components)
+
+
[email protected](os.name == 'nt', reason="deserialize_regex not pickleable")
+def test_deserialize_in_different_process():
+ from multiprocessing import Process, Queue
+ import re
+
+ regex = re.compile(r"\d+\.\d*")
+
+ serialization_context = pa.SerializationContext()
+ serialization_context.register_type(type(regex), "Regex", pickle=True)
+
+ serialized = pa.serialize(regex, serialization_context)
+ serialized_bytes = serialized.to_buffer().to_pybytes()
+
+ def deserialize_regex(serialized, q):
+ import pyarrow as pa
+ q.put(pa.deserialize(serialized))
+
+ q = Queue()
+ p = Process(target=deserialize_regex, args=(serialized_bytes, q))
+ p.start()
+ assert q.get().pattern == regex.pattern
+ p.join()
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Bring back pickle=True option for serialization
> --------------------------------------------------------
>
> Key: ARROW-1924
> URL: https://issues.apache.org/jira/browse/ARROW-1924
> Project: Apache Arrow
> Issue Type: Bug
> Reporter: Philipp Moritz
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> We need to revert https://issues.apache.org/jira/browse/ARROW-1758
> The reason is that the semantics for pickle=True is slightly different from
> just using (cloud-)pickle as the custom serializer:
> If pickle=True is used, the object can be deserialized in any process, even
> if a deserializer for that type_id has not been registered in that process.
> On the other hand, if (cloud-)pickle is used as a custom serializer, the
> object can only be deserialized if pyarrow has the type_id registered and can
> call the deserializer.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)