This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 175b2a2fe2 GH-37254: [Python] Parametrize all pickling tests to use
both the pickle and cloudpickle modules (#37255)
175b2a2fe2 is described below
commit 175b2a2fe2d6e35237c36666478800f62e3f6947
Author: Dane Pitkin <[email protected]>
AuthorDate: Wed Aug 23 22:12:55 2023 -0400
GH-37254: [Python] Parametrize all pickling tests to use both the pickle
and cloudpickle modules (#37255)
### Rationale for this change
Cloudpickle was not tested in most parts of the pyarrow test suite.
Improving this coverage will make the Cython 3.0.0 upgrade cleaner as
cloudpickle was failing in a few places where the default pickle module was
not. This has been verified using Cython 0.29.36.
### What changes are included in this PR?
* `__reduce__` methods that need to pass kwargs have been changed from
classmethod to staticmethod
* All pytests that pickle objects are parameterized to use both `pickle`
and `cloudpickle`
### Are these changes tested?
Yes, pytests run successfully with Cython 0.29.36
### Are there any user-facing changes?
No.
* Closes: #37254
Authored-by: Dane Pitkin <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
python/pyarrow/_dataset_parquet.pyx | 12 ++-
python/pyarrow/_fs.pyx | 8 +-
python/pyarrow/_gcsfs.pyx | 13 ++-
python/pyarrow/_hdfs.pyx | 11 ++-
python/pyarrow/_s3fs.pyx | 11 ++-
python/pyarrow/scalar.pxi | 6 +-
python/pyarrow/tests/conftest.py | 111 +++++++++++++++++----
python/pyarrow/tests/parquet/test_dataset.py | 25 +----
python/pyarrow/tests/test_array.py | 35 +++----
python/pyarrow/tests/test_compute.py | 13 ++-
python/pyarrow/tests/test_csv.py | 30 +++---
python/pyarrow/tests/test_dataset.py | 139 ++++++++++++++-------------
python/pyarrow/tests/test_extension_type.py | 39 ++++----
python/pyarrow/tests/test_fs.py | 117 +++++++++++-----------
python/pyarrow/tests/test_hdfs.py | 14 ++-
python/pyarrow/tests/test_io.py | 23 +++--
python/pyarrow/tests/test_json.py | 17 ++--
python/pyarrow/tests/test_scalars.py | 23 +++--
python/pyarrow/tests/test_schema.py | 7 +-
python/pyarrow/tests/test_table.py | 13 ++-
python/pyarrow/tests/test_types.py | 13 ++-
21 files changed, 372 insertions(+), 308 deletions(-)
diff --git a/python/pyarrow/_dataset_parquet.pyx
b/python/pyarrow/_dataset_parquet.pyx
index 4de396f4f5..79bd270ce5 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -19,6 +19,7 @@
"""Dataset support for Parquest file format."""
+from cython cimport binding
from cython.operator cimport dereference as deref
import os
@@ -770,9 +771,12 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
other.thrift_container_size_limit)
return attrs == other_attrs
- @classmethod
- def _reconstruct(cls, kwargs):
- return cls(**kwargs)
+ @staticmethod
+ @binding(True) # Required for Cython < 3
+ def _reconstruct(kwargs):
+ # __reduce__ doesn't allow passing named arguments directly to the
+ # reconstructor, hence this wrapper.
+ return ParquetFragmentScanOptions(**kwargs)
def __reduce__(self):
kwargs = dict(
@@ -782,7 +786,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
thrift_string_size_limit=self.thrift_string_size_limit,
thrift_container_size_limit=self.thrift_container_size_limit,
)
- return type(self)._reconstruct, (kwargs,)
+ return ParquetFragmentScanOptions._reconstruct, (kwargs,)
cdef class ParquetFactoryOptions(_Weakrefable):
diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx
index dbd7ebe5e4..ef8db31bfc 100644
--- a/python/pyarrow/_fs.pyx
+++ b/python/pyarrow/_fs.pyx
@@ -18,6 +18,7 @@
# cython: language_level = 3
from cpython.datetime cimport datetime, PyDateTime_DateTime
+from cython cimport binding
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint
@@ -1106,11 +1107,12 @@ cdef class LocalFileSystem(FileSystem):
FileSystem.init(self, c_fs)
self.localfs = <CLocalFileSystem*> c_fs.get()
- @classmethod
- def _reconstruct(cls, kwargs):
+ @staticmethod
+ @binding(True) # Required for cython < 3
+ def _reconstruct(kwargs):
# __reduce__ doesn't allow passing named arguments directly to the
# reconstructor, hence this wrapper.
- return cls(**kwargs)
+ return LocalFileSystem(**kwargs)
def __reduce__(self):
cdef CLocalFileSystemOptions opts = self.localfs.options()
diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx
index 1fda08232c..5e69413cea 100644
--- a/python/pyarrow/_gcsfs.pyx
+++ b/python/pyarrow/_gcsfs.pyx
@@ -17,6 +17,8 @@
# cython: language_level = 3
+from cython cimport binding
+
from pyarrow.lib cimport (pyarrow_wrap_metadata,
pyarrow_unwrap_metadata)
from pyarrow.lib import frombytes, tobytes, ensure_metadata
@@ -154,10 +156,6 @@ cdef class GcsFileSystem(FileSystem):
FileSystem.init(self, wrapped)
self.gcsfs = <CGcsFileSystem*> wrapped.get()
- @classmethod
- def _reconstruct(cls, kwargs):
- return cls(**kwargs)
-
def _expiration_datetime_from_options(self):
expiration_ns = TimePoint_to_ns(
self.gcsfs.options().credentials.expiration())
@@ -165,6 +163,13 @@ cdef class GcsFileSystem(FileSystem):
return None
return datetime.fromtimestamp(expiration_ns / 1.0e9, timezone.utc)
+ @staticmethod
+ @binding(True) # Required for cython < 3
+ def _reconstruct(kwargs):
+ # __reduce__ doesn't allow passing named arguments directly to the
+ # reconstructor, hence this wrapper.
+ return GcsFileSystem(**kwargs)
+
def __reduce__(self):
cdef CGcsOptions opts = self.gcsfs.options()
service_account = frombytes(opts.credentials.target_service_account())
diff --git a/python/pyarrow/_hdfs.pyx b/python/pyarrow/_hdfs.pyx
index a0cef25105..c426337a12 100644
--- a/python/pyarrow/_hdfs.pyx
+++ b/python/pyarrow/_hdfs.pyx
@@ -17,6 +17,8 @@
# cython: language_level = 3
+from cython cimport binding
+
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
@@ -134,9 +136,12 @@ replication=1)``
self.init(<shared_ptr[CFileSystem]> wrapped)
return self
- @classmethod
- def _reconstruct(cls, kwargs):
- return cls(**kwargs)
+ @staticmethod
+ @binding(True) # Required for cython < 3
+ def _reconstruct(kwargs):
+ # __reduce__ doesn't allow passing named arguments directly to the
+ # reconstructor, hence this wrapper.
+ return HadoopFileSystem(**kwargs)
def __reduce__(self):
cdef CHdfsOptions opts = self.hdfs.options()
diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
index 51c248d147..ab45171369 100644
--- a/python/pyarrow/_s3fs.pyx
+++ b/python/pyarrow/_s3fs.pyx
@@ -17,6 +17,8 @@
# cython: language_level = 3
+from cython cimport binding
+
from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata,
pyarrow_unwrap_metadata)
from pyarrow.lib import frombytes, tobytes, KeyValueMetadata
@@ -388,9 +390,12 @@ cdef class S3FileSystem(FileSystem):
FileSystem.init(self, wrapped)
self.s3fs = <CS3FileSystem*> wrapped.get()
- @classmethod
- def _reconstruct(cls, kwargs):
- return cls(**kwargs)
+ @staticmethod
+ @binding(True) # Required for cython < 3
+ def _reconstruct(kwargs):
+ # __reduce__ doesn't allow passing named arguments directly to the
+ # reconstructor, hence this wrapper.
+ return S3FileSystem(**kwargs)
def __reduce__(self):
cdef CS3Options opts = self.s3fs.options()
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index e19807ba56..e07949c675 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -16,6 +16,7 @@
# under the License.
import collections
+from cython cimport binding
cdef class Scalar(_Weakrefable):
@@ -836,8 +837,9 @@ cdef class DictionaryScalar(Scalar):
Concrete class for dictionary-encoded scalars.
"""
- @classmethod
- def _reconstruct(cls, type, is_valid, index, dictionary):
+ @staticmethod
+ @binding(True) # Required for cython < 3
+ def _reconstruct(type, is_valid, index, dictionary):
cdef:
CDictionaryScalarIndexAndDictionary value
shared_ptr[CDictionaryScalar] wrapped
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index e6d87217ec..241ae4814a 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -15,13 +15,16 @@
# specific language governing permissions and limitations
# under the License.
+import functools
import os
import pathlib
import subprocess
import sys
-from tempfile import TemporaryDirectory
+import time
+import urllib.request
import pytest
+from pytest_lazyfixture import lazy_fixture
import hypothesis as h
from ..conftest import groups, defaults
@@ -146,8 +149,49 @@ def s3_connection():
return host, port, access_key, secret_key
+def retry(attempts=3, delay=1.0, max_delay=None, backoff=1):
+ """
+ Retry decorator
+
+ Parameters
+ ----------
+ attempts : int, default 3
+ The number of attempts.
+ delay : float, default 1
+ Initial delay in seconds.
+ max_delay : float, optional
+ The max delay between attempts.
+ backoff : float, default 1
+ The multiplier to delay after each attempt.
+ """
+ def decorate(func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ remaining_attempts = attempts
+ curr_delay = delay
+ while remaining_attempts > 0:
+ try:
+ return func(*args, **kwargs)
+ except Exception as err:
+ remaining_attempts -= 1
+ last_exception = err
+ curr_delay *= backoff
+ if max_delay:
+ curr_delay = min(curr_delay, max_delay)
+ time.sleep(curr_delay)
+ raise last_exception
+ return wrapper
+ return decorate
+
+
@pytest.fixture(scope='session')
-def s3_server(s3_connection):
+def s3_server(s3_connection, tmpdir_factory):
+ @retry(attempts=5, delay=0.1, backoff=2)
+ def minio_server_health_check(address):
+ resp = urllib.request.urlopen(f"http://{address}/minio/health/cluster")
+ assert resp.getcode() == 200
+
+ tmpdir = tmpdir_factory.getbasetemp()
host, port, access_key, secret_key = s3_connection
address = '{}:{}'.format(host, port)
@@ -157,24 +201,26 @@ def s3_server(s3_connection):
'MINIO_SECRET_KEY': secret_key
})
- with TemporaryDirectory() as tempdir:
- args = ['minio', '--compat', 'server', '--quiet', '--address',
- address, tempdir]
- proc = None
- try:
- proc = subprocess.Popen(args, env=env)
- except OSError:
- pytest.skip('`minio` command cannot be located')
- else:
- yield {
- 'connection': s3_connection,
- 'process': proc,
- 'tempdir': tempdir
- }
- finally:
- if proc is not None:
- proc.kill()
- proc.wait()
+ args = ['minio', '--compat', 'server', '--quiet', '--address',
+ address, tmpdir]
+ proc = None
+ try:
+ proc = subprocess.Popen(args, env=env)
+ except OSError:
+ pytest.skip('`minio` command cannot be located')
+ else:
+ # Wait for the server to startup before yielding
+ minio_server_health_check(address)
+
+ yield {
+ 'connection': s3_connection,
+ 'process': proc,
+ 'tempdir': tmpdir
+ }
+ finally:
+ if proc is not None:
+ proc.kill()
+ proc.wait()
@pytest.fixture(scope='session')
@@ -202,3 +248,28 @@ def gcs_server():
if proc is not None:
proc.kill()
proc.wait()
+
+
[email protected](
+ params=[
+ lazy_fixture('builtin_pickle'),
+ lazy_fixture('cloudpickle')
+ ],
+ scope='session'
+)
+def pickle_module(request):
+ return request.param
+
+
[email protected](scope='session')
+def builtin_pickle():
+ import pickle
+ return pickle
+
+
[email protected](scope='session')
+def cloudpickle():
+ cp = pytest.importorskip('cloudpickle')
+ if 'HIGHEST_PROTOCOL' not in cp.__dict__:
+ cp.HIGHEST_PROTOCOL = cp.DEFAULT_PROTOCOL
+ return cp
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index 3e6ff49265..be27c71b81 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -1533,10 +1533,13 @@ def _make_dataset_for_pickling(tempdir,
use_legacy_dataset=False, N=100):
return dataset
-def _assert_dataset_is_picklable(dataset, pickler, use_legacy_dataset=False):
[email protected]
+@parametrize_legacy_dataset
+def test_pickle_dataset(tempdir, datadir, use_legacy_dataset, pickle_module):
def is_pickleable(obj):
- return obj == pickler.loads(pickler.dumps(obj))
+ return obj == pickle_module.loads(pickle_module.dumps(obj))
+ dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset)
assert is_pickleable(dataset)
if use_legacy_dataset:
with pytest.warns(FutureWarning):
@@ -1555,24 +1558,6 @@ def _assert_dataset_is_picklable(dataset, pickler,
use_legacy_dataset=False):
assert is_pickleable(metadata.row_group(i))
[email protected]
-@parametrize_legacy_dataset
-def test_builtin_pickle_dataset(tempdir, datadir, use_legacy_dataset):
- import pickle
- dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset)
- _assert_dataset_is_picklable(
- dataset, pickler=pickle, use_legacy_dataset=use_legacy_dataset)
-
-
[email protected]
-@parametrize_legacy_dataset
-def test_cloudpickle_dataset(tempdir, datadir, use_legacy_dataset):
- cp = pytest.importorskip('cloudpickle')
- dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset)
- _assert_dataset_is_picklable(
- dataset, pickler=cp, use_legacy_dataset=use_legacy_dataset)
-
-
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_partitioned_dataset(tempdir, use_legacy_dataset):
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index 0546830a66..fca094b519 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -21,7 +21,6 @@ import decimal
import hypothesis as h
import hypothesis.strategies as st
import itertools
-import pickle
import pytest
import struct
import subprocess
@@ -29,10 +28,6 @@ import sys
import weakref
import numpy as np
-try:
- import pickle5
-except ImportError:
- pickle5 = None
import pyarrow as pa
import pyarrow.tests.strategies as past
@@ -2007,21 +2002,21 @@ pickle_test_parametrize = pytest.mark.parametrize(
@pickle_test_parametrize
-def test_array_pickle(data, typ):
+def test_array_pickle(data, typ, pickle_module):
# Allocate here so that we don't have any Arrow data allocated.
# This is needed to ensure that allocator tests can be reliable.
array = pa.array(data, type=typ)
- for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
- result = pickle.loads(pickle.dumps(array, proto))
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+ result = pickle_module.loads(pickle_module.dumps(array, proto))
assert array.equals(result)
-def test_array_pickle_dictionary():
+def test_array_pickle_dictionary(pickle_module):
# not included in the above as dictionary array cannot be created with
# the pa.array function
array = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1], ['a', 'b', 'c'])
- for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
- result = pickle.loads(pickle.dumps(array, proto))
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+ result = pickle_module.loads(pickle_module.dumps(array, proto))
assert array.equals(result)
@@ -2031,27 +2026,23 @@ def test_array_pickle_dictionary():
size=st.integers(min_value=0, max_value=10)
)
)
-def test_pickling(arr):
- data = pickle.dumps(arr)
- restored = pickle.loads(data)
+def test_pickling(pickle_module, arr):
+ data = pickle_module.dumps(arr)
+ restored = pickle_module.loads(data)
assert arr.equals(restored)
@pickle_test_parametrize
-def test_array_pickle5(data, typ):
+def test_array_pickle_protocol5(data, typ, pickle_module):
# Test zero-copy pickling with protocol 5 (PEP 574)
- picklemod = pickle5 or pickle
- if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5:
- pytest.skip("need pickle5 package or Python 3.8+")
-
array = pa.array(data, type=typ)
addresses = [buf.address if buf is not None else 0
for buf in array.buffers()]
- for proto in range(5, pickle.HIGHEST_PROTOCOL + 1):
+ for proto in range(5, pickle_module.HIGHEST_PROTOCOL + 1):
buffers = []
- pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append)
- result = picklemod.loads(pickled, buffers=buffers)
+ pickled = pickle_module.dumps(array, proto,
buffer_callback=buffers.append)
+ result = pickle_module.loads(pickled, buffers=buffers)
assert array.equals(result)
result_addresses = [buf.address if buf is not None else 0
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index 6ee54ca4b1..152cae1be1 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -23,7 +23,6 @@ import inspect
import itertools
import math
import os
-import pickle
import pytest
import random
import sys
@@ -278,18 +277,18 @@ def test_call_function_with_memory_pool():
assert result3.equals(expected)
-def test_pickle_functions():
+def test_pickle_functions(pickle_module):
# Pickle registered functions
for name in pc.list_functions():
func = pc.get_function(name)
- reconstructed = pickle.loads(pickle.dumps(func))
+ reconstructed = pickle_module.loads(pickle_module.dumps(func))
assert type(reconstructed) is type(func)
assert reconstructed.name == func.name
assert reconstructed.arity == func.arity
assert reconstructed.num_kernels == func.num_kernels
-def test_pickle_global_functions():
+def test_pickle_global_functions(pickle_module):
# Pickle global wrappers (manual or automatic) of registered functions
for name in pc.list_functions():
try:
@@ -297,7 +296,7 @@ def test_pickle_global_functions():
except AttributeError:
# hash_aggregate functions are not exported as callables.
continue
- reconstructed = pickle.loads(pickle.dumps(func))
+ reconstructed = pickle_module.loads(pickle_module.dumps(func))
assert reconstructed is func
@@ -3366,10 +3365,10 @@ def create_sample_expressions():
# Tests the Arrow-specific serialization mechanism
-def test_expression_serialization_arrow():
+def test_expression_serialization_arrow(pickle_module):
for expr in create_sample_expressions()["all"]:
assert isinstance(expr, pc.Expression)
- restored = pickle.loads(pickle.dumps(expr))
+ restored = pickle_module.loads(pickle_module.dumps(expr))
assert expr.equals(restored)
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 81c31d98ac..afc5380b75 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -24,7 +24,6 @@ import gzip
import io
import itertools
import os
-import pickle
import select
import shutil
import signal
@@ -102,10 +101,10 @@ def check_options_class(cls, **attr_values):
# The various options classes need to be picklable for dataset
-def check_options_class_pickling(cls, **attr_values):
+def check_options_class_pickling(cls, pickler, **attr_values):
opts = cls(**attr_values)
- new_opts = pickle.loads(pickle.dumps(opts,
- protocol=pickle.HIGHEST_PROTOCOL))
+ new_opts = pickler.loads(pickler.dumps(opts,
+ protocol=pickler.HIGHEST_PROTOCOL))
for name, value in attr_values.items():
assert getattr(new_opts, name) == value
@@ -128,7 +127,7 @@ class InvalidRowHandler:
other.result != self.result)
-def test_read_options():
+def test_read_options(pickle_module):
cls = ReadOptions
opts = cls()
@@ -139,7 +138,8 @@ def test_read_options():
encoding=['utf8', 'utf16'],
skip_rows_after_names=[0, 27])
- check_options_class_pickling(cls, use_threads=True,
+ check_options_class_pickling(cls, pickler=pickle_module,
+ use_threads=True,
skip_rows=3,
column_names=["ab", "cd"],
autogenerate_column_names=False,
@@ -182,7 +182,7 @@ def test_read_options():
opts.validate()
-def test_parse_options():
+def test_parse_options(pickle_module):
cls = ParseOptions
skip_handler = InvalidRowHandler('skip')
@@ -194,7 +194,8 @@ def test_parse_options():
ignore_empty_lines=[True, False],
invalid_row_handler=[None, skip_handler])
- check_options_class_pickling(cls, delimiter='x',
+ check_options_class_pickling(cls, pickler=pickle_module,
+ delimiter='x',
escape_char='y',
quote_char=False,
double_quote=False,
@@ -241,7 +242,7 @@ def test_parse_options():
opts.validate()
-def test_convert_options():
+def test_convert_options(pickle_module):
cls = ConvertOptions
opts = cls()
@@ -256,7 +257,8 @@ def test_convert_options():
timestamp_parsers=[[], [ISO8601, '%y-%m']])
check_options_class_pickling(
- cls, check_utf8=False,
+ cls, pickler=pickle_module,
+ check_utf8=False,
strings_can_be_null=True,
quoted_strings_can_be_null=False,
decimal_point=',',
@@ -622,7 +624,7 @@ class BaseTestCSV(abc.ABC):
read_options=read_options,
convert_options=convert_options)
- def test_invalid_row_handler(self):
+ def test_invalid_row_handler(self, pickle_module):
rows = b"a,b\nc\nd,e\nf,g,h\ni,j\n"
parse_opts = ParseOptions()
with pytest.raises(
@@ -657,7 +659,7 @@ class BaseTestCSV(abc.ABC):
# Test ser/de
parse_opts.invalid_row_handler = InvalidRowHandler('skip')
- parse_opts = pickle.loads(pickle.dumps(parse_opts))
+ parse_opts = pickle_module.loads(pickle_module.dumps(parse_opts))
table = self.read_bytes(rows, parse_options=parse_opts)
assert table.to_pydict() == {
@@ -1792,13 +1794,13 @@ class BaseStreamingCSVRead(BaseTestCSV):
assert reader.read_next_batch()
-class TestSerialStreamingCSVRead(BaseStreamingCSVRead, unittest.TestCase):
+class TestSerialStreamingCSVRead(BaseStreamingCSVRead):
@property
def use_threads(self):
return False
-class TestThreadedStreamingCSVRead(BaseStreamingCSVRead, unittest.TestCase):
+class TestThreadedStreamingCSVRead(BaseStreamingCSVRead):
@property
def use_threads(self):
return True
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index f92317c0f2..b8a0c38089 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -20,7 +20,6 @@ import os
import posixpath
import datetime
import pathlib
-import pickle
import sys
import textwrap
import tempfile
@@ -700,7 +699,7 @@ def test_partitioning():
assert load_back_table.equals(table)
-def test_partitioning_pickling():
+def test_partitioning_pickling(pickle_module):
schema = pa.schema([
pa.field('i64', pa.int64()),
pa.field('f64', pa.float64())
@@ -715,7 +714,7 @@ def test_partitioning_pickling():
]
for part in parts:
- assert pickle.loads(pickle.dumps(part)) == part
+ assert pickle_module.loads(pickle_module.dumps(part)) == part
def test_expression_arithmetic_operators():
@@ -818,7 +817,7 @@ def test_parquet_scan_options():
assert opts5 != opts1
-def test_file_format_pickling():
+def test_file_format_pickling(pickle_module):
formats = [
ds.IpcFileFormat(),
ds.CsvFileFormat(),
@@ -854,10 +853,10 @@ def test_file_format_pickling():
])
for file_format in formats:
- assert pickle.loads(pickle.dumps(file_format)) == file_format
+ assert pickle_module.loads(pickle_module.dumps(file_format)) ==
file_format
-def test_fragment_scan_options_pickling():
+def test_fragment_scan_options_pickling(pickle_module):
options = [
ds.CsvFragmentScanOptions(),
ds.CsvFragmentScanOptions(
@@ -879,7 +878,7 @@ def test_fragment_scan_options_pickling():
])
for option in options:
- assert pickle.loads(pickle.dumps(option)) == option
+ assert pickle_module.loads(pickle_module.dumps(option)) == option
@pytest.mark.parametrize('paths_or_selector', [
@@ -982,7 +981,7 @@ def test_make_fragment(multisourcefs):
assert row_group_fragment.row_groups == [0]
-def test_make_csv_fragment_from_buffer(dataset_reader):
+def test_make_csv_fragment_from_buffer(dataset_reader, pickle_module):
content = textwrap.dedent("""
alpha,num,animal
a,12,dog
@@ -1003,11 +1002,11 @@ def test_make_csv_fragment_from_buffer(dataset_reader):
names=['alpha', 'num', 'animal'])
assert dataset_reader.to_table(fragment).equals(expected)
- pickled = pickle.loads(pickle.dumps(fragment))
+ pickled = pickle_module.loads(pickle_module.dumps(fragment))
assert dataset_reader.to_table(pickled).equals(fragment.to_table())
-def test_make_json_fragment_from_buffer(dataset_reader):
+def test_make_json_fragment_from_buffer(dataset_reader, pickle_module):
content = '{"alpha" : "a", "num": 12, "animal" : "dog"}\n' + \
'{"alpha" : "b", "num": 11, "animal" : "cat"}\n' + \
'{"alpha" : "c", "num": 10, "animal" : "rabbit"}\n'
@@ -1025,12 +1024,12 @@ def test_make_json_fragment_from_buffer(dataset_reader):
names=['alpha', 'num', 'animal'])
assert dataset_reader.to_table(fragment).equals(expected)
- pickled = pickle.loads(pickle.dumps(fragment))
+ pickled = pickle_module.loads(pickle_module.dumps(fragment))
assert dataset_reader.to_table(pickled).equals(fragment.to_table())
@pytest.mark.parquet
-def test_make_parquet_fragment_from_buffer(dataset_reader):
+def test_make_parquet_fragment_from_buffer(dataset_reader, pickle_module):
arrays = [
pa.array(['a', 'b', 'c']),
pa.array([12, 11, 10]),
@@ -1063,7 +1062,7 @@ def
test_make_parquet_fragment_from_buffer(dataset_reader):
fragment = format_.make_fragment(buffer)
assert dataset_reader.to_table(fragment).equals(table)
- pickled = pickle.loads(pickle.dumps(fragment))
+ pickled = pickle_module.loads(pickle_module.dumps(fragment))
assert dataset_reader.to_table(pickled).equals(table)
@@ -1140,7 +1139,7 @@ def test_fragments_implicit_cast(tempdir):
@pytest.mark.parquet
@pytest.mark.filterwarnings(
"ignore:Passing 'use_legacy_dataset=True':FutureWarning")
-def test_fragments_reconstruct(tempdir, dataset_reader):
+def test_fragments_reconstruct(tempdir, dataset_reader, pickle_module):
table, dataset = _create_dataset_for_fragments(tempdir)
def assert_yields_projected(fragment, row_slice,
@@ -1157,7 +1156,7 @@ def test_fragments_reconstruct(tempdir, dataset_reader):
parquet_format = fragment.format
# test pickle roundtrip
- pickled_fragment = pickle.loads(pickle.dumps(fragment))
+ pickled_fragment = pickle_module.loads(pickle_module.dumps(fragment))
assert dataset_reader.to_table(
pickled_fragment) == dataset_reader.to_table(fragment)
@@ -1272,7 +1271,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir,
dataset_reader):
@pytest.mark.parquet
@pytest.mark.filterwarnings(
"ignore:Passing 'use_legacy_dataset=True':FutureWarning")
-def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
+def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs,
pickle_module):
fs, assert_opens = open_logging_fs
_, dataset = _create_dataset_for_fragments(
tempdir, chunk_size=2, filesystem=fs
@@ -1304,7 +1303,7 @@ def test_fragments_parquet_ensure_metadata(tempdir,
open_logging_fs):
assert row_group.statistics is not None
# pickling preserves row group ids
- pickled_fragment = pickle.loads(pickle.dumps(new_fragment))
+ pickled_fragment = pickle_module.loads(pickle_module.dumps(new_fragment))
with assert_opens([fragment.path]):
assert pickled_fragment.row_groups == [0, 1]
row_group = pickled_fragment.row_groups[0]
@@ -1314,7 +1313,7 @@ def test_fragments_parquet_ensure_metadata(tempdir,
open_logging_fs):
@pytest.mark.pandas
@pytest.mark.parquet
-def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs):
+def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs,
pickle_module):
# https://issues.apache.org/jira/browse/ARROW-15796
fs, assert_opens = open_logging_fs
_, dataset = _create_dataset_for_fragments(tempdir, filesystem=fs)
@@ -1323,7 +1322,7 @@ def test_fragments_parquet_pickle_no_metadata(tempdir,
open_logging_fs):
# second fragment hasn't yet loaded the metadata,
# and pickling it also should not read the metadata
with assert_opens([]):
- pickled_fragment = pickle.loads(pickle.dumps(fragment))
+ pickled_fragment = pickle_module.loads(pickle_module.dumps(fragment))
# then accessing the row group info reads the metadata
with assert_opens([pickled_fragment.path]):
@@ -1487,7 +1486,8 @@ def test_fragments_parquet_row_groups_predicate(tempdir):
@pytest.mark.parquet
@pytest.mark.filterwarnings(
"ignore:Passing 'use_legacy_dataset=True':FutureWarning")
-def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader):
+def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader,
+ pickle_module):
table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2)
fragment = list(dataset.get_fragments())[0]
@@ -1495,7 +1495,7 @@ def
test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader):
row_group_fragments = list(fragment.split_by_row_group())
# test pickle roundtrip
- pickled_fragment = pickle.loads(pickle.dumps(fragment))
+ pickled_fragment = pickle_module.loads(pickle_module.dumps(fragment))
assert dataset_reader.to_table(
pickled_fragment) == dataset_reader.to_table(fragment)
@@ -1644,14 +1644,14 @@ def test_fragments_repr(tempdir, dataset):
@pytest.mark.parquet
@pytest.mark.parametrize(
- "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory(mockfs, pickled):
+ "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory(mockfs, pickled, pickle_module):
paths_or_selector = fs.FileSelector('subdir', recursive=True)
format = ds.ParquetFileFormat()
options = ds.FileSystemFactoryOptions('subdir')
partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key'])
- partitioning_factory = pickled(partitioning_factory)
+ partitioning_factory = pickled(partitioning_factory, pickle_module)
assert isinstance(partitioning_factory, ds.PartitioningFactory)
options.partitioning_factory = partitioning_factory
@@ -1678,15 +1678,16 @@ def test_partitioning_factory(mockfs, pickled):
@pytest.mark.parquet
@pytest.mark.parametrize('infer_dictionary', [False, True])
@pytest.mark.parametrize(
- "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled):
+ "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled,
+ pickle_module):
paths_or_selector = fs.FileSelector('subdir', recursive=True)
format = ds.ParquetFileFormat()
options = ds.FileSystemFactoryOptions('subdir')
partitioning_factory = ds.DirectoryPartitioning.discover(
['group', 'key'], infer_dictionary=infer_dictionary)
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(
mockfs, paths_or_selector, format, options)
@@ -1711,8 +1712,8 @@ def test_partitioning_factory_dictionary(mockfs,
infer_dictionary, pickled):
@pytest.mark.parametrize(
- "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory_segment_encoding(pickled):
+ "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory_segment_encoding(pickled, pickle_module):
mockfs = fs._MockFileSystem()
format = ds.IpcFileFormat()
schema = pa.schema([("i64", pa.int64())])
@@ -1737,7 +1738,7 @@ def test_partitioning_factory_segment_encoding(pickled):
options = ds.FileSystemFactoryOptions("directory")
partitioning_factory = ds.DirectoryPartitioning.discover(
schema=partition_schema)
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
inferred_schema = factory.inspect()
assert inferred_schema == full_schema
@@ -1748,7 +1749,7 @@ def test_partitioning_factory_segment_encoding(pickled):
partitioning_factory = ds.DirectoryPartitioning.discover(
["date", "string"], segment_encoding="none")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1757,7 +1758,7 @@ def test_partitioning_factory_segment_encoding(pickled):
partitioning = ds.DirectoryPartitioning(
string_partition_schema, segment_encoding="none")
- options.partitioning = pickled(partitioning)
+ options.partitioning = pickled(partitioning, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1766,7 +1767,7 @@ def test_partitioning_factory_segment_encoding(pickled):
partitioning_factory = ds.DirectoryPartitioning.discover(
schema=partition_schema, segment_encoding="none")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
with pytest.raises(pa.ArrowInvalid,
match="Could not cast segments for partition field"):
@@ -1777,7 +1778,7 @@ def test_partitioning_factory_segment_encoding(pickled):
options = ds.FileSystemFactoryOptions("hive")
partitioning_factory = ds.HivePartitioning.discover(
schema=partition_schema)
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
inferred_schema = factory.inspect()
assert inferred_schema == full_schema
@@ -1788,7 +1789,7 @@ def test_partitioning_factory_segment_encoding(pickled):
partitioning_factory = ds.HivePartitioning.discover(
segment_encoding="none")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1805,7 +1806,7 @@ def test_partitioning_factory_segment_encoding(pickled):
partitioning_factory = ds.HivePartitioning.discover(
schema=partition_schema, segment_encoding="none")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
with pytest.raises(pa.ArrowInvalid,
match="Could not cast segments for partition field"):
@@ -1813,8 +1814,8 @@ def test_partitioning_factory_segment_encoding(pickled):
@pytest.mark.parametrize(
- "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
+ "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled,
pickle_module):
mockfs = fs._MockFileSystem()
format = ds.IpcFileFormat()
schema = pa.schema([("i64", pa.int64())])
@@ -1845,7 +1846,7 @@ def
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
options = ds.FileSystemFactoryOptions("hive")
partitioning_factory = ds.HivePartitioning.discover(
schema=partition_schema)
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
inferred_schema = factory.inspect()
assert inferred_schema == full_schema
@@ -1856,7 +1857,7 @@ def
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
partitioning_factory = ds.HivePartitioning.discover(
segment_encoding="uri")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1865,7 +1866,7 @@ def
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
partitioning = ds.HivePartitioning(
string_partition_schema, segment_encoding="uri")
- options.partitioning = pickled(partitioning)
+ options.partitioning = pickled(partitioning, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1874,7 +1875,7 @@ def
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
partitioning_factory = ds.HivePartitioning.discover(
segment_encoding="none")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1883,7 +1884,7 @@ def
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
partitioning = ds.HivePartitioning(
string_partition_schema_en, segment_encoding="none")
- options.partitioning = pickled(partitioning)
+ options.partitioning = pickled(partitioning, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
fragments = list(factory.finish().get_fragments())
assert fragments[0].partition_expression.equals(
@@ -1892,7 +1893,7 @@ def
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
partitioning_factory = ds.HivePartitioning.discover(
schema=partition_schema_en, segment_encoding="none")
- options.partitioning_factory = pickled(partitioning_factory)
+ options.partitioning_factory = pickled(partitioning_factory, pickle_module)
factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
with pytest.raises(pa.ArrowInvalid,
match="Could not cast segments for partition field"):
@@ -2087,14 +2088,14 @@ def _create_directory_of_files(base_dir):
return (table1, table2), (path1, path2)
-def _check_dataset(dataset, table, dataset_reader):
+def _check_dataset(dataset, table, dataset_reader, pickler):
# also test that pickle roundtrip keeps the functionality
- for d in [dataset, pickle.loads(pickle.dumps(dataset))]:
+ for d in [dataset, pickler.loads(pickler.dumps(dataset))]:
assert dataset.schema.equals(table.schema)
assert dataset_reader.to_table(dataset).equals(table)
-def _check_dataset_from_path(path, table, dataset_reader, **kwargs):
+def _check_dataset_from_path(path, table, dataset_reader, pickler, **kwargs):
# pathlib object
assert isinstance(path, pathlib.Path)
@@ -2102,39 +2103,39 @@ def _check_dataset_from_path(path, table,
dataset_reader, **kwargs):
for p in [path, str(path), [path], [str(path)]]:
dataset = ds.dataset(path, **kwargs)
assert isinstance(dataset, ds.FileSystemDataset)
- _check_dataset(dataset, table, dataset_reader)
+ _check_dataset(dataset, table, dataset_reader, pickler)
# relative string path
with change_cwd(path.parent):
dataset = ds.dataset(path.name, **kwargs)
assert isinstance(dataset, ds.FileSystemDataset)
- _check_dataset(dataset, table, dataset_reader)
+ _check_dataset(dataset, table, dataset_reader, pickler)
@pytest.mark.parquet
-def test_open_dataset_single_file(tempdir, dataset_reader):
+def test_open_dataset_single_file(tempdir, dataset_reader, pickle_module):
table, path = _create_single_file(tempdir)
- _check_dataset_from_path(path, table, dataset_reader)
+ _check_dataset_from_path(path, table, dataset_reader, pickle_module)
@pytest.mark.parquet
-def test_deterministic_row_order(tempdir, dataset_reader):
+def test_deterministic_row_order(tempdir, dataset_reader, pickle_module):
# ARROW-8447 Ensure that dataset.to_table (and Scanner::ToTable) returns a
# deterministic row ordering. This is achieved by constructing a single
# parquet file with one row per RowGroup.
table, path = _create_single_file(tempdir, row_group_size=1)
- _check_dataset_from_path(path, table, dataset_reader)
+ _check_dataset_from_path(path, table, dataset_reader, pickle_module)
@pytest.mark.parquet
-def test_open_dataset_directory(tempdir, dataset_reader):
+def test_open_dataset_directory(tempdir, dataset_reader, pickle_module):
tables, _ = _create_directory_of_files(tempdir)
table = pa.concat_tables(tables)
- _check_dataset_from_path(tempdir, table, dataset_reader)
+ _check_dataset_from_path(tempdir, table, dataset_reader, pickle_module)
@pytest.mark.parquet
-def test_open_dataset_list_of_files(tempdir, dataset_reader):
+def test_open_dataset_list_of_files(tempdir, dataset_reader, pickle_module):
tables, (path1, path2) = _create_directory_of_files(tempdir)
table = pa.concat_tables(tables)
@@ -2143,7 +2144,7 @@ def test_open_dataset_list_of_files(tempdir,
dataset_reader):
ds.dataset([str(path1), str(path2)])
]
datasets += [
- pickle.loads(pickle.dumps(d)) for d in datasets
+ pickle_module.loads(pickle_module.dumps(d)) for d in datasets
]
for dataset in datasets:
@@ -2173,7 +2174,7 @@ def test_open_dataset_filesystem_fspath(tempdir):
@pytest.mark.parquet
-def test_construct_from_single_file(tempdir, dataset_reader):
+def test_construct_from_single_file(tempdir, dataset_reader, pickle_module):
directory = tempdir / 'single-file'
directory.mkdir()
table, path = _create_single_file(directory)
@@ -2186,14 +2187,14 @@ def test_construct_from_single_file(tempdir,
dataset_reader):
# instantiate from a single file with prefixed filesystem URI
d3 = ds.dataset(str(relative_path), filesystem=_filesystem_uri(directory))
# pickle roundtrip
- d4 = pickle.loads(pickle.dumps(d1))
+ d4 = pickle_module.loads(pickle_module.dumps(d1))
assert dataset_reader.to_table(d1) == dataset_reader.to_table(
d2) == dataset_reader.to_table(d3) == dataset_reader.to_table(d4)
@pytest.mark.parquet
-def test_construct_from_single_directory(tempdir, dataset_reader):
+def test_construct_from_single_directory(tempdir, dataset_reader,
pickle_module):
directory = tempdir / 'single-directory'
directory.mkdir()
tables, paths = _create_directory_of_files(directory)
@@ -2208,7 +2209,7 @@ def test_construct_from_single_directory(tempdir,
dataset_reader):
# test pickle roundtrip
for d in [d1, d2, d3]:
- restored = pickle.loads(pickle.dumps(d))
+ restored = pickle_module.loads(pickle_module.dumps(d))
assert dataset_reader.to_table(restored) == t1
@@ -2403,12 +2404,12 @@ def _create_partitioned_dataset(basedir):
@pytest.mark.parquet
-def test_open_dataset_partitioned_directory(tempdir, dataset_reader):
+def test_open_dataset_partitioned_directory(tempdir, dataset_reader,
pickle_module):
full_table, path = _create_partitioned_dataset(tempdir)
# no partitioning specified, just read all individual files
table = full_table.select(['a', 'b'])
- _check_dataset_from_path(path, table, dataset_reader)
+ _check_dataset_from_path(path, table, dataset_reader, pickle_module)
# specify partition scheme with discovery
dataset = ds.dataset(
@@ -2470,14 +2471,14 @@ def test_open_dataset_unsupported_format(tempdir):
@pytest.mark.parquet
-def test_open_union_dataset(tempdir, dataset_reader):
+def test_open_union_dataset(tempdir, dataset_reader, pickle_module):
_, path = _create_single_file(tempdir)
dataset = ds.dataset(path)
union = ds.dataset([dataset, dataset])
assert isinstance(union, ds.UnionDataset)
- pickled = pickle.loads(pickle.dumps(union))
+ pickled = pickle_module.loads(pickle_module.dumps(union))
assert dataset_reader.to_table(pickled) == dataset_reader.to_table(union)
@@ -2568,7 +2569,7 @@ def test_partition_discovery(
@pytest.mark.pandas
-def test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
+def test_dataset_partitioned_dictionary_type_reconstruct(tempdir,
pickle_module):
# https://issues.apache.org/jira/browse/ARROW-11400
table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)})
part = ds.partitioning(table.select(['part']).schema, flavor="hive")
@@ -2586,10 +2587,10 @@ def
test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
assert fragment.to_table(schema=dataset.schema).equals(expected[:5])
part_expr = fragment.partition_expression
- restored = pickle.loads(pickle.dumps(dataset))
+ restored = pickle_module.loads(pickle_module.dumps(dataset))
assert restored.to_table().equals(expected)
- restored = pickle.loads(pickle.dumps(fragment))
+ restored = pickle_module.loads(pickle_module.dumps(fragment))
assert restored.to_table(schema=dataset.schema).equals(expected[:5])
# to_pandas call triggers computation of the actual dictionary values
assert restored.to_table(schema=dataset.schema).to_pandas().equals(
@@ -3789,7 +3790,7 @@ def test_dataset_preserved_partitioning(tempdir):
assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)
# TODO(GH-34884) partitioning attribute not preserved in pickling
# dataset_ = ds.dataset(path)
- # for dataset in [dataset_, pickle.loads(pickle.dumps(dataset_))]:
+ # for dataset in [dataset_,
pickle_module.loads(pickle_module.dumps(dataset_))]:
# assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)
# through discovery, with hive partitioning but not specified
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 973aa29c75..1eb7d5fa76 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -16,7 +16,6 @@
# under the License.
import os
-import pickle
import shutil
import subprocess
import weakref
@@ -217,12 +216,12 @@ def test_ext_type_as_py():
assert result.as_py() == expected
-def test_uuid_type_pickle():
- for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+def test_uuid_type_pickle(pickle_module):
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
ty = UuidType()
- ser = pickle.dumps(ty, protocol=proto)
+ ser = pickle_module.dumps(ty, protocol=proto)
del ty
- ty = pickle.loads(ser)
+ ty = pickle_module.loads(ser)
wr = weakref.ref(ty)
assert ty.extension_name == "arrow.py_extension_type"
del ty
@@ -410,14 +409,14 @@ def test_ext_scalar_from_storage():
assert s.value == pa.scalar(b"0123456789abcdef", ty.storage_type)
-def test_ext_array_pickling():
- for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+def test_ext_array_pickling(pickle_module):
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
ty = ParamExtType(3)
storage = pa.array([b"foo", b"bar"], type=pa.binary(3))
arr = pa.ExtensionArray.from_storage(ty, storage)
- ser = pickle.dumps(arr, protocol=proto)
+ ser = pickle_module.dumps(arr, protocol=proto)
del ty, storage, arr
- arr = pickle.loads(ser)
+ arr = pickle_module.loads(ser)
arr.validate()
assert isinstance(arr, pa.ExtensionArray)
assert arr.type == ParamExtType(3)
@@ -867,23 +866,23 @@ def test_generic_ext_type_equality():
assert not period_type == period_type3
-def test_generic_ext_type_pickling(registered_period_type):
+def test_generic_ext_type_pickling(registered_period_type, pickle_module):
# GH-36038
- for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
period_type, _ = registered_period_type
- ser = pickle.dumps(period_type, protocol=proto)
- period_type_pickled = pickle.loads(ser)
+ ser = pickle_module.dumps(period_type, protocol=proto)
+ period_type_pickled = pickle_module.loads(ser)
assert period_type == period_type_pickled
-def test_generic_ext_array_pickling(registered_period_type):
- for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+def test_generic_ext_array_pickling(registered_period_type, pickle_module):
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
period_type, _ = registered_period_type
storage = pa.array([1, 2, 3, 4], pa.int64())
arr = pa.ExtensionArray.from_storage(period_type, storage)
- ser = pickle.dumps(arr, protocol=proto)
+ ser = pickle_module.dumps(arr, protocol=proto)
del storage, arr
- arr = pickle.loads(ser)
+ arr = pickle_module.loads(ser)
arr.validate()
assert isinstance(arr, pa.ExtensionArray)
assert arr.type == period_type
@@ -1338,17 +1337,17 @@ def
test_extension_to_pandas_storage_type(registered_period_type):
assert isinstance(result["ext"].dtype, pd.ArrowDtype)
-def test_tensor_type_is_picklable():
+def test_tensor_type_is_picklable(pickle_module):
# GH-35599
expected_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))
- result = pickle.loads(pickle.dumps(expected_type))
+ result = pickle_module.loads(pickle_module.dumps(expected_type))
assert result == expected_type
arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
storage = pa.array(arr, pa.list_(pa.int32(), 4))
expected_arr = pa.ExtensionArray.from_storage(expected_type, storage)
- result = pickle.loads(pickle.dumps(expected_arr))
+ result = pickle_module.loads(pickle_module.dumps(expected_arr))
assert result == expected_arr
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index b64680c87c..8135f70f69 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -19,7 +19,6 @@ from datetime import datetime, timezone, timedelta
import gzip
import os
import pathlib
-import pickle
import subprocess
import sys
@@ -559,17 +558,17 @@ def test_subtree_filesystem():
' base_fs=<pyarrow._fs.LocalFileSystem')
-def test_filesystem_pickling(fs):
+def test_filesystem_pickling(fs, pickle_module):
if fs.type_name.split('::')[-1] == 'mock':
pytest.xfail(reason='MockFileSystem is not serializable')
- serialized = pickle.dumps(fs)
- restored = pickle.loads(serialized)
+ serialized = pickle_module.dumps(fs)
+ restored = pickle_module.loads(serialized)
assert isinstance(restored, FileSystem)
assert restored.equals(fs)
-def test_filesystem_is_functional_after_pickling(fs, pathfn):
+def test_filesystem_is_functional_after_pickling(fs, pathfn, pickle_module):
if fs.type_name.split('::')[-1] == 'mock':
pytest.xfail(reason='MockFileSystem is not serializable')
skip_fsspec_s3fs(fs)
@@ -584,7 +583,7 @@ def test_filesystem_is_functional_after_pickling(fs,
pathfn):
with fs.open_output_stream(c) as fp:
fp.write(b'test')
- restored = pickle.loads(pickle.dumps(fs))
+ restored = pickle_module.loads(pickle_module.dumps(fs))
aaa_info, bb_info, c_info = restored.get_file_info([aaa, bb, c])
assert aaa_info.type == FileType.Directory
assert bb_info.type == FileType.File
@@ -1060,7 +1059,7 @@ def test_mockfs_mtime_roundtrip(mockfs):
@pytest.mark.gcs
-def test_gcs_options():
+def test_gcs_options(pickle_module):
from pyarrow.fs import GcsFileSystem
dt = datetime.now()
fs = GcsFileSystem(access_token='abc',
@@ -1072,20 +1071,20 @@ def test_gcs_options():
assert isinstance(fs, GcsFileSystem)
assert fs.default_bucket_location == 'us-west2'
assert fs.project_id == 'test-project-id'
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = GcsFileSystem()
assert isinstance(fs, GcsFileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = GcsFileSystem(anonymous=True)
assert isinstance(fs, GcsFileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = GcsFileSystem(default_metadata={"ACL": "authenticated-read",
"Content-Type": "text/plain"})
assert isinstance(fs, GcsFileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
with pytest.raises(ValueError):
GcsFileSystem(access_token='access')
@@ -1098,7 +1097,7 @@ def test_gcs_options():
@pytest.mark.s3
-def test_s3_options():
+def test_s3_options(pickle_module):
from pyarrow.fs import (AwsDefaultS3RetryStrategy,
AwsStandardS3RetryStrategy, S3FileSystem,
S3RetryStrategy)
@@ -1108,12 +1107,12 @@ def test_s3_options():
scheme='https', endpoint_override='localhost:8999')
assert isinstance(fs, S3FileSystem)
assert fs.region == 'us-east-2'
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = S3FileSystem(role_arn='role', session_name='session',
external_id='id', load_frequency=100)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
# Note that the retry strategy won't survive pickling for now
fs = S3FileSystem(
@@ -1126,35 +1125,35 @@ def test_s3_options():
fs2 = S3FileSystem(role_arn='role')
assert isinstance(fs2, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs2)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
assert fs2 != fs
fs = S3FileSystem(anonymous=True)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = S3FileSystem(background_writes=True)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs2 = S3FileSystem(background_writes=True,
default_metadata={"ACL": "authenticated-read",
"Content-Type": "text/plain"})
assert isinstance(fs2, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs2)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
assert fs2 != fs
fs = S3FileSystem(allow_bucket_creation=True, allow_bucket_deletion=True)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = S3FileSystem(request_timeout=0.5, connect_timeout=0.25)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs2 = S3FileSystem(request_timeout=0.25, connect_timeout=0.5)
assert isinstance(fs2, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs2)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
assert fs2 != fs
with pytest.raises(ValueError):
@@ -1182,7 +1181,7 @@ def test_s3_options():
@pytest.mark.s3
-def test_s3_proxy_options(monkeypatch):
+def test_s3_proxy_options(monkeypatch, pickle_module):
from pyarrow.fs import S3FileSystem
# The following two are equivalent:
@@ -1195,111 +1194,111 @@ def test_s3_proxy_options(monkeypatch):
# Check dict case for 'proxy_options'
fs = S3FileSystem(proxy_options=proxy_opts_1_dict)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = S3FileSystem(proxy_options=proxy_opts_2_dict)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
# Check str case for 'proxy_options'
fs = S3FileSystem(proxy_options=proxy_opts_1_str)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
fs = S3FileSystem(proxy_options=proxy_opts_2_str)
assert isinstance(fs, S3FileSystem)
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
# Check that two FSs using the same proxy_options dict are equal
fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
fs2 = S3FileSystem(proxy_options=proxy_opts_1_dict)
assert fs1 == fs2
- assert pickle.loads(pickle.dumps(fs1)) == fs2
- assert pickle.loads(pickle.dumps(fs2)) == fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
assert fs1 == fs2
- assert pickle.loads(pickle.dumps(fs1)) == fs2
- assert pickle.loads(pickle.dumps(fs2)) == fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
# Check that two FSs using the same proxy_options str are equal
fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
fs2 = S3FileSystem(proxy_options=proxy_opts_1_str)
assert fs1 == fs2
- assert pickle.loads(pickle.dumps(fs1)) == fs2
- assert pickle.loads(pickle.dumps(fs2)) == fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_2_str)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
assert fs1 == fs2
- assert pickle.loads(pickle.dumps(fs1)) == fs2
- assert pickle.loads(pickle.dumps(fs2)) == fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
# Check that two FSs using equivalent proxy_options
# (one dict, one str) are equal
fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
fs2 = S3FileSystem(proxy_options=proxy_opts_1_str)
assert fs1 == fs2
- assert pickle.loads(pickle.dumps(fs1)) == fs2
- assert pickle.loads(pickle.dumps(fs2)) == fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
assert fs1 == fs2
- assert pickle.loads(pickle.dumps(fs1)) == fs2
- assert pickle.loads(pickle.dumps(fs2)) == fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
# Check that two FSs using nonequivalent proxy_options are not equal
fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
# Check that two FSs (one using proxy_options and the other not)
# are not equal
fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
fs2 = S3FileSystem()
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
fs2 = S3FileSystem()
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
fs2 = S3FileSystem()
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
fs1 = S3FileSystem(proxy_options=proxy_opts_2_str)
fs2 = S3FileSystem()
assert fs1 != fs2
- assert pickle.loads(pickle.dumps(fs1)) != fs2
- assert pickle.loads(pickle.dumps(fs2)) != fs1
+ assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+ assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
# Only dict and str are supported
with pytest.raises(TypeError):
@@ -1347,7 +1346,7 @@ def test_s3fs_wrong_region():
@pytest.mark.hdfs
-def test_hdfs_options(hdfs_connection):
+def test_hdfs_options(hdfs_connection, pickle_module):
from pyarrow.fs import HadoopFileSystem
if not pa.have_libhdfs():
pytest.skip('Cannot locate libhdfs')
@@ -1405,7 +1404,7 @@ def test_hdfs_options(hdfs_connection):
for fs in [hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8,
hdfs9, hdfs10, hdfs11]:
- assert pickle.loads(pickle.dumps(fs)) == fs
+ assert pickle_module.loads(pickle_module.dumps(fs)) == fs
host, port, user = hdfs_connection
@@ -1524,12 +1523,12 @@ def test_py_filesystem_equality():
assert fs1 != object()
-def test_py_filesystem_pickling():
+def test_py_filesystem_pickling(pickle_module):
handler = DummyHandler()
fs = PyFileSystem(handler)
- serialized = pickle.dumps(fs)
- restored = pickle.loads(serialized)
+ serialized = pickle_module.dumps(fs)
+ restored = pickle_module.loads(serialized)
assert isinstance(restored, FileSystem)
assert restored == fs
assert restored.handler == handler
diff --git a/python/pyarrow/tests/test_hdfs.py
b/python/pyarrow/tests/test_hdfs.py
index 1f5f10f7d9..511dbf9a1c 100644
--- a/python/pyarrow/tests/test_hdfs.py
+++ b/python/pyarrow/tests/test_hdfs.py
@@ -16,9 +16,7 @@
# under the License.
import os
-import pickle
import random
-import unittest
from io import BytesIO
from os.path import join as pjoin
@@ -81,20 +79,20 @@ class HdfsTestCases:
return full_path
@classmethod
- def setUpClass(cls):
+ def setup_class(cls):
cls.check_driver()
cls.hdfs = hdfs_test_client()
cls.tmp_path = '/tmp/pyarrow-test-{}'.format(random.randint(0, 1000))
cls.hdfs.mkdir(cls.tmp_path)
@classmethod
- def tearDownClass(cls):
+ def teardown_class(cls):
cls.hdfs.delete(cls.tmp_path, recursive=True)
cls.hdfs.close()
- def test_pickle(self):
- s = pickle.dumps(self.hdfs)
- h2 = pickle.loads(s)
+ def test_pickle(self, pickle_module):
+ s = pickle_module.dumps(self.hdfs)
+ h2 = pickle_module.loads(s)
assert h2.is_open
assert h2.host == self.hdfs.host
assert h2.port == self.hdfs.port
@@ -392,7 +390,7 @@ class HdfsTestCases:
tmpdir, filesystem=self.hdfs)
-class TestLibHdfs(HdfsTestCases, unittest.TestCase):
+class TestLibHdfs(HdfsTestCases):
@classmethod
def check_driver(cls):
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index a6488d70df..0c9e591ccd 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -24,7 +24,6 @@ import gzip
import math
import os
import pathlib
-import pickle
import pytest
import sys
import tempfile
@@ -372,17 +371,17 @@ def test_python_file_closing():
# Buffers
-def check_buffer_pickling(buf):
+def check_buffer_pickling(buf, pickler):
# Check that buffer survives a pickle roundtrip
- for protocol in range(0, pickle.HIGHEST_PROTOCOL + 1):
- result = pickle.loads(pickle.dumps(buf, protocol=protocol))
+ for protocol in range(0, pickler.HIGHEST_PROTOCOL + 1):
+ result = pickler.loads(pickler.dumps(buf, protocol=protocol))
assert len(result) == len(buf)
assert memoryview(result) == memoryview(buf)
assert result.to_pybytes() == buf.to_pybytes()
assert result.is_mutable == buf.is_mutable
-def test_buffer_bytes():
+def test_buffer_bytes(pickle_module):
val = b'some data'
buf = pa.py_buffer(val)
@@ -393,10 +392,10 @@ def test_buffer_bytes():
result = buf.to_pybytes()
assert result == val
- check_buffer_pickling(buf)
+ check_buffer_pickling(buf, pickle_module)
-def test_buffer_null_data():
+def test_buffer_null_data(pickle_module):
null_buff = pa.foreign_buffer(address=0, size=0)
assert null_buff.to_pybytes() == b""
assert null_buff.address == 0
@@ -406,10 +405,10 @@ def test_buffer_null_data():
assert m.tobytes() == b""
assert pa.py_buffer(m).address != 0
- check_buffer_pickling(null_buff)
+ check_buffer_pickling(null_buff, pickle_module)
-def test_buffer_memoryview():
+def test_buffer_memoryview(pickle_module):
val = b'some data'
buf = pa.py_buffer(val)
@@ -420,10 +419,10 @@ def test_buffer_memoryview():
result = memoryview(buf)
assert result == val
- check_buffer_pickling(buf)
+ check_buffer_pickling(buf, pickle_module)
-def test_buffer_bytearray():
+def test_buffer_bytearray(pickle_module):
val = bytearray(b'some data')
buf = pa.py_buffer(val)
@@ -434,7 +433,7 @@ def test_buffer_bytearray():
result = bytearray(buf)
assert result == val
- check_buffer_pickling(buf)
+ check_buffer_pickling(buf, pickle_module)
def test_buffer_invalid():
diff --git a/python/pyarrow/tests/test_json.py
b/python/pyarrow/tests/test_json.py
index f37019ade5..be83f891a2 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -20,7 +20,6 @@ from decimal import Decimal
import io
import itertools
import json
-import pickle
import string
import unittest
@@ -53,15 +52,15 @@ def make_random_json(num_cols=2, num_rows=10,
linesep='\r\n'):
return data, expected
-def check_options_class_pickling(cls, **attr_values):
+def check_options_class_pickling(cls, pickler, **attr_values):
opts = cls(**attr_values)
- new_opts = pickle.loads(pickle.dumps(opts,
- protocol=pickle.HIGHEST_PROTOCOL))
+ new_opts = pickler.loads(pickler.dumps(opts,
+ protocol=pickler.HIGHEST_PROTOCOL))
for name, value in attr_values.items():
assert getattr(new_opts, name) == value
-def test_read_options():
+def test_read_options(pickle_module):
cls = ReadOptions
opts = cls()
@@ -77,11 +76,12 @@ def test_read_options():
assert opts.block_size == 1234
assert opts.use_threads is False
- check_options_class_pickling(cls, block_size=1234,
+ check_options_class_pickling(cls, pickler=pickle_module,
+ block_size=1234,
use_threads=False)
-def test_parse_options():
+def test_parse_options(pickle_module):
cls = ParseOptions
opts = cls()
assert opts.newlines_in_values is False
@@ -102,7 +102,8 @@ def test_parse_options():
with pytest.raises(ValueError):
opts.unexpected_field_behavior = "invalid-value"
- check_options_class_pickling(cls, explicit_schema=schema,
+ check_options_class_pickling(cls, pickler=pickle_module,
+ explicit_schema=schema,
newlines_in_values=False,
unexpected_field_behavior="ignore")
diff --git a/python/pyarrow/tests/test_scalars.py
b/python/pyarrow/tests/test_scalars.py
index d52edce6d9..5f6c8c813f 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -17,7 +17,6 @@
import datetime
import decimal
-import pickle
import pytest
import sys
import weakref
@@ -68,7 +67,7 @@ from pyarrow.tests import util
({'a': 1, 'b': [1, 2]}, None, pa.StructScalar),
([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar),
])
-def test_basics(value, ty, klass):
+def test_basics(value, ty, klass, pickle_module):
s = pa.scalar(value, type=ty)
s.validate()
s.validate(full=True)
@@ -87,7 +86,7 @@ def test_basics(value, ty, klass):
assert s != pa.scalar(value, type=ty)
# test pickle roundtrip
- restored = pickle.loads(pickle.dumps(s))
+ restored = pickle_module.loads(pickle_module.dumps(s))
assert s.equals(restored)
# test that scalars are weak-referenceable
@@ -110,7 +109,7 @@ def test_null_singleton():
pa.NullScalar()
-def test_nulls():
+def test_nulls(pickle_module):
null = pa.scalar(None)
assert null is pa.NA
assert null.as_py() is None
@@ -126,7 +125,7 @@ def test_nulls():
assert v.as_py() is None
# test pickle roundtrip
- restored = pickle.loads(pickle.dumps(null))
+ restored = pickle_module.loads(pickle_module.dumps(null))
assert restored.equals(null)
# test that scalars are weak-referenceable
@@ -683,7 +682,7 @@ def test_struct_duplicate_fields():
s.as_py()
-def test_map():
+def test_map(pickle_module):
ty = pa.map_(pa.string(), pa.int8())
v = [('a', 1), ('b', 2)]
s = pa.scalar(v, type=ty)
@@ -713,11 +712,11 @@ def test_map():
with pytest.raises(IndexError):
s[2]
- restored = pickle.loads(pickle.dumps(s))
+ restored = pickle_module.loads(pickle_module.dumps(s))
assert restored.equals(s)
-def test_dictionary():
+def test_dictionary(pickle_module):
indices = pa.array([2, None, 1, 2, 0, None])
dictionary = pa.array(['foo', 'bar', 'baz'])
@@ -733,7 +732,7 @@ def test_dictionary():
assert s.index.equals(i)
assert s.dictionary.equals(dictionary)
- restored = pickle.loads(pickle.dumps(s))
+ restored = pickle_module.loads(pickle_module.dumps(s))
assert restored.equals(s)
@@ -758,7 +757,7 @@ def test_run_end_encoded():
pa.scalar(1, pa.run_end_encoded(pa.int64(), pa.int64()))
-def test_union():
+def test_union(pickle_module):
# sparse
arr = pa.UnionArray.from_sparse(
pa.array([0, 0, 1, 1], type=pa.int8()),
@@ -773,7 +772,7 @@ def test_union():
assert s.type.equals(arr.type)
assert s.is_valid is True
with pytest.raises(pa.ArrowNotImplementedError):
- pickle.loads(pickle.dumps(s))
+ pickle_module.loads(pickle_module.dumps(s))
assert arr[0].type_code == 0
assert arr[0].as_py() == "a"
@@ -799,7 +798,7 @@ def test_union():
assert s.type.equals(arr.type)
assert s.is_valid is True
with pytest.raises(pa.ArrowNotImplementedError):
- pickle.loads(pickle.dumps(s))
+ pickle_module.loads(pickle_module.dumps(s))
assert arr[0].type_code == 0
assert arr[0].as_py() == b'a'
diff --git a/python/pyarrow/tests/test_schema.py
b/python/pyarrow/tests/test_schema.py
index 2c2f9547f2..e28e0ac445 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -16,7 +16,6 @@
# under the License.
from collections import OrderedDict
-import pickle
import sys
import weakref
@@ -585,7 +584,7 @@ two: int32""")
assert repr(sch) == expected
-def test_type_schema_pickling():
+def test_type_schema_pickling(pickle_module):
cases = [
pa.int8(),
pa.string(),
@@ -621,7 +620,7 @@ def test_type_schema_pickling():
]
for val in cases:
- roundtripped = pickle.loads(pickle.dumps(val))
+ roundtripped = pickle_module.loads(pickle_module.dumps(val))
assert val == roundtripped
fields = []
@@ -632,7 +631,7 @@ def test_type_schema_pickling():
fields.append(pa.field('_f{}'.format(i), f))
schema = pa.schema(fields, metadata={b'foo': b'bar'})
- roundtripped = pickle.loads(pickle.dumps(schema))
+ roundtripped = pickle_module.loads(pickle_module.dumps(schema))
assert schema == roundtripped
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index e28256e91f..f93c6bbc2c 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -17,7 +17,6 @@
from collections import OrderedDict
from collections.abc import Iterable
-import pickle
import sys
import weakref
@@ -301,14 +300,14 @@ def test_chunked_array_equals():
pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
]
)
-def test_chunked_array_pickle(data, typ):
+def test_chunked_array_pickle(data, typ, pickle_module):
arrays = []
while data:
arrays.append(pa.array(data[:2], type=typ))
data = data[2:]
array = pa.chunked_array(arrays, type=typ)
array.validate()
- result = pickle.loads(pickle.dumps(array))
+ result = pickle_module.loads(pickle_module.dumps(array))
result.validate()
assert result.equals(array)
@@ -663,7 +662,7 @@ def test_recordbatch_empty_metadata():
assert batch.schema.metadata is None
-def test_recordbatch_pickle():
+def test_recordbatch_pickle(pickle_module):
data = [
pa.array(range(5), type='int8'),
pa.array([-10, -5, 0, 5, 10], type='float32')
@@ -675,7 +674,7 @@ def test_recordbatch_pickle():
schema = pa.schema(fields, metadata={b'foo': b'bar'})
batch = pa.record_batch(data, schema=schema)
- result = pickle.loads(pickle.dumps(batch))
+ result = pickle_module.loads(pickle_module.dumps(batch))
assert result.equals(batch)
assert result.schema == schema
@@ -1022,7 +1021,7 @@ def test_table_from_lists():
assert result.equals(expected)
-def test_table_pickle():
+def test_table_pickle(pickle_module):
data = [
pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()),
pa.chunked_array([["some", "strings", None, ""]], type=pa.string()),
@@ -1032,7 +1031,7 @@ def test_table_pickle():
metadata={b'foo': b'bar'})
table = pa.Table.from_arrays(data, schema=schema)
- result = pickle.loads(pickle.dumps(table))
+ result = pickle_module.loads(pickle_module.dumps(table))
result.validate()
assert result.equals(table)
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index 568702d8e8..f3b6001003 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -21,7 +21,6 @@ from functools import partial
import datetime
import sys
-import pickle
import pytest
import hypothesis as h
import hypothesis.strategies as st
@@ -790,10 +789,10 @@ def test_types_hashable():
assert in_dict[type_] == i
-def test_types_picklable():
+def test_types_picklable(pickle_module):
for ty in get_many_types():
- data = pickle.dumps(ty)
- assert pickle.loads(data) == ty
+ data = pickle_module.dumps(ty)
+ assert pickle_module.loads(data) == ty
def test_types_weakref():
@@ -1193,9 +1192,9 @@ def test_is_boolean_value():
@h.example(
pa.field(name='', type=pa.null(), metadata={'0': '', '': ''})
)
-def test_pickling(field):
- data = pickle.dumps(field)
- assert pickle.loads(data) == field
+def test_pickling(pickle_module, field):
+ data = pickle_module.dumps(field)
+ assert pickle_module.loads(data) == field
@h.given(