[arrow] branch main updated: GH-37254: [Python] Parametrize all pickling tests to use both the pickle and cloudpickle modules (#37255)

kou Wed, 23 Aug 2023 19:13:24 -0700

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 175b2a2fe2 GH-37254: [Python] Parametrize all pickling tests to use 
both the pickle and cloudpickle modules (#37255)
175b2a2fe2 is described below

commit 175b2a2fe2d6e35237c36666478800f62e3f6947
Author: Dane Pitkin <[email protected]>
AuthorDate: Wed Aug 23 22:12:55 2023 -0400

    GH-37254: [Python] Parametrize all pickling tests to use both the pickle 
and cloudpickle modules (#37255)
    
    ### Rationale for this change
    
    Cloudpickle was not tested in most parts of the pyarrow test suite. 
Improving this coverage will make the Cython 3.0.0 upgrade cleaner as 
cloudpickle was failing in a few places where the default pickle module was 
not. This has been verified using Cython 0.29.36.
    
    ### What changes are included in this PR?
    
    * `__reduce__` methods that need to pass kwargs have been changed from 
classmethod to staticmethod
    * All pytests that pickle objects are parameterized to use both `pickle` 
and `cloudpickle`
    
    ### Are these changes tested?
    
    Yes, pytests run successfully with Cython 0.29.36
    
    ### Are there any user-facing changes?
    
    No.
    * Closes: #37254
    
    Authored-by: Dane Pitkin <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 python/pyarrow/_dataset_parquet.pyx          |  12 ++-
 python/pyarrow/_fs.pyx                       |   8 +-
 python/pyarrow/_gcsfs.pyx                    |  13 ++-
 python/pyarrow/_hdfs.pyx                     |  11 ++-
 python/pyarrow/_s3fs.pyx                     |  11 ++-
 python/pyarrow/scalar.pxi                    |   6 +-
 python/pyarrow/tests/conftest.py             | 111 +++++++++++++++++----
 python/pyarrow/tests/parquet/test_dataset.py |  25 +----
 python/pyarrow/tests/test_array.py           |  35 +++----
 python/pyarrow/tests/test_compute.py         |  13 ++-
 python/pyarrow/tests/test_csv.py             |  30 +++---
 python/pyarrow/tests/test_dataset.py         | 139 ++++++++++++++-------------
 python/pyarrow/tests/test_extension_type.py  |  39 ++++----
 python/pyarrow/tests/test_fs.py              | 117 +++++++++++-----------
 python/pyarrow/tests/test_hdfs.py            |  14 ++-
 python/pyarrow/tests/test_io.py              |  23 +++--
 python/pyarrow/tests/test_json.py            |  17 ++--
 python/pyarrow/tests/test_scalars.py         |  23 +++--
 python/pyarrow/tests/test_schema.py          |   7 +-
 python/pyarrow/tests/test_table.py           |  13 ++-
 python/pyarrow/tests/test_types.py           |  13 ++-
 21 files changed, 372 insertions(+), 308 deletions(-)

diff --git a/python/pyarrow/_dataset_parquet.pyx 
b/python/pyarrow/_dataset_parquet.pyx
index 4de396f4f5..79bd270ce5 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -19,6 +19,7 @@
 
 """Dataset support for Parquest file format."""
 
+from cython cimport binding
 from cython.operator cimport dereference as deref
 
 import os
@@ -770,9 +771,12 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
             other.thrift_container_size_limit)
         return attrs == other_attrs
 
-    @classmethod
-    def _reconstruct(cls, kwargs):
-        return cls(**kwargs)
+    @staticmethod
+    @binding(True)  # Required for Cython < 3
+    def _reconstruct(kwargs):
+        # __reduce__ doesn't allow passing named arguments directly to the
+        # reconstructor, hence this wrapper.
+        return ParquetFragmentScanOptions(**kwargs)
 
     def __reduce__(self):
         kwargs = dict(
@@ -782,7 +786,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
             thrift_string_size_limit=self.thrift_string_size_limit,
             thrift_container_size_limit=self.thrift_container_size_limit,
         )
-        return type(self)._reconstruct, (kwargs,)
+        return ParquetFragmentScanOptions._reconstruct, (kwargs,)
 
 
 cdef class ParquetFactoryOptions(_Weakrefable):
diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx
index dbd7ebe5e4..ef8db31bfc 100644
--- a/python/pyarrow/_fs.pyx
+++ b/python/pyarrow/_fs.pyx
@@ -18,6 +18,7 @@
 # cython: language_level = 3
 
 from cpython.datetime cimport datetime, PyDateTime_DateTime
+from cython cimport binding
 
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint
@@ -1106,11 +1107,12 @@ cdef class LocalFileSystem(FileSystem):
         FileSystem.init(self, c_fs)
         self.localfs = <CLocalFileSystem*> c_fs.get()
 
-    @classmethod
-    def _reconstruct(cls, kwargs):
+    @staticmethod
+    @binding(True)  # Required for cython < 3
+    def _reconstruct(kwargs):
         # __reduce__ doesn't allow passing named arguments directly to the
         # reconstructor, hence this wrapper.
-        return cls(**kwargs)
+        return LocalFileSystem(**kwargs)
 
     def __reduce__(self):
         cdef CLocalFileSystemOptions opts = self.localfs.options()
diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx
index 1fda08232c..5e69413cea 100644
--- a/python/pyarrow/_gcsfs.pyx
+++ b/python/pyarrow/_gcsfs.pyx
@@ -17,6 +17,8 @@
 
 # cython: language_level = 3
 
+from cython cimport binding
+
 from pyarrow.lib cimport (pyarrow_wrap_metadata,
                           pyarrow_unwrap_metadata)
 from pyarrow.lib import frombytes, tobytes, ensure_metadata
@@ -154,10 +156,6 @@ cdef class GcsFileSystem(FileSystem):
         FileSystem.init(self, wrapped)
         self.gcsfs = <CGcsFileSystem*> wrapped.get()
 
-    @classmethod
-    def _reconstruct(cls, kwargs):
-        return cls(**kwargs)
-
     def _expiration_datetime_from_options(self):
         expiration_ns = TimePoint_to_ns(
             self.gcsfs.options().credentials.expiration())
@@ -165,6 +163,13 @@ cdef class GcsFileSystem(FileSystem):
             return None
         return datetime.fromtimestamp(expiration_ns / 1.0e9, timezone.utc)
 
+    @staticmethod
+    @binding(True)  # Required for cython < 3
+    def _reconstruct(kwargs):
+        # __reduce__ doesn't allow passing named arguments directly to the
+        # reconstructor, hence this wrapper.
+        return GcsFileSystem(**kwargs)
+
     def __reduce__(self):
         cdef CGcsOptions opts = self.gcsfs.options()
         service_account = frombytes(opts.credentials.target_service_account())
diff --git a/python/pyarrow/_hdfs.pyx b/python/pyarrow/_hdfs.pyx
index a0cef25105..c426337a12 100644
--- a/python/pyarrow/_hdfs.pyx
+++ b/python/pyarrow/_hdfs.pyx
@@ -17,6 +17,8 @@
 
 # cython: language_level = 3
 
+from cython cimport binding
+
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport *
 from pyarrow.includes.libarrow_fs cimport *
@@ -134,9 +136,12 @@ replication=1)``
         self.init(<shared_ptr[CFileSystem]> wrapped)
         return self
 
-    @classmethod
-    def _reconstruct(cls, kwargs):
-        return cls(**kwargs)
+    @staticmethod
+    @binding(True)  # Required for cython < 3
+    def _reconstruct(kwargs):
+        # __reduce__ doesn't allow passing named arguments directly to the
+        # reconstructor, hence this wrapper.
+        return HadoopFileSystem(**kwargs)
 
     def __reduce__(self):
         cdef CHdfsOptions opts = self.hdfs.options()
diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
index 51c248d147..ab45171369 100644
--- a/python/pyarrow/_s3fs.pyx
+++ b/python/pyarrow/_s3fs.pyx
@@ -17,6 +17,8 @@
 
 # cython: language_level = 3
 
+from cython cimport binding
+
 from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata,
                           pyarrow_unwrap_metadata)
 from pyarrow.lib import frombytes, tobytes, KeyValueMetadata
@@ -388,9 +390,12 @@ cdef class S3FileSystem(FileSystem):
         FileSystem.init(self, wrapped)
         self.s3fs = <CS3FileSystem*> wrapped.get()
 
-    @classmethod
-    def _reconstruct(cls, kwargs):
-        return cls(**kwargs)
+    @staticmethod
+    @binding(True)  # Required for cython < 3
+    def _reconstruct(kwargs):
+        # __reduce__ doesn't allow passing named arguments directly to the
+        # reconstructor, hence this wrapper.
+        return S3FileSystem(**kwargs)
 
     def __reduce__(self):
         cdef CS3Options opts = self.s3fs.options()
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index e19807ba56..e07949c675 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -16,6 +16,7 @@
 # under the License.
 
 import collections
+from cython cimport binding
 
 
 cdef class Scalar(_Weakrefable):
@@ -836,8 +837,9 @@ cdef class DictionaryScalar(Scalar):
     Concrete class for dictionary-encoded scalars.
     """
 
-    @classmethod
-    def _reconstruct(cls, type, is_valid, index, dictionary):
+    @staticmethod
+    @binding(True)  # Required for cython < 3
+    def _reconstruct(type, is_valid, index, dictionary):
         cdef:
             CDictionaryScalarIndexAndDictionary value
             shared_ptr[CDictionaryScalar] wrapped
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index e6d87217ec..241ae4814a 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -15,13 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import functools
 import os
 import pathlib
 import subprocess
 import sys
-from tempfile import TemporaryDirectory
+import time
+import urllib.request
 
 import pytest
+from pytest_lazyfixture import lazy_fixture
 import hypothesis as h
 from ..conftest import groups, defaults
 
@@ -146,8 +149,49 @@ def s3_connection():
     return host, port, access_key, secret_key
 
 
+def retry(attempts=3, delay=1.0, max_delay=None, backoff=1):
+    """
+    Retry decorator
+
+    Parameters
+    ----------
+    attempts : int, default 3
+        The number of attempts.
+    delay : float, default 1
+        Initial delay in seconds.
+    max_delay : float, optional
+        The max delay between attempts.
+    backoff : float, default 1
+        The multiplier to delay after each attempt.
+    """
+    def decorate(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            remaining_attempts = attempts
+            curr_delay = delay
+            while remaining_attempts > 0:
+                try:
+                    return func(*args, **kwargs)
+                except Exception as err:
+                    remaining_attempts -= 1
+                    last_exception = err
+                    curr_delay *= backoff
+                    if max_delay:
+                        curr_delay = min(curr_delay, max_delay)
+                    time.sleep(curr_delay)
+            raise last_exception
+        return wrapper
+    return decorate
+
+
 @pytest.fixture(scope='session')
-def s3_server(s3_connection):
+def s3_server(s3_connection, tmpdir_factory):
+    @retry(attempts=5, delay=0.1, backoff=2)
+    def minio_server_health_check(address):
+        resp = urllib.request.urlopen(f"http://{address}/minio/health/cluster";)
+        assert resp.getcode() == 200
+
+    tmpdir = tmpdir_factory.getbasetemp()
     host, port, access_key, secret_key = s3_connection
 
     address = '{}:{}'.format(host, port)
@@ -157,24 +201,26 @@ def s3_server(s3_connection):
         'MINIO_SECRET_KEY': secret_key
     })
 
-    with TemporaryDirectory() as tempdir:
-        args = ['minio', '--compat', 'server', '--quiet', '--address',
-                address, tempdir]
-        proc = None
-        try:
-            proc = subprocess.Popen(args, env=env)
-        except OSError:
-            pytest.skip('`minio` command cannot be located')
-        else:
-            yield {
-                'connection': s3_connection,
-                'process': proc,
-                'tempdir': tempdir
-            }
-        finally:
-            if proc is not None:
-                proc.kill()
-                proc.wait()
+    args = ['minio', '--compat', 'server', '--quiet', '--address',
+            address, tmpdir]
+    proc = None
+    try:
+        proc = subprocess.Popen(args, env=env)
+    except OSError:
+        pytest.skip('`minio` command cannot be located')
+    else:
+        # Wait for the server to startup before yielding
+        minio_server_health_check(address)
+
+        yield {
+            'connection': s3_connection,
+            'process': proc,
+            'tempdir': tmpdir
+        }
+    finally:
+        if proc is not None:
+            proc.kill()
+            proc.wait()
 
 
 @pytest.fixture(scope='session')
@@ -202,3 +248,28 @@ def gcs_server():
         if proc is not None:
             proc.kill()
             proc.wait()
+
+
[email protected](
+    params=[
+        lazy_fixture('builtin_pickle'),
+        lazy_fixture('cloudpickle')
+    ],
+    scope='session'
+)
+def pickle_module(request):
+    return request.param
+
+
[email protected](scope='session')
+def builtin_pickle():
+    import pickle
+    return pickle
+
+
[email protected](scope='session')
+def cloudpickle():
+    cp = pytest.importorskip('cloudpickle')
+    if 'HIGHEST_PROTOCOL' not in cp.__dict__:
+        cp.HIGHEST_PROTOCOL = cp.DEFAULT_PROTOCOL
+    return cp
diff --git a/python/pyarrow/tests/parquet/test_dataset.py 
b/python/pyarrow/tests/parquet/test_dataset.py
index 3e6ff49265..be27c71b81 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -1533,10 +1533,13 @@ def _make_dataset_for_pickling(tempdir, 
use_legacy_dataset=False, N=100):
     return dataset
 
 
-def _assert_dataset_is_picklable(dataset, pickler, use_legacy_dataset=False):
[email protected]
+@parametrize_legacy_dataset
+def test_pickle_dataset(tempdir, datadir, use_legacy_dataset, pickle_module):
     def is_pickleable(obj):
-        return obj == pickler.loads(pickler.dumps(obj))
+        return obj == pickle_module.loads(pickle_module.dumps(obj))
 
+    dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset)
     assert is_pickleable(dataset)
     if use_legacy_dataset:
         with pytest.warns(FutureWarning):
@@ -1555,24 +1558,6 @@ def _assert_dataset_is_picklable(dataset, pickler, 
use_legacy_dataset=False):
                 assert is_pickleable(metadata.row_group(i))
 
 
[email protected]
-@parametrize_legacy_dataset
-def test_builtin_pickle_dataset(tempdir, datadir, use_legacy_dataset):
-    import pickle
-    dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset)
-    _assert_dataset_is_picklable(
-        dataset, pickler=pickle, use_legacy_dataset=use_legacy_dataset)
-
-
[email protected]
-@parametrize_legacy_dataset
-def test_cloudpickle_dataset(tempdir, datadir, use_legacy_dataset):
-    cp = pytest.importorskip('cloudpickle')
-    dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset)
-    _assert_dataset_is_picklable(
-        dataset, pickler=cp, use_legacy_dataset=use_legacy_dataset)
-
-
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_partitioned_dataset(tempdir, use_legacy_dataset):
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 0546830a66..fca094b519 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -21,7 +21,6 @@ import decimal
 import hypothesis as h
 import hypothesis.strategies as st
 import itertools
-import pickle
 import pytest
 import struct
 import subprocess
@@ -29,10 +28,6 @@ import sys
 import weakref
 
 import numpy as np
-try:
-    import pickle5
-except ImportError:
-    pickle5 = None
 
 import pyarrow as pa
 import pyarrow.tests.strategies as past
@@ -2007,21 +2002,21 @@ pickle_test_parametrize = pytest.mark.parametrize(
 
 
 @pickle_test_parametrize
-def test_array_pickle(data, typ):
+def test_array_pickle(data, typ, pickle_module):
     # Allocate here so that we don't have any Arrow data allocated.
     # This is needed to ensure that allocator tests can be reliable.
     array = pa.array(data, type=typ)
-    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
-        result = pickle.loads(pickle.dumps(array, proto))
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+        result = pickle_module.loads(pickle_module.dumps(array, proto))
         assert array.equals(result)
 
 
-def test_array_pickle_dictionary():
+def test_array_pickle_dictionary(pickle_module):
     # not included in the above as dictionary array cannot be created with
     # the pa.array function
     array = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1], ['a', 'b', 'c'])
-    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
-        result = pickle.loads(pickle.dumps(array, proto))
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+        result = pickle_module.loads(pickle_module.dumps(array, proto))
         assert array.equals(result)
 
 
@@ -2031,27 +2026,23 @@ def test_array_pickle_dictionary():
         size=st.integers(min_value=0, max_value=10)
     )
 )
-def test_pickling(arr):
-    data = pickle.dumps(arr)
-    restored = pickle.loads(data)
+def test_pickling(pickle_module, arr):
+    data = pickle_module.dumps(arr)
+    restored = pickle_module.loads(data)
     assert arr.equals(restored)
 
 
 @pickle_test_parametrize
-def test_array_pickle5(data, typ):
+def test_array_pickle_protocol5(data, typ, pickle_module):
     # Test zero-copy pickling with protocol 5 (PEP 574)
-    picklemod = pickle5 or pickle
-    if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5:
-        pytest.skip("need pickle5 package or Python 3.8+")
-
     array = pa.array(data, type=typ)
     addresses = [buf.address if buf is not None else 0
                  for buf in array.buffers()]
 
-    for proto in range(5, pickle.HIGHEST_PROTOCOL + 1):
+    for proto in range(5, pickle_module.HIGHEST_PROTOCOL + 1):
         buffers = []
-        pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append)
-        result = picklemod.loads(pickled, buffers=buffers)
+        pickled = pickle_module.dumps(array, proto, 
buffer_callback=buffers.append)
+        result = pickle_module.loads(pickled, buffers=buffers)
         assert array.equals(result)
 
         result_addresses = [buf.address if buf is not None else 0
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 6ee54ca4b1..152cae1be1 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -23,7 +23,6 @@ import inspect
 import itertools
 import math
 import os
-import pickle
 import pytest
 import random
 import sys
@@ -278,18 +277,18 @@ def test_call_function_with_memory_pool():
     assert result3.equals(expected)
 
 
-def test_pickle_functions():
+def test_pickle_functions(pickle_module):
     # Pickle registered functions
     for name in pc.list_functions():
         func = pc.get_function(name)
-        reconstructed = pickle.loads(pickle.dumps(func))
+        reconstructed = pickle_module.loads(pickle_module.dumps(func))
         assert type(reconstructed) is type(func)
         assert reconstructed.name == func.name
         assert reconstructed.arity == func.arity
         assert reconstructed.num_kernels == func.num_kernels
 
 
-def test_pickle_global_functions():
+def test_pickle_global_functions(pickle_module):
     # Pickle global wrappers (manual or automatic) of registered functions
     for name in pc.list_functions():
         try:
@@ -297,7 +296,7 @@ def test_pickle_global_functions():
         except AttributeError:
             # hash_aggregate functions are not exported as callables.
             continue
-        reconstructed = pickle.loads(pickle.dumps(func))
+        reconstructed = pickle_module.loads(pickle_module.dumps(func))
         assert reconstructed is func
 
 
@@ -3366,10 +3365,10 @@ def create_sample_expressions():
 # Tests the Arrow-specific serialization mechanism
 
 
-def test_expression_serialization_arrow():
+def test_expression_serialization_arrow(pickle_module):
     for expr in create_sample_expressions()["all"]:
         assert isinstance(expr, pc.Expression)
-        restored = pickle.loads(pickle.dumps(expr))
+        restored = pickle_module.loads(pickle_module.dumps(expr))
         assert expr.equals(restored)
 
 
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 81c31d98ac..afc5380b75 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -24,7 +24,6 @@ import gzip
 import io
 import itertools
 import os
-import pickle
 import select
 import shutil
 import signal
@@ -102,10 +101,10 @@ def check_options_class(cls, **attr_values):
 
 
 # The various options classes need to be picklable for dataset
-def check_options_class_pickling(cls, **attr_values):
+def check_options_class_pickling(cls, pickler, **attr_values):
     opts = cls(**attr_values)
-    new_opts = pickle.loads(pickle.dumps(opts,
-                                         protocol=pickle.HIGHEST_PROTOCOL))
+    new_opts = pickler.loads(pickler.dumps(opts,
+                                           protocol=pickler.HIGHEST_PROTOCOL))
     for name, value in attr_values.items():
         assert getattr(new_opts, name) == value
 
@@ -128,7 +127,7 @@ class InvalidRowHandler:
                 other.result != self.result)
 
 
-def test_read_options():
+def test_read_options(pickle_module):
     cls = ReadOptions
     opts = cls()
 
@@ -139,7 +138,8 @@ def test_read_options():
                         encoding=['utf8', 'utf16'],
                         skip_rows_after_names=[0, 27])
 
-    check_options_class_pickling(cls, use_threads=True,
+    check_options_class_pickling(cls, pickler=pickle_module,
+                                 use_threads=True,
                                  skip_rows=3,
                                  column_names=["ab", "cd"],
                                  autogenerate_column_names=False,
@@ -182,7 +182,7 @@ def test_read_options():
         opts.validate()
 
 
-def test_parse_options():
+def test_parse_options(pickle_module):
     cls = ParseOptions
     skip_handler = InvalidRowHandler('skip')
 
@@ -194,7 +194,8 @@ def test_parse_options():
                         ignore_empty_lines=[True, False],
                         invalid_row_handler=[None, skip_handler])
 
-    check_options_class_pickling(cls, delimiter='x',
+    check_options_class_pickling(cls, pickler=pickle_module,
+                                 delimiter='x',
                                  escape_char='y',
                                  quote_char=False,
                                  double_quote=False,
@@ -241,7 +242,7 @@ def test_parse_options():
         opts.validate()
 
 
-def test_convert_options():
+def test_convert_options(pickle_module):
     cls = ConvertOptions
     opts = cls()
 
@@ -256,7 +257,8 @@ def test_convert_options():
         timestamp_parsers=[[], [ISO8601, '%y-%m']])
 
     check_options_class_pickling(
-        cls, check_utf8=False,
+        cls, pickler=pickle_module,
+        check_utf8=False,
         strings_can_be_null=True,
         quoted_strings_can_be_null=False,
         decimal_point=',',
@@ -622,7 +624,7 @@ class BaseTestCSV(abc.ABC):
                             read_options=read_options,
                             convert_options=convert_options)
 
-    def test_invalid_row_handler(self):
+    def test_invalid_row_handler(self, pickle_module):
         rows = b"a,b\nc\nd,e\nf,g,h\ni,j\n"
         parse_opts = ParseOptions()
         with pytest.raises(
@@ -657,7 +659,7 @@ class BaseTestCSV(abc.ABC):
 
         # Test ser/de
         parse_opts.invalid_row_handler = InvalidRowHandler('skip')
-        parse_opts = pickle.loads(pickle.dumps(parse_opts))
+        parse_opts = pickle_module.loads(pickle_module.dumps(parse_opts))
 
         table = self.read_bytes(rows, parse_options=parse_opts)
         assert table.to_pydict() == {
@@ -1792,13 +1794,13 @@ class BaseStreamingCSVRead(BaseTestCSV):
             assert reader.read_next_batch()
 
 
-class TestSerialStreamingCSVRead(BaseStreamingCSVRead, unittest.TestCase):
+class TestSerialStreamingCSVRead(BaseStreamingCSVRead):
     @property
     def use_threads(self):
         return False
 
 
-class TestThreadedStreamingCSVRead(BaseStreamingCSVRead, unittest.TestCase):
+class TestThreadedStreamingCSVRead(BaseStreamingCSVRead):
     @property
     def use_threads(self):
         return True
diff --git a/python/pyarrow/tests/test_dataset.py 
b/python/pyarrow/tests/test_dataset.py
index f92317c0f2..b8a0c38089 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -20,7 +20,6 @@ import os
 import posixpath
 import datetime
 import pathlib
-import pickle
 import sys
 import textwrap
 import tempfile
@@ -700,7 +699,7 @@ def test_partitioning():
             assert load_back_table.equals(table)
 
 
-def test_partitioning_pickling():
+def test_partitioning_pickling(pickle_module):
     schema = pa.schema([
         pa.field('i64', pa.int64()),
         pa.field('f64', pa.float64())
@@ -715,7 +714,7 @@ def test_partitioning_pickling():
     ]
 
     for part in parts:
-        assert pickle.loads(pickle.dumps(part)) == part
+        assert pickle_module.loads(pickle_module.dumps(part)) == part
 
 
 def test_expression_arithmetic_operators():
@@ -818,7 +817,7 @@ def test_parquet_scan_options():
     assert opts5 != opts1
 
 
-def test_file_format_pickling():
+def test_file_format_pickling(pickle_module):
     formats = [
         ds.IpcFileFormat(),
         ds.CsvFileFormat(),
@@ -854,10 +853,10 @@ def test_file_format_pickling():
         ])
 
     for file_format in formats:
-        assert pickle.loads(pickle.dumps(file_format)) == file_format
+        assert pickle_module.loads(pickle_module.dumps(file_format)) == 
file_format
 
 
-def test_fragment_scan_options_pickling():
+def test_fragment_scan_options_pickling(pickle_module):
     options = [
         ds.CsvFragmentScanOptions(),
         ds.CsvFragmentScanOptions(
@@ -879,7 +878,7 @@ def test_fragment_scan_options_pickling():
         ])
 
     for option in options:
-        assert pickle.loads(pickle.dumps(option)) == option
+        assert pickle_module.loads(pickle_module.dumps(option)) == option
 
 
 @pytest.mark.parametrize('paths_or_selector', [
@@ -982,7 +981,7 @@ def test_make_fragment(multisourcefs):
         assert row_group_fragment.row_groups == [0]
 
 
-def test_make_csv_fragment_from_buffer(dataset_reader):
+def test_make_csv_fragment_from_buffer(dataset_reader, pickle_module):
     content = textwrap.dedent("""
         alpha,num,animal
         a,12,dog
@@ -1003,11 +1002,11 @@ def test_make_csv_fragment_from_buffer(dataset_reader):
                         names=['alpha', 'num', 'animal'])
     assert dataset_reader.to_table(fragment).equals(expected)
 
-    pickled = pickle.loads(pickle.dumps(fragment))
+    pickled = pickle_module.loads(pickle_module.dumps(fragment))
     assert dataset_reader.to_table(pickled).equals(fragment.to_table())
 
 
-def test_make_json_fragment_from_buffer(dataset_reader):
+def test_make_json_fragment_from_buffer(dataset_reader, pickle_module):
     content = '{"alpha" : "a", "num": 12, "animal" : "dog"}\n' + \
         '{"alpha" : "b", "num": 11, "animal" : "cat"}\n' + \
         '{"alpha" : "c", "num": 10, "animal" : "rabbit"}\n'
@@ -1025,12 +1024,12 @@ def test_make_json_fragment_from_buffer(dataset_reader):
                         names=['alpha', 'num', 'animal'])
     assert dataset_reader.to_table(fragment).equals(expected)
 
-    pickled = pickle.loads(pickle.dumps(fragment))
+    pickled = pickle_module.loads(pickle_module.dumps(fragment))
     assert dataset_reader.to_table(pickled).equals(fragment.to_table())
 
 
 @pytest.mark.parquet
-def test_make_parquet_fragment_from_buffer(dataset_reader):
+def test_make_parquet_fragment_from_buffer(dataset_reader, pickle_module):
     arrays = [
         pa.array(['a', 'b', 'c']),
         pa.array([12, 11, 10]),
@@ -1063,7 +1062,7 @@ def 
test_make_parquet_fragment_from_buffer(dataset_reader):
         fragment = format_.make_fragment(buffer)
         assert dataset_reader.to_table(fragment).equals(table)
 
-        pickled = pickle.loads(pickle.dumps(fragment))
+        pickled = pickle_module.loads(pickle_module.dumps(fragment))
         assert dataset_reader.to_table(pickled).equals(table)
 
 
@@ -1140,7 +1139,7 @@ def test_fragments_implicit_cast(tempdir):
 @pytest.mark.parquet
 @pytest.mark.filterwarnings(
     "ignore:Passing 'use_legacy_dataset=True':FutureWarning")
-def test_fragments_reconstruct(tempdir, dataset_reader):
+def test_fragments_reconstruct(tempdir, dataset_reader, pickle_module):
     table, dataset = _create_dataset_for_fragments(tempdir)
 
     def assert_yields_projected(fragment, row_slice,
@@ -1157,7 +1156,7 @@ def test_fragments_reconstruct(tempdir, dataset_reader):
     parquet_format = fragment.format
 
     # test pickle roundtrip
-    pickled_fragment = pickle.loads(pickle.dumps(fragment))
+    pickled_fragment = pickle_module.loads(pickle_module.dumps(fragment))
     assert dataset_reader.to_table(
         pickled_fragment) == dataset_reader.to_table(fragment)
 
@@ -1272,7 +1271,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir, 
dataset_reader):
 @pytest.mark.parquet
 @pytest.mark.filterwarnings(
     "ignore:Passing 'use_legacy_dataset=True':FutureWarning")
-def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
+def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, 
pickle_module):
     fs, assert_opens = open_logging_fs
     _, dataset = _create_dataset_for_fragments(
         tempdir, chunk_size=2, filesystem=fs
@@ -1304,7 +1303,7 @@ def test_fragments_parquet_ensure_metadata(tempdir, 
open_logging_fs):
     assert row_group.statistics is not None
 
     # pickling preserves row group ids
-    pickled_fragment = pickle.loads(pickle.dumps(new_fragment))
+    pickled_fragment = pickle_module.loads(pickle_module.dumps(new_fragment))
     with assert_opens([fragment.path]):
         assert pickled_fragment.row_groups == [0, 1]
         row_group = pickled_fragment.row_groups[0]
@@ -1314,7 +1313,7 @@ def test_fragments_parquet_ensure_metadata(tempdir, 
open_logging_fs):
 
 @pytest.mark.pandas
 @pytest.mark.parquet
-def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs):
+def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs, 
pickle_module):
     # https://issues.apache.org/jira/browse/ARROW-15796
     fs, assert_opens = open_logging_fs
     _, dataset = _create_dataset_for_fragments(tempdir, filesystem=fs)
@@ -1323,7 +1322,7 @@ def test_fragments_parquet_pickle_no_metadata(tempdir, 
open_logging_fs):
     # second fragment hasn't yet loaded the metadata,
     # and pickling it also should not read the metadata
     with assert_opens([]):
-        pickled_fragment = pickle.loads(pickle.dumps(fragment))
+        pickled_fragment = pickle_module.loads(pickle_module.dumps(fragment))
 
     # then accessing the row group info reads the metadata
     with assert_opens([pickled_fragment.path]):
@@ -1487,7 +1486,8 @@ def test_fragments_parquet_row_groups_predicate(tempdir):
 @pytest.mark.parquet
 @pytest.mark.filterwarnings(
     "ignore:Passing 'use_legacy_dataset=True':FutureWarning")
-def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader):
+def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader,
+                                                  pickle_module):
     table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2)
 
     fragment = list(dataset.get_fragments())[0]
@@ -1495,7 +1495,7 @@ def 
test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader):
     row_group_fragments = list(fragment.split_by_row_group())
 
     # test pickle roundtrip
-    pickled_fragment = pickle.loads(pickle.dumps(fragment))
+    pickled_fragment = pickle_module.loads(pickle_module.dumps(fragment))
     assert dataset_reader.to_table(
         pickled_fragment) == dataset_reader.to_table(fragment)
 
@@ -1644,14 +1644,14 @@ def test_fragments_repr(tempdir, dataset):
 
 @pytest.mark.parquet
 @pytest.mark.parametrize(
-    "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory(mockfs, pickled):
+    "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory(mockfs, pickled, pickle_module):
     paths_or_selector = fs.FileSelector('subdir', recursive=True)
     format = ds.ParquetFileFormat()
 
     options = ds.FileSystemFactoryOptions('subdir')
     partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key'])
-    partitioning_factory = pickled(partitioning_factory)
+    partitioning_factory = pickled(partitioning_factory, pickle_module)
     assert isinstance(partitioning_factory, ds.PartitioningFactory)
     options.partitioning_factory = partitioning_factory
 
@@ -1678,15 +1678,16 @@ def test_partitioning_factory(mockfs, pickled):
 @pytest.mark.parquet
 @pytest.mark.parametrize('infer_dictionary', [False, True])
 @pytest.mark.parametrize(
-    "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled):
+    "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled,
+                                         pickle_module):
     paths_or_selector = fs.FileSelector('subdir', recursive=True)
     format = ds.ParquetFileFormat()
     options = ds.FileSystemFactoryOptions('subdir')
 
     partitioning_factory = ds.DirectoryPartitioning.discover(
         ['group', 'key'], infer_dictionary=infer_dictionary)
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
 
     factory = ds.FileSystemDatasetFactory(
         mockfs, paths_or_selector, format, options)
@@ -1711,8 +1712,8 @@ def test_partitioning_factory_dictionary(mockfs, 
infer_dictionary, pickled):
 
 
 @pytest.mark.parametrize(
-    "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory_segment_encoding(pickled):
+    "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory_segment_encoding(pickled, pickle_module):
     mockfs = fs._MockFileSystem()
     format = ds.IpcFileFormat()
     schema = pa.schema([("i64", pa.int64())])
@@ -1737,7 +1738,7 @@ def test_partitioning_factory_segment_encoding(pickled):
     options = ds.FileSystemFactoryOptions("directory")
     partitioning_factory = ds.DirectoryPartitioning.discover(
         schema=partition_schema)
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     inferred_schema = factory.inspect()
     assert inferred_schema == full_schema
@@ -1748,7 +1749,7 @@ def test_partitioning_factory_segment_encoding(pickled):
 
     partitioning_factory = ds.DirectoryPartitioning.discover(
         ["date", "string"], segment_encoding="none")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1757,7 +1758,7 @@ def test_partitioning_factory_segment_encoding(pickled):
 
     partitioning = ds.DirectoryPartitioning(
         string_partition_schema, segment_encoding="none")
-    options.partitioning = pickled(partitioning)
+    options.partitioning = pickled(partitioning, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1766,7 +1767,7 @@ def test_partitioning_factory_segment_encoding(pickled):
 
     partitioning_factory = ds.DirectoryPartitioning.discover(
         schema=partition_schema, segment_encoding="none")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     with pytest.raises(pa.ArrowInvalid,
                        match="Could not cast segments for partition field"):
@@ -1777,7 +1778,7 @@ def test_partitioning_factory_segment_encoding(pickled):
     options = ds.FileSystemFactoryOptions("hive")
     partitioning_factory = ds.HivePartitioning.discover(
         schema=partition_schema)
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     inferred_schema = factory.inspect()
     assert inferred_schema == full_schema
@@ -1788,7 +1789,7 @@ def test_partitioning_factory_segment_encoding(pickled):
 
     partitioning_factory = ds.HivePartitioning.discover(
         segment_encoding="none")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1805,7 +1806,7 @@ def test_partitioning_factory_segment_encoding(pickled):
 
     partitioning_factory = ds.HivePartitioning.discover(
         schema=partition_schema, segment_encoding="none")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     with pytest.raises(pa.ArrowInvalid,
                        match="Could not cast segments for partition field"):
@@ -1813,8 +1814,8 @@ def test_partitioning_factory_segment_encoding(pickled):
 
 
 @pytest.mark.parametrize(
-    "pickled", [lambda x: x, lambda x: pickle.loads(pickle.dumps(x))])
-def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
+    "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))])
+def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled, 
pickle_module):
     mockfs = fs._MockFileSystem()
     format = ds.IpcFileFormat()
     schema = pa.schema([("i64", pa.int64())])
@@ -1845,7 +1846,7 @@ def 
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
     options = ds.FileSystemFactoryOptions("hive")
     partitioning_factory = ds.HivePartitioning.discover(
         schema=partition_schema)
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     inferred_schema = factory.inspect()
     assert inferred_schema == full_schema
@@ -1856,7 +1857,7 @@ def 
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
 
     partitioning_factory = ds.HivePartitioning.discover(
         segment_encoding="uri")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1865,7 +1866,7 @@ def 
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
 
     partitioning = ds.HivePartitioning(
         string_partition_schema, segment_encoding="uri")
-    options.partitioning = pickled(partitioning)
+    options.partitioning = pickled(partitioning, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1874,7 +1875,7 @@ def 
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
 
     partitioning_factory = ds.HivePartitioning.discover(
         segment_encoding="none")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1883,7 +1884,7 @@ def 
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
 
     partitioning = ds.HivePartitioning(
         string_partition_schema_en, segment_encoding="none")
-    options.partitioning = pickled(partitioning)
+    options.partitioning = pickled(partitioning, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     fragments = list(factory.finish().get_fragments())
     assert fragments[0].partition_expression.equals(
@@ -1892,7 +1893,7 @@ def 
test_partitioning_factory_hive_segment_encoding_key_encoded(pickled):
 
     partitioning_factory = ds.HivePartitioning.discover(
         schema=partition_schema_en, segment_encoding="none")
-    options.partitioning_factory = pickled(partitioning_factory)
+    options.partitioning_factory = pickled(partitioning_factory, pickle_module)
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     with pytest.raises(pa.ArrowInvalid,
                        match="Could not cast segments for partition field"):
@@ -2087,14 +2088,14 @@ def _create_directory_of_files(base_dir):
     return (table1, table2), (path1, path2)
 
 
-def _check_dataset(dataset, table, dataset_reader):
+def _check_dataset(dataset, table, dataset_reader, pickler):
     # also test that pickle roundtrip keeps the functionality
-    for d in [dataset, pickle.loads(pickle.dumps(dataset))]:
+    for d in [dataset, pickler.loads(pickler.dumps(dataset))]:
         assert dataset.schema.equals(table.schema)
         assert dataset_reader.to_table(dataset).equals(table)
 
 
-def _check_dataset_from_path(path, table, dataset_reader, **kwargs):
+def _check_dataset_from_path(path, table, dataset_reader, pickler, **kwargs):
     # pathlib object
     assert isinstance(path, pathlib.Path)
 
@@ -2102,39 +2103,39 @@ def _check_dataset_from_path(path, table, 
dataset_reader, **kwargs):
     for p in [path, str(path), [path], [str(path)]]:
         dataset = ds.dataset(path, **kwargs)
         assert isinstance(dataset, ds.FileSystemDataset)
-        _check_dataset(dataset, table, dataset_reader)
+        _check_dataset(dataset, table, dataset_reader, pickler)
 
     # relative string path
     with change_cwd(path.parent):
         dataset = ds.dataset(path.name, **kwargs)
         assert isinstance(dataset, ds.FileSystemDataset)
-        _check_dataset(dataset, table, dataset_reader)
+        _check_dataset(dataset, table, dataset_reader, pickler)
 
 
 @pytest.mark.parquet
-def test_open_dataset_single_file(tempdir, dataset_reader):
+def test_open_dataset_single_file(tempdir, dataset_reader, pickle_module):
     table, path = _create_single_file(tempdir)
-    _check_dataset_from_path(path, table, dataset_reader)
+    _check_dataset_from_path(path, table, dataset_reader, pickle_module)
 
 
 @pytest.mark.parquet
-def test_deterministic_row_order(tempdir, dataset_reader):
+def test_deterministic_row_order(tempdir, dataset_reader, pickle_module):
     # ARROW-8447 Ensure that dataset.to_table (and Scanner::ToTable) returns a
     # deterministic row ordering. This is achieved by constructing a single
     # parquet file with one row per RowGroup.
     table, path = _create_single_file(tempdir, row_group_size=1)
-    _check_dataset_from_path(path, table, dataset_reader)
+    _check_dataset_from_path(path, table, dataset_reader, pickle_module)
 
 
 @pytest.mark.parquet
-def test_open_dataset_directory(tempdir, dataset_reader):
+def test_open_dataset_directory(tempdir, dataset_reader, pickle_module):
     tables, _ = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
-    _check_dataset_from_path(tempdir, table, dataset_reader)
+    _check_dataset_from_path(tempdir, table, dataset_reader, pickle_module)
 
 
 @pytest.mark.parquet
-def test_open_dataset_list_of_files(tempdir, dataset_reader):
+def test_open_dataset_list_of_files(tempdir, dataset_reader, pickle_module):
     tables, (path1, path2) = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
 
@@ -2143,7 +2144,7 @@ def test_open_dataset_list_of_files(tempdir, 
dataset_reader):
         ds.dataset([str(path1), str(path2)])
     ]
     datasets += [
-        pickle.loads(pickle.dumps(d)) for d in datasets
+        pickle_module.loads(pickle_module.dumps(d)) for d in datasets
     ]
 
     for dataset in datasets:
@@ -2173,7 +2174,7 @@ def test_open_dataset_filesystem_fspath(tempdir):
 
 
 @pytest.mark.parquet
-def test_construct_from_single_file(tempdir, dataset_reader):
+def test_construct_from_single_file(tempdir, dataset_reader, pickle_module):
     directory = tempdir / 'single-file'
     directory.mkdir()
     table, path = _create_single_file(directory)
@@ -2186,14 +2187,14 @@ def test_construct_from_single_file(tempdir, 
dataset_reader):
     # instantiate from a single file with prefixed filesystem URI
     d3 = ds.dataset(str(relative_path), filesystem=_filesystem_uri(directory))
     # pickle roundtrip
-    d4 = pickle.loads(pickle.dumps(d1))
+    d4 = pickle_module.loads(pickle_module.dumps(d1))
 
     assert dataset_reader.to_table(d1) == dataset_reader.to_table(
         d2) == dataset_reader.to_table(d3) == dataset_reader.to_table(d4)
 
 
 @pytest.mark.parquet
-def test_construct_from_single_directory(tempdir, dataset_reader):
+def test_construct_from_single_directory(tempdir, dataset_reader, 
pickle_module):
     directory = tempdir / 'single-directory'
     directory.mkdir()
     tables, paths = _create_directory_of_files(directory)
@@ -2208,7 +2209,7 @@ def test_construct_from_single_directory(tempdir, 
dataset_reader):
 
     # test pickle roundtrip
     for d in [d1, d2, d3]:
-        restored = pickle.loads(pickle.dumps(d))
+        restored = pickle_module.loads(pickle_module.dumps(d))
         assert dataset_reader.to_table(restored) == t1
 
 
@@ -2403,12 +2404,12 @@ def _create_partitioned_dataset(basedir):
 
 
 @pytest.mark.parquet
-def test_open_dataset_partitioned_directory(tempdir, dataset_reader):
+def test_open_dataset_partitioned_directory(tempdir, dataset_reader, 
pickle_module):
     full_table, path = _create_partitioned_dataset(tempdir)
 
     # no partitioning specified, just read all individual files
     table = full_table.select(['a', 'b'])
-    _check_dataset_from_path(path, table, dataset_reader)
+    _check_dataset_from_path(path, table, dataset_reader, pickle_module)
 
     # specify partition scheme with discovery
     dataset = ds.dataset(
@@ -2470,14 +2471,14 @@ def test_open_dataset_unsupported_format(tempdir):
 
 
 @pytest.mark.parquet
-def test_open_union_dataset(tempdir, dataset_reader):
+def test_open_union_dataset(tempdir, dataset_reader, pickle_module):
     _, path = _create_single_file(tempdir)
     dataset = ds.dataset(path)
 
     union = ds.dataset([dataset, dataset])
     assert isinstance(union, ds.UnionDataset)
 
-    pickled = pickle.loads(pickle.dumps(union))
+    pickled = pickle_module.loads(pickle_module.dumps(union))
     assert dataset_reader.to_table(pickled) == dataset_reader.to_table(union)
 
 
@@ -2568,7 +2569,7 @@ def test_partition_discovery(
 
 
 @pytest.mark.pandas
-def test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
+def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, 
pickle_module):
     # https://issues.apache.org/jira/browse/ARROW-11400
     table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)})
     part = ds.partitioning(table.select(['part']).schema, flavor="hive")
@@ -2586,10 +2587,10 @@ def 
test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
     assert fragment.to_table(schema=dataset.schema).equals(expected[:5])
     part_expr = fragment.partition_expression
 
-    restored = pickle.loads(pickle.dumps(dataset))
+    restored = pickle_module.loads(pickle_module.dumps(dataset))
     assert restored.to_table().equals(expected)
 
-    restored = pickle.loads(pickle.dumps(fragment))
+    restored = pickle_module.loads(pickle_module.dumps(fragment))
     assert restored.to_table(schema=dataset.schema).equals(expected[:5])
     # to_pandas call triggers computation of the actual dictionary values
     assert restored.to_table(schema=dataset.schema).to_pandas().equals(
@@ -3789,7 +3790,7 @@ def test_dataset_preserved_partitioning(tempdir):
     assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)
     # TODO(GH-34884) partitioning attribute not preserved in pickling
     # dataset_ = ds.dataset(path)
-    # for dataset in [dataset_, pickle.loads(pickle.dumps(dataset_))]:
+    # for dataset in [dataset_, 
pickle_module.loads(pickle_module.dumps(dataset_))]:
     #     assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)
 
     # through discovery, with hive partitioning but not specified
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index 973aa29c75..1eb7d5fa76 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -16,7 +16,6 @@
 # under the License.
 
 import os
-import pickle
 import shutil
 import subprocess
 import weakref
@@ -217,12 +216,12 @@ def test_ext_type_as_py():
         assert result.as_py() == expected
 
 
-def test_uuid_type_pickle():
-    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+def test_uuid_type_pickle(pickle_module):
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
         ty = UuidType()
-        ser = pickle.dumps(ty, protocol=proto)
+        ser = pickle_module.dumps(ty, protocol=proto)
         del ty
-        ty = pickle.loads(ser)
+        ty = pickle_module.loads(ser)
         wr = weakref.ref(ty)
         assert ty.extension_name == "arrow.py_extension_type"
         del ty
@@ -410,14 +409,14 @@ def test_ext_scalar_from_storage():
     assert s.value == pa.scalar(b"0123456789abcdef", ty.storage_type)
 
 
-def test_ext_array_pickling():
-    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+def test_ext_array_pickling(pickle_module):
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
         ty = ParamExtType(3)
         storage = pa.array([b"foo", b"bar"], type=pa.binary(3))
         arr = pa.ExtensionArray.from_storage(ty, storage)
-        ser = pickle.dumps(arr, protocol=proto)
+        ser = pickle_module.dumps(arr, protocol=proto)
         del ty, storage, arr
-        arr = pickle.loads(ser)
+        arr = pickle_module.loads(ser)
         arr.validate()
         assert isinstance(arr, pa.ExtensionArray)
         assert arr.type == ParamExtType(3)
@@ -867,23 +866,23 @@ def test_generic_ext_type_equality():
     assert not period_type == period_type3
 
 
-def test_generic_ext_type_pickling(registered_period_type):
+def test_generic_ext_type_pickling(registered_period_type, pickle_module):
     # GH-36038
-    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
         period_type, _ = registered_period_type
-        ser = pickle.dumps(period_type, protocol=proto)
-        period_type_pickled = pickle.loads(ser)
+        ser = pickle_module.dumps(period_type, protocol=proto)
+        period_type_pickled = pickle_module.loads(ser)
         assert period_type == period_type_pickled
 
 
-def test_generic_ext_array_pickling(registered_period_type):
-    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+def test_generic_ext_array_pickling(registered_period_type, pickle_module):
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
         period_type, _ = registered_period_type
         storage = pa.array([1, 2, 3, 4], pa.int64())
         arr = pa.ExtensionArray.from_storage(period_type, storage)
-        ser = pickle.dumps(arr, protocol=proto)
+        ser = pickle_module.dumps(arr, protocol=proto)
         del storage, arr
-        arr = pickle.loads(ser)
+        arr = pickle_module.loads(ser)
         arr.validate()
         assert isinstance(arr, pa.ExtensionArray)
         assert arr.type == period_type
@@ -1338,17 +1337,17 @@ def 
test_extension_to_pandas_storage_type(registered_period_type):
         assert isinstance(result["ext"].dtype, pd.ArrowDtype)
 
 
-def test_tensor_type_is_picklable():
+def test_tensor_type_is_picklable(pickle_module):
     # GH-35599
 
     expected_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))
-    result = pickle.loads(pickle.dumps(expected_type))
+    result = pickle_module.loads(pickle_module.dumps(expected_type))
 
     assert result == expected_type
 
     arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
     storage = pa.array(arr, pa.list_(pa.int32(), 4))
     expected_arr = pa.ExtensionArray.from_storage(expected_type, storage)
-    result = pickle.loads(pickle.dumps(expected_arr))
+    result = pickle_module.loads(pickle_module.dumps(expected_arr))
 
     assert result == expected_arr
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index b64680c87c..8135f70f69 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -19,7 +19,6 @@ from datetime import datetime, timezone, timedelta
 import gzip
 import os
 import pathlib
-import pickle
 import subprocess
 import sys
 
@@ -559,17 +558,17 @@ def test_subtree_filesystem():
                                   ' base_fs=<pyarrow._fs.LocalFileSystem')
 
 
-def test_filesystem_pickling(fs):
+def test_filesystem_pickling(fs, pickle_module):
     if fs.type_name.split('::')[-1] == 'mock':
         pytest.xfail(reason='MockFileSystem is not serializable')
 
-    serialized = pickle.dumps(fs)
-    restored = pickle.loads(serialized)
+    serialized = pickle_module.dumps(fs)
+    restored = pickle_module.loads(serialized)
     assert isinstance(restored, FileSystem)
     assert restored.equals(fs)
 
 
-def test_filesystem_is_functional_after_pickling(fs, pathfn):
+def test_filesystem_is_functional_after_pickling(fs, pathfn, pickle_module):
     if fs.type_name.split('::')[-1] == 'mock':
         pytest.xfail(reason='MockFileSystem is not serializable')
     skip_fsspec_s3fs(fs)
@@ -584,7 +583,7 @@ def test_filesystem_is_functional_after_pickling(fs, 
pathfn):
     with fs.open_output_stream(c) as fp:
         fp.write(b'test')
 
-    restored = pickle.loads(pickle.dumps(fs))
+    restored = pickle_module.loads(pickle_module.dumps(fs))
     aaa_info, bb_info, c_info = restored.get_file_info([aaa, bb, c])
     assert aaa_info.type == FileType.Directory
     assert bb_info.type == FileType.File
@@ -1060,7 +1059,7 @@ def test_mockfs_mtime_roundtrip(mockfs):
 
 
 @pytest.mark.gcs
-def test_gcs_options():
+def test_gcs_options(pickle_module):
     from pyarrow.fs import GcsFileSystem
     dt = datetime.now()
     fs = GcsFileSystem(access_token='abc',
@@ -1072,20 +1071,20 @@ def test_gcs_options():
     assert isinstance(fs, GcsFileSystem)
     assert fs.default_bucket_location == 'us-west2'
     assert fs.project_id == 'test-project-id'
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = GcsFileSystem()
     assert isinstance(fs, GcsFileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = GcsFileSystem(anonymous=True)
     assert isinstance(fs, GcsFileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = GcsFileSystem(default_metadata={"ACL": "authenticated-read",
                                          "Content-Type": "text/plain"})
     assert isinstance(fs, GcsFileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     with pytest.raises(ValueError):
         GcsFileSystem(access_token='access')
@@ -1098,7 +1097,7 @@ def test_gcs_options():
 
 
 @pytest.mark.s3
-def test_s3_options():
+def test_s3_options(pickle_module):
     from pyarrow.fs import (AwsDefaultS3RetryStrategy,
                             AwsStandardS3RetryStrategy, S3FileSystem,
                             S3RetryStrategy)
@@ -1108,12 +1107,12 @@ def test_s3_options():
                       scheme='https', endpoint_override='localhost:8999')
     assert isinstance(fs, S3FileSystem)
     assert fs.region == 'us-east-2'
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = S3FileSystem(role_arn='role', session_name='session',
                       external_id='id', load_frequency=100)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     # Note that the retry strategy won't survive pickling for now
     fs = S3FileSystem(
@@ -1126,35 +1125,35 @@ def test_s3_options():
 
     fs2 = S3FileSystem(role_arn='role')
     assert isinstance(fs2, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs2)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
     assert fs2 != fs
 
     fs = S3FileSystem(anonymous=True)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = S3FileSystem(background_writes=True)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs2 = S3FileSystem(background_writes=True,
                        default_metadata={"ACL": "authenticated-read",
                                          "Content-Type": "text/plain"})
     assert isinstance(fs2, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs2)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
     assert fs2 != fs
 
     fs = S3FileSystem(allow_bucket_creation=True, allow_bucket_deletion=True)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = S3FileSystem(request_timeout=0.5, connect_timeout=0.25)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs2 = S3FileSystem(request_timeout=0.25, connect_timeout=0.5)
     assert isinstance(fs2, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs2)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
     assert fs2 != fs
 
     with pytest.raises(ValueError):
@@ -1182,7 +1181,7 @@ def test_s3_options():
 
 
 @pytest.mark.s3
-def test_s3_proxy_options(monkeypatch):
+def test_s3_proxy_options(monkeypatch, pickle_module):
     from pyarrow.fs import S3FileSystem
 
     # The following two are equivalent:
@@ -1195,111 +1194,111 @@ def test_s3_proxy_options(monkeypatch):
     # Check dict case for 'proxy_options'
     fs = S3FileSystem(proxy_options=proxy_opts_1_dict)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = S3FileSystem(proxy_options=proxy_opts_2_dict)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     # Check str case for 'proxy_options'
     fs = S3FileSystem(proxy_options=proxy_opts_1_str)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     fs = S3FileSystem(proxy_options=proxy_opts_2_str)
     assert isinstance(fs, S3FileSystem)
-    assert pickle.loads(pickle.dumps(fs)) == fs
+    assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     # Check that two FSs using the same proxy_options dict are equal
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
     fs2 = S3FileSystem(proxy_options=proxy_opts_1_dict)
     assert fs1 == fs2
-    assert pickle.loads(pickle.dumps(fs1)) == fs2
-    assert pickle.loads(pickle.dumps(fs2)) == fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
     assert fs1 == fs2
-    assert pickle.loads(pickle.dumps(fs1)) == fs2
-    assert pickle.loads(pickle.dumps(fs2)) == fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
 
     # Check that two FSs using the same proxy_options str are equal
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
     fs2 = S3FileSystem(proxy_options=proxy_opts_1_str)
     assert fs1 == fs2
-    assert pickle.loads(pickle.dumps(fs1)) == fs2
-    assert pickle.loads(pickle.dumps(fs2)) == fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_2_str)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
     assert fs1 == fs2
-    assert pickle.loads(pickle.dumps(fs1)) == fs2
-    assert pickle.loads(pickle.dumps(fs2)) == fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
 
     # Check that two FSs using equivalent proxy_options
     # (one dict, one str) are equal
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
     fs2 = S3FileSystem(proxy_options=proxy_opts_1_str)
     assert fs1 == fs2
-    assert pickle.loads(pickle.dumps(fs1)) == fs2
-    assert pickle.loads(pickle.dumps(fs2)) == fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
     assert fs1 == fs2
-    assert pickle.loads(pickle.dumps(fs1)) == fs2
-    assert pickle.loads(pickle.dumps(fs2)) == fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) == fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) == fs1
 
     # Check that two FSs using nonequivalent proxy_options are not equal
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
     fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     # Check that two FSs (one using proxy_options and the other not)
     # are not equal
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
     fs2 = S3FileSystem()
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
     fs2 = S3FileSystem()
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
     fs2 = S3FileSystem()
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     fs1 = S3FileSystem(proxy_options=proxy_opts_2_str)
     fs2 = S3FileSystem()
     assert fs1 != fs2
-    assert pickle.loads(pickle.dumps(fs1)) != fs2
-    assert pickle.loads(pickle.dumps(fs2)) != fs1
+    assert pickle_module.loads(pickle_module.dumps(fs1)) != fs2
+    assert pickle_module.loads(pickle_module.dumps(fs2)) != fs1
 
     # Only dict and str are supported
     with pytest.raises(TypeError):
@@ -1347,7 +1346,7 @@ def test_s3fs_wrong_region():
 
 
 @pytest.mark.hdfs
-def test_hdfs_options(hdfs_connection):
+def test_hdfs_options(hdfs_connection, pickle_module):
     from pyarrow.fs import HadoopFileSystem
     if not pa.have_libhdfs():
         pytest.skip('Cannot locate libhdfs')
@@ -1405,7 +1404,7 @@ def test_hdfs_options(hdfs_connection):
 
     for fs in [hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8,
                hdfs9, hdfs10, hdfs11]:
-        assert pickle.loads(pickle.dumps(fs)) == fs
+        assert pickle_module.loads(pickle_module.dumps(fs)) == fs
 
     host, port, user = hdfs_connection
 
@@ -1524,12 +1523,12 @@ def test_py_filesystem_equality():
     assert fs1 != object()
 
 
-def test_py_filesystem_pickling():
+def test_py_filesystem_pickling(pickle_module):
     handler = DummyHandler()
     fs = PyFileSystem(handler)
 
-    serialized = pickle.dumps(fs)
-    restored = pickle.loads(serialized)
+    serialized = pickle_module.dumps(fs)
+    restored = pickle_module.loads(serialized)
     assert isinstance(restored, FileSystem)
     assert restored == fs
     assert restored.handler == handler
diff --git a/python/pyarrow/tests/test_hdfs.py 
b/python/pyarrow/tests/test_hdfs.py
index 1f5f10f7d9..511dbf9a1c 100644
--- a/python/pyarrow/tests/test_hdfs.py
+++ b/python/pyarrow/tests/test_hdfs.py
@@ -16,9 +16,7 @@
 # under the License.
 
 import os
-import pickle
 import random
-import unittest
 from io import BytesIO
 from os.path import join as pjoin
 
@@ -81,20 +79,20 @@ class HdfsTestCases:
         return full_path
 
     @classmethod
-    def setUpClass(cls):
+    def setup_class(cls):
         cls.check_driver()
         cls.hdfs = hdfs_test_client()
         cls.tmp_path = '/tmp/pyarrow-test-{}'.format(random.randint(0, 1000))
         cls.hdfs.mkdir(cls.tmp_path)
 
     @classmethod
-    def tearDownClass(cls):
+    def teardown_class(cls):
         cls.hdfs.delete(cls.tmp_path, recursive=True)
         cls.hdfs.close()
 
-    def test_pickle(self):
-        s = pickle.dumps(self.hdfs)
-        h2 = pickle.loads(s)
+    def test_pickle(self, pickle_module):
+        s = pickle_module.dumps(self.hdfs)
+        h2 = pickle_module.loads(s)
         assert h2.is_open
         assert h2.host == self.hdfs.host
         assert h2.port == self.hdfs.port
@@ -392,7 +390,7 @@ class HdfsTestCases:
             tmpdir, filesystem=self.hdfs)
 
 
-class TestLibHdfs(HdfsTestCases, unittest.TestCase):
+class TestLibHdfs(HdfsTestCases):
 
     @classmethod
     def check_driver(cls):
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index a6488d70df..0c9e591ccd 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -24,7 +24,6 @@ import gzip
 import math
 import os
 import pathlib
-import pickle
 import pytest
 import sys
 import tempfile
@@ -372,17 +371,17 @@ def test_python_file_closing():
 # Buffers
 
 
-def check_buffer_pickling(buf):
+def check_buffer_pickling(buf, pickler):
     # Check that buffer survives a pickle roundtrip
-    for protocol in range(0, pickle.HIGHEST_PROTOCOL + 1):
-        result = pickle.loads(pickle.dumps(buf, protocol=protocol))
+    for protocol in range(0, pickler.HIGHEST_PROTOCOL + 1):
+        result = pickler.loads(pickler.dumps(buf, protocol=protocol))
         assert len(result) == len(buf)
         assert memoryview(result) == memoryview(buf)
         assert result.to_pybytes() == buf.to_pybytes()
         assert result.is_mutable == buf.is_mutable
 
 
-def test_buffer_bytes():
+def test_buffer_bytes(pickle_module):
     val = b'some data'
 
     buf = pa.py_buffer(val)
@@ -393,10 +392,10 @@ def test_buffer_bytes():
     result = buf.to_pybytes()
     assert result == val
 
-    check_buffer_pickling(buf)
+    check_buffer_pickling(buf, pickle_module)
 
 
-def test_buffer_null_data():
+def test_buffer_null_data(pickle_module):
     null_buff = pa.foreign_buffer(address=0, size=0)
     assert null_buff.to_pybytes() == b""
     assert null_buff.address == 0
@@ -406,10 +405,10 @@ def test_buffer_null_data():
     assert m.tobytes() == b""
     assert pa.py_buffer(m).address != 0
 
-    check_buffer_pickling(null_buff)
+    check_buffer_pickling(null_buff, pickle_module)
 
 
-def test_buffer_memoryview():
+def test_buffer_memoryview(pickle_module):
     val = b'some data'
 
     buf = pa.py_buffer(val)
@@ -420,10 +419,10 @@ def test_buffer_memoryview():
     result = memoryview(buf)
     assert result == val
 
-    check_buffer_pickling(buf)
+    check_buffer_pickling(buf, pickle_module)
 
 
-def test_buffer_bytearray():
+def test_buffer_bytearray(pickle_module):
     val = bytearray(b'some data')
 
     buf = pa.py_buffer(val)
@@ -434,7 +433,7 @@ def test_buffer_bytearray():
     result = bytearray(buf)
     assert result == val
 
-    check_buffer_pickling(buf)
+    check_buffer_pickling(buf, pickle_module)
 
 
 def test_buffer_invalid():
diff --git a/python/pyarrow/tests/test_json.py 
b/python/pyarrow/tests/test_json.py
index f37019ade5..be83f891a2 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -20,7 +20,6 @@ from decimal import Decimal
 import io
 import itertools
 import json
-import pickle
 import string
 import unittest
 
@@ -53,15 +52,15 @@ def make_random_json(num_cols=2, num_rows=10, 
linesep='\r\n'):
     return data, expected
 
 
-def check_options_class_pickling(cls, **attr_values):
+def check_options_class_pickling(cls, pickler, **attr_values):
     opts = cls(**attr_values)
-    new_opts = pickle.loads(pickle.dumps(opts,
-                                         protocol=pickle.HIGHEST_PROTOCOL))
+    new_opts = pickler.loads(pickler.dumps(opts,
+                                           protocol=pickler.HIGHEST_PROTOCOL))
     for name, value in attr_values.items():
         assert getattr(new_opts, name) == value
 
 
-def test_read_options():
+def test_read_options(pickle_module):
     cls = ReadOptions
     opts = cls()
 
@@ -77,11 +76,12 @@ def test_read_options():
     assert opts.block_size == 1234
     assert opts.use_threads is False
 
-    check_options_class_pickling(cls, block_size=1234,
+    check_options_class_pickling(cls, pickler=pickle_module,
+                                 block_size=1234,
                                  use_threads=False)
 
 
-def test_parse_options():
+def test_parse_options(pickle_module):
     cls = ParseOptions
     opts = cls()
     assert opts.newlines_in_values is False
@@ -102,7 +102,8 @@ def test_parse_options():
     with pytest.raises(ValueError):
         opts.unexpected_field_behavior = "invalid-value"
 
-    check_options_class_pickling(cls, explicit_schema=schema,
+    check_options_class_pickling(cls, pickler=pickle_module,
+                                 explicit_schema=schema,
                                  newlines_in_values=False,
                                  unexpected_field_behavior="ignore")
 
diff --git a/python/pyarrow/tests/test_scalars.py 
b/python/pyarrow/tests/test_scalars.py
index d52edce6d9..5f6c8c813f 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -17,7 +17,6 @@
 
 import datetime
 import decimal
-import pickle
 import pytest
 import sys
 import weakref
@@ -68,7 +67,7 @@ from pyarrow.tests import util
     ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar),
     ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar),
 ])
-def test_basics(value, ty, klass):
+def test_basics(value, ty, klass, pickle_module):
     s = pa.scalar(value, type=ty)
     s.validate()
     s.validate(full=True)
@@ -87,7 +86,7 @@ def test_basics(value, ty, klass):
     assert s != pa.scalar(value, type=ty)
 
     # test pickle roundtrip
-    restored = pickle.loads(pickle.dumps(s))
+    restored = pickle_module.loads(pickle_module.dumps(s))
     assert s.equals(restored)
 
     # test that scalars are weak-referenceable
@@ -110,7 +109,7 @@ def test_null_singleton():
         pa.NullScalar()
 
 
-def test_nulls():
+def test_nulls(pickle_module):
     null = pa.scalar(None)
     assert null is pa.NA
     assert null.as_py() is None
@@ -126,7 +125,7 @@ def test_nulls():
         assert v.as_py() is None
 
     # test pickle roundtrip
-    restored = pickle.loads(pickle.dumps(null))
+    restored = pickle_module.loads(pickle_module.dumps(null))
     assert restored.equals(null)
 
     # test that scalars are weak-referenceable
@@ -683,7 +682,7 @@ def test_struct_duplicate_fields():
         s.as_py()
 
 
-def test_map():
+def test_map(pickle_module):
     ty = pa.map_(pa.string(), pa.int8())
     v = [('a', 1), ('b', 2)]
     s = pa.scalar(v, type=ty)
@@ -713,11 +712,11 @@ def test_map():
     with pytest.raises(IndexError):
         s[2]
 
-    restored = pickle.loads(pickle.dumps(s))
+    restored = pickle_module.loads(pickle_module.dumps(s))
     assert restored.equals(s)
 
 
-def test_dictionary():
+def test_dictionary(pickle_module):
     indices = pa.array([2, None, 1, 2, 0, None])
     dictionary = pa.array(['foo', 'bar', 'baz'])
 
@@ -733,7 +732,7 @@ def test_dictionary():
         assert s.index.equals(i)
         assert s.dictionary.equals(dictionary)
 
-        restored = pickle.loads(pickle.dumps(s))
+        restored = pickle_module.loads(pickle_module.dumps(s))
         assert restored.equals(s)
 
 
@@ -758,7 +757,7 @@ def test_run_end_encoded():
         pa.scalar(1, pa.run_end_encoded(pa.int64(), pa.int64()))
 
 
-def test_union():
+def test_union(pickle_module):
     # sparse
     arr = pa.UnionArray.from_sparse(
         pa.array([0, 0, 1, 1], type=pa.int8()),
@@ -773,7 +772,7 @@ def test_union():
         assert s.type.equals(arr.type)
         assert s.is_valid is True
         with pytest.raises(pa.ArrowNotImplementedError):
-            pickle.loads(pickle.dumps(s))
+            pickle_module.loads(pickle_module.dumps(s))
 
     assert arr[0].type_code == 0
     assert arr[0].as_py() == "a"
@@ -799,7 +798,7 @@ def test_union():
         assert s.type.equals(arr.type)
         assert s.is_valid is True
         with pytest.raises(pa.ArrowNotImplementedError):
-            pickle.loads(pickle.dumps(s))
+            pickle_module.loads(pickle_module.dumps(s))
 
     assert arr[0].type_code == 0
     assert arr[0].as_py() == b'a'
diff --git a/python/pyarrow/tests/test_schema.py 
b/python/pyarrow/tests/test_schema.py
index 2c2f9547f2..e28e0ac445 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -16,7 +16,6 @@
 # under the License.
 
 from collections import OrderedDict
-import pickle
 import sys
 import weakref
 
@@ -585,7 +584,7 @@ two: int32""")
     assert repr(sch) == expected
 
 
-def test_type_schema_pickling():
+def test_type_schema_pickling(pickle_module):
     cases = [
         pa.int8(),
         pa.string(),
@@ -621,7 +620,7 @@ def test_type_schema_pickling():
     ]
 
     for val in cases:
-        roundtripped = pickle.loads(pickle.dumps(val))
+        roundtripped = pickle_module.loads(pickle_module.dumps(val))
         assert val == roundtripped
 
     fields = []
@@ -632,7 +631,7 @@ def test_type_schema_pickling():
             fields.append(pa.field('_f{}'.format(i), f))
 
     schema = pa.schema(fields, metadata={b'foo': b'bar'})
-    roundtripped = pickle.loads(pickle.dumps(schema))
+    roundtripped = pickle_module.loads(pickle_module.dumps(schema))
     assert schema == roundtripped
 
 
diff --git a/python/pyarrow/tests/test_table.py 
b/python/pyarrow/tests/test_table.py
index e28256e91f..f93c6bbc2c 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -17,7 +17,6 @@
 
 from collections import OrderedDict
 from collections.abc import Iterable
-import pickle
 import sys
 import weakref
 
@@ -301,14 +300,14 @@ def test_chunked_array_equals():
             pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
     ]
 )
-def test_chunked_array_pickle(data, typ):
+def test_chunked_array_pickle(data, typ, pickle_module):
     arrays = []
     while data:
         arrays.append(pa.array(data[:2], type=typ))
         data = data[2:]
     array = pa.chunked_array(arrays, type=typ)
     array.validate()
-    result = pickle.loads(pickle.dumps(array))
+    result = pickle_module.loads(pickle_module.dumps(array))
     result.validate()
     assert result.equals(array)
 
@@ -663,7 +662,7 @@ def test_recordbatch_empty_metadata():
     assert batch.schema.metadata is None
 
 
-def test_recordbatch_pickle():
+def test_recordbatch_pickle(pickle_module):
     data = [
         pa.array(range(5), type='int8'),
         pa.array([-10, -5, 0, 5, 10], type='float32')
@@ -675,7 +674,7 @@ def test_recordbatch_pickle():
     schema = pa.schema(fields, metadata={b'foo': b'bar'})
     batch = pa.record_batch(data, schema=schema)
 
-    result = pickle.loads(pickle.dumps(batch))
+    result = pickle_module.loads(pickle_module.dumps(batch))
     assert result.equals(batch)
     assert result.schema == schema
 
@@ -1022,7 +1021,7 @@ def test_table_from_lists():
     assert result.equals(expected)
 
 
-def test_table_pickle():
+def test_table_pickle(pickle_module):
     data = [
         pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()),
         pa.chunked_array([["some", "strings", None, ""]], type=pa.string()),
@@ -1032,7 +1031,7 @@ def test_table_pickle():
                        metadata={b'foo': b'bar'})
     table = pa.Table.from_arrays(data, schema=schema)
 
-    result = pickle.loads(pickle.dumps(table))
+    result = pickle_module.loads(pickle_module.dumps(table))
     result.validate()
     assert result.equals(table)
 
diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index 568702d8e8..f3b6001003 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -21,7 +21,6 @@ from functools import partial
 import datetime
 import sys
 
-import pickle
 import pytest
 import hypothesis as h
 import hypothesis.strategies as st
@@ -790,10 +789,10 @@ def test_types_hashable():
         assert in_dict[type_] == i
 
 
-def test_types_picklable():
+def test_types_picklable(pickle_module):
     for ty in get_many_types():
-        data = pickle.dumps(ty)
-        assert pickle.loads(data) == ty
+        data = pickle_module.dumps(ty)
+        assert pickle_module.loads(data) == ty
 
 
 def test_types_weakref():
@@ -1193,9 +1192,9 @@ def test_is_boolean_value():
 @h.example(
     pa.field(name='', type=pa.null(), metadata={'0': '', '': ''})
 )
-def test_pickling(field):
-    data = pickle.dumps(field)
-    assert pickle.loads(data) == field
+def test_pickling(pickle_module, field):
+    data = pickle_module.dumps(field)
+    assert pickle_module.loads(data) == field
 
 
 @h.given(

[arrow] branch main updated: GH-37254: [Python] Parametrize all pickling tests to use both the pickle and cloudpickle modules (#37255)

Reply via email to