This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9ab9532a20 GH-25118: [Python] Make NumPy an optional runtime
dependency (#41904)
9ab9532a20 is described below
commit 9ab9532a208d5632b0f8b5035a207235b5e6b828
Author: Raúl Cumplido <[email protected]>
AuthorDate: Mon Sep 2 16:35:26 2024 +0200
GH-25118: [Python] Make NumPy an optional runtime dependency (#41904)
### Rationale for this change
Being able to run pyarrow without requiring numpy.
### What changes are included in this PR?
If numpy is not present we are able to import pyarrow and run functionality.
A new CI job has been created to run some basic tests without numpy.
### Are these changes tested?
Yes via CI.
### Are there any user-facing changes?
Yes, NumPy can be removed from the user installation and pyarrow
functionality still works
* GitHub Issue: #25118
Lead-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
.github/workflows/python.yml | 6 +
docker-compose.yml | 32 +++++
python/CMakeLists.txt | 4 +-
python/pyarrow/_compute.pyx | 16 ++-
python/pyarrow/array.pxi | 5 +
python/pyarrow/builder.pxi | 14 +-
python/pyarrow/conftest.py | 13 +-
python/pyarrow/includes/libarrow_python.pxd | 2 +-
python/pyarrow/lib.pyx | 12 +-
python/pyarrow/pandas_compat.py | 79 ++++++-----
python/pyarrow/src/arrow/python/inference.cc | 4 +-
python/pyarrow/src/arrow/python/iterators.h | 6 +-
.../src/arrow/python/{init.cc => numpy_init.cc} | 13 +-
.../src/arrow/python/{init.h => numpy_init.h} | 5 +-
python/pyarrow/src/arrow/python/numpy_internal.h | 19 ++-
python/pyarrow/src/arrow/python/python_test.cc | 2 +-
python/pyarrow/src/arrow/python/python_to_arrow.cc | 11 +-
python/pyarrow/table.pxi | 3 +
python/pyarrow/tensor.pxi | 15 ++
python/pyarrow/tests/conftest.py | 1 +
.../pyarrow/tests/interchange/test_conversion.py | 35 +++--
.../tests/interchange/test_interchange_spec.py | 33 +++--
python/pyarrow/tests/parquet/common.py | 5 +-
python/pyarrow/tests/parquet/test_basic.py | 5 +-
python/pyarrow/tests/parquet/test_data_types.py | 13 +-
python/pyarrow/tests/parquet/test_dataset.py | 5 +-
python/pyarrow/tests/parquet/test_datetime.py | 5 +-
python/pyarrow/tests/parquet/test_metadata.py | 7 +-
python/pyarrow/tests/parquet/test_pandas.py | 5 +-
python/pyarrow/tests/strategies.py | 10 +-
python/pyarrow/tests/test_adhoc_memory_leak.py | 5 +-
python/pyarrow/tests/test_array.py | 100 +++++++++++--
python/pyarrow/tests/test_builder.py | 11 +-
python/pyarrow/tests/test_compute.py | 85 ++++++-----
python/pyarrow/tests/test_convert_builtin.py | 155 +++++++++++++--------
python/pyarrow/tests/test_cpp_internals.py | 8 ++
python/pyarrow/tests/test_csv.py | 44 +++++-
python/pyarrow/tests/test_cuda.py | 5 +-
python/pyarrow/tests/test_cuda_numba_interop.py | 5 +-
python/pyarrow/tests/test_cython.py | 4 +
python/pyarrow/tests/test_dataset.py | 55 ++++----
python/pyarrow/tests/test_dataset_encryption.py | 7 +-
python/pyarrow/tests/test_dlpack.py | 46 +++---
python/pyarrow/tests/test_extension_type.py | 77 ++++++----
python/pyarrow/tests/test_feather.py | 10 +-
python/pyarrow/tests/test_flight.py | 6 +-
python/pyarrow/tests/test_io.py | 38 +++--
python/pyarrow/tests/test_ipc.py | 10 +-
python/pyarrow/tests/test_json.py | 8 +-
python/pyarrow/tests/test_pandas.py | 62 +++++----
python/pyarrow/tests/test_scalars.py | 59 ++++++--
python/pyarrow/tests/test_schema.py | 6 +-
python/pyarrow/tests/test_sparse_tensor.py | 5 +-
python/pyarrow/tests/test_strategies.py | 5 +
python/pyarrow/tests/test_substrait.py | 2 +
python/pyarrow/tests/test_table.py | 29 +++-
python/pyarrow/tests/test_tensor.py | 5 +-
python/pyarrow/tests/test_types.py | 16 ++-
python/pyarrow/tests/test_udf.py | 13 +-
python/pyarrow/tests/test_without_numpy.py | 58 ++++++++
python/pyarrow/tests/util.py | 19 +--
python/pyarrow/types.pxi | 85 ++++++-----
62 files changed, 1008 insertions(+), 420 deletions(-)
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 854d792f31..90d3a50af3 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -59,6 +59,7 @@ jobs:
- conda-python-3.9-nopandas
- conda-python-3.8-pandas-1.0
- conda-python-3.10-pandas-latest
+ - conda-python-3.10-no-numpy
include:
- name: conda-python-docs
cache: conda-python-3.9
@@ -83,6 +84,11 @@ jobs:
title: AMD64 Conda Python 3.10 Pandas latest
python: "3.10"
pandas: latest
+ - name: conda-python-3.10-no-numpy
+ cache: conda-python-3.10
+ image: conda-python-no-numpy
+ title: AMD64 Conda Python 3.10 without NumPy
+ python: "3.10"
env:
PYTHON: ${{ matrix.python || 3.8 }}
UBUNTU: ${{ matrix.ubuntu || 20.04 }}
diff --git a/docker-compose.yml b/docker-compose.yml
index 3045cf015b..97d6e1158e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -126,6 +126,7 @@ x-hierarchy:
- conda-python-hdfs
- conda-python-java-integration
- conda-python-jpype
+ - conda-python-no-numpy
- conda-python-spark
- conda-python-substrait
- conda-verify-rc
@@ -1258,6 +1259,37 @@ services:
volumes: *conda-volumes
command: *python-conda-command
+ conda-python-no-numpy:
+ # Usage:
+ # docker-compose build conda
+ # docker-compose build conda-cpp
+ # docker-compose build conda-python
+ # docker-compose build conda-python-no-numpy
+ # docker-compose run --rm conda-python-no-numpy
+ image: ${REPO}:${ARCH}-conda-python-${PYTHON}-no-numpy
+ build:
+ context: .
+ dockerfile: ci/docker/conda-python.dockerfile
+ cache_from:
+ - ${REPO}:${ARCH}-conda-python-${PYTHON}
+ args:
+ repo: ${REPO}
+ arch: ${ARCH}
+ python: ${PYTHON}
+ shm_size: *shm-size
+ environment:
+ <<: [*common, *ccache, *sccache]
+ PARQUET_REQUIRE_ENCRYPTION: # inherit
+ HYPOTHESIS_PROFILE: # inherit
+ PYARROW_TEST_HYPOTHESIS: # inherit
+ volumes: *conda-volumes
+ command:
+ ["
+ /arrow/ci/scripts/cpp_build.sh /arrow /build &&
+ /arrow/ci/scripts/python_build.sh /arrow /build &&
+ mamba uninstall -y numpy &&
+ /arrow/ci/scripts/python_test.sh /arrow"]
+
conda-python-docs:
# Usage:
# archery docker run conda-python-docs
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1a18b2b173..eda4ff4ca5 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -339,17 +339,17 @@ set(PYARROW_CPP_SRCS
${PYARROW_CPP_SOURCE_DIR}/gdb.cc
${PYARROW_CPP_SOURCE_DIR}/helpers.cc
${PYARROW_CPP_SOURCE_DIR}/inference.cc
- ${PYARROW_CPP_SOURCE_DIR}/init.cc
${PYARROW_CPP_SOURCE_DIR}/io.cc
${PYARROW_CPP_SOURCE_DIR}/ipc.cc
${PYARROW_CPP_SOURCE_DIR}/numpy_convert.cc
+ ${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
${PYARROW_CPP_SOURCE_DIR}/numpy_to_arrow.cc
${PYARROW_CPP_SOURCE_DIR}/python_test.cc
${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc
${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc
${PYARROW_CPP_SOURCE_DIR}/serialize.cc
${PYARROW_CPP_SOURCE_DIR}/udf.cc)
-set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/init.cc
+set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
PROPERTIES SKIP_PRECOMPILE_HEADERS ON
SKIP_UNITY_BUILD_INCLUSION ON)
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 0e860eaf4c..d39120934d 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -33,7 +33,10 @@ from pyarrow.util import _DEPR_MSG
from libcpp cimport bool as c_bool
import inspect
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import warnings
@@ -43,6 +46,11 @@ _substrait_msg = (
)
+SUPPORTED_INPUT_ARR_TYPES = (list, tuple)
+if np is not None:
+ SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, )
+
+
def _pas():
global __pas
if __pas is None:
@@ -473,7 +481,7 @@ cdef class MetaFunction(Function):
cdef _pack_compute_args(object values, vector[CDatum]* out):
for val in values:
- if isinstance(val, (list, np.ndarray)):
+ if isinstance(val, SUPPORTED_INPUT_ARR_TYPES):
val = lib.asarray(val)
if isinstance(val, Array):
@@ -2189,7 +2197,7 @@ class QuantileOptions(_QuantileOptions):
def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True,
min_count=0):
- if not isinstance(q, (list, tuple, np.ndarray)):
+ if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
q = [q]
self._set_options(q, interpolation, skip_nulls, min_count)
@@ -2222,7 +2230,7 @@ class TDigestOptions(_TDigestOptions):
def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True,
min_count=0):
- if not isinstance(q, (list, tuple, np.ndarray)):
+ if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
q = [q]
self._set_options(q, delta, buffer_size, skip_nulls, min_count)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1587de0e6b..93c4429759 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -50,6 +50,8 @@ cdef _sequence_to_array(object sequence, object mask, object
size,
cdef inline _is_array_like(obj):
+ if np is None:
+ return False
if isinstance(obj, np.ndarray):
return True
return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
@@ -1608,6 +1610,9 @@ cdef class Array(_PandasConvertible):
"""
self._assert_cpu()
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
cdef:
PyObject* out
PandasOptions c_options
diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi
index 2af39e2c58..fbab5bbdb5 100644
--- a/python/pyarrow/builder.pxi
+++ b/python/pyarrow/builder.pxi
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+import math
+
cdef class StringBuilder(_Weakrefable):
"""
@@ -42,10 +44,10 @@ cdef class StringBuilder(_Weakrefable):
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
- if value is None or value is np.nan:
- self.builder.get().AppendNull()
- elif isinstance(value, (bytes, str)):
+ if isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
+ elif value is None or math.isnan(value):
+ self.builder.get().AppendNull()
else:
raise TypeError('StringBuilder only accepts string objects')
@@ -108,10 +110,10 @@ cdef class StringViewBuilder(_Weakrefable):
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
- if value is None or value is np.nan:
- self.builder.get().AppendNull()
- elif isinstance(value, (bytes, str)):
+ if isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
+ elif value is None or math.isnan(value):
+ self.builder.get().AppendNull()
else:
raise TypeError('StringViewBuilder only accepts string objects')
diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py
index 29c850c142..10a2e72f92 100644
--- a/python/pyarrow/conftest.py
+++ b/python/pyarrow/conftest.py
@@ -25,7 +25,6 @@ from pyarrow.lib import is_threading_enabled
from pyarrow.tests.util import windows_has_tzdata
import sys
-import numpy as np
groups = [
'acero',
@@ -46,6 +45,8 @@ groups = [
'lz4',
'memory_leak',
'nopandas',
+ 'nonumpy',
+ 'numpy',
'orc',
'pandas',
'parquet',
@@ -81,6 +82,8 @@ defaults = {
'lz4': Codec.is_available('lz4'),
'memory_leak': False,
'nopandas': False,
+ 'nonumpy': False,
+ 'numpy': False,
'orc': False,
'pandas': False,
'parquet': False,
@@ -158,6 +161,12 @@ try:
except ImportError:
defaults['nopandas'] = True
+try:
+ import numpy # noqa
+ defaults['numpy'] = True
+except ImportError:
+ defaults['nonumpy'] = True
+
try:
import pyarrow.parquet # noqa
defaults['parquet'] = True
@@ -327,6 +336,7 @@ def unary_agg_func_fixture():
Register a unary aggregate function (mean)
"""
from pyarrow import compute as pc
+ import numpy as np
def func(ctx, x):
return pa.scalar(np.nanmean(x))
@@ -352,6 +362,7 @@ def varargs_agg_func_fixture():
Register a unary aggregate function
"""
from pyarrow import compute as pc
+ import numpy as np
def func(ctx, *args):
sum = 0.0
diff --git a/python/pyarrow/includes/libarrow_python.pxd
b/python/pyarrow/includes/libarrow_python.pxd
index 9fcc97aaf0..96725c9c38 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -248,7 +248,7 @@ cdef extern from "arrow/python/api.h" namespace
"arrow::py::internal" nogil:
CResult[PyObject*] StringToTzinfo(c_string)
-cdef extern from "arrow/python/init.h":
+cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
int arrow_init_numpy() except -1
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index c72841c299..6b82eb6566 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -21,7 +21,10 @@
import datetime
import decimal as _pydecimal
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import os
import sys
@@ -32,8 +35,11 @@ from pyarrow.includes.common cimport PyObject_to_object
cimport pyarrow.includes.libarrow_python as libarrow_python
cimport cpython as cp
-# Initialize NumPy C API
-arrow_init_numpy()
+
+# Initialize NumPy C API only if numpy was able to be imported
+if np is not None:
+ arrow_init_numpy()
+
# Initialize PyArrow C++ API
# (used from some of our C++ code, see e.g. ARROW-5260)
import_pyarrow()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index fcccf564fc..7fbde36bc2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -30,13 +30,17 @@ import operator
import re
import warnings
-import numpy as np
-
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa
_logical_type_map = {}
+_numpy_logical_type_map = {}
+_pandas_logical_type_map = {}
def get_logical_type_map():
@@ -85,27 +89,32 @@ def get_logical_type(arrow_type):
return 'object'
-_numpy_logical_type_map = {
- np.bool_: 'bool',
- np.int8: 'int8',
- np.int16: 'int16',
- np.int32: 'int32',
- np.int64: 'int64',
- np.uint8: 'uint8',
- np.uint16: 'uint16',
- np.uint32: 'uint32',
- np.uint64: 'uint64',
- np.float32: 'float32',
- np.float64: 'float64',
- 'datetime64[D]': 'date',
- np.str_: 'string',
- np.bytes_: 'bytes',
-}
+def get_numpy_logical_type_map():
+ global _numpy_logical_type_map
+ if not _numpy_logical_type_map:
+ _numpy_logical_type_map.update({
+ np.bool_: 'bool',
+ np.int8: 'int8',
+ np.int16: 'int16',
+ np.int32: 'int32',
+ np.int64: 'int64',
+ np.uint8: 'uint8',
+ np.uint16: 'uint16',
+ np.uint32: 'uint32',
+ np.uint64: 'uint64',
+ np.float32: 'float32',
+ np.float64: 'float64',
+ 'datetime64[D]': 'date',
+ np.str_: 'string',
+ np.bytes_: 'bytes',
+ })
+ return _numpy_logical_type_map
def get_logical_type_from_numpy(pandas_collection):
+ numpy_logical_type_map = get_numpy_logical_type_map()
try:
- return _numpy_logical_type_map[pandas_collection.dtype.type]
+ return numpy_logical_type_map[pandas_collection.dtype.type]
except KeyError:
if hasattr(pandas_collection.dtype, 'tz'):
return 'datetimetz'
@@ -1023,18 +1032,23 @@ def _is_generated_index_name(name):
return re.match(pattern, name) is not None
-_pandas_logical_type_map = {
- 'date': 'datetime64[D]',
- 'datetime': 'datetime64[ns]',
- 'datetimetz': 'datetime64[ns]',
- 'unicode': np.str_,
- 'bytes': np.bytes_,
- 'string': np.str_,
- 'integer': np.int64,
- 'floating': np.float64,
- 'decimal': np.object_,
- 'empty': np.object_,
-}
+def get_pandas_logical_type_map():
+ global _pandas_logical_type_map
+
+ if not _pandas_logical_type_map:
+ _pandas_logical_type_map.update({
+ 'date': 'datetime64[D]',
+ 'datetime': 'datetime64[ns]',
+ 'datetimetz': 'datetime64[ns]',
+ 'unicode': np.str_,
+ 'bytes': np.bytes_,
+ 'string': np.str_,
+ 'integer': np.int64,
+ 'floating': np.float64,
+ 'decimal': np.object_,
+ 'empty': np.object_,
+ })
+ return _pandas_logical_type_map
def _pandas_type_to_numpy_type(pandas_type):
@@ -1050,8 +1064,9 @@ def _pandas_type_to_numpy_type(pandas_type):
dtype : np.dtype
The dtype that corresponds to `pandas_type`.
"""
+ pandas_logical_type_map = get_pandas_logical_type_map()
try:
- return _pandas_logical_type_map[pandas_type]
+ return pandas_logical_type_map[pandas_type]
except KeyError:
if 'mixed' in pandas_type:
# catching 'mixed', 'mixed-integer' and 'mixed-integer-float'
diff --git a/python/pyarrow/src/arrow/python/inference.cc
b/python/pyarrow/src/arrow/python/inference.cc
index 10116f9afa..1aa7915ba1 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -395,11 +395,11 @@ class TypeInferrer {
*keep_going = make_unions_;
} else if (arrow::py::is_scalar(obj)) {
RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
- } else if (PyArray_CheckAnyScalarExact(obj)) {
+ } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
} else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
RETURN_NOT_OK(VisitSet(obj, keep_going));
- } else if (PyArray_Check(obj)) {
+ } else if (has_numpy() && PyArray_Check(obj)) {
RETURN_NOT_OK(VisitNdarray(obj, keep_going));
} else if (PyDict_Check(obj)) {
RETURN_NOT_OK(VisitDict(obj));
diff --git a/python/pyarrow/src/arrow/python/iterators.h
b/python/pyarrow/src/arrow/python/iterators.h
index 7b31962dac..8512276848 100644
--- a/python/pyarrow/src/arrow/python/iterators.h
+++ b/python/pyarrow/src/arrow/python/iterators.h
@@ -22,6 +22,7 @@
#include "arrow/array/array_primitive.h"
#include "arrow/python/common.h"
+#include "arrow/python/numpy_init.h"
#include "arrow/python/numpy_internal.h"
namespace arrow {
@@ -44,7 +45,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t
offset, VisitorFunc&&
// VisitorFunc may set to false to terminate iteration
bool keep_going = true;
- if (PyArray_Check(obj)) {
+ if (has_numpy() && PyArray_Check(obj)) {
PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
if (PyArray_NDIM(arr_obj) != 1) {
return Status::Invalid("Only 1D arrays accepted");
@@ -64,6 +65,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t
offset, VisitorFunc&&
// This code path is inefficient: callers should implement dedicated
// logic for non-object arrays.
}
+
if (PySequence_Check(obj)) {
if (PyList_Check(obj) || PyTuple_Check(obj)) {
// Use fast item access
@@ -101,7 +103,7 @@ inline Status VisitSequence(PyObject* obj, int64_t offset,
VisitorFunc&& func) {
template <class VisitorFunc>
inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
VisitorFunc&& func) {
- if (PyArray_Check(mo)) {
+ if (has_numpy() && PyArray_Check(mo)) {
PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
if (PyArray_NDIM(mask) != 1) {
return Status::Invalid("Mask must be 1D array");
diff --git a/python/pyarrow/src/arrow/python/init.cc
b/python/pyarrow/src/arrow/python/numpy_init.cc
similarity index 78%
rename from python/pyarrow/src/arrow/python/init.cc
rename to python/pyarrow/src/arrow/python/numpy_init.cc
index dba293bbe2..96e2c7b7cc 100644
--- a/python/pyarrow/src/arrow/python/init.cc
+++ b/python/pyarrow/src/arrow/python/numpy_init.cc
@@ -18,7 +18,16 @@
// Trigger the array import (inversion of NO_IMPORT_ARRAY)
#define NUMPY_IMPORT_ARRAY
-#include "arrow/python/init.h"
+#include "arrow/python/numpy_init.h"
#include "arrow/python/numpy_interop.h"
-int arrow_init_numpy() { return arrow::py::import_numpy(); }
+namespace arrow::py {
+bool numpy_imported = false;
+
+int arrow_init_numpy() {
+ numpy_imported = true;
+ return arrow::py::import_numpy();
+}
+
+bool has_numpy() { return numpy_imported; }
+} // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/init.h
b/python/pyarrow/src/arrow/python/numpy_init.h
similarity index 93%
rename from python/pyarrow/src/arrow/python/init.h
rename to python/pyarrow/src/arrow/python/numpy_init.h
index 2e6c954862..36c544c1b5 100644
--- a/python/pyarrow/src/arrow/python/init.h
+++ b/python/pyarrow/src/arrow/python/numpy_init.h
@@ -20,7 +20,8 @@
#include "arrow/python/platform.h"
#include "arrow/python/visibility.h"
-extern "C" {
+namespace arrow::py {
ARROW_PYTHON_EXPORT
int arrow_init_numpy();
-}
+bool has_numpy();
+} // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/numpy_internal.h
b/python/pyarrow/src/arrow/python/numpy_internal.h
index b9b632f9f9..0b4d0be00e 100644
--- a/python/pyarrow/src/arrow/python/numpy_internal.h
+++ b/python/pyarrow/src/arrow/python/numpy_internal.h
@@ -19,6 +19,7 @@
#pragma once
+#include "arrow/python/numpy_init.h"
#include "arrow/python/numpy_interop.h"
#include "arrow/status.h"
@@ -155,15 +156,27 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr,
VISITOR* visitor) {
namespace internal {
inline bool PyFloatScalar_Check(PyObject* obj) {
- return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+ if (has_numpy()) {
+ return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+ } else {
+ return PyFloat_Check(obj);
+ }
}
inline bool PyIntScalar_Check(PyObject* obj) {
- return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+ if (has_numpy()) {
+ return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+ } else {
+ return PyLong_Check(obj);
+ }
}
inline bool PyBoolScalar_Check(PyObject* obj) {
- return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+ if (has_numpy()) {
+ return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+ } else {
+ return PyBool_Check(obj);
+ }
}
static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
diff --git a/python/pyarrow/src/arrow/python/python_test.cc
b/python/pyarrow/src/arrow/python/python_test.cc
index 746bf41091..eea6bf9459 100644
--- a/python/pyarrow/src/arrow/python/python_test.cc
+++ b/python/pyarrow/src/arrow/python/python_test.cc
@@ -870,7 +870,7 @@ std::vector<TestCase> GetCppTestCases() {
TestInferAllLeadingZerosExponentialNotationPositive},
{"test_infer_all_leading_zeros_exponential_notation_negative",
TestInferAllLeadingZerosExponentialNotationNegative},
- {"test_object_block_write_fails", TestObjectBlockWriteFails},
+ {"test_object_block_write_fails_pandas_convert",
TestObjectBlockWriteFails},
{"test_mixed_type_fails", TestMixedTypeFails},
{"test_from_python_decimal_rescale_not_truncateable",
TestFromPythonDecimalRescaleNotTruncateable},
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index ce9e15c894..e7195e9907 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -202,7 +202,7 @@ class PyValue {
return true;
} else if (obj == Py_False) {
return false;
- } else if (PyArray_IsScalar(obj, Bool)) {
+ } else if (has_numpy() && PyArray_IsScalar(obj, Bool)) {
return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE;
} else {
return internal::InvalidValue(obj, "tried to convert to boolean");
@@ -385,7 +385,7 @@ class PyValue {
default:
return Status::UnknownError("Invalid time unit");
}
- } else if (PyArray_CheckAnyScalarExact(obj)) {
+ } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
// validate that the numpy scalar has np.datetime64 dtype
ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
if (!numpy_type->Equals(*type)) {
@@ -464,7 +464,7 @@ class PyValue {
default:
return Status::UnknownError("Invalid time unit");
}
- } else if (PyArray_CheckAnyScalarExact(obj)) {
+ } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
// validate that the numpy scalar has np.datetime64 dtype
ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
if (!numpy_type->Equals(*type)) {
@@ -664,7 +664,7 @@ class PyPrimitiveConverter<
ARROW_ASSIGN_OR_RAISE(
auto converted, PyValue::Convert(this->primitive_type_,
this->options_, value));
// Numpy NaT sentinels can be checked after the conversion
- if (PyArray_CheckAnyScalarExact(value) &&
+ if (has_numpy() && PyArray_CheckAnyScalarExact(value) &&
PyValue::IsNaT(this->primitive_type_, converted)) {
this->primitive_builder_->UnsafeAppendNull();
} else {
@@ -804,8 +804,7 @@ class PyListConverter : public ListConverter<T,
PyConverter, PyConverterTrait> {
if (PyValue::IsNull(this->options_, value)) {
return this->list_builder_->AppendNull();
}
-
- if (PyArray_Check(value)) {
+ if (has_numpy() && PyArray_Check(value)) {
RETURN_NOT_OK(AppendNdarray(value));
} else if (PySequence_Check(value)) {
RETURN_NOT_OK(AppendSequence(value));
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 6d34c71c9d..fff47373cb 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -495,6 +495,9 @@ cdef class ChunkedArray(_PandasConvertible):
>>> n_legs.to_numpy()
array([ 2, 2, 4, 4, 5, 100])
"""
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
if zero_copy_only:
raise ValueError(
"zero_copy_only must be False for
pyarrow.ChunkedArray.to_numpy"
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index 6fb4fc99d7..3e0c63c18f 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -107,6 +107,9 @@ strides: {0.strides}""".format(self)
array([[ 2, 2, 4],
[ 4, 5, 100]], dtype=int32)
"""
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
cdef PyObject* out
check_status(TensorToNdarray(self.sp_tensor, self, &out))
@@ -478,6 +481,9 @@ shape: {0.shape}""".format(self)
"""
Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy.
"""
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
cdef PyObject* out_data
cdef PyObject* out_coords
@@ -743,6 +749,9 @@ shape: {0.shape}""".format(self)
"""
Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy.
"""
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
cdef PyObject* out_data
cdef PyObject* out_indptr
cdef PyObject* out_indices
@@ -981,6 +990,9 @@ shape: {0.shape}""".format(self)
"""
Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy
"""
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
cdef PyObject* out_data
cdef PyObject* out_indptr
cdef PyObject* out_indices
@@ -1216,6 +1228,9 @@ shape: {0.shape}""".format(self)
"""
Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy
"""
+ if np is None:
+ raise ImportError(
+ "Cannot return a numpy.ndarray if NumPy is not present")
cdef PyObject* out_data
cdef PyObject* out_indptr
cdef PyObject* out_indices
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 7a222cec8a..0b82696d0a 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -25,6 +25,7 @@ import urllib.request
import pytest
import hypothesis as h
+
from ..conftest import groups, defaults
from pyarrow import set_timezone_db_path
diff --git a/python/pyarrow/tests/interchange/test_conversion.py
b/python/pyarrow/tests/interchange/test_conversion.py
index 6d91bad57c..50da6693af 100644
--- a/python/pyarrow/tests/interchange/test_conversion.py
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -16,11 +16,15 @@
# under the License.
from datetime import datetime as dt
-import numpy as np
import pyarrow as pa
from pyarrow.vendored.version import Version
import pytest
+try:
+ import numpy as np
+except ImportError:
+ np = None
+
import pyarrow.interchange as pi
from pyarrow.interchange.column import (
_PyArrowColumn,
@@ -107,13 +111,13 @@ def test_offset_of_sliced_array():
"int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
)
@pytest.mark.parametrize(
- "float, np_float", [
+ "float, np_float_str", [
# (pa.float16(), np.float16), #not supported by pandas
- (pa.float32(), np.float32),
- (pa.float64(), np.float64)
+ (pa.float32(), "float32"),
+ (pa.float64(), "float64")
]
)
-def test_pandas_roundtrip(uint, int, float, np_float):
+def test_pandas_roundtrip(uint, int, float, np_float_str):
if Version(pd.__version__) < Version("1.5.0"):
pytest.skip("__dataframe__ added to pandas in 1.5.0")
@@ -122,7 +126,7 @@ def test_pandas_roundtrip(uint, int, float, np_float):
{
"a": pa.array(arr, type=uint),
"b": pa.array(arr, type=int),
- "c": pa.array(np.array(arr, dtype=np_float), type=float),
+ "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)),
type=float),
"d": [True, False, True],
}
)
@@ -326,13 +330,13 @@ def test_pandas_roundtrip_datetime(unit):
@pytest.mark.pandas
@pytest.mark.parametrize(
- "np_float", [np.float32, np.float64]
+ "np_float_str", ["float32", "float64"]
)
-def test_pandas_to_pyarrow_with_missing(np_float):
+def test_pandas_to_pyarrow_with_missing(np_float_str):
if Version(pd.__version__) < Version("1.5.0"):
pytest.skip("__dataframe__ added to pandas in 1.5.0")
- np_array = np.array([0, np.nan, 2], dtype=np_float)
+ np_array = np.array([0, np.nan, 2], dtype=np.dtype(np_float_str))
datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
df = pd.DataFrame({
# float, ColumnNullType.USE_NAN
@@ -364,6 +368,7 @@ def test_pandas_to_pyarrow_float16_with_missing():
pi.from_dataframe(df)
[email protected]
@pytest.mark.parametrize(
"uint", [pa.uint8(), pa.uint16(), pa.uint32()]
)
@@ -371,16 +376,16 @@ def test_pandas_to_pyarrow_float16_with_missing():
"int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
)
@pytest.mark.parametrize(
- "float, np_float", [
- (pa.float16(), np.float16),
- (pa.float32(), np.float32),
- (pa.float64(), np.float64)
+ "float, np_float_str", [
+ (pa.float16(), "float16"),
+ (pa.float32(), "float32"),
+ (pa.float64(), "float64")
]
)
@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
@pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30'])
@pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)])
-def test_pyarrow_roundtrip(uint, int, float, np_float,
+def test_pyarrow_roundtrip(uint, int, float, np_float_str,
unit, tz, offset, length):
from datetime import datetime as dt
@@ -391,7 +396,7 @@ def test_pyarrow_roundtrip(uint, int, float, np_float,
{
"a": pa.array(arr, type=uint),
"b": pa.array(arr, type=int),
- "c": pa.array(np.array(arr, dtype=np_float),
+ "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)),
type=float, from_pandas=True),
"d": [True, False, True],
"e": [True, False, None],
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py
b/python/pyarrow/tests/interchange/test_interchange_spec.py
index 826089652b..d060f7842c 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -19,10 +19,13 @@ import ctypes
import hypothesis as h
import hypothesis.strategies as st
-import numpy as np
+import pytest
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.tests.strategies as past
-import pytest
all_types = st.deferred(
@@ -39,6 +42,7 @@ all_types = st.deferred(
# datetime is tested in test_extra.py
# dictionary is tested in test_categorical()
[email protected]
@h.given(past.arrays(all_types, size=3))
def test_dtypes(arr):
table = pa.table([arr], names=["a"])
@@ -51,6 +55,7 @@ def test_dtypes(arr):
assert df.get_column(0).offset == 0
[email protected]
@pytest.mark.parametrize(
"uint, uint_bw",
[
@@ -68,17 +73,17 @@ def test_dtypes(arr):
]
)
@pytest.mark.parametrize(
- "float, float_bw, np_float", [
- (pa.float16(), 16, np.float16),
- (pa.float32(), 32, np.float32),
- (pa.float64(), 64, np.float64)
+ "float, float_bw, np_float_str", [
+ (pa.float16(), 16, "float16"),
+ (pa.float32(), 32, "float32"),
+ (pa.float64(), 64, "float64")
]
)
@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
@pytest.mark.parametrize("use_batch", [False, True])
def test_mixed_dtypes(uint, uint_bw, int, int_bw,
- float, float_bw, np_float, unit, tz,
+ float, float_bw, np_float_str, unit, tz,
use_batch):
from datetime import datetime as dt
arr = [1, 2, 3]
@@ -87,7 +92,7 @@ def test_mixed_dtypes(uint, uint_bw, int, int_bw,
{
"a": pa.array(arr, type=uint),
"b": pa.array(arr, type=int),
- "c": pa.array(np.array(arr, dtype=np_float), type=float),
+ "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)),
type=float),
"d": [True, False, True],
"e": ["a", "", "c"],
"f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
@@ -200,16 +205,16 @@ def test_column_get_chunks(use_batch, size, n_chunks):
"int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
)
@pytest.mark.parametrize(
- "float, np_float", [
- (pa.float16(), np.float16),
- (pa.float32(), np.float32),
- (pa.float64(), np.float64)
+ "float, np_float_str", [
+ (pa.float16(), "float16"),
+ (pa.float32(), "float32"),
+ (pa.float64(), "float64")
]
)
@pytest.mark.parametrize("use_batch", [False, True])
-def test_get_columns(uint, int, float, np_float, use_batch):
+def test_get_columns(uint, int, float, np_float_str, use_batch):
arr = [[1, 2, 3], [4, 5]]
- arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float)
+ arr_float = np.array([1, 2, 3, 4, 5], dtype=np.dtype(np_float_str))
table = pa.table(
{
"a": pa.chunked_array(arr, type=uint),
diff --git a/python/pyarrow/tests/parquet/common.py
b/python/pyarrow/tests/parquet/common.py
index b4a57ba0b1..fd6ad94fbd 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -17,7 +17,10 @@
import io
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
from pyarrow.tests import util
diff --git a/python/pyarrow/tests/parquet/test_basic.py
b/python/pyarrow/tests/parquet/test_basic.py
index 194af7415e..6496aa9909 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -22,7 +22,6 @@ import warnings
from shutil import copytree
from decimal import Decimal
-import numpy as np
import pytest
import pyarrow as pa
@@ -47,6 +46,10 @@ try:
except ImportError:
pd = tm = None
+try:
+ import numpy as np
+except ImportError:
+ np = None
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet'
diff --git a/python/pyarrow/tests/parquet/test_data_types.py
b/python/pyarrow/tests/parquet/test_data_types.py
index e6b66b0042..79dd969482 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -17,8 +17,12 @@
import decimal
import io
+import random
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
@@ -173,6 +177,7 @@ def test_direct_read_dictionary_subfield():
assert result[0].num_chunks == 1
[email protected]
def test_dictionary_array_automatically_read():
# ARROW-3246
@@ -334,10 +339,10 @@ def test_column_of_lists(tempdir):
def test_large_list_records():
# This was fixed in PARQUET-1100
- list_lengths = np.random.randint(0, 500, size=50)
- list_lengths[::10] = 0
+ list_lengths = [random.randint(0, 500) for _ in range(50)]
+ list_lengths[::10] = [0, 0, 0, 0, 0]
- list_values = [list(map(int, np.random.randint(0, 100, size=x)))
+ list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)]))
if i % 8 else None
for i, x in enumerate(list_lengths)]
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index 47e608a140..f68f1aa9cd 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -20,7 +20,10 @@ import inspect
import os
import pathlib
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import unittest.mock as mock
diff --git a/python/pyarrow/tests/parquet/test_datetime.py
b/python/pyarrow/tests/parquet/test_datetime.py
index 08fb109832..b89fd97cb9 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -19,7 +19,10 @@ import datetime
import io
import warnings
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index c29213ebc3..14ce9bbfcd 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -20,7 +20,10 @@ import decimal
from collections import OrderedDict
import io
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
@@ -584,7 +587,7 @@ def test_table_large_metadata():
my_schema = pa.schema([pa.field('f0', 'double')],
metadata={'large': 'x' * 10000000})
- table = pa.table([np.arange(10)], schema=my_schema)
+ table = pa.table([range(10)], schema=my_schema)
_check_roundtrip(table)
diff --git a/python/pyarrow/tests/parquet/test_pandas.py
b/python/pyarrow/tests/parquet/test_pandas.py
index b5913bf5c6..2ea2f46873 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -18,7 +18,10 @@
import io
import json
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
diff --git a/python/pyarrow/tests/strategies.py
b/python/pyarrow/tests/strategies.py
index db0aa13971..7a1b31a4d9 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -21,7 +21,10 @@ import sys
import pytest
import hypothesis as h
import hypothesis.strategies as st
-import hypothesis.extra.numpy as npst
+try:
+ import hypothesis.extra.numpy as npst
+except ImportError:
+ npst = None
try:
import hypothesis.extra.pytz as tzst
except ImportError:
@@ -35,7 +38,10 @@ if sys.platform == 'win32':
import tzdata # noqa:F401
except ImportError:
zoneinfo = None
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py
b/python/pyarrow/tests/test_adhoc_memory_leak.py
index cd381cf427..76a766984d 100644
--- a/python/pyarrow/tests/test_adhoc_memory_leak.py
+++ b/python/pyarrow/tests/test_adhoc_memory_leak.py
@@ -17,7 +17,10 @@
import pytest
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.tests.util as test_util
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index c44ec3f8e1..4160d64829 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -27,7 +27,10 @@ import subprocess
import sys
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.tests.strategies as past
@@ -157,6 +160,7 @@ def test_binary_total_values_length():
assert large_arr.slice(1, 3).total_values_length == 11
[email protected]
def test_to_numpy_zero_copy():
arr = pa.array(range(10))
@@ -176,6 +180,7 @@ def test_to_numpy_zero_copy():
np.testing.assert_array_equal(np_arr, expected)
[email protected]
def test_chunked_array_to_numpy_zero_copy():
elements = [[2, 2, 4], [4, 5, 100]]
@@ -191,6 +196,7 @@ def test_chunked_array_to_numpy_zero_copy():
np.testing.assert_array_equal(np_arr, expected)
[email protected]
def test_to_numpy_unsupported_types():
# ARROW-2871: Some primitive types are not yet supported in to_numpy
bool_arr = pa.array([True, False, True])
@@ -217,6 +223,7 @@ def test_to_numpy_unsupported_types():
arr.to_numpy()
[email protected]
def test_to_numpy_writable():
arr = pa.array(range(10))
np_arr = arr.to_numpy()
@@ -234,6 +241,7 @@ def test_to_numpy_writable():
arr.to_numpy(zero_copy_only=True, writable=True)
[email protected]
@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
@pytest.mark.parametrize('tz', [None, "UTC"])
def test_to_numpy_datetime64(unit, tz):
@@ -243,6 +251,7 @@ def test_to_numpy_datetime64(unit, tz):
np.testing.assert_array_equal(np_arr, expected)
[email protected]
@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
def test_to_numpy_timedelta64(unit):
arr = pa.array([1, 2, 3], pa.duration(unit))
@@ -251,6 +260,7 @@ def test_to_numpy_timedelta64(unit):
np.testing.assert_array_equal(np_arr, expected)
[email protected]
def test_to_numpy_dictionary():
# ARROW-7591
arr = pa.array(["a", "b", "a"]).dictionary_encode()
@@ -427,6 +437,11 @@ def test_array_getitem():
with pytest.raises(IndexError):
arr[idx]
+
[email protected]
+def test_array_getitem_numpy_scalars():
+ arr = pa.array(range(10, 15))
+ lst = arr.to_pylist()
# check that numpy scalars are supported
for idx in range(-len(arr), len(arr)):
assert arr[np.int32(idx)].as_py() == lst[idx]
@@ -469,9 +484,11 @@ def test_array_slice():
res.validate()
expected = arr.to_pylist()[start:stop]
assert res.to_pylist() == expected
- assert res.to_numpy().tolist() == expected
+ if np is not None:
+ assert res.to_numpy().tolist() == expected
[email protected]
def test_array_slice_negative_step():
# ARROW-2714
np_arr = np.arange(20)
@@ -542,6 +559,7 @@ def test_struct_array_slice():
{'a': 5, 'b': 6.5}]
[email protected]
def test_array_factory_invalid_type():
class MyObject:
@@ -552,6 +570,7 @@ def test_array_factory_invalid_type():
pa.array(arr)
[email protected]
def test_array_ref_to_ndarray_base():
arr = np.array([1, 2, 3])
@@ -576,6 +595,7 @@ def test_array_eq():
assert (arr1 == None) is False # noqa: E711
[email protected]
def test_array_from_buffers():
values_buf = pa.py_buffer(np.int16([4, 5, 6, 7]))
nulls_buf = pa.py_buffer(np.uint8([0b00001101]))
@@ -773,6 +793,7 @@ def test_dictionary_from_buffers(offset):
assert a[offset:] == b
[email protected]
def test_dictionary_from_numpy():
indices = np.repeat([0, 1, 2], 2)
dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -795,6 +816,7 @@ def test_dictionary_from_numpy():
assert d2[i].as_py() == dictionary[indices[i]]
[email protected]
def test_dictionary_to_numpy():
expected = pa.array(
["foo", "bar", None, "foo"]
@@ -865,6 +887,7 @@ def test_dictionary_to_numpy():
)
[email protected]
def test_dictionary_from_boxed_arrays():
indices = np.repeat([0, 1, 2], 2)
dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -910,6 +933,7 @@ def test_dictionary_indices():
arr.indices.validate(full=True)
[email protected]
@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
[(pa.ListArray, pa.list_),
(pa.LargeListArray, pa.large_list)])
@@ -1052,6 +1076,7 @@ def test_map_from_dict():
assert tup_arr.equals(dict_arr)
[email protected]
def test_map_from_arrays():
offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
offsets = pa.array(offsets_arr, type='int32')
@@ -1472,6 +1497,7 @@ def _check_cast_case(case, *, safe=True,
check_array_construction=True):
assert in_arr.equals(expected)
[email protected]
def test_cast_integers_safe():
safe_cases = [
(np.array([0, 1, 2, 3], dtype='i1'), 'int8',
@@ -1558,6 +1584,7 @@ def test_chunked_array_data_warns():
assert isinstance(res, pa.ChunkedArray)
[email protected]
def test_cast_integers_unsafe():
# We let NumPy do the unsafe casting.
# Note that NEP50 in the NumPy spec no longer allows
@@ -1578,6 +1605,7 @@ def test_cast_integers_unsafe():
_check_cast_case(case, safe=False)
[email protected]
def test_floating_point_truncate_safe():
safe_cases = [
(np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32',
@@ -1591,6 +1619,7 @@ def test_floating_point_truncate_safe():
_check_cast_case(case, safe=True)
[email protected]
def test_floating_point_truncate_unsafe():
unsafe_cases = [
(np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32',
@@ -1635,6 +1664,7 @@ def test_decimal_to_int_safe():
_check_cast_case(case, safe=True)
[email protected]
def test_decimal_to_int_value_out_of_bounds():
out_of_bounds_cases = [
(
@@ -1735,6 +1765,7 @@ def test_decimal_to_decimal():
result = arr.cast(pa.decimal128(5, 2))
[email protected]
def test_safe_cast_nan_to_int_raises():
arr = pa.array([np.nan, 1.])
@@ -1742,6 +1773,7 @@ def test_safe_cast_nan_to_int_raises():
arr.cast(pa.int64(), safe=True)
[email protected]
def test_cast_signed_to_unsigned():
safe_cases = [
(np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(),
@@ -1992,6 +2024,7 @@ def test_dictionary_decode():
assert result.equals(expected)
[email protected]
def test_cast_time32_to_int():
arr = pa.array(np.array([0, 1, 2], dtype='int32'),
type=pa.time32('s'))
@@ -2001,6 +2034,7 @@ def test_cast_time32_to_int():
assert result.equals(expected)
[email protected]
def test_cast_time64_to_int():
arr = pa.array(np.array([0, 1, 2], dtype='int64'),
type=pa.time64('us'))
@@ -2010,6 +2044,7 @@ def test_cast_time64_to_int():
assert result.equals(expected)
[email protected]
def test_cast_timestamp_to_int():
arr = pa.array(np.array([0, 1, 2], dtype='int64'),
type=pa.timestamp('us'))
@@ -2035,6 +2070,7 @@ def test_cast_date32_to_int():
assert result2.equals(arr)
[email protected]
def test_cast_duration_to_int():
arr = pa.array(np.array([0, 1, 2], dtype='int64'),
type=pa.duration('us'))
@@ -2044,6 +2080,7 @@ def test_cast_duration_to_int():
assert result.equals(expected)
[email protected]
def test_cast_binary_to_utf8():
binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
utf8_arr = binary_arr.cast(pa.utf8())
@@ -2064,6 +2101,7 @@ def test_cast_binary_to_utf8():
assert casted.null_count == 1
[email protected]
def test_cast_date64_to_int():
arr = pa.array(np.array([0, 1, 2], dtype='int64'),
type=pa.date64())
@@ -2146,6 +2184,7 @@ def test_array_pickle_dictionary(pickle_module):
assert array.equals(result)
[email protected]
@h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
@h.given(
past.arrays(
@@ -2177,9 +2216,9 @@ def test_array_pickle_protocol5(data, typ, pickle_module):
assert result_addresses == addresses
[email protected](
- 'narr',
- [
[email protected]
+def test_to_numpy_roundtrip():
+ for narr in [
np.arange(10, dtype=np.int64),
np.arange(10, dtype=np.int32),
np.arange(10, dtype=np.int16),
@@ -2191,23 +2230,23 @@ def test_array_pickle_protocol5(data, typ,
pickle_module):
np.arange(10, dtype=np.float64),
np.arange(10, dtype=np.float32),
np.arange(10, dtype=np.float16),
- ]
-)
-def test_to_numpy_roundtrip(narr):
- arr = pa.array(narr)
- assert narr.dtype == arr.to_numpy().dtype
- np.testing.assert_array_equal(narr, arr.to_numpy())
- np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
- np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
- np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
+ ]:
+ arr = pa.array(narr)
+ assert narr.dtype == arr.to_numpy().dtype
+ np.testing.assert_array_equal(narr, arr.to_numpy())
+ np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
+ np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
+ np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
[email protected]
def test_array_uint64_from_py_over_range():
arr = pa.array([2 ** 63], type=pa.uint64())
expected = pa.array(np.array([2 ** 63], dtype='u8'))
assert arr.equals(expected)
[email protected]
def test_array_conversions_no_sentinel_values():
arr = np.array([1, 2, 3, 4], dtype='int8')
refcount = sys.getrefcount(arr)
@@ -2249,6 +2288,7 @@ def test_time32_time64_from_integer():
assert result.equals(expected)
[email protected]
def test_binary_string_pandas_null_sentinels():
# ARROW-6227
def _check_case(ty):
@@ -2259,6 +2299,7 @@ def test_binary_string_pandas_null_sentinels():
_check_case('utf8')
[email protected]
def test_pandas_null_sentinels_raise_error():
# ARROW-6227
cases = [
@@ -2299,6 +2340,7 @@ def test_pandas_null_sentinels_index():
assert result.equals(expected)
[email protected]
def test_array_roundtrip_from_numpy_datetimeD():
arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')
@@ -2319,6 +2361,7 @@ def test_array_from_naive_datetimes():
assert arr.type == pa.timestamp('us', tz=None)
[email protected]
@pytest.mark.parametrize(('dtype', 'type'), [
('datetime64[s]', pa.timestamp('s')),
('datetime64[ms]', pa.timestamp('ms')),
@@ -2342,6 +2385,7 @@ def test_array_from_numpy_datetime(dtype, type):
assert arr.equals(expected)
[email protected]
def test_array_from_different_numpy_datetime_units_raises():
data = [
None,
@@ -2356,6 +2400,7 @@ def
test_array_from_different_numpy_datetime_units_raises():
pa.array(data)
[email protected]
@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
def test_array_from_list_of_timestamps(unit):
n = np.datetime64('NaT', unit)
@@ -2370,6 +2415,7 @@ def test_array_from_list_of_timestamps(unit):
assert a1[0] == a2[0]
[email protected]
def test_array_from_timestamp_with_generic_unit():
n = np.datetime64('NaT')
x = np.datetime64('2017-01-01 01:01:01.111111111')
@@ -2380,6 +2426,7 @@ def test_array_from_timestamp_with_generic_unit():
pa.array([n, x, y])
[email protected]
@pytest.mark.parametrize(('dtype', 'type'), [
('timedelta64[s]', pa.duration('s')),
('timedelta64[ms]', pa.duration('ms')),
@@ -2408,6 +2455,7 @@ def test_array_from_numpy_timedelta(dtype, type):
assert arr.to_pylist() == data
[email protected]
def test_array_from_numpy_timedelta_incorrect_unit():
# generic (no unit)
td = np.timedelta64(1)
@@ -2423,6 +2471,7 @@ def test_array_from_numpy_timedelta_incorrect_unit():
pa.array(data)
[email protected]
def test_array_from_numpy_ascii():
arr = np.array(['abcde', 'abc', ''], dtype='|S5')
@@ -2567,6 +2616,7 @@ def test_interval_array_from_dateoffset():
assert list(actual_list[0]) == expected_from_pandas
[email protected]
def test_array_from_numpy_unicode():
dtypes = ['<U5', '>U5']
@@ -2599,12 +2649,14 @@ def test_array_from_numpy_unicode():
assert arrow_arr.equals(expected)
[email protected]
def test_array_string_from_non_string():
# ARROW-5682 - when converting to string raise on non string-like dtype
with pytest.raises(TypeError):
pa.array(np.array([1, 2, 3]), type=pa.string())
[email protected]
def test_array_string_from_all_null():
# ARROW-5682
vals = np.array([None, None], dtype=object)
@@ -2619,6 +2671,7 @@ def test_array_string_from_all_null():
assert arr.null_count == 2
[email protected]
def test_array_from_masked():
ma = np.ma.array([1, 2, 3, 4], dtype='int64',
mask=[False, False, True, False])
@@ -2630,6 +2683,7 @@ def test_array_from_masked():
pa.array(ma, mask=np.array([True, False, False, False]))
[email protected]
def test_array_from_shrunken_masked():
ma = np.ma.array([0], dtype='int64')
result = pa.array(ma)
@@ -2637,6 +2691,7 @@ def test_array_from_shrunken_masked():
assert expected.equals(result)
[email protected]
def test_array_from_invalid_dim_raises():
msg = "only handle 1-dimensional arrays"
arr2d = np.array([[1, 2, 3], [4, 5, 6]])
@@ -2648,6 +2703,7 @@ def test_array_from_invalid_dim_raises():
pa.array(arr0d)
[email protected]
def test_array_from_strided_bool():
# ARROW-6325
arr = np.ones((3, 2), dtype=bool)
@@ -2659,6 +2715,7 @@ def test_array_from_strided_bool():
assert result.equals(expected)
[email protected]
def test_array_from_strided():
pydata = [
([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))),
@@ -2683,6 +2740,7 @@ def test_boolean_true_count_false_count():
assert arr.false_count == 1000
[email protected]
def test_buffers_primitive():
a = pa.array([1, 2, None, 4], type=pa.int16())
buffers = a.buffers()
@@ -2755,6 +2813,7 @@ def test_buffers_nested():
assert struct.unpack('4xh', values) == (43,)
[email protected]
def test_total_buffer_size():
a = pa.array(np.array([4, 5, 6], dtype='int64'))
assert a.nbytes == 8 * 3
@@ -3153,6 +3212,7 @@ def test_nested_dictionary_array():
assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
[email protected]
def test_array_from_numpy_str_utf8():
# ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python
# 2 they are NPY_STRING (binary), so we must do UTF-8 validation
@@ -3179,6 +3239,7 @@ def test_array_from_numpy_str_utf8():
pa.array(vec, pa.string(), mask=np.array([False]))
[email protected]
@pytest.mark.slow
@pytest.mark.large_memory
def test_numpy_binary_overflow_to_chunked():
@@ -3237,6 +3298,7 @@ def test_list_child_overflow_to_chunked():
assert len(arr.chunk(1)) == 1
[email protected]
def test_infer_type_masked():
# ARROW-5208
ty = pa.infer_type(['foo', 'bar', None, 2],
@@ -3252,6 +3314,7 @@ def test_infer_type_masked():
assert pa.infer_type([], mask=[]) == pa.null()
[email protected]
def test_array_masked():
# ARROW-5208
arr = pa.array([4, None, 4, 3.],
@@ -3264,6 +3327,7 @@ def test_array_masked():
assert arr.type == pa.int64()
[email protected]
def test_array_supported_masks():
# ARROW-13883
arr = pa.array([4, None, 4, 3.],
@@ -3322,6 +3386,7 @@ def test_array_supported_pandas_masks():
assert arr.to_pylist() == [None, 1]
[email protected]
def test_binary_array_masked():
# ARROW-12431
masked_basic = pa.array([b'\x05'], type=pa.binary(1),
@@ -3354,6 +3419,7 @@ def test_binary_array_masked():
assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist()
[email protected]
def test_binary_array_strided():
# Masked
nparray = np.array([b"ab", b"cd", b"ef"])
@@ -3367,6 +3433,7 @@ def test_binary_array_strided():
assert [b"ab", b"ef"] == arrow_array.to_pylist()
[email protected]
def test_array_invalid_mask_raises():
# ARROW-10742
cases = [
@@ -3400,6 +3467,7 @@ def test_array_from_large_pyints():
pa.array([int(2 ** 63)])
[email protected]
def test_numpy_array_protocol():
# test the __array__ method on pyarrow.Array
arr = pa.array([1, 2, 3])
@@ -3446,6 +3514,7 @@ def test_numpy_array_protocol():
assert result.dtype == "float64"
[email protected]
def test_array_protocol():
class MyArray:
@@ -3769,6 +3838,7 @@ def test_run_end_encoded_from_buffers():
1, offset, children)
[email protected]
def test_run_end_encoded_from_array_with_type():
run_ends = [1, 3, 6]
values = [1, 2, 3]
@@ -3808,6 +3878,7 @@ def test_run_end_encoded_from_array_with_type():
assert result.equals(expected)
[email protected]
def test_run_end_encoded_to_numpy():
arr = [1, 2, 2, 3, 3, 3]
ree_array = pa.array(arr, pa.run_end_encoded(pa.int32(), pa.int64()))
@@ -4023,6 +4094,7 @@ def test_list_view_slice(list_view_type):
assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() ==
[4]
[email protected]
@pytest.mark.parametrize('numpy_native_dtype', ['u2', 'i4', 'f8'])
def test_swapped_byte_order_fails(numpy_native_dtype):
# ARROW-39129
diff --git a/python/pyarrow/tests/test_builder.py
b/python/pyarrow/tests/test_builder.py
index abc8a0013d..9187a19b5f 100644
--- a/python/pyarrow/tests/test_builder.py
+++ b/python/pyarrow/tests/test_builder.py
@@ -15,10 +15,9 @@
# specific language governing permissions and limitations
# under the License.
+import math
import weakref
-import numpy as np
-
import pyarrow as pa
from pyarrow.lib import StringBuilder, StringViewBuilder
@@ -35,7 +34,7 @@ def test_string_builder_append():
sbuilder = StringBuilder()
sbuilder.append(b"a byte string")
sbuilder.append("a string")
- sbuilder.append(np.nan)
+ sbuilder.append(math.nan)
sbuilder.append(None)
assert len(sbuilder) == 4
assert sbuilder.null_count == 2
@@ -50,7 +49,7 @@ def test_string_builder_append():
def test_string_builder_append_values():
sbuilder = StringBuilder()
- sbuilder.append_values([np.nan, None, "text", None, "other text"])
+ sbuilder.append_values([math.nan, None, "text", None, "other text"])
assert sbuilder.null_count == 3
arr = sbuilder.finish()
assert arr.null_count == 3
@@ -60,7 +59,7 @@ def test_string_builder_append_values():
def test_string_builder_append_after_finish():
sbuilder = StringBuilder()
- sbuilder.append_values([np.nan, None, "text", None, "other text"])
+ sbuilder.append_values([math.nan, None, "text", None, "other text"])
arr = sbuilder.finish()
sbuilder.append("No effect")
expected = [None, None, "text", None, "other text"]
@@ -72,7 +71,7 @@ def test_string_view_builder():
builder.append(b"a byte string")
builder.append("a string")
builder.append("a longer not-inlined string")
- builder.append(np.nan)
+ builder.append(math.nan)
builder.append_values([None, "text"])
assert len(builder) == 6
assert builder.null_count == 2
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index 64fe7f1deb..d4307cd24f 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -28,7 +28,10 @@ import random
import sys
import textwrap
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
try:
import pandas as pd
@@ -44,27 +47,6 @@ try:
except ImportError:
pas = None
-all_array_types = [
- ('bool', [True, False, False, True, True]),
- ('uint8', np.arange(5)),
- ('int8', np.arange(5)),
- ('uint16', np.arange(5)),
- ('int16', np.arange(5)),
- ('uint32', np.arange(5)),
- ('int32', np.arange(5)),
- ('uint64', np.arange(5, 10)),
- ('int64', np.arange(5, 10)),
- ('float', np.arange(0, 0.5, 0.1)),
- ('double', np.arange(0, 0.5, 0.1)),
- ('string', ['a', 'b', None, 'ddd', 'ee']),
- ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
- (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
- (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
- (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
- (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
- {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
-]
-
exported_functions = [
func for (name, func) in sorted(pc.__dict__.items())
if hasattr(func, '__arrow_compute_function__')]
@@ -87,6 +69,28 @@ numerical_arrow_types = [
]
+all_array_types = [
+ ('bool', [True, False, False, True, True]),
+ ('uint8', range(5)),
+ ('int8', range(5)),
+ ('uint16', range(5)),
+ ('int16', range(5)),
+ ('uint32', range(5)),
+ ('int32', range(5)),
+ ('uint64', range(5, 10)),
+ ('int64', range(5, 10)),
+ ('float', [0, 0.1, 0.2, 0.3, 0.4]),
+ ('double', [0, 0.1, 0.2, 0.3, 0.4]),
+ ('string', ['a', 'b', None, 'ddd', 'ee']),
+ ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
+ (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
+ (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
+ (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
+ (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
+ {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
+]
+
+
def test_exported_functions():
# Check that all exported concrete functions can be called with
# the right number of arguments.
@@ -263,6 +267,7 @@ def test_get_function_hash_aggregate():
pc.HashAggregateKernel, 1)
[email protected]
def test_call_function_with_memory_pool():
arr = pa.array(["foo", "bar", "baz"])
indices = np.array([2, 2, 1])
@@ -1172,7 +1177,7 @@ def test_take_on_chunked_array():
]
])
- indices = np.array([0, 5, 1, 6, 9, 2])
+ indices = pa.array([0, 5, 1, 6, 9, 2])
result = arr.take(indices)
expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]])
assert result.equals(expected)
@@ -1304,12 +1309,6 @@ def test_filter(ty, values):
result.validate()
assert result.equals(pa.array([values[0], values[3], None], type=ty))
- # same test with different array type
- mask = np.array([True, False, False, True, None])
- result = arr.filter(mask, null_selection_behavior='drop')
- result.validate()
- assert result.equals(pa.array([values[0], values[3]], type=ty))
-
# non-boolean dtype
mask = pa.array([0, 1, 0, 1, 0])
with pytest.raises(NotImplementedError):
@@ -1321,6 +1320,17 @@ def test_filter(ty, values):
arr.filter(mask)
[email protected]
[email protected](('ty', 'values'), all_array_types)
+def test_filter_numpy_array_mask(ty, values):
+ arr = pa.array(values, type=ty)
+ # same test as test_filter with different array type
+ mask = np.array([True, False, False, True, None])
+ result = arr.filter(mask, null_selection_behavior='drop')
+ result.validate()
+ assert result.equals(pa.array([values[0], values[3]], type=ty))
+
+
def test_filter_chunked_array():
arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
expected_drop = pa.chunked_array([["a"], ["e"]])
@@ -1586,9 +1596,11 @@ def test_round_to_integer(ty):
for round_mode, expected in rmode_and_expected.items():
options = RoundOptions(round_mode=round_mode)
result = round(values, options=options)
- np.testing.assert_array_equal(result, pa.array(expected))
+ expected_array = pa.array(expected, type=pa.float64())
+ assert expected_array.equals(result)
[email protected]
def test_round():
values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
ndigits_and_expected = {
@@ -1607,6 +1619,7 @@ def test_round():
assert pc.round(values, ndigits, "half_towards_infinity") == result
[email protected]
def test_round_to_multiple():
values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
multiple_and_expected = {
@@ -1670,7 +1683,7 @@ def test_is_null():
expected = pa.chunked_array([[True, True], [True, False]])
assert result.equals(expected)
- arr = pa.array([1, 2, 3, None, np.nan])
+ arr = pa.array([1, 2, 3, None, float("nan")])
result = arr.is_null()
expected = pa.array([False, False, False, True, False])
assert result.equals(expected)
@@ -1681,7 +1694,7 @@ def test_is_null():
def test_is_nan():
- arr = pa.array([1, 2, 3, None, np.nan])
+ arr = pa.array([1, 2, 3, None, float("nan")])
result = arr.is_nan()
expected = pa.array([False, False, False, None, True])
assert result.equals(expected)
@@ -1986,6 +1999,7 @@ def check_cast_float_to_decimal(float_ty, float_val,
decimal_ty, decimal_ctx,
# Cannot test float32 as case generators above assume float64
[email protected]
@pytest.mark.parametrize('float_ty', [pa.float64()], ids=str)
@pytest.mark.parametrize('decimal_ty', decimal_type_traits,
ids=lambda v: v.name)
@@ -2003,6 +2017,7 @@ def test_cast_float_to_decimal(float_ty, decimal_ty,
case_generator):
ctx, decimal_ty.max_precision)
[email protected]
@pytest.mark.parametrize('float_ty', [pa.float32(), pa.float64()], ids=str)
@pytest.mark.parametrize('decimal_traits', decimal_type_traits,
ids=lambda v: v.name)
@@ -2908,6 +2923,7 @@ def test_min_max_element_wise():
assert result == pa.array([1, 2, None])
[email protected]
@pytest.mark.parametrize('start', (1.25, 10.5, -10.5))
@pytest.mark.parametrize('skip_nulls', (True, False))
def test_cumulative_sum(start, skip_nulls):
@@ -2962,6 +2978,7 @@ def test_cumulative_sum(start, skip_nulls):
pc.cumulative_sum([1, 2, 3], start=strt)
[email protected]
@pytest.mark.parametrize('start', (1.25, 10.5, -10.5))
@pytest.mark.parametrize('skip_nulls', (True, False))
def test_cumulative_prod(start, skip_nulls):
@@ -3016,6 +3033,7 @@ def test_cumulative_prod(start, skip_nulls):
pc.cumulative_prod([1, 2, 3], start=strt)
[email protected]
@pytest.mark.parametrize('start', (0.5, 3.5, 6.5))
@pytest.mark.parametrize('skip_nulls', (True, False))
def test_cumulative_max(start, skip_nulls):
@@ -3073,6 +3091,7 @@ def test_cumulative_max(start, skip_nulls):
pc.cumulative_max([1, 2, 3], start=strt)
[email protected]
@pytest.mark.parametrize('start', (0.5, 3.5, 6.5))
@pytest.mark.parametrize('skip_nulls', (True, False))
def test_cumulative_min(start, skip_nulls):
@@ -3407,6 +3426,7 @@ def create_sample_expressions():
# Tests the Arrow-specific serialization mechanism
[email protected]
def test_expression_serialization_arrow(pickle_module):
for expr in create_sample_expressions()["all"]:
assert isinstance(expr, pc.Expression)
@@ -3414,6 +3434,7 @@ def test_expression_serialization_arrow(pickle_module):
assert expr.equals(restored)
[email protected]
@pytest.mark.substrait
def test_expression_serialization_substrait():
diff --git a/python/pyarrow/tests/test_convert_builtin.py
b/python/pyarrow/tests/test_convert_builtin.py
index 6140163a8e..c3589877e6 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -23,8 +23,11 @@ import math
import re
import hypothesis as h
-import numpy as np
import pytest
+try:
+ import numpy as np
+except ImportError:
+ np = None
from pyarrow.pandas_compat import _pandas_api # noqa
import pyarrow as pa
@@ -32,17 +35,17 @@ import pyarrow.tests.strategies as past
int_type_pairs = [
- (np.int8, pa.int8()),
- (np.int16, pa.int16()),
- (np.int32, pa.int32()),
- (np.int64, pa.int64()),
- (np.uint8, pa.uint8()),
- (np.uint16, pa.uint16()),
- (np.uint32, pa.uint32()),
- (np.uint64, pa.uint64())]
+ ("int8", pa.int8()),
+ ("int16", pa.int16()),
+ ("int32", pa.int32()),
+ ("int64", pa.int64()),
+ ("uint8", pa.uint8()),
+ ("uint16", pa.uint16()),
+ ("uint32", pa.uint32()),
+ ("uint64", pa.uint64())]
-np_int_types, pa_int_types = zip(*int_type_pairs)
+np_str_int_types, pa_int_types = zip(*int_type_pairs)
class StrangeIterable:
@@ -174,7 +177,9 @@ def _as_set(xs):
return set(xs)
-SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array]
+SEQUENCE_TYPES = [_as_list, _as_tuple]
+if np is not None:
+ SEQUENCE_TYPES.append(_as_numpy_array)
ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES
COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES
@@ -217,6 +222,7 @@ def test_sequence_boolean(seq):
assert arr.to_pylist() == expected
[email protected]
@parametrize_with_sequence_types
def test_sequence_numpy_boolean(seq):
expected = [np.bool_(True), None, np.bool_(False), None]
@@ -225,6 +231,7 @@ def test_sequence_numpy_boolean(seq):
assert arr.to_pylist() == [True, None, False, None]
[email protected]
@parametrize_with_sequence_types
def test_sequence_mixed_numpy_python_bools(seq):
values = np.array([True, False])
@@ -278,11 +285,14 @@ def test_list_with_non_list(seq):
@parametrize_with_sequence_types
[email protected](
+ "inner_seq", SEQUENCE_TYPES
+)
@pytest.mark.parametrize("factory", [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
-def test_nested_arrays(seq, factory):
- arr = pa.array(seq([np.array([], dtype=np.int64),
- np.array([1, 2], dtype=np.int64), None]),
+def test_nested_arrays(seq, inner_seq, factory):
+ arr = pa.array(seq([inner_seq([]),
+ inner_seq([1, 2]), None]),
type=factory(pa.int64()))
assert len(arr) == 3
assert arr.null_count == 1
@@ -290,6 +300,7 @@ def test_nested_arrays(seq, factory):
assert arr.to_pylist() == [[], [1, 2], None]
[email protected]
@parametrize_with_sequence_types
def test_nested_fixed_size_list(seq):
# sequence of lists
@@ -334,10 +345,12 @@ def test_sequence_all_none(seq):
assert arr.to_pylist() == [None, None]
[email protected]
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer(seq, np_scalar_pa_type):
- np_scalar, pa_type = np_scalar_pa_type
+ np_str_scalar, pa_type = np_scalar_pa_type
+ np_scalar = getattr(np, np_str_scalar)
expected = [1, None, 3, None,
np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
arr = pa.array(seq(expected), type=pa_type)
@@ -347,12 +360,12 @@ def test_sequence_integer(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected
[email protected]
@parametrize_with_collections_types
[email protected]("np_scalar_pa_type", int_type_pairs)
-def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
[email protected]("pa_type", pa_int_types)
+def test_sequence_integer_np_nan(seq, pa_type):
# ARROW-2806: numpy.nan is a double value and thus should produce
# a double array.
- _, pa_type = np_scalar_pa_type
with pytest.raises(ValueError):
pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
@@ -364,12 +377,12 @@ def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected
[email protected]
@parametrize_with_sequence_types
[email protected]("np_scalar_pa_type", int_type_pairs)
-def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
[email protected]("pa_type", pa_int_types)
+def test_sequence_integer_nested_np_nan(seq, pa_type):
# ARROW-2806: numpy.nan is a double value and thus should produce
# a double array.
- _, pa_type = np_scalar_pa_type
with pytest.raises(ValueError):
pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
@@ -391,10 +404,12 @@ def test_sequence_integer_inferred(seq):
assert arr.to_pylist() == expected
[email protected]
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_numpy_integer(seq, np_scalar_pa_type):
- np_scalar, pa_type = np_scalar_pa_type
+ np_str_scalar, pa_type = np_scalar_pa_type
+ np_scalar = getattr(np, np_str_scalar)
expected = [np_scalar(1), None, np_scalar(3), None,
np_scalar(np.iinfo(np_scalar).min),
np_scalar(np.iinfo(np_scalar).max)]
@@ -405,10 +420,12 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected
[email protected]
@parametrize_with_sequence_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
- np_scalar, pa_type = np_scalar_pa_type
+ np_str_scalar, pa_type = np_scalar_pa_type
+ np_scalar = getattr(np, np_str_scalar)
expected = [np_scalar(1), None, np_scalar(3), None]
expected += [np_scalar(np.iinfo(np_scalar).min),
np_scalar(np.iinfo(np_scalar).max)]
@@ -434,6 +451,7 @@ def test_broken_integers(seq):
pa.array(seq(data), type=pa.int64())
[email protected]
def test_numpy_scalars_mixed_type():
# ARROW-4324
data = [np.int32(10), np.float32(0.5)]
@@ -448,6 +466,7 @@ def test_numpy_scalars_mixed_type():
assert arr.equals(expected)
[email protected]
@pytest.mark.xfail(reason="Type inference for uint64 not implemented",
raises=OverflowError)
def test_uint64_max_convert():
@@ -491,7 +510,7 @@ def test_integer_from_string_error(seq, typ):
def test_convert_with_mask():
data = [1, 2, 3, 4, 5]
- mask = np.array([False, True, False, False, True])
+ mask = [False, True, False, False, True]
result = pa.array(data, mask=mask)
expected = pa.array([1, None, 3, 4, None])
@@ -559,6 +578,7 @@ def test_double_integer_coerce_representable_range():
pa.array(invalid_values2)
[email protected]
def test_float32_integer_coerce_representable_range():
f32 = np.float32
valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
@@ -587,14 +607,16 @@ def test_mixed_sequence_errors():
pa.array([1.5, 'foo'])
[email protected]
@parametrize_with_sequence_types
[email protected]("np_scalar,pa_type", [
- (np.float16, pa.float16()),
- (np.float32, pa.float32()),
- (np.float64, pa.float64())
[email protected]("np_str_scalar,pa_type", [
+ ("float16", pa.float16()),
+ ("float32", pa.float32()),
+ ("float64", pa.float64())
])
@pytest.mark.parametrize("from_pandas", [True, False])
-def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
+def test_sequence_numpy_double(seq, np_str_scalar, pa_type, from_pandas):
+ np_scalar = getattr(np, np_str_scalar)
data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
arr = pa.array(seq(data), from_pandas=from_pandas)
assert len(arr) == 6
@@ -616,27 +638,29 @@ def test_sequence_numpy_double(seq, np_scalar, pa_type,
from_pandas):
assert np.isnan(arr.to_pylist()[5])
[email protected]
@pytest.mark.parametrize("from_pandas", [True, False])
[email protected]("inner_seq", [np.array, list])
-def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
+def test_ndarray_nested_numpy_double(from_pandas):
# ARROW-2806
- data = np.array([
- inner_seq([1., 2.]),
- inner_seq([1., 2., 3.]),
- inner_seq([np.nan]),
- None
- ], dtype=object)
- arr = pa.array(data, from_pandas=from_pandas)
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.list_(pa.float64())
- if from_pandas:
- assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
- else:
- np.testing.assert_equal(arr.to_pylist(),
- [[1., 2.], [1., 2., 3.], [np.nan], None])
+ for inner_seq in (np.array, list):
+ data = np.array([
+ inner_seq([1., 2.]),
+ inner_seq([1., 2., 3.]),
+ inner_seq([np.nan]),
+ None
+ ], dtype=object)
+ arr = pa.array(data, from_pandas=from_pandas)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.list_(pa.float64())
+ if from_pandas:
+ assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None],
None]
+ else:
+ np.testing.assert_equal(arr.to_pylist(),
+ [[1., 2.], [1., 2., 3.], [np.nan], None])
[email protected]
def test_nested_ndarray_in_object_array():
# ARROW-4350
arr = np.empty(2, dtype=object)
@@ -664,6 +688,7 @@ def test_nested_ndarray_in_object_array():
assert result.to_pylist() == [[[1], [2]], [[1], [2]]]
[email protected]
@pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
"not yet implemented"),
raises=AssertionError)
@@ -682,6 +707,7 @@ def test_multidimensional_ndarray_as_nested_list():
assert result.equals(expected)
[email protected]
@pytest.mark.parametrize(('data', 'value_type'), [
([True, False], pa.bool_()),
([None, None], pa.null()),
@@ -711,6 +737,7 @@ def test_list_array_from_object_ndarray(data, value_type):
assert arr.to_pylist() == [data]
[email protected]
@pytest.mark.parametrize(('data', 'value_type'), [
([[1, 2], [3]], pa.list_(pa.int64())),
([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
@@ -730,13 +757,14 @@ def test_array_ignore_nan_from_pandas():
# See ARROW-4324, this reverts logic that was introduced in
# ARROW-2240
with pytest.raises(ValueError):
- pa.array([np.nan, 'str'])
+ pa.array([float("nan"), 'str'])
- arr = pa.array([np.nan, 'str'], from_pandas=True)
+ arr = pa.array([float("nan"), 'str'], from_pandas=True)
expected = pa.array([None, 'str'])
assert arr.equals(expected)
[email protected]
def test_nested_ndarray_different_dtypes():
data = [
np.array([1, 2, 3], dtype='int64'),
@@ -1238,6 +1266,7 @@ def test_sequence_timestamp_out_of_bounds_nanosecond():
assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12)
[email protected]
def test_sequence_numpy_timestamp():
data = [
np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
@@ -1407,14 +1436,25 @@ def test_sequence_timestamp_from_int_with_unit():
pa.array([1, CustomClass()], type=ty)
[email protected]('np_scalar', [True, False])
-def test_sequence_duration(np_scalar):
+def test_sequence_duration():
td1 = datetime.timedelta(2, 3601, 1)
td2 = datetime.timedelta(1, 100, 1000)
- if np_scalar:
- data = [np.timedelta64(td1), None, np.timedelta64(td2)]
- else:
- data = [td1, None, td2]
+ data = [td1, None, td2]
+
+ arr = pa.array(data)
+ assert len(arr) == 3
+ assert arr.type == pa.duration('us')
+ assert arr.null_count == 1
+ assert arr[0].as_py() == td1
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == td2
+
+
[email protected]
+def test_sequence_duration_np_scalar():
+ td1 = datetime.timedelta(2, 3601, 1)
+ td2 = datetime.timedelta(1, 100, 1000)
+ data = [np.timedelta64(td1), None, np.timedelta64(td2)]
arr = pa.array(data)
assert len(arr) == 3
@@ -1480,6 +1520,7 @@ def
test_sequence_duration_nested_lists_with_explicit_type(factory):
assert arr.to_pylist() == data
[email protected]
def test_sequence_duration_nested_lists_numpy():
td1 = datetime.timedelta(1, 1, 1000)
td2 = datetime.timedelta(1, 100)
@@ -1769,6 +1810,7 @@ def test_struct_from_dicts_bytes_keys():
]
[email protected]
def test_struct_from_tuples():
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
@@ -1915,6 +1957,7 @@ def test_struct_from_mixed_sequence():
pa.array(data, type=ty)
[email protected]
def test_struct_from_dicts_inference():
expected_type = pa.struct([pa.field('a', pa.int64()),
pa.field('b', pa.string()),
@@ -1992,7 +2035,7 @@ def test_structarray_from_arrays_coerce():
def test_decimal_array_with_none_and_nan():
- values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
+ values = [decimal.Decimal('1.234'), None, float("nan"),
decimal.Decimal('nan')]
with pytest.raises(TypeError):
# ARROW-6227: Without from_pandas=True, NaN is considered a float
@@ -2215,6 +2258,7 @@ def
test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
]
[email protected]
@h.given(past.all_arrays)
def test_array_to_pylist_roundtrip(arr):
seq = arr.to_pylist()
@@ -2498,6 +2542,7 @@ def test_array_accepts_pyarrow_scalar(seq, data,
scalar_data, value_type):
assert expect.equals(result)
[email protected]
@parametrize_with_collections_types
def test_array_accepts_pyarrow_scalar_errors(seq):
sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)])
diff --git a/python/pyarrow/tests/test_cpp_internals.py
b/python/pyarrow/tests/test_cpp_internals.py
index 83800b77f8..7508d8f0b9 100644
--- a/python/pyarrow/tests/test_cpp_internals.py
+++ b/python/pyarrow/tests/test_cpp_internals.py
@@ -18,6 +18,8 @@
import os.path
from os.path import join as pjoin
+import pytest
+
from pyarrow._pyarrow_cpp_tests import get_cpp_tests
@@ -26,10 +28,16 @@ def inject_cpp_tests(ns):
Inject C++ tests as Python functions into namespace `ns` (a dict).
"""
for case in get_cpp_tests():
+
def wrapper(case=case):
case()
wrapper.__name__ = wrapper.__qualname__ = case.name
wrapper.__module__ = ns['__name__']
+ # Add numpy or pandas marks if the test requires it
+ if 'numpy' in case.name:
+ wrapper = pytest.mark.numpy(wrapper)
+ elif 'pandas' in case.name:
+ wrapper = pytest.mark.pandas(wrapper)
ns[case.name] = wrapper
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 112129d960..dcf96f68c4 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -24,6 +24,7 @@ import gzip
import io
import itertools
import os
+import random
import select
import shutil
import signal
@@ -36,8 +37,6 @@ import weakref
import pytest
-import numpy as np
-
import pyarrow as pa
from pyarrow.csv import (
open_csv, read_csv, ReadOptions, ParseOptions, ConvertOptions, ISO8601,
@@ -54,18 +53,32 @@ def generate_col_names():
yield first + second
+def split_rows(arr, num_cols, num_rows):
+ # Split a num_cols x num_rows array into rows
+ for i in range(0, num_rows * num_cols, num_cols):
+ yield arr[i:i + num_cols]
+
+
+def split_columns(arr, num_cols, num_rows):
+ # Split a num_cols x num_rows array into columns
+ for i in range(0, num_cols):
+ yield arr[i::num_cols]
+
+
def make_random_csv(num_cols=2, num_rows=10, linesep='\r\n', write_names=True):
- arr = np.random.RandomState(42).randint(0, 1000, size=(num_cols, num_rows))
+ rnd = random.Random(42)
+ arr = [rnd.randint(0, 1000) for _ in range(num_cols * num_rows)]
csv = io.StringIO()
col_names = list(itertools.islice(generate_col_names(), num_cols))
if write_names:
csv.write(",".join(col_names))
csv.write(linesep)
- for row in arr.T:
+ for row in split_rows(arr, num_cols, num_rows):
csv.write(",".join(map(str, row)))
csv.write(linesep)
csv = csv.getvalue().encode()
- columns = [pa.array(a, type=pa.int64()) for a in arr]
+ columns = [pa.array(row, type=pa.int64())
+ for row in split_columns(arr, num_cols, num_rows)]
expected = pa.Table.from_arrays(columns, col_names)
return csv, expected
@@ -127,6 +140,25 @@ class InvalidRowHandler:
other.result != self.result)
+def test_split_rows_and_columns_utility():
+ num_cols = 5
+ num_rows = 2
+ arr = [x for x in range(1, 11)]
+ rows = list(split_rows(arr, num_cols, num_rows))
+ assert rows == [
+ [1, 2, 3, 4, 5],
+ [6, 7, 8, 9, 10]
+ ]
+ columns = list(split_columns(arr, num_cols, num_rows))
+ assert columns == [
+ [1, 6],
+ [2, 7],
+ [3, 8],
+ [4, 9],
+ [5, 10]
+ ]
+
+
def test_read_options(pickle_module):
cls = ReadOptions
opts = cls()
@@ -520,6 +552,7 @@ class BaseTestCSV(abc.ABC):
assert (values[opts.skip_rows + opts.skip_rows_after_names:] ==
table_dict[name])
+ @pytest.mark.numpy
def test_row_number_offset_in_errors(self):
# Row numbers are only correctly counted in serial reads
def format_msg(msg_format, row, *args):
@@ -1802,6 +1835,7 @@ class BaseStreamingCSVRead(BaseTestCSV):
with pytest.raises(StopIteration):
assert reader.read_next_batch()
+ @pytest.mark.numpy
def test_skip_rows_after_names(self):
super().test_skip_rows_after_names()
diff --git a/python/pyarrow/tests/test_cuda.py
b/python/pyarrow/tests/test_cuda.py
index d55be651b1..a71fa03650 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -26,7 +26,10 @@ import sysconfig
import pytest
import pyarrow as pa
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ pytestmark = pytest.mark.numpy
cuda = pytest.importorskip("pyarrow.cuda")
diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py
b/python/pyarrow/tests/test_cuda_numba_interop.py
index ff1722d278..876f3c7f76 100644
--- a/python/pyarrow/tests/test_cuda_numba_interop.py
+++ b/python/pyarrow/tests/test_cuda_numba_interop.py
@@ -17,7 +17,10 @@
import pytest
import pyarrow as pa
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ pytestmark = pytest.mark.numpy
dtypes = ['uint8', 'int16', 'float32']
cuda = pytest.importorskip("pyarrow.cuda")
diff --git a/python/pyarrow/tests/test_cython.py
b/python/pyarrow/tests/test_cython.py
index 0eeae5d65f..937d927f83 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -80,6 +80,9 @@ def check_cython_example_module(mod):
mod.cast_scalar(scal, pa.list_(pa.int64()))
+# NumPy is still a required build dependency. It is present in our
+# headers and is required to build for the cython tests.
[email protected]
@pytest.mark.cython
def test_cython_api(tmpdir):
"""
@@ -162,6 +165,7 @@ def test_cython_api(tmpdir):
env=subprocess_env)
[email protected]
@pytest.mark.cython
def test_visit_strings(tmpdir):
with tmpdir.as_cwd():
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index 3b0284bcb7..276cd2e78d 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -20,6 +20,7 @@ import datetime
import os
import pathlib
import posixpath
+import random
import sys
import tempfile
import textwrap
@@ -28,7 +29,10 @@ import time
from shutil import copytree
from urllib.parse import quote
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
@@ -684,8 +688,8 @@ def test_partitioning():
# test partitioning roundtrip
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))],
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)],
names=["f1", "f2", "part"]
)
partitioning_schema = pa.schema([("part", pa.string())])
@@ -2494,7 +2498,7 @@ def _create_partitioned_dataset(basedir):
pq.write_table(table.slice(3*i, 3), part / "test.parquet")
full_table = table.append_column(
- "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int32()))
+ "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int32()))
return full_table, path
@@ -2532,7 +2536,7 @@ def test_open_dataset_partitioned_directory(tempdir,
dataset_reader, pickle_modu
result = dataset.to_table()
expected = table.append_column(
- "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int8()))
+ "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int8()))
assert result.equals(expected)
@@ -3567,7 +3571,7 @@ def _create_parquet_dataset_simple(root_path):
metadata_collector = []
for i in range(4):
- table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)})
+ table = pa.table({'f1': [i] * 10, 'f2': [random.random() for _ in
range(10)]})
pq.write_to_dataset(
table, str(root_path), metadata_collector=metadata_collector
)
@@ -4255,7 +4259,7 @@ def test_write_dataset_existing_data(tempdir):
def _generate_random_int_array(size=4, min=1, max=10):
- return np.random.randint(min, max, size)
+ return [random.randint(min, max) for _ in range(size)]
def _generate_data_and_columns(num_of_columns, num_of_records):
@@ -4513,8 +4517,8 @@ def test_write_dataset_use_threads(tempdir):
def test_write_table(tempdir):
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)
], names=["f1", "f2", "part"])
base_dir = tempdir / 'single'
@@ -4560,8 +4564,8 @@ def test_write_table(tempdir):
def test_write_table_multiple_fragments(tempdir):
table = pa.table([
- pa.array(range(10)), pa.array(np.random.randn(10)),
- pa.array(np.repeat(['a', 'b'], 5))
+ pa.array(range(10)), pa.array(random.random() for _ in range(10)),
+ pa.array(['a'] * 5 + ['b'] * 5)
], names=["f1", "f2", "part"])
table = pa.concat_tables([table]*2)
@@ -4596,8 +4600,8 @@ def test_write_table_multiple_fragments(tempdir):
def test_write_iterable(tempdir):
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)
], names=["f1", "f2", "part"])
base_dir = tempdir / 'inmemory_iterable'
@@ -4618,8 +4622,8 @@ def test_write_iterable(tempdir):
def test_write_scanner(tempdir, dataset_reader):
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)
], names=["f1", "f2", "part"])
dataset = ds.dataset(table)
@@ -4647,7 +4651,7 @@ def test_write_table_partitioned_dict(tempdir):
# specifying the dictionary values explicitly
table = pa.table([
pa.array(range(20)),
- pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(),
+ pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(),
], names=['col', 'part'])
partitioning = ds.partitioning(table.select(["part"]).schema)
@@ -4666,6 +4670,7 @@ def test_write_table_partitioned_dict(tempdir):
assert result.equals(table)
[email protected]
@pytest.mark.parquet
def test_write_dataset_parquet(tempdir):
table = pa.table([
@@ -4712,8 +4717,8 @@ def test_write_dataset_parquet(tempdir):
def test_write_dataset_csv(tempdir):
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)
], names=["f1", "f2", "chr1"])
base_dir = tempdir / 'csv_dataset'
@@ -4739,8 +4744,8 @@ def test_write_dataset_csv(tempdir):
@pytest.mark.parquet
def test_write_dataset_parquet_file_visitor(tempdir):
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)
], names=["f1", "f2", "part"])
visitor_called = False
@@ -4763,7 +4768,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir):
f1_vals = [item for chunk in range(4) for item in [chunk] * 10]
f2_vals = [item*10 for chunk in range(4) for item in [chunk] * 10]
table = pa.table({'f1': f1_vals, 'f2': f2_vals,
- 'part': np.repeat(['a', 'b'], 20)})
+ 'part': ['a'] * 20 + ['b'] * 20})
root_path = tempdir / 'partitioned'
partitioning = ds.partitioning(
@@ -4841,8 +4846,8 @@ def test_write_dataset_s3(s3_example_simple):
)
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))],
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a'] * 10 + ['b'] * 10)],
names=["f1", "f2", "part"]
)
part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
@@ -4918,8 +4923,8 @@ def test_write_dataset_s3_put_only(s3_server):
_configure_s3_limited_user(s3_server, _minio_put_only_policy)
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
- pa.array(np.repeat(['a', 'b'], 10))],
+ pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+ pa.array(['a']*10 + ['b'] * 10)],
names=["f1", "f2", "part"]
)
part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
diff --git a/python/pyarrow/tests/test_dataset_encryption.py
b/python/pyarrow/tests/test_dataset_encryption.py
index 0d8b4a152a..eb79121b1c 100644
--- a/python/pyarrow/tests/test_dataset_encryption.py
+++ b/python/pyarrow/tests/test_dataset_encryption.py
@@ -17,7 +17,7 @@
import base64
from datetime import timedelta
-import numpy as np
+import random
import pyarrow.fs as fs
import pyarrow as pa
@@ -187,7 +187,10 @@ def test_large_row_encryption_decryption():
row_count = 2**15 + 1
table = pa.Table.from_arrays(
- [pa.array(np.random.rand(row_count), type=pa.float32())], names=["foo"]
+ [pa.array(
+ [random.random() for _ in range(row_count)],
+ type=pa.float32()
+ )], names=["foo"]
)
kms_config = pe.KmsConnectionConfig()
diff --git a/python/pyarrow/tests/test_dlpack.py
b/python/pyarrow/tests/test_dlpack.py
index 7cf3f4acdb..a18accb1e2 100644
--- a/python/pyarrow/tests/test_dlpack.py
+++ b/python/pyarrow/tests/test_dlpack.py
@@ -19,12 +19,20 @@ import ctypes
from functools import wraps
import pytest
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
from pyarrow.vendored.version import Version
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not numpy'
+pytestmark = pytest.mark.numpy
+
+
def PyCapsule_IsValid(capsule, name):
return ctypes.pythonapi.PyCapsule_IsValid(ctypes.py_object(capsule), name)
== 1
@@ -52,45 +60,45 @@ def check_bytes_allocated(f):
@check_bytes_allocated
@pytest.mark.parametrize(
- ('value_type', 'np_type'),
+ ('value_type', 'np_type_str'),
[
- (pa.uint8(), np.uint8),
- (pa.uint16(), np.uint16),
- (pa.uint32(), np.uint32),
- (pa.uint64(), np.uint64),
- (pa.int8(), np.int8),
- (pa.int16(), np.int16),
- (pa.int32(), np.int32),
- (pa.int64(), np.int64),
- (pa.float16(), np.float16),
- (pa.float32(), np.float32),
- (pa.float64(), np.float64),
+ (pa.uint8(), "uint8"),
+ (pa.uint16(), "uint16"),
+ (pa.uint32(), "uint32"),
+ (pa.uint64(), "uint64"),
+ (pa.int8(), "int8"),
+ (pa.int16(), "int16"),
+ (pa.int32(), "int32"),
+ (pa.int64(), "int64"),
+ (pa.float16(), "float16"),
+ (pa.float32(), "float32"),
+ (pa.float64(), "float64"),
]
)
-def test_dlpack(value_type, np_type):
+def test_dlpack(value_type, np_type_str):
if Version(np.__version__) < Version("1.24.0"):
pytest.skip("No dlpack support in numpy versions older than 1.22.0, "
"strict keyword in assert_array_equal added in numpy
version "
"1.24.0")
- expected = np.array([1, 2, 3], dtype=np_type)
+ expected = np.array([1, 2, 3], dtype=np.dtype(np_type_str))
arr = pa.array(expected, type=value_type)
check_dlpack_export(arr, expected)
arr_sliced = arr.slice(1, 1)
- expected = np.array([2], dtype=np_type)
+ expected = np.array([2], dtype=np.dtype(np_type_str))
check_dlpack_export(arr_sliced, expected)
arr_sliced = arr.slice(0, 1)
- expected = np.array([1], dtype=np_type)
+ expected = np.array([1], dtype=np.dtype(np_type_str))
check_dlpack_export(arr_sliced, expected)
arr_sliced = arr.slice(1)
- expected = np.array([2, 3], dtype=np_type)
+ expected = np.array([2, 3], dtype=np.dtype(np_type_str))
check_dlpack_export(arr_sliced, expected)
arr_zero = pa.array([], type=value_type)
- expected = np.array([], dtype=np_type)
+ expected = np.array([], dtype=np.dtype(np_type_str))
check_dlpack_export(arr_zero, expected)
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index aacbd2cb6e..b74eca75bd 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -23,12 +23,15 @@ import weakref
from uuid import uuid4, UUID
import sys
-import numpy as np
+import pytest
+try:
+ import numpy as np
+except ImportError:
+ np = None
+
import pyarrow as pa
from pyarrow.vendored.version import Version
-import pytest
-
@contextlib.contextmanager
def registered_extension_type(ext_type):
@@ -562,6 +565,7 @@ def test_ext_array_pickling(pickle_module):
assert arr.storage.to_pylist() == [b"foo", b"bar"]
[email protected]
def test_ext_array_conversion_to_numpy():
storage1 = pa.array([1, 2, 3], type=pa.int64())
storage2 = pa.array([b"123", b"456", b"789"], type=pa.binary(3))
@@ -619,6 +623,7 @@ def struct_w_ext_data():
return [sarr1, sarr2]
[email protected]
def test_struct_w_ext_array_to_numpy(struct_w_ext_data):
# ARROW-15291
# Check that we don't segfault when trying to build
@@ -1233,6 +1238,7 @@ def test_parquet_extension_nested_in_extension(tmpdir):
assert table == orig_table
[email protected]
def test_to_numpy():
period_type = PeriodType('D')
storage = pa.array([1, 2, 3, 4], pa.int64())
@@ -1285,7 +1291,11 @@ def test_empty_take():
(["cat", "dog", "horse"], LabelType)
))
@pytest.mark.parametrize(
- "into", ["to_numpy", pytest.param("to_pandas", marks=pytest.mark.pandas)])
+ "into", [
+ pytest.param("to_numpy", marks=pytest.mark.numpy),
+ pytest.param("to_pandas", marks=pytest.mark.pandas)
+ ]
+)
def test_extension_array_to_numpy_pandas(data, ty, into):
storage = pa.array(data)
ext_arr = pa.ExtensionArray.from_storage(ty(), storage)
@@ -1301,6 +1311,7 @@ def test_extension_array_to_numpy_pandas(data, ty, into):
assert np.array_equal(result, expected)
[email protected]
def test_array_constructor():
ext_type = IntegerType()
storage = pa.array([1, 2, 3], type=pa.int64())
@@ -1333,6 +1344,7 @@ def test_array_constructor_from_pandas():
assert result.equals(expected)
[email protected]
@pytest.mark.cython
def test_cpp_extension_in_python(tmpdir):
from .test_cython import (
@@ -1430,38 +1442,45 @@ def test_tensor_type():
assert tensor_type.permutation is None
[email protected]("value_type", (np.int8(), np.int64(), np.float32()))
-def test_tensor_class_methods(value_type):
[email protected]
[email protected]("np_type_str", ("int8", "int64", "float32"))
+def test_tensor_class_methods(np_type_str):
from numpy.lib.stride_tricks import as_strided
- arrow_type = pa.from_numpy_dtype(value_type)
+ arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str))
tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3])
storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
pa.list_(arrow_type, 6))
arr = pa.ExtensionArray.from_storage(tensor_type, storage)
expected = np.array(
- [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+ [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+ dtype=np.dtype(np_type_str)
+ )
np.testing.assert_array_equal(arr.to_tensor(), expected)
np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected)
- expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+ expected = np.array([[[7, 8, 9], [10, 11, 12]]],
dtype=np.dtype(np_type_str))
result = arr[1:].to_numpy_ndarray()
np.testing.assert_array_equal(result, expected)
values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
- flat_arr = np.array(values[0], dtype=value_type)
- bw = value_type.itemsize
+ flat_arr = np.array(values[0], dtype=np.dtype(np_type_str))
+ bw = np.dtype(np_type_str).itemsize
storage = pa.array(values, pa.list_(arrow_type, 12))
tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0,
1, 2])
result = pa.ExtensionArray.from_storage(tensor_type, storage)
expected = np.array(
- [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
dtype=value_type)
+ [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
+ dtype=np.dtype(np_type_str)
+ )
np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
result = flat_arr.reshape(1, 2, 3, 2)
expected = np.array(
- [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
dtype=value_type)
+ [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
+ dtype=np.dtype(np_type_str)
+ )
np.testing.assert_array_equal(result, expected)
tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0,
2, 1])
@@ -1482,25 +1501,27 @@ def test_tensor_class_methods(value_type):
assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw)
[email protected]("value_type", (np.int8(), np.int64(), np.float32()))
-def test_tensor_array_from_numpy(value_type):
[email protected]
[email protected]("np_type_str", ("int8", "int64", "float32"))
+def test_tensor_array_from_numpy(np_type_str):
from numpy.lib.stride_tricks import as_strided
- arrow_type = pa.from_numpy_dtype(value_type)
+ arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str))
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
- dtype=value_type, order="C")
+ dtype=np.dtype(np_type_str), order="C")
tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
assert tensor_array_from_numpy.type.value_type == arrow_type
assert tensor_array_from_numpy.type.shape == [2, 3]
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
- dtype=value_type, order="F")
+ dtype=np.dtype(np_type_str), order="F")
with pytest.raises(ValueError, match="First stride needs to be largest"):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
- flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
dtype=value_type)
- bw = value_type.itemsize
+ flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+ dtype=np.dtype(np_type_str))
+ bw = np.dtype(np_type_str).itemsize
arr = flat_arr.reshape(1, 3, 4)
tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
@@ -1518,23 +1539,26 @@ def test_tensor_array_from_numpy(value_type):
arr = flat_arr.reshape(1, 2, 3, 2)
result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
expected = np.array(
- [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
dtype=value_type)
+ [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
+ dtype=np.dtype(np_type_str)
+ )
np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
- arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
dtype=value_type)
+ arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+ dtype=np.dtype(np_type_str))
expected = arr[1:]
result =
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray()
np.testing.assert_array_equal(result, expected)
- arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
+ arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
dtype=np.dtype(np_type_str))
with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to
fixed"):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
- arr = np.array(1, dtype=value_type)
+ arr = np.array(1, dtype=np.dtype(np_type_str))
with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to
fixed"):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
- arr = np.array([], dtype=value_type)
+ arr = np.array([], dtype=np.dtype(np_type_str))
with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to
fixed"):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0)))
@@ -1546,6 +1570,7 @@ def test_tensor_array_from_numpy(value_type):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2)))
[email protected]
@pytest.mark.parametrize("tensor_type", (
pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]),
@@ -1801,6 +1826,7 @@ def test_bool8_to_bool_conversion():
assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
[email protected]
def test_bool8_to_numpy_conversion():
arr = pa.ExtensionArray.from_storage(
pa.bool8(),
@@ -1841,6 +1867,7 @@ def test_bool8_to_numpy_conversion():
assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
[email protected]
def test_bool8_from_numpy_conversion():
np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
diff --git a/python/pyarrow/tests/test_feather.py
b/python/pyarrow/tests/test_feather.py
index 0064006489..18c8cd5b65 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -23,7 +23,10 @@ import pytest
import hypothesis as h
import hypothesis.strategies as st
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.tests.strategies as past
@@ -135,6 +138,7 @@ def _assert_error_on_write(df, exc, path=None, version=2):
pytest.raises(exc, f)
[email protected]
def test_dataset(version):
num_values = (100, 100)
num_files = 5
@@ -354,6 +358,7 @@ def test_buffer_bounds_error(version):
_check_arrow_roundtrip(table)
[email protected]
def test_boolean_object_nulls(version):
repeats = 100
table = pa.Table.from_arrays(
@@ -540,6 +545,7 @@ def test_read_columns(version):
columns=['boo', 'woo'])
[email protected]
def test_overwritten_file(version):
path = random_path()
TEST_FILES.append(path)
@@ -675,6 +681,7 @@ def test_v2_compression_options():
write_feather(df, buf, compression='snappy')
[email protected]
def test_v2_lz4_default_compression():
# ARROW-8750: Make sure that the compression=None option selects lz4 if
# it's available
@@ -807,6 +814,7 @@ def test_nested_types(compression):
_check_arrow_roundtrip(table, compression=compression)
[email protected]
@h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"]))
def test_roundtrip(table, compression):
_check_arrow_roundtrip(table, compression=compression)
diff --git a/python/pyarrow/tests/test_flight.py
b/python/pyarrow/tests/test_flight.py
index 832c6a2dbd..029a2695b9 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -28,7 +28,10 @@ import time
import traceback
import json
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
@@ -1588,6 +1591,7 @@ def test_flight_do_put_metadata():
assert idx == server_idx
[email protected]
def test_flight_do_put_limit():
"""Try a simple do_put call with a size limit."""
large_batch = pa.RecordBatch.from_arrays([
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index ef499a3a8d..e2df1b1c46 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -25,11 +25,15 @@ import math
import os
import pathlib
import pytest
+import random
import sys
import tempfile
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
from pyarrow.util import guid
from pyarrow import Codec
@@ -464,6 +468,7 @@ def test_buffer_hex(val, expected_hex_buffer):
assert buf.hex() == expected_hex_buffer
[email protected]
def test_buffer_to_numpy():
# Make sure creating a numpy array from an arrow buffer works
byte_array = bytearray(20)
@@ -476,6 +481,7 @@ def test_buffer_to_numpy():
assert array.base == buf
[email protected]
def test_buffer_from_numpy():
# C-contiguous
arr = np.arange(12, dtype=np.int8).reshape((3, 4))
@@ -493,6 +499,7 @@ def test_buffer_from_numpy():
buf = pa.py_buffer(arr.T[::2])
[email protected]
def test_buffer_address():
b1 = b'some data!'
b2 = bytearray(b1)
@@ -513,6 +520,7 @@ def test_buffer_address():
assert buf.address == arr.ctypes.data
[email protected]
def test_buffer_equals():
# Buffer.equals() returns true iff the buffers have the same contents
def eq(a, b):
@@ -624,6 +632,7 @@ def test_buffer_hashing():
hash(pa.py_buffer(b'123'))
[email protected]
def test_buffer_protocol_respects_immutability():
# ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like
# object is mutable by first attempting to get a mutable buffer using
@@ -635,6 +644,7 @@ def test_buffer_protocol_respects_immutability():
assert not numpy_ref.flags.writeable
[email protected]
def test_foreign_buffer():
obj = np.array([1, 2], dtype=np.int32)
addr = obj.__array_interface__["data"][0]
@@ -669,6 +679,7 @@ def test_allocate_buffer_resizable():
assert buf.size == 200
[email protected]
def test_non_cpu_buffer(pickle_module):
cuda = pytest.importorskip("pyarrow.cuda")
ctx = cuda.Context(0)
@@ -798,6 +809,7 @@ def test_cache_options_pickling(pickle_module):
assert pickle_module.loads(pickle_module.dumps(option)) == option
[email protected]
@pytest.mark.parametrize("compression", [
pytest.param(
"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
@@ -838,6 +850,7 @@ def test_compress_decompress(compression):
pa.decompress(compressed_bytes, codec=compression)
[email protected]
@pytest.mark.parametrize("compression", [
pytest.param(
"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
@@ -996,6 +1009,7 @@ def test_buffer_protocol_ref_counting():
assert refcount_before == sys.getrefcount(val)
[email protected]
def test_nativefile_write_memoryview():
f = pa.BufferOutputStream()
data = b'ok'
@@ -1058,8 +1072,8 @@ def test_mock_output_stream():
@pytest.fixture
def sample_disk_data(request, tmpdir):
SIZE = 4096
- arr = np.random.randint(0, 256, size=SIZE).astype('u1')
- data = arr.tobytes()[:SIZE]
+ arr = [random.randint(0, 255) for _ in range(SIZE)]
+ data = bytes(arr[:SIZE])
path = os.path.join(str(tmpdir), guid())
@@ -1146,8 +1160,8 @@ def test_memory_map_writer(tmpdir):
if sys.platform == "emscripten":
pytest.xfail("Multiple memory maps to same file don't work on
emscripten")
SIZE = 4096
- arr = np.random.randint(0, 256, size=SIZE).astype('u1')
- data = arr.tobytes()[:SIZE]
+ arr = [random.randint(0, 255) for _ in range(SIZE)]
+ data = bytes(arr[:SIZE])
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
@@ -1187,9 +1201,9 @@ def test_memory_map_writer(tmpdir):
def test_memory_map_resize(tmpdir):
SIZE = 4096
- arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
- data1 = arr.tobytes()[:(SIZE // 2)]
- data2 = arr.tobytes()[(SIZE // 2):]
+ arr = [random.randint(0, 255) for _ in range(SIZE)]
+ data1 = bytes(arr[:(SIZE // 2)])
+ data2 = bytes(arr[(SIZE // 2):])
path = os.path.join(str(tmpdir), guid())
@@ -1202,7 +1216,7 @@ def test_memory_map_resize(tmpdir):
mmap.close()
with open(path, 'rb') as f:
- assert f.read() == arr.tobytes()
+ assert f.read() == bytes(arr[:SIZE])
def test_memory_zero_length(tmpdir):
@@ -1241,8 +1255,8 @@ def test_memory_map_deref_remove(tmpdir):
def test_os_file_writer(tmpdir):
SIZE = 4096
- arr = np.random.randint(0, 256, size=SIZE).astype('u1')
- data = arr.tobytes()[:SIZE]
+ arr = [random.randint(0, 255) for _ in range(SIZE)]
+ data = bytes(arr[:SIZE])
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
@@ -1523,6 +1537,7 @@ def test_buffered_input_stream_detach_non_seekable():
raw.seek(2)
[email protected]
def test_buffered_output_stream():
np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
buf = pa.py_buffer(np_buf)
@@ -1540,6 +1555,7 @@ def test_buffered_output_stream():
assert np_buf[:10].tobytes() == b'123456789\0'
[email protected]
def test_buffered_output_stream_detach():
np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
buf = pa.py_buffer(np_buf)
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 1e5242efe4..4be5792a92 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -20,11 +20,15 @@ import datetime
import io
import pathlib
import pytest
+import random
import socket
import threading
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
from pyarrow.tests.util import changed_environ, invoke_script
@@ -59,7 +63,7 @@ class IpcFixture:
batches = []
for i in range(num_batches):
batch = pa.record_batch(
- [np.random.randn(nrows),
+ [[random.random() for _ in range(nrows)],
['foo', None, 'bar', 'bazbaz', 'qux']],
schema=schema)
batches.append(batch)
@@ -422,7 +426,7 @@ def test_stream_simple_roundtrip(stream_fixture,
use_legacy_ipc_format):
@pytest.mark.zstd
def test_compression_roundtrip():
sink = io.BytesIO()
- values = np.random.randint(0, 3, 10000)
+ values = [random.randint(0, 3) for _ in range(10000)]
table = pa.Table.from_arrays([values], names=["values"])
options = pa.ipc.IpcWriteOptions(compression='zstd')
diff --git a/python/pyarrow/tests/test_json.py
b/python/pyarrow/tests/test_json.py
index a0a6174266..3bb4440e89 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -23,7 +23,10 @@ import json
import string
import unittest
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
@@ -297,6 +300,7 @@ class BaseTestJSONRead:
match="JSON parse error: unexpected field"):
self.read_bytes(rows, parse_options=opts)
+ @pytest.mark.numpy
def test_small_random_json(self):
data, expected = make_random_json(num_cols=2, num_rows=10)
table = self.read_bytes(data)
@@ -304,6 +308,7 @@ class BaseTestJSONRead:
assert table.equals(expected)
assert table.to_pydict() == expected.to_pydict()
+ @pytest.mark.numpy
def test_load_large_json(self):
data, expected = make_random_json(num_cols=2, num_rows=100100)
# set block size is 10MB
@@ -312,6 +317,7 @@ class BaseTestJSONRead:
assert table.num_rows == 100100
assert expected.num_rows == 100100
+ @pytest.mark.numpy
def test_stress_block_sizes(self):
# Test a number of small block sizes to stress block stitching
data_base, expected = make_random_json(num_cols=2, num_rows=100)
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 208812c3ac..178a073ed5 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -27,9 +27,18 @@ from datetime import date, datetime, time, timedelta,
timezone
import hypothesis as h
import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
import pytest
+try:
+ import numpy as np
+ import numpy.testing as npt
+ try:
+ _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
+ except AttributeError:
+ from numpy.exceptions import (
+ VisibleDeprecationWarning as _np_VisibleDeprecationWarning
+ )
+except ImportError:
+ np = None
from pyarrow.pandas_compat import get_logical_type, _pandas_api
from pyarrow.tests.util import invoke_script, random_ascii, rands
@@ -51,14 +60,6 @@ except ImportError:
pass
-try:
- _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
-except AttributeError:
- from numpy.exceptions import (
- VisibleDeprecationWarning as _np_VisibleDeprecationWarning
- )
-
-
# Marks all of the tests in this module
pytestmark = pytest.mark.pandas
@@ -1202,9 +1203,11 @@ class TestConvertDateTimeLikeTypes:
@pytest.mark.parametrize('mask', [
None,
- np.array([True, False, False, True, False, False]),
+ [True, False, False, True, False, False],
])
def test_pandas_datetime_to_date64(self, mask):
+ if mask:
+ mask = np.array(mask)
s = pd.to_datetime([
'2018-05-10T00:00:00',
'2018-05-11T00:00:00',
@@ -1608,7 +1611,8 @@ class TestConvertDateTimeLikeTypes:
assert pa.Array.from_pandas(expected).equals(result)
@pytest.mark.skipif(
- Version('1.16.0') <= Version(np.__version__) < Version('1.16.1'),
+ np is not None and Version('1.16.0') <= Version(
+ np.__version__) < Version('1.16.1'),
reason='Until numpy/numpy#12745 is resolved')
def test_fixed_offset_timezone(self):
df = pd.DataFrame({
@@ -2921,23 +2925,23 @@ class TestConvertMisc:
"""
type_pairs = [
- (np.int8, pa.int8()),
- (np.int16, pa.int16()),
- (np.int32, pa.int32()),
- (np.int64, pa.int64()),
- (np.uint8, pa.uint8()),
- (np.uint16, pa.uint16()),
- (np.uint32, pa.uint32()),
- (np.uint64, pa.uint64()),
- (np.float16, pa.float16()),
- (np.float32, pa.float32()),
- (np.float64, pa.float64()),
+ ("int8", pa.int8()),
+ ("int16", pa.int16()),
+ ("int32", pa.int32()),
+ ("int64", pa.int64()),
+ ("uint8", pa.uint8()),
+ ("uint16", pa.uint16()),
+ ("uint32", pa.uint32()),
+ ("uint64", pa.uint64()),
+ ("float16", pa.float16()),
+ ("float32", pa.float32()),
+ ("float64", pa.float64()),
# XXX unsupported
# (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
- (np.object_, pa.string()),
- (np.object_, pa.binary()),
- (np.object_, pa.binary(10)),
- (np.object_, pa.list_(pa.int64())),
+ ("object", pa.string()),
+ ("object", pa.binary()),
+ ("object", pa.binary(10)),
+ ("object", pa.list_(pa.int64())),
]
def test_all_none_objects(self):
@@ -2950,8 +2954,8 @@ class TestConvertMisc:
_check_pandas_roundtrip(df)
def test_empty_arrays(self):
- for dtype, pa_type in self.type_pairs:
- arr = np.array([], dtype=dtype)
+ for dtype_str, pa_type in self.type_pairs:
+ arr = np.array([], dtype=np.dtype(dtype_str))
_check_array_roundtrip(arr, type=pa_type)
def test_non_threaded_conversion(self):
diff --git a/python/pyarrow/tests/test_scalars.py
b/python/pyarrow/tests/test_scalars.py
index bc50697e1b..3f4a53c473 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -20,7 +20,10 @@ import decimal
import pytest
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.compute as pc
@@ -40,7 +43,6 @@ import pyarrow.compute as pc
(1, pa.int64(), pa.Int64Scalar),
(1, pa.uint64(), pa.UInt64Scalar),
(1.0, None, pa.DoubleScalar),
- (np.float16(1.0), pa.float16(), pa.HalfFloatScalar),
(1.0, pa.float32(), pa.FloatScalar),
(decimal.Decimal("1.123"), None, pa.Decimal128Scalar),
(decimal.Decimal("1.1234567890123456789012345678901234567890"),
@@ -98,6 +100,40 @@ def test_basics(value, ty, klass, pickle_module):
assert wr() is None
+# This test is a copy of test_basics but only for float16 (HalfFloatScalar)
+# which currently requires a numpy scalar to create it. The test collection
+# fails if numpy is used on the parametrization when not present.
[email protected]
+def test_basics_np_required(pickle_module):
+ value, ty, klass = np.float16(1.0), pa.float16(), pa.HalfFloatScalar
+ s = pa.scalar(value, type=ty)
+ s.validate()
+ s.validate(full=True)
+ assert isinstance(s, klass)
+ assert s.as_py() == value
+ assert s == pa.scalar(value, type=ty)
+ assert s != value
+ assert s != "else"
+ assert hash(s) == hash(s)
+ assert s.is_valid is True
+ assert s != None # noqa: E711
+
+ s = pa.scalar(None, type=s.type)
+ assert s.is_valid is False
+ assert s.as_py() is None
+ assert s != pa.scalar(value, type=ty)
+
+ # test pickle roundtrip
+ restored = pickle_module.loads(pickle_module.dumps(s))
+ assert s.equals(restored)
+
+ # test that scalars are weak-referenceable
+ wr = weakref.ref(s)
+ assert wr() is not None
+ del s
+ assert wr() is None
+
+
def test_invalid_scalar():
s = pc.cast(pa.scalar(b"\xff"), pa.string(), safe=False)
s.validate()
@@ -202,14 +238,15 @@ def test_numerics():
assert str(s) == "1.5"
assert s.as_py() == 1.5
- # float16
- s = pa.scalar(np.float16(0.5), type='float16')
- assert isinstance(s, pa.HalfFloatScalar)
- # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)"
- # on numpy1 repr(np.float16(0.5)) == "0.5"
- assert repr(s) == f"<pyarrow.HalfFloatScalar: {np.float16(0.5)!r}>"
- assert str(s) == "0.5"
- assert s.as_py() == 0.5
+ if np is not None:
+ # float16
+ s = pa.scalar(np.float16(0.5), type='float16')
+ assert isinstance(s, pa.HalfFloatScalar)
+ # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)"
+ # on numpy1 repr(np.float16(0.5)) == "0.5"
+ assert repr(s) == f"<pyarrow.HalfFloatScalar: {np.float16(0.5)!r}>"
+ assert str(s) == "0.5"
+ assert s.as_py() == 0.5
def test_decimal128():
@@ -434,6 +471,7 @@ def test_timestamp_fixed_offset_print():
assert str(arr[0]) == "1970-01-01 02:00:00+02:00"
[email protected]
def test_duration():
arr = np.array([0, 3600000000000], dtype='timedelta64[ns]')
@@ -559,6 +597,7 @@ def test_list(ty, klass):
s[2]
[email protected]
@pytest.mark.parametrize('ty', [
pa.list_(pa.int64()),
pa.large_list(pa.int64()),
diff --git a/python/pyarrow/tests/test_schema.py
b/python/pyarrow/tests/test_schema.py
index 1b05c58384..bdcb6c2b42 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -20,7 +20,10 @@ import sys
import weakref
import pytest
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.tests.util as test_util
@@ -185,6 +188,7 @@ def test_time_types():
pa.time64('s')
[email protected]
def test_from_numpy_dtype():
cases = [
(np.dtype('bool'), pa.bool_()),
diff --git a/python/pyarrow/tests/test_sparse_tensor.py
b/python/pyarrow/tests/test_sparse_tensor.py
index aa7da0a742..7ba9e2b3e1 100644
--- a/python/pyarrow/tests/test_sparse_tensor.py
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -19,7 +19,10 @@ import pytest
import sys
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ pytestmark = pytest.mark.numpy
import pyarrow as pa
try:
diff --git a/python/pyarrow/tests/test_strategies.py
b/python/pyarrow/tests/test_strategies.py
index 14fc949928..da50bcda52 100644
--- a/python/pyarrow/tests/test_strategies.py
+++ b/python/pyarrow/tests/test_strategies.py
@@ -17,6 +17,8 @@
import hypothesis as h
+import pytest
+
import pyarrow as pa
import pyarrow.tests.strategies as past
@@ -36,11 +38,13 @@ def test_schemas(schema):
assert isinstance(schema, pa.lib.Schema)
[email protected]
@h.given(past.all_arrays)
def test_arrays(array):
assert isinstance(array, pa.lib.Array)
[email protected]
@h.given(past.arrays(past.primitive_types, nullable=False))
def test_array_nullability(array):
assert array.null_count == 0
@@ -56,6 +60,7 @@ def test_record_batches(record_bath):
assert isinstance(record_bath, pa.lib.RecordBatch)
[email protected]
@h.given(past.all_tables)
def test_tables(table):
assert isinstance(table, pa.lib.Table)
diff --git a/python/pyarrow/tests/test_substrait.py
b/python/pyarrow/tests/test_substrait.py
index 40700e4741..01d468cd9e 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -608,6 +608,7 @@ def test_output_field_names(use_threads):
assert res_tb == expected
[email protected]
def test_scalar_aggregate_udf_basic(varargs_agg_func_fixture):
test_table = pa.Table.from_pydict(
@@ -756,6 +757,7 @@ def
test_scalar_aggregate_udf_basic(varargs_agg_func_fixture):
assert res_tb == expected_tb
[email protected]
def test_hash_aggregate_udf_basic(varargs_agg_func_fixture):
test_table = pa.Table.from_pydict(
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index cd38909edf..3b60cff2d8 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -20,7 +20,10 @@ from collections.abc import Iterable
import sys
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pytest
import pyarrow as pa
import pyarrow.compute as pc
@@ -125,6 +128,7 @@ def test_chunked_array_can_combine_chunks_with_no_chunks():
).combine_chunks() == pa.array([], type=pa.bool_())
[email protected]
def test_chunked_array_to_numpy():
data = pa.chunked_array([
[1, 2, 3],
@@ -173,6 +177,7 @@ def test_chunked_array_str():
]"""
[email protected]
def test_chunked_array_getitem():
data = [
pa.array([1, 2, 3]),
@@ -972,12 +977,14 @@ def check_tensors(tensor, expected_tensor, type, size):
assert tensor.strides == expected_tensor.strides
[email protected]('typ', [
- np.uint8, np.uint16, np.uint32, np.uint64,
- np.int8, np.int16, np.int32, np.int64,
- np.float32, np.float64,
[email protected]
[email protected]('typ_str', [
+ "uint8", "uint16", "uint32", "uint64",
+ "int8", "int16", "int32", "int64",
+ "float32", "float64",
])
-def test_recordbatch_to_tensor_uniform_type(typ):
+def test_recordbatch_to_tensor_uniform_type(typ_str):
+ typ = np.dtype(typ_str)
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
arr3 = [100, 100, 100, 100, 100, 100, 100, 100, 100]
@@ -1031,6 +1038,7 @@ def test_recordbatch_to_tensor_uniform_type(typ):
check_tensors(result, expected, pa.from_numpy_dtype(typ), 15)
[email protected]
def test_recordbatch_to_tensor_uniform_float_16():
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
@@ -1054,6 +1062,7 @@ def test_recordbatch_to_tensor_uniform_float_16():
check_tensors(result, expected, pa.float16(), 27)
[email protected]
def test_recordbatch_to_tensor_mixed_type():
# uint16 + int16 = int32
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
@@ -1105,6 +1114,7 @@ def test_recordbatch_to_tensor_mixed_type():
assert result.strides == expected.strides
[email protected]
def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16():
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
@@ -1124,6 +1134,7 @@ def
test_recordbatch_to_tensor_unsupported_mixed_type_with_float16():
batch.to_tensor()
[email protected]
def test_recordbatch_to_tensor_nan():
arr1 = [1, 2, 3, 4, np.nan, 6, 7, 8, 9]
arr2 = [10, 20, 30, 40, 50, 60, 70, np.nan, 90]
@@ -1144,6 +1155,7 @@ def test_recordbatch_to_tensor_nan():
assert result.strides == expected.strides
[email protected]
def test_recordbatch_to_tensor_null():
arr1 = [1, 2, 3, 4, None, 6, 7, 8, 9]
arr2 = [10, 20, 30, 40, 50, 60, 70, None, 90]
@@ -1204,6 +1216,7 @@ def test_recordbatch_to_tensor_null():
assert result.strides == expected.strides
[email protected]
def test_recordbatch_to_tensor_empty():
batch = pa.RecordBatch.from_arrays(
[
@@ -1295,6 +1308,7 @@ def test_slice_zero_length_table():
table.to_pandas()
[email protected]
def test_recordbatchlist_schema_equals():
a1 = np.array([1], dtype='uint32')
a2 = np.array([4.0, 5.0], dtype='float64')
@@ -2130,6 +2144,7 @@ def test_table_unsafe_casting(cls):
assert casted_table.equals(expected_table)
[email protected]
def test_invalid_table_construct():
array = np.array([0, 1], dtype=np.uint8)
u8 = pa.uint8()
@@ -3287,6 +3302,7 @@ def test_table_sort_by(cls):
assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"]
[email protected]
@pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
def test_numpy_asarray(constructor):
table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
@@ -3319,6 +3335,7 @@ def test_numpy_asarray(constructor):
assert result.dtype == "int32"
[email protected]
@pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
def test_numpy_array_protocol(constructor):
table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
diff --git a/python/pyarrow/tests/test_tensor.py
b/python/pyarrow/tests/test_tensor.py
index 29c6de65b1..debb106628 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -21,7 +21,10 @@ import pytest
import warnings
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ pytestmark = pytest.mark.numpy
import pyarrow as pa
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index d673f95652..cc680939ac 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -30,7 +30,10 @@ except ImportError:
tzst = None
import weakref
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.types as types
import pyarrow.tests.strategies as past
@@ -1265,14 +1268,16 @@ def test_field_modified_copies():
def test_is_integer_value():
assert pa.types.is_integer_value(1)
- assert pa.types.is_integer_value(np.int64(1))
+ if np is not None:
+ assert pa.types.is_integer_value(np.int64(1))
assert not pa.types.is_integer_value('1')
def test_is_float_value():
assert not pa.types.is_float_value(1)
assert pa.types.is_float_value(1.)
- assert pa.types.is_float_value(np.float64(1))
+ if np is not None:
+ assert pa.types.is_float_value(np.float64(1))
assert not pa.types.is_float_value('1.0')
@@ -1280,8 +1285,9 @@ def test_is_boolean_value():
assert not pa.types.is_boolean_value(1)
assert pa.types.is_boolean_value(True)
assert pa.types.is_boolean_value(False)
- assert pa.types.is_boolean_value(np.bool_(True))
- assert pa.types.is_boolean_value(np.bool_(False))
+ if np is not None:
+ assert pa.types.is_boolean_value(np.bool_(True))
+ assert pa.types.is_boolean_value(np.bool_(False))
@h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py
index 22fefbbb58..93004a3061 100644
--- a/python/pyarrow/tests/test_udf.py
+++ b/python/pyarrow/tests/test_udf.py
@@ -18,7 +18,10 @@
import pytest
-import numpy as np
+try:
+ import numpy as np
+except ImportError:
+ np = None
import pyarrow as pa
from pyarrow import compute as pc
@@ -749,6 +752,7 @@ def test_udt_datasource1_exception():
_test_datasource1_udt(datasource1_exception)
[email protected]
def test_scalar_agg_basic(unary_agg_func_fixture):
arr = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64())
result = pc.call_function("mean_udf", [arr])
@@ -756,6 +760,7 @@ def test_scalar_agg_basic(unary_agg_func_fixture):
assert result == expected
[email protected]
def test_scalar_agg_empty(unary_agg_func_fixture):
empty = pa.array([], pa.float64())
@@ -775,6 +780,7 @@ def
test_scalar_agg_wrong_output_type(wrong_output_type_agg_func_fixture):
pc.call_function("y=wrong_output_type(x)", [arr])
[email protected]
def test_scalar_agg_varargs(varargs_agg_func_fixture):
arr1 = pa.array([10, 20, 30, 40, 50], pa.int64())
arr2 = pa.array([1.0, 2.0, 3.0, 4.0, 5.0], pa.float64())
@@ -786,6 +792,7 @@ def test_scalar_agg_varargs(varargs_agg_func_fixture):
assert result == expected
[email protected]
def test_scalar_agg_exception(exception_agg_func_fixture):
arr = pa.array([10, 20, 30, 40, 50, 60], pa.int64())
@@ -793,6 +800,7 @@ def test_scalar_agg_exception(exception_agg_func_fixture):
pc.call_function("y=exception_len(x)", [arr])
[email protected]
def test_hash_agg_basic(unary_agg_func_fixture):
arr1 = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64())
arr2 = pa.array([4, 2, 1, 2, 1], pa.int32())
@@ -811,6 +819,7 @@ def test_hash_agg_basic(unary_agg_func_fixture):
assert result.sort_by('id') == expected.sort_by('id')
[email protected]
def test_hash_agg_empty(unary_agg_func_fixture):
arr1 = pa.array([], pa.float64())
arr2 = pa.array([], pa.int32())
@@ -841,6 +850,7 @@ def
test_hash_agg_wrong_output_type(wrong_output_type_agg_func_fixture):
table.group_by("id").aggregate([("value", "y=wrong_output_type(x)")])
[email protected]
def test_hash_agg_exception(exception_agg_func_fixture):
arr1 = pa.array([10, 20, 30, 40, 50], pa.int64())
arr2 = pa.array([4, 2, 1, 2, 1], pa.int32())
@@ -850,6 +860,7 @@ def test_hash_agg_exception(exception_agg_func_fixture):
table.group_by("id").aggregate([("value", "y=exception_len(x)")])
[email protected]
def test_hash_agg_random(sum_agg_func_fixture):
"""Test hash aggregate udf with randomly sampled data"""
diff --git a/python/pyarrow/tests/test_without_numpy.py
b/python/pyarrow/tests/test_without_numpy.py
new file mode 100644
index 0000000000..55c12602ce
--- /dev/null
+++ b/python/pyarrow/tests/test_without_numpy.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not nonumpy'
+pytestmark = pytest.mark.nonumpy
+
+
+def test_array_to_np():
+ arr = pa.array(range(10))
+
+ msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+ with pytest.raises(ImportError, match=msg):
+ arr.to_numpy()
+
+
+def test_chunked_array_to_np():
+ data = pa.chunked_array([
+ [1, 2, 3],
+ [4, 5, 6],
+ []
+ ])
+ msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+ with pytest.raises(ImportError, match=msg):
+ data.to_numpy()
+
+
+def test_tensor_to_np():
+ tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+ arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+ storage = pa.array(arr, pa.list_(pa.int32(), 4))
+ tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage)
+
+ tensor = tensor_array.to_tensor()
+ msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+ with pytest.raises(ImportError, match=msg):
+ tensor.to_numpy()
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index 638eee9807..aa6dd21f80 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -22,7 +22,6 @@ Utility functions for testing
import contextlib
import decimal
import gc
-import numpy as np
import os
import random
import re
@@ -110,27 +109,15 @@ def randdecimal(precision, scale):
def random_ascii(length):
- return bytes(np.random.randint(65, 123, size=length, dtype='i1'))
+ return bytes([random.randint(65, 122) for i in range(length)])
def rands(nchars):
"""
Generate one random string.
"""
- RANDS_CHARS = np.array(
- list(string.ascii_letters + string.digits), dtype=(np.str_, 1))
- return "".join(np.random.choice(RANDS_CHARS, nchars))
-
-
-def make_dataframe():
- import pandas as pd
-
- N = 30
- df = pd.DataFrame(
- {col: np.random.randn(N) for col in string.ascii_uppercase[:4]},
- index=pd.Index([rands(10) for _ in range(N)])
- )
- return df
+ RANDS_CHARS = list(string.ascii_letters + string.digits)
+ return "".join(random.choice(RANDS_CHARS) for i in range(nchars))
def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10,
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index f83ecc3aa4..a46caff1f2 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -33,42 +33,50 @@ from cython import sizeof
# These are imprecise because the type (in pandas 0.x) depends on the presence
# of nulls
-cdef dict _pandas_type_map = {
- _Type_NA: np.object_, # NaNs
- _Type_BOOL: np.bool_,
- _Type_INT8: np.int8,
- _Type_INT16: np.int16,
- _Type_INT32: np.int32,
- _Type_INT64: np.int64,
- _Type_UINT8: np.uint8,
- _Type_UINT16: np.uint16,
- _Type_UINT32: np.uint32,
- _Type_UINT64: np.uint64,
- _Type_HALF_FLOAT: np.float16,
- _Type_FLOAT: np.float32,
- _Type_DOUBLE: np.float64,
- # Pandas does not support [D]ay, so default to [ms] for date32
- _Type_DATE32: np.dtype('datetime64[ms]'),
- _Type_DATE64: np.dtype('datetime64[ms]'),
- _Type_TIMESTAMP: {
- 's': np.dtype('datetime64[s]'),
- 'ms': np.dtype('datetime64[ms]'),
- 'us': np.dtype('datetime64[us]'),
- 'ns': np.dtype('datetime64[ns]'),
- },
- _Type_DURATION: {
- 's': np.dtype('timedelta64[s]'),
- 'ms': np.dtype('timedelta64[ms]'),
- 'us': np.dtype('timedelta64[us]'),
- 'ns': np.dtype('timedelta64[ns]'),
- },
- _Type_BINARY: np.object_,
- _Type_FIXED_SIZE_BINARY: np.object_,
- _Type_STRING: np.object_,
- _Type_LIST: np.object_,
- _Type_MAP: np.object_,
- _Type_DECIMAL128: np.object_,
-}
+cdef dict _pandas_type_map = {}
+
+
+def _get_pandas_type_map():
+ global _pandas_type_map
+ if not _pandas_type_map:
+ _pandas_type_map.update({
+ _Type_NA: np.object_, # NaNs
+ _Type_BOOL: np.bool_,
+ _Type_INT8: np.int8,
+ _Type_INT16: np.int16,
+ _Type_INT32: np.int32,
+ _Type_INT64: np.int64,
+ _Type_UINT8: np.uint8,
+ _Type_UINT16: np.uint16,
+ _Type_UINT32: np.uint32,
+ _Type_UINT64: np.uint64,
+ _Type_HALF_FLOAT: np.float16,
+ _Type_FLOAT: np.float32,
+ _Type_DOUBLE: np.float64,
+ # Pandas does not support [D]ay, so default to [ms] for date32
+ _Type_DATE32: np.dtype('datetime64[ms]'),
+ _Type_DATE64: np.dtype('datetime64[ms]'),
+ _Type_TIMESTAMP: {
+ 's': np.dtype('datetime64[s]'),
+ 'ms': np.dtype('datetime64[ms]'),
+ 'us': np.dtype('datetime64[us]'),
+ 'ns': np.dtype('datetime64[ns]'),
+ },
+ _Type_DURATION: {
+ 's': np.dtype('timedelta64[s]'),
+ 'ms': np.dtype('timedelta64[ms]'),
+ 'us': np.dtype('timedelta64[us]'),
+ 'ns': np.dtype('timedelta64[ns]'),
+ },
+ _Type_BINARY: np.object_,
+ _Type_FIXED_SIZE_BINARY: np.object_,
+ _Type_STRING: np.object_,
+ _Type_LIST: np.object_,
+ _Type_MAP: np.object_,
+ _Type_DECIMAL128: np.object_,
+ })
+ return _pandas_type_map
+
cdef dict _pep3118_type_map = {
_Type_INT8: b'b',
@@ -149,14 +157,15 @@ def _is_primitive(Type type):
def _get_pandas_type(arrow_type, coerce_to_ns=False):
cdef Type type_id = arrow_type.id
- if type_id not in _pandas_type_map:
+ cdef dict pandas_type_map = _get_pandas_type_map()
+ if type_id not in pandas_type_map:
return None
if coerce_to_ns:
# ARROW-3789: Coerce date/timestamp types to datetime64[ns]
if type_id == _Type_DURATION:
return np.dtype('timedelta64[ns]')
return np.dtype('datetime64[ns]')
- pandas_type = _pandas_type_map[type_id]
+ pandas_type = pandas_type_map[type_id]
if isinstance(pandas_type, dict):
unit = getattr(arrow_type, 'unit', None)
pandas_type = pandas_type.get(unit, None)