This is an automated email from the ASF dual-hosted git repository. raulcd pushed a commit to branch maint-15.0.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit fc586a091be82a19bb5b7f9f0a23ae7bb4e36ea6 Author: Antoine Pitrou <[email protected]> AuthorDate: Wed Jan 17 11:26:37 2024 +0100 GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636) ### Rationale for this change `PyArray_DescrFromScalar` returns a new reference, so we should be careful to decref it when we don't use it anymore. ### Are these changes tested? No. ### Are there any user-facing changes? No. * Closes: #39599 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]> --- python/pyarrow/array.pxi | 3 +- python/pyarrow/includes/libarrow_python.pxd | 2 +- python/pyarrow/src/arrow/python/inference.cc | 5 +- python/pyarrow/src/arrow/python/numpy_convert.cc | 77 ++++++++++------------ python/pyarrow/src/arrow/python/numpy_convert.h | 6 +- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 11 ++-- python/pyarrow/src/arrow/python/python_to_arrow.cc | 6 +- python/pyarrow/types.pxi | 6 +- 8 files changed, 48 insertions(+), 68 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5c2d22aef1..1416f5f434 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -66,8 +66,7 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values, dtype = values.dtype if type is None and dtype != object: - with nogil: - check_status(NumPyDtypeToArrow(dtype, &c_type)) + c_type = GetResultValue(NumPyDtypeToArrow(dtype)) if type is not None: c_type = type.sp_type diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index e3179062a1..906f0b7d28 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -73,7 +73,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: object obj, object mask, const PyConversionOptions& options, CMemoryPool* pool) - CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) + CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype) CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, c_bool from_pandas, diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 9537aec574..10116f9afa 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -468,10 +468,7 @@ class TypeInferrer { if (numpy_dtype_count_ > 0) { // All NumPy scalars and Nones/nulls if (numpy_dtype_count_ + none_count_ == total_count_) { - std::shared_ptr<DataType> type; - RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type)); - *out = type; - return Status::OK(); + return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out); } // The "bad path": data contains a mix of NumPy scalars and diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 4970680764..dfee88c092 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -59,12 +59,11 @@ NumPyBuffer::~NumPyBuffer() { #define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \ case NPY_##NPY_NAME: \ - *out = FACTORY(); \ - break; + return FACTORY(); namespace { -Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) { +Result<std::shared_ptr<DataType>> GetTensorType(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { return Status::TypeError("Did not pass numpy.dtype object"); } @@ -84,11 +83,8 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) { TO_ARROW_TYPE_CASE(FLOAT16, float16); TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); - default: { - return Status::NotImplemented("Unsupported numpy type ", descr->type_num); - } } - return Status::OK(); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } Status GetNumPyType(const DataType& type, int* type_num) { @@ -120,15 +116,21 @@ Status GetNumPyType(const DataType& type, int* type_num) { } // namespace -Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) { +Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) { + PyArray_Descr* descr = PyArray_DescrFromScalar(scalar); + OwnedRef descr_ref(reinterpret_cast<PyObject*>(descr)); + return NumPyDtypeToArrow(descr); +} + +Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { return Status::TypeError("Did not pass numpy.dtype object"); } PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype); - return NumPyDtypeToArrow(descr, out); + return NumPyDtypeToArrow(descr); } -Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) { +Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) { int type_num = fix_numpy_type_num(descr->type_num); switch (type_num) { @@ -151,20 +153,15 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) { reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata); switch (date_dtype->meta.base) { case NPY_FR_s: - *out = timestamp(TimeUnit::SECOND); - break; + return timestamp(TimeUnit::SECOND); case NPY_FR_ms: - *out = timestamp(TimeUnit::MILLI); - break; + return timestamp(TimeUnit::MILLI); case NPY_FR_us: - *out = timestamp(TimeUnit::MICRO); - break; + return timestamp(TimeUnit::MICRO); case NPY_FR_ns: - *out = timestamp(TimeUnit::NANO); - break; + return timestamp(TimeUnit::NANO); case NPY_FR_D: - *out = date32(); - break; + return date32(); case NPY_FR_GENERIC: return Status::NotImplemented("Unbound or generic datetime64 time unit"); default: @@ -176,29 +173,22 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) { reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata); switch (timedelta_dtype->meta.base) { case NPY_FR_s: - *out = duration(TimeUnit::SECOND); - break; + return duration(TimeUnit::SECOND); case NPY_FR_ms: - *out = duration(TimeUnit::MILLI); - break; + return duration(TimeUnit::MILLI); case NPY_FR_us: - *out = duration(TimeUnit::MICRO); - break; + return duration(TimeUnit::MICRO); case NPY_FR_ns: - *out = duration(TimeUnit::NANO); - break; + return duration(TimeUnit::NANO); case NPY_FR_GENERIC: return Status::NotImplemented("Unbound or generic timedelta64 time unit"); default: return Status::NotImplemented("Unsupported timedelta64 time unit"); } } break; - default: { - return Status::NotImplemented("Unsupported numpy type ", descr->type_num); - } } - return Status::OK(); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } #undef TO_ARROW_TYPE_CASE @@ -230,9 +220,8 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, strides[i] = array_strides[i]; } - std::shared_ptr<DataType> type; - RETURN_NOT_OK( - GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type)); + ARROW_ASSIGN_OR_RAISE( + auto type, GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)))); *out = std::make_shared<Tensor>(type, data, shape, strides, dim_names); return Status::OK(); } @@ -435,9 +424,9 @@ Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject* PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao); std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao); - std::shared_ptr<DataType> type_data; - RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)), - &type_data)); + ARROW_ASSIGN_OR_RAISE( + auto type_data, + GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)))); std::shared_ptr<Tensor> coords; RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords)); @@ -462,9 +451,9 @@ Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao); std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao); - std::shared_ptr<DataType> type_data; - RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)), - &type_data)); + ARROW_ASSIGN_OR_RAISE( + auto type_data, + GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)))); std::shared_ptr<Tensor> indptr, indices; RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr)); @@ -491,9 +480,9 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* const int ndim = static_cast<const int>(shape.size()); PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao); std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao); - std::shared_ptr<DataType> type_data; - RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)), - &type_data)); + ARROW_ASSIGN_OR_RAISE( + auto type_data, + GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)))); std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1); std::vector<std::shared_ptr<Tensor>> indices(ndim); diff --git a/python/pyarrow/src/arrow/python/numpy_convert.h b/python/pyarrow/src/arrow/python/numpy_convert.h index 10451077a2..2d1086e135 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.h +++ b/python/pyarrow/src/arrow/python/numpy_convert.h @@ -49,9 +49,11 @@ class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer { }; ARROW_PYTHON_EXPORT -Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out); +Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype); ARROW_PYTHON_EXPORT -Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out); +Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr); +ARROW_PYTHON_EXPORT +Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar); ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, const std::vector<std::string>& dim_names, diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 2727ce32f4..8903df31be 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -462,8 +462,7 @@ template <typename ArrowType> inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) { RETURN_NOT_OK(PrepareInputData<ArrowType>(data)); - std::shared_ptr<DataType> input_type; - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type)); + ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, @@ -490,7 +489,7 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data); RETURN_NOT_OK(s); } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type)); + ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, @@ -498,7 +497,7 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d } } } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type)); + ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); @@ -531,7 +530,7 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d } *data = std::move(result); } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type)); + ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, @@ -539,7 +538,7 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d } } } else { - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type)); + ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 23b92598e3..d1d94ac17a 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -386,8 +386,7 @@ class PyValue { } } else if (PyArray_CheckAnyScalarExact(obj)) { // validate that the numpy scalar has np.datetime64 dtype - std::shared_ptr<DataType> numpy_type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj)); if (!numpy_type->Equals(*type)) { return Status::NotImplemented("Expected np.datetime64 but got: ", numpy_type->ToString()); @@ -466,8 +465,7 @@ class PyValue { } } else if (PyArray_CheckAnyScalarExact(obj)) { // validate that the numpy scalar has np.datetime64 dtype - std::shared_ptr<DataType> numpy_type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj)); if (!numpy_type->Equals(*type)) { return Status::NotImplemented("Expected np.timedelta64 but got: ", numpy_type->ToString()); diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 912ee39f7d..b6dc53d633 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5140,12 +5140,8 @@ def from_numpy_dtype(object dtype): >>> pa.from_numpy_dtype(np.str_) DataType(string) """ - cdef shared_ptr[CDataType] c_type dtype = np.dtype(dtype) - with nogil: - check_status(NumPyDtypeToArrow(dtype, &c_type)) - - return pyarrow_wrap_data_type(c_type) + return pyarrow_wrap_data_type(GetResultValue(NumPyDtypeToArrow(dtype))) def is_boolean_value(object obj):
