This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-15.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit fc586a091be82a19bb5b7f9f0a23ae7bb4e36ea6
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jan 17 11:26:37 2024 +0100

    GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636)
    
    ### Rationale for this change
    
    `PyArray_DescrFromScalar` returns a new reference, so we should be careful 
to decref it when we don't use it anymore.
    
    ### Are these changes tested?
    
    No.
    
    ### Are there any user-facing changes?
    
    No.
    * Closes: #39599
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 python/pyarrow/array.pxi                           |  3 +-
 python/pyarrow/includes/libarrow_python.pxd        |  2 +-
 python/pyarrow/src/arrow/python/inference.cc       |  5 +-
 python/pyarrow/src/arrow/python/numpy_convert.cc   | 77 ++++++++++------------
 python/pyarrow/src/arrow/python/numpy_convert.h    |  6 +-
 python/pyarrow/src/arrow/python/numpy_to_arrow.cc  | 11 ++--
 python/pyarrow/src/arrow/python/python_to_arrow.cc |  6 +-
 python/pyarrow/types.pxi                           |  6 +-
 8 files changed, 48 insertions(+), 68 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5c2d22aef1..1416f5f434 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -66,8 +66,7 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values,
     dtype = values.dtype
 
     if type is None and dtype != object:
-        with nogil:
-            check_status(NumPyDtypeToArrow(dtype, &c_type))
+        c_type = GetResultValue(NumPyDtypeToArrow(dtype))
 
     if type is not None:
         c_type = type.sp_type
diff --git a/python/pyarrow/includes/libarrow_python.pxd 
b/python/pyarrow/includes/libarrow_python.pxd
index e3179062a1..906f0b7d28 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -73,7 +73,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
         object obj, object mask, const PyConversionOptions& options,
         CMemoryPool* pool)
 
-    CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
+    CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype)
 
     CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
                            c_bool from_pandas,
diff --git a/python/pyarrow/src/arrow/python/inference.cc 
b/python/pyarrow/src/arrow/python/inference.cc
index 9537aec574..10116f9afa 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -468,10 +468,7 @@ class TypeInferrer {
     if (numpy_dtype_count_ > 0) {
       // All NumPy scalars and Nones/nulls
       if (numpy_dtype_count_ + none_count_ == total_count_) {
-        std::shared_ptr<DataType> type;
-        RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), 
&type));
-        *out = type;
-        return Status::OK();
+        return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
       }
 
       // The "bad path": data contains a mix of NumPy scalars and
diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc 
b/python/pyarrow/src/arrow/python/numpy_convert.cc
index 4970680764..dfee88c092 100644
--- a/python/pyarrow/src/arrow/python/numpy_convert.cc
+++ b/python/pyarrow/src/arrow/python/numpy_convert.cc
@@ -59,12 +59,11 @@ NumPyBuffer::~NumPyBuffer() {
 
 #define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \
   case NPY_##NPY_NAME:                        \
-    *out = FACTORY();                         \
-    break;
+    return FACTORY();
 
 namespace {
 
-Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) {
+Result<std::shared_ptr<DataType>> GetTensorType(PyObject* dtype) {
   if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
     return Status::TypeError("Did not pass numpy.dtype object");
   }
@@ -84,11 +83,8 @@ Status GetTensorType(PyObject* dtype, 
std::shared_ptr<DataType>* out) {
     TO_ARROW_TYPE_CASE(FLOAT16, float16);
     TO_ARROW_TYPE_CASE(FLOAT32, float32);
     TO_ARROW_TYPE_CASE(FLOAT64, float64);
-    default: {
-      return Status::NotImplemented("Unsupported numpy type ", 
descr->type_num);
-    }
   }
-  return Status::OK();
+  return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
 }
 
 Status GetNumPyType(const DataType& type, int* type_num) {
@@ -120,15 +116,21 @@ Status GetNumPyType(const DataType& type, int* type_num) {
 
 }  // namespace
 
-Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) {
+Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) 
{
+  PyArray_Descr* descr = PyArray_DescrFromScalar(scalar);
+  OwnedRef descr_ref(reinterpret_cast<PyObject*>(descr));
+  return NumPyDtypeToArrow(descr);
+}
+
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
   if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
     return Status::TypeError("Did not pass numpy.dtype object");
   }
   PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
-  return NumPyDtypeToArrow(descr, out);
+  return NumPyDtypeToArrow(descr);
 }
 
-Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) 
{
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
   int type_num = fix_numpy_type_num(descr->type_num);
 
   switch (type_num) {
@@ -151,20 +153,15 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, 
std::shared_ptr<DataType>* out) {
           reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
       switch (date_dtype->meta.base) {
         case NPY_FR_s:
-          *out = timestamp(TimeUnit::SECOND);
-          break;
+          return timestamp(TimeUnit::SECOND);
         case NPY_FR_ms:
-          *out = timestamp(TimeUnit::MILLI);
-          break;
+          return timestamp(TimeUnit::MILLI);
         case NPY_FR_us:
-          *out = timestamp(TimeUnit::MICRO);
-          break;
+          return timestamp(TimeUnit::MICRO);
         case NPY_FR_ns:
-          *out = timestamp(TimeUnit::NANO);
-          break;
+          return timestamp(TimeUnit::NANO);
         case NPY_FR_D:
-          *out = date32();
-          break;
+          return date32();
         case NPY_FR_GENERIC:
           return Status::NotImplemented("Unbound or generic datetime64 time 
unit");
         default:
@@ -176,29 +173,22 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, 
std::shared_ptr<DataType>* out) {
           reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
       switch (timedelta_dtype->meta.base) {
         case NPY_FR_s:
-          *out = duration(TimeUnit::SECOND);
-          break;
+          return duration(TimeUnit::SECOND);
         case NPY_FR_ms:
-          *out = duration(TimeUnit::MILLI);
-          break;
+          return duration(TimeUnit::MILLI);
         case NPY_FR_us:
-          *out = duration(TimeUnit::MICRO);
-          break;
+          return duration(TimeUnit::MICRO);
         case NPY_FR_ns:
-          *out = duration(TimeUnit::NANO);
-          break;
+          return duration(TimeUnit::NANO);
         case NPY_FR_GENERIC:
           return Status::NotImplemented("Unbound or generic timedelta64 time 
unit");
         default:
           return Status::NotImplemented("Unsupported timedelta64 time unit");
       }
     } break;
-    default: {
-      return Status::NotImplemented("Unsupported numpy type ", 
descr->type_num);
-    }
   }
 
-  return Status::OK();
+  return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
 }
 
 #undef TO_ARROW_TYPE_CASE
@@ -230,9 +220,8 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
     strides[i] = array_strides[i];
   }
 
-  std::shared_ptr<DataType> type;
-  RETURN_NOT_OK(
-      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), 
&type));
+  ARROW_ASSIGN_OR_RAISE(
+      auto type, 
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray))));
   *out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
   return Status::OK();
 }
@@ -435,9 +424,9 @@ Status NdarraysToSparseCOOTensor(MemoryPool* pool, 
PyObject* data_ao, PyObject*
 
   PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
   std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
-  std::shared_ptr<DataType> type_data;
-  
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
-                              &type_data));
+  ARROW_ASSIGN_OR_RAISE(
+      auto type_data,
+      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
 
   std::shared_ptr<Tensor> coords;
   RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
@@ -462,9 +451,9 @@ Status NdarraysToSparseCSXMatrix(MemoryPool* pool, 
PyObject* data_ao, PyObject*
 
   PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
   std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
-  std::shared_ptr<DataType> type_data;
-  
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
-                              &type_data));
+  ARROW_ASSIGN_OR_RAISE(
+      auto type_data,
+      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
 
   std::shared_ptr<Tensor> indptr, indices;
   RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
@@ -491,9 +480,9 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, 
PyObject* data_ao, PyObject*
   const int ndim = static_cast<const int>(shape.size());
   PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
   std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
-  std::shared_ptr<DataType> type_data;
-  
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
-                              &type_data));
+  ARROW_ASSIGN_OR_RAISE(
+      auto type_data,
+      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
 
   std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
   std::vector<std::shared_ptr<Tensor>> indices(ndim);
diff --git a/python/pyarrow/src/arrow/python/numpy_convert.h 
b/python/pyarrow/src/arrow/python/numpy_convert.h
index 10451077a2..2d1086e135 100644
--- a/python/pyarrow/src/arrow/python/numpy_convert.h
+++ b/python/pyarrow/src/arrow/python/numpy_convert.h
@@ -49,9 +49,11 @@ class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
 };
 
 ARROW_PYTHON_EXPORT
-Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype);
 ARROW_PYTHON_EXPORT
-Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);
 
 ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
                                            const std::vector<std::string>& 
dim_names,
diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc 
b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
index 2727ce32f4..8903df31be 100644
--- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
@@ -462,8 +462,7 @@ template <typename ArrowType>
 inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
 
-  std::shared_ptr<DataType> input_type;
-  RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
+  ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_));
 
   if (!input_type->Equals(*type_)) {
     RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_, type_,
@@ -490,7 +489,7 @@ inline Status 
NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
       Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, 
data);
       RETURN_NOT_OK(s);
     } else {
-      RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
+      ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
       if (!input_type->Equals(*type_)) {
         // The null bitmap was already computed in VisitNative()
         RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
@@ -498,7 +497,7 @@ inline Status 
NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
       }
     }
   } else {
-    RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
+    ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
     if (!input_type->Equals(*type_)) {
       RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
                                type_, cast_options_, pool_, data));
@@ -531,7 +530,7 @@ inline Status 
NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
       }
       *data = std::move(result);
     } else {
-      RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
+      ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
       if (!input_type->Equals(*type_)) {
         // The null bitmap was already computed in VisitNative()
         RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
@@ -539,7 +538,7 @@ inline Status 
NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
       }
     }
   } else {
-    RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
+    ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
     if (!input_type->Equals(*type_)) {
       RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
                                type_, cast_options_, pool_, data));
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc 
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 23b92598e3..d1d94ac17a 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -386,8 +386,7 @@ class PyValue {
       }
     } else if (PyArray_CheckAnyScalarExact(obj)) {
       // validate that the numpy scalar has np.datetime64 dtype
-      std::shared_ptr<DataType> numpy_type;
-      RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), 
&numpy_type));
+      ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
       if (!numpy_type->Equals(*type)) {
         return Status::NotImplemented("Expected np.datetime64 but got: ",
                                       numpy_type->ToString());
@@ -466,8 +465,7 @@ class PyValue {
       }
     } else if (PyArray_CheckAnyScalarExact(obj)) {
       // validate that the numpy scalar has np.datetime64 dtype
-      std::shared_ptr<DataType> numpy_type;
-      RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), 
&numpy_type));
+      ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
       if (!numpy_type->Equals(*type)) {
         return Status::NotImplemented("Expected np.timedelta64 but got: ",
                                       numpy_type->ToString());
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 912ee39f7d..b6dc53d633 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -5140,12 +5140,8 @@ def from_numpy_dtype(object dtype):
     >>> pa.from_numpy_dtype(np.str_)
     DataType(string)
     """
-    cdef shared_ptr[CDataType] c_type
     dtype = np.dtype(dtype)
-    with nogil:
-        check_status(NumPyDtypeToArrow(dtype, &c_type))
-
-    return pyarrow_wrap_data_type(c_type)
+    return pyarrow_wrap_data_type(GetResultValue(NumPyDtypeToArrow(dtype)))
 
 
 def is_boolean_value(object obj):

Reply via email to