This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new bdbf630  ARROW-4582: [Python/C++] Acquire the GIL on Py_INCREF
bdbf630 is described below

commit bdbf630041668b5503d15fb0a3bb0e1300c6f240
Author: Korn, Uwe <[email protected]>
AuthorDate: Fri Feb 15 09:44:45 2019 -0600

    ARROW-4582: [Python/C++] Acquire the GIL on Py_INCREF
    
    Minimal reproducing example:
    
    ```
    import dask
    import pandas as pd
    import pyarrow as pa
    import numpy as np
    
    def segfault_me(df):
        pa.Table.from_pandas(df, nthreads=1)
    
    while True:
        df = pd.DataFrame(
            {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": 
np.arange(10, 20)}
        )
        dask.compute([
            dask.delayed(segfault_me)(df),
            dask.delayed(segfault_me)(df),
            dask.delayed(segfault_me)(df),
            dask.delayed(segfault_me)(df),
            dask.delayed(segfault_me)(df),
        ])
    ```
    
    Segfaults are more likely when run in AddressSanitizer or otherwise slow 
system with many cores. It is important that always the same df is passed into 
the functions.
    
    The issue was that the reference count of the underlying NumPy array was 
increased at the same time by multiple threads. The decrease happend then with 
a GIL, so the array was sometimes destroyed while still used.
    
    Author: Korn, Uwe <[email protected]>
    
    Closes #3655 from xhochy/ARROW-4582 and squashes the following commits:
    
    7f9838da5 <Korn, Uwe> docker-compose run clang-format
    3d6e5eeb3 <Korn, Uwe> ARROW-4582:  Acquire the GIL on Py_INCREF
---
 cpp/src/arrow/python/numpy_convert.cc | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_convert.cc 
b/cpp/src/arrow/python/numpy_convert.cc
index c73e0bc..02ce0b6 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -46,6 +46,7 @@ bool is_contiguous(PyObject* array) {
 }
 
 NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
+  PyAcquireGIL lock;
   arr_ = ao;
   Py_INCREF(ao);
 
@@ -187,8 +188,6 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, 
std::shared_ptr<DataType>* out) {
 #undef TO_ARROW_TYPE_CASE
 
 Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, 
std::shared_ptr<Tensor>* out) {
-  PyAcquireGIL lock;
-
   if (!PyArray_Check(ao)) {
     return Status::TypeError("Did not pass ndarray object");
   }
@@ -199,25 +198,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, 
std::shared_ptr<Tensor>*
 
   int ndim = PyArray_NDIM(ndarray);
 
+  // This is also holding the GIL, so don't already draw it.
   std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
   std::vector<int64_t> shape(ndim);
   std::vector<int64_t> strides(ndim);
 
-  npy_intp* array_strides = PyArray_STRIDES(ndarray);
-  npy_intp* array_shape = PyArray_SHAPE(ndarray);
-  for (int i = 0; i < ndim; ++i) {
-    if (array_strides[i] < 0) {
-      return Status::Invalid("Negative ndarray strides not supported");
+  {
+    PyAcquireGIL lock;
+    npy_intp* array_strides = PyArray_STRIDES(ndarray);
+    npy_intp* array_shape = PyArray_SHAPE(ndarray);
+    for (int i = 0; i < ndim; ++i) {
+      if (array_strides[i] < 0) {
+        return Status::Invalid("Negative ndarray strides not supported");
+      }
+      shape[i] = array_shape[i];
+      strides[i] = array_strides[i];
     }
-    shape[i] = array_shape[i];
-    strides[i] = array_strides[i];
-  }
 
-  std::shared_ptr<DataType> type;
-  RETURN_NOT_OK(
-      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), 
&type));
-  *out = std::make_shared<Tensor>(type, data, shape, strides);
-  return Status::OK();
+    std::shared_ptr<DataType> type;
+    RETURN_NOT_OK(
+        GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), 
&type));
+    *out = std::make_shared<Tensor>(type, data, shape, strides);
+    return Status::OK();
+  }
 }
 
 Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,

Reply via email to