This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new bdbf630 ARROW-4582: [Python/C++] Acquire the GIL on Py_INCREF
bdbf630 is described below
commit bdbf630041668b5503d15fb0a3bb0e1300c6f240
Author: Korn, Uwe <[email protected]>
AuthorDate: Fri Feb 15 09:44:45 2019 -0600
ARROW-4582: [Python/C++] Acquire the GIL on Py_INCREF
Minimal reproducing example:
```
import dask
import pandas as pd
import pyarrow as pa
import numpy as np
def segfault_me(df):
pa.Table.from_pandas(df, nthreads=1)
while True:
df = pd.DataFrame(
{"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET":
np.arange(10, 20)}
)
dask.compute([
dask.delayed(segfault_me)(df),
dask.delayed(segfault_me)(df),
dask.delayed(segfault_me)(df),
dask.delayed(segfault_me)(df),
dask.delayed(segfault_me)(df),
])
```
Segfaults are more likely when run in AddressSanitizer or otherwise slow
system with many cores. It is important that always the same df is passed into
the functions.
The issue was that the reference count of the underlying NumPy array was
increased at the same time by multiple threads. The decrease happend then with
a GIL, so the array was sometimes destroyed while still used.
Author: Korn, Uwe <[email protected]>
Closes #3655 from xhochy/ARROW-4582 and squashes the following commits:
7f9838da5 <Korn, Uwe> docker-compose run clang-format
3d6e5eeb3 <Korn, Uwe> ARROW-4582: Acquire the GIL on Py_INCREF
---
cpp/src/arrow/python/numpy_convert.cc | 33 ++++++++++++++++++---------------
1 file changed, 18 insertions(+), 15 deletions(-)
diff --git a/cpp/src/arrow/python/numpy_convert.cc
b/cpp/src/arrow/python/numpy_convert.cc
index c73e0bc..02ce0b6 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -46,6 +46,7 @@ bool is_contiguous(PyObject* array) {
}
NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
+ PyAcquireGIL lock;
arr_ = ao;
Py_INCREF(ao);
@@ -187,8 +188,6 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr,
std::shared_ptr<DataType>* out) {
#undef TO_ARROW_TYPE_CASE
Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
std::shared_ptr<Tensor>* out) {
- PyAcquireGIL lock;
-
if (!PyArray_Check(ao)) {
return Status::TypeError("Did not pass ndarray object");
}
@@ -199,25 +198,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
std::shared_ptr<Tensor>*
int ndim = PyArray_NDIM(ndarray);
+ // This is also holding the GIL, so don't already draw it.
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
std::vector<int64_t> shape(ndim);
std::vector<int64_t> strides(ndim);
- npy_intp* array_strides = PyArray_STRIDES(ndarray);
- npy_intp* array_shape = PyArray_SHAPE(ndarray);
- for (int i = 0; i < ndim; ++i) {
- if (array_strides[i] < 0) {
- return Status::Invalid("Negative ndarray strides not supported");
+ {
+ PyAcquireGIL lock;
+ npy_intp* array_strides = PyArray_STRIDES(ndarray);
+ npy_intp* array_shape = PyArray_SHAPE(ndarray);
+ for (int i = 0; i < ndim; ++i) {
+ if (array_strides[i] < 0) {
+ return Status::Invalid("Negative ndarray strides not supported");
+ }
+ shape[i] = array_shape[i];
+ strides[i] = array_strides[i];
}
- shape[i] = array_shape[i];
- strides[i] = array_strides[i];
- }
- std::shared_ptr<DataType> type;
- RETURN_NOT_OK(
- GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)),
&type));
- *out = std::make_shared<Tensor>(type, data, shape, strides);
- return Status::OK();
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(
+ GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)),
&type));
+ *out = std::make_shared<Tensor>(type, data, shape, strides);
+ return Status::OK();
+ }
}
Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,