This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2f99cf8583 GH-44046: [Python] Fix threading issues with borrowed refs 
and pandas (#44047)
2f99cf8583 is described below

commit 2f99cf85835c05e0d3d23d06592e6d2f6322aede
Author: Lysandros Nikolaou <[email protected]>
AuthorDate: Thu Sep 12 18:26:46 2024 +0300

    GH-44046: [Python] Fix threading issues with borrowed refs and pandas 
(#44047)
    
    
    
    ### Rationale for this change
    
    Fix threading bugs that could leads to races under the free-threaded build.
    
    ### What changes are included in this PR?
    
    - Use `PySequence_ITEM` instead of the `Fast` variant on lists under the 
free-threaded build.
    - Use `std::once_flag` to make sure that `pandas` staic data only gets 
initialized once.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #44046
    
    Lead-authored-by: Lysandros Nikolaou <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/src/arrow/python/helpers.cc       | 38 ++++++++++++++++++------
 python/pyarrow/src/arrow/python/iterators.h      |  4 +++
 python/pyarrow/src/arrow/python/numpy_convert.cc | 12 ++++++++
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/helpers.cc 
b/python/pyarrow/src/arrow/python/helpers.cc
index 18302e6fe0..ca89ebe9d8 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -22,6 +22,7 @@
 
 #include <cmath>
 #include <limits>
+#include <mutex>
 #include <sstream>
 #include <type_traits>
 
@@ -292,7 +293,15 @@ bool PyFloat_IsNaN(PyObject* obj) {
 
 namespace {
 
+// This needs a conditional, because using std::once_flag could introduce
+// a deadlock when the GIL is enabled. See
+// 
https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 
for
+// more info.
+#ifdef Py_GIL_DISABLED
+static std::once_flag pandas_static_initialized;
+#else
 static bool pandas_static_initialized = false;
+#endif
 
 // Once initialized, these variables hold borrowed references to Pandas static 
data.
 // We should not use OwnedRef here because Python destructors would be
@@ -304,15 +313,7 @@ static PyObject* pandas_Timestamp = nullptr;
 static PyTypeObject* pandas_NaTType = nullptr;
 static PyObject* pandas_DateOffset = nullptr;
 
-}  // namespace
-
-void InitPandasStaticData() {
-  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
-  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
-  if (pandas_static_initialized) {
-    return;
-  }
-
+void GetPandasStaticSymbols() {
   OwnedRef pandas;
 
   // Import pandas
@@ -321,11 +322,14 @@ void InitPandasStaticData() {
     return;
   }
 
+#ifndef Py_GIL_DISABLED
   // Since ImportModule can release the GIL, another thread could have
   // already initialized the static data.
   if (pandas_static_initialized) {
     return;
   }
+#endif
+
   OwnedRef ref;
 
   // set NaT sentinel and its type
@@ -355,9 +359,25 @@ void InitPandasStaticData() {
   if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
     pandas_DateOffset = ref.obj();
   }
+}
+
+}  // namespace
 
+#ifdef Py_GIL_DISABLED
+void InitPandasStaticData() {
+  std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
+}
+#else
+void InitPandasStaticData() {
+  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
+  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
+  if (pandas_static_initialized) {
+    return;
+  }
+  GetPandasStaticSymbols();
   pandas_static_initialized = true;
 }
+#endif
 
 bool PandasObjectIsNull(PyObject* obj) {
   if (!MayHaveNaN(obj)) {
diff --git a/python/pyarrow/src/arrow/python/iterators.h 
b/python/pyarrow/src/arrow/python/iterators.h
index 8512276848..dd467f6ac4 100644
--- a/python/pyarrow/src/arrow/python/iterators.h
+++ b/python/pyarrow/src/arrow/python/iterators.h
@@ -67,7 +67,11 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t 
offset, VisitorFunc&&
   }
 
   if (PySequence_Check(obj)) {
+#ifdef Py_GIL_DISABLED
+    if (PyTuple_Check(obj)) {
+#else
     if (PyList_Check(obj) || PyTuple_Check(obj)) {
+#endif
       // Use fast item access
       const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
       for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc 
b/python/pyarrow/src/arrow/python/numpy_convert.cc
index 5fd2cb511f..4113cc67d2 100644
--- a/python/pyarrow/src/arrow/python/numpy_convert.cc
+++ b/python/pyarrow/src/arrow/python/numpy_convert.cc
@@ -488,7 +488,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, 
PyObject* data_ao, PyObject*
   std::vector<std::shared_ptr<Tensor>> indices(ndim);
 
   for (int i = 0; i < ndim - 1; ++i) {
+#ifdef Py_GIL_DISABLED
+    PyObject* item = PySequence_ITEM(indptr_ao, i);
+    RETURN_IF_PYERROR();
+    OwnedRef item_ref(item);
+#else
     PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i);
+#endif
     if (!PyArray_Check(item)) {
       return Status::TypeError("Did not pass ndarray object for indptr");
     }
@@ -497,7 +503,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, 
PyObject* data_ao, PyObject*
   }
 
   for (int i = 0; i < ndim; ++i) {
+#ifdef Py_GIL_DISABLED
+    PyObject* item = PySequence_ITEM(indices_ao, i);
+    RETURN_IF_PYERROR();
+    OwnedRef item_ref(item);
+#else
     PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i);
+#endif
     if (!PyArray_Check(item)) {
       return Status::TypeError("Did not pass ndarray object for indices");
     }

Reply via email to