This is an automated email from the ASF dual-hosted git repository.

pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7339a2995d GH-49272: [C++][CI] Fix intermittent segfault in 
arrow-json-test with MinGW (#49462)
7339a2995d is described below

commit 7339a2995daa3cd90f3c015a459a59fb65bc5c12
Author: Kumar Vanshaj <[email protected]>
AuthorDate: Thu May 7 18:18:29 2026 +0530

    GH-49272: [C++][CI] Fix intermittent segfault in arrow-json-test with MinGW 
(#49462)
    
    ### Rationale for this change
    
    The `arrow-json-test` intermittently segfaults on AMD64 Windows MinGW CI 
(both CLANG64 and MINGW64 environments), causing false CI failures. The crash 
occurs at 0.03 seconds into test execution during 
`ReaderTest.MultipleChunksParallel`. See #49272.
    
    Example failing runs:
    - 
https://github.com/apache/arrow/actions/runs/21956381694/job/63422678894?pr=49262
    - 
https://github.com/apache/arrow/actions/runs/21981450495/job/63504813441?pr=49259
    
    The root cause is MinGW's `__emutls` implementation for C++ `thread_local`, 
which has known race conditions during thread creation (upstream GCC bug 
[#78605](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78605)). When 
`ThreadPool::LaunchWorkersUnlocked` creates a new worker thread, the thread 
immediately writes to the `thread_local` `current_thread_pool_` variable. If 
`__emutls` hasn't finished initializing TLS for the new thread, this 
dereferences an invalid pointer, causing a segfault.
    
    ### What changes are included in this PR?
    
    **Replace `thread_local` with native Win32 TLS API on MinGW 
(`thread_pool.cc`):** Uses `TlsAlloc`/`TlsGetValue`/`TlsSetValue` via Arrow's 
`windows_compatibility.h` instead of `thread_local` on MinGW to bypass the 
buggy `__emutls` emulation. Non-MinGW platforms are unchanged. `GetLastError()` 
is preserved around `TlsGetValue()` to avoid clobbering the caller's Windows 
error state.
    
    **Add `MultipleChunksParallelRegression` test (`reader_test.cc`):** 
Exercises the parallel JSON reading path to catch the original crash in CI. Run 
manually to stress-test: `while build/debug/arrow-json-test 
--gtest_filter=ReaderTest.MultipleChunksParallelRegression; do :; done`
    
    **Add `OwnsThisThreadPreservesLastError` test (`thread_pool_test.cc`):** 
Verifies that `OwnsThisThread()` does not clobber `GetLastError()` on Windows.
    
    ### Are these changes tested?
    
    Yes. A new regression test (`ReaderTest.MultipleChunksParallelRegression`) 
exercises the affected parallel JSON reading path. The existing 
`ReaderTest.MultipleChunksParallel` test also covers the affected code path.
    
    ### Are there any user-facing changes?
    
    No.
    
    **This PR contains a "Critical Fix".** The changes fix a bug that causes a 
crash (segfault) in `arrow-json-test` on MinGW Windows CI due to a race 
condition in MinGW's `__emutls` thread-local storage emulation during thread 
pool worker creation.
    
    * GitHub Issue: #49272
    
    Lead-authored-by: Kumar Vanshaj <[email protected]>
    Co-authored-by: vanshaj2023 <[email protected]>
    Co-authored-by: vanshaj2023 <[email protected]>
    Co-authored-by: vanshaj2023 <[email protected]>
    Co-authored-by: tadeja <[email protected]>
    Co-authored-by: Rossi Sun <[email protected]>
    Co-authored-by: Copilot <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/json/reader_test.cc      | 22 ++++++++++++
 cpp/src/arrow/util/thread_pool.cc      | 62 ++++++++++++++++++++++++++++++++--
 cpp/src/arrow/util/thread_pool_test.cc | 29 ++++++++++++++++
 3 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/json/reader_test.cc 
b/cpp/src/arrow/json/reader_test.cc
index a52626413d..98bdac96c2 100644
--- a/cpp/src/arrow/json/reader_test.cc
+++ b/cpp/src/arrow/json/reader_test.cc
@@ -290,6 +290,28 @@ TEST(ReaderTest, MultipleChunksParallel) {
   AssertTablesEqual(*serial, *threaded);
 }
 
+// Regression test for intermittent threading crashes on MinGW.
+// Run this test multiple times manually to stress-test:
+//   while build/debug/arrow-json-test
+//       --gtest_filter=ReaderTest.MultipleChunksParallelRegression; do :; done
+// See https://github.com/apache/arrow/issues/49272
+TEST(ReaderTest, MultipleChunksParallelRegression) {
+  int64_t count = 1 << 10;
+  ReadOptions read_options;
+  read_options.block_size = static_cast<int>(count / 2);
+  read_options.use_threads = true;
+  ParseOptions parse_options;
+
+  std::string json;
+  for (int64_t i = 0; i < count; ++i) {
+    json += "{\"a\":" + std::to_string(i) + "}\n";
+  }
+
+  ASSERT_OK_AND_ASSIGN(auto table,
+                       ReadToTable(std::move(json), read_options, 
parse_options));
+  ASSERT_EQ(table->num_rows(), count);
+}
+
 TEST(ReaderTest, ListArrayWithFewValues) {
   // ARROW-7647
   ParseOptions parse_options;
diff --git a/cpp/src/arrow/util/thread_pool.cc 
b/cpp/src/arrow/util/thread_pool.cc
index bf107006f8..4fbce97c2f 100644
--- a/cpp/src/arrow/util/thread_pool.cc
+++ b/cpp/src/arrow/util/thread_pool.cc
@@ -31,6 +31,7 @@
 #include "arrow/util/io_util.h"
 #include "arrow/util/logging_internal.h"
 #include "arrow/util/mutex.h"
+#include "arrow/util/windows_compatibility.h"
 
 #include "arrow/util/tracing_internal.h"
 
@@ -630,9 +631,66 @@ void ThreadPool::CollectFinishedWorkersUnlocked() {
   state_->finished_workers_.clear();
 }
 
+// MinGW's __emutls implementation for C++ thread_local has known race 
conditions
+// during thread creation. When a new worker thread is spawned and immediately
+// writes to a thread_local variable (here: current_thread_pool_), __emutls may
+// not have finished initializing TLS for that thread, causing it to 
dereference
+// a stale/invalid pointer and segfault. This is a known upstream GCC/MinGW 
bug:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78605
+// The crash surfaces specifically in arrow-json-test 
(ReaderTest.MultipleChunksParallel)
+// because that test creates a fresh ThreadPool and immediately dispatches 
work,
+// hitting the narrow startup race before __emutls can initialize. Other tests
+// that rely on the already-warmed global thread pool do not trigger this 
window.
+// Use native Win32 TLS (TlsAlloc/TlsGetValue/TlsSetValue) to bypass __emutls.
+// See also: https://github.com/apache/arrow/issues/49272
+#  ifdef __MINGW32__
+
+namespace {
+DWORD GetPoolTlsIndex() {
+  static DWORD index = [] {
+    DWORD i = TlsAlloc();
+    if (i == TLS_OUT_OF_INDEXES) {
+      ARROW_LOG(FATAL) << "TlsAlloc failed for thread pool TLS: "
+                       << WinErrorMessage(GetLastError());
+    }
+    return i;
+  }();
+  return index;
+}
+}  // namespace
+
+static ThreadPool* GetCurrentThreadPool() {
+  // Preserve the caller's last-error value while also detecting TLS failures.
+  DWORD original_error = GetLastError();
+  // Ensure a successful TlsGetValue() leaves GetLastError() == 0.
+  SetLastError(0);
+  auto* pool = static_cast<ThreadPool*>(TlsGetValue(GetPoolTlsIndex()));
+  DWORD tls_error = GetLastError();
+  if (tls_error != 0) {
+    // No need to restore original_error here: ARROW_LOG(FATAL) aborts the 
process.
+    ARROW_LOG(FATAL) << "TlsGetValue failed for thread pool TLS: "
+                     << WinErrorMessage(tls_error);
+  }
+  // Restore the caller's last-error value.
+  SetLastError(original_error);
+  return pool;
+}
+
+static void SetCurrentThreadPool(ThreadPool* pool) {
+  BOOL ok = TlsSetValue(GetPoolTlsIndex(), pool);
+  if (!ok) {
+    ARROW_LOG(FATAL) << "TlsSetValue failed for thread pool TLS: "
+                     << WinErrorMessage(GetLastError());
+  }
+}
+#  else
 thread_local ThreadPool* current_thread_pool_ = nullptr;
 
-bool ThreadPool::OwnsThisThread() { return current_thread_pool_ == this; }
+static ThreadPool* GetCurrentThreadPool() { return current_thread_pool_; }
+static void SetCurrentThreadPool(ThreadPool* pool) { current_thread_pool_ = 
pool; }
+#  endif
+
+bool ThreadPool::OwnsThisThread() { return GetCurrentThreadPool() == this; }
 
 void ThreadPool::LaunchWorkersUnlocked(int threads) {
   std::shared_ptr<State> state = sp_state_;
@@ -641,7 +699,7 @@ void ThreadPool::LaunchWorkersUnlocked(int threads) {
     state_->workers_.emplace_back();
     auto it = --(state_->workers_.end());
     *it = std::thread([this, state, it] {
-      current_thread_pool_ = this;
+      SetCurrentThreadPool(this);
       WorkerLoop(state, it);
     });
   }
diff --git a/cpp/src/arrow/util/thread_pool_test.cc 
b/cpp/src/arrow/util/thread_pool_test.cc
index 45441fa321..c1391c8be8 100644
--- a/cpp/src/arrow/util/thread_pool_test.cc
+++ b/cpp/src/arrow/util/thread_pool_test.cc
@@ -44,6 +44,7 @@
 #include "arrow/util/macros.h"
 #include "arrow/util/test_common.h"
 #include "arrow/util/thread_pool.h"
+#include "arrow/util/windows_compatibility.h"
 
 namespace arrow {
 namespace internal {
@@ -660,6 +661,34 @@ TEST_F(TestThreadPool, OwnsCurrentThread) {
   ASSERT_FALSE(one_failed);
 }
 
+#ifdef _WIN32
+TEST_F(TestThreadPool, OwnsThisThreadPreservesLastError) {
+#  ifndef ARROW_ENABLE_THREADING
+  GTEST_SKIP() << "Test requires threading support";
+#  endif
+  auto pool = this->MakeThreadPool(4);
+
+  // Verify from outside the pool: OwnsThisThread() must not clobber
+  // the calling thread's last-error state.
+  ::SetLastError(ERROR_FILE_NOT_FOUND);
+  ASSERT_FALSE(pool->OwnsThisThread());
+  ASSERT_EQ(::GetLastError(), static_cast<DWORD>(ERROR_FILE_NOT_FOUND));
+
+  // Verify from inside a pool thread.
+  std::atomic<bool> error_preserved{true};
+  ASSERT_OK(pool->Spawn([&] {
+    ::SetLastError(ERROR_ACCESS_DENIED);
+    ASSERT_TRUE(pool->OwnsThisThread());
+    if (::GetLastError() != ERROR_ACCESS_DENIED) {
+      error_preserved = false;
+    }
+  }));
+
+  ASSERT_OK(pool->Shutdown());
+  ASSERT_TRUE(error_preserved.load());
+}
+#endif
+
 TEST_F(TestThreadPool, StressSpawnThreaded) {
 #ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";

Reply via email to