Antoine Pitrou created ARROW-17124:
--------------------------------------
Summary: [C++] Data race between future signalling and destruction
Key: ARROW-17124
URL: https://issues.apache.org/jira/browse/ARROW-17124
Project: Apache Arrow
Issue Type: Bug
Components: C++
Reporter: Antoine Pitrou
This sporadic Thread Sanitizer error just occurred to me:
{code}
WARNING: ThreadSanitizer: data race (pid=636020)
Write of size 8 at 0x7b2c000017d0 by main thread:
#0 pthread_cond_destroy
../../../../libsanitizer/tsan/tsan_interceptors_posix.cpp:1208
(libtsan.so.0+0x31c14)
#1 arrow::ConcreteFutureImpl::~ConcreteFutureImpl()
/home/antoine/arrow/dev/cpp/src/arrow/util/future.cc:211
(libarrow.so.900+0xa70b62)
#2 arrow::ConcreteFutureImpl::~ConcreteFutureImpl()
/home/antoine/arrow/dev/cpp/src/arrow/util/future.cc:211
(libarrow.so.900+0xa70ba0)
#3 std::default_delete<arrow::FutureImpl>::operator()(arrow::FutureImpl*)
const
/home/antoine/miniconda3/envs/pyarrow/x86_64-conda-linux-gnu/include/c++/10.3.0/bits/unique_ptr.h:85
(arrow-dataset-file-test+0x584a1)
#4 std::_Sp_counted_deleter<arrow::FutureImpl*,
std::default_delete<arrow::FutureImpl>, std::allocator<void>,
(__gnu_cxx::_Lock_policy)2>::_M_dispose()
/home/antoine/miniconda3/envs/pyarrow/x86_64-conda-linux-gnu/include/c++/10.3.0/bits/shared_ptr_base.h:474
(arrow-dataset-file-test+0xa9638)
#5 std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() <null>
(libarrow.so.900+0x2e1158)
#6 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count()
<null> (libarrow.so.900+0x2dc6ed)
#7 std::__shared_ptr<arrow::FutureImpl,
(__gnu_cxx::_Lock_policy)2>::~__shared_ptr() <null> (libarrow.so.900+0x978fee)
#8 std::shared_ptr<arrow::FutureImpl>::~shared_ptr() <null>
(libarrow.so.900+0x97901c)
#9 arrow::Future<arrow::internal::Empty>::~Future() <null>
(libarrow.so.900+0x97904a)
#10 ~ExecPlanImpl
/home/antoine/arrow/dev/cpp/src/arrow/compute/exec/exec_plan.cc:52
(libarrow.so.900+0xe8160b)
#11 ~ExecPlanImpl
/home/antoine/arrow/dev/cpp/src/arrow/compute/exec/exec_plan.cc:58
(libarrow.so.900+0xe8166e)
#12 _M_dispose
/home/antoine/miniconda3/envs/pyarrow/x86_64-conda-linux-gnu/include/c++/10.3.0/bits/shared_ptr_base.h:380
(libarrow.so.900+0xea6c2a)
#13 std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() <null>
(libarrow_dataset.so.900+0x7bd10)
#14 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count()
/home/antoine/miniconda3/envs/pyarrow/x86_64-conda-linux-gnu/include/c++/10.3.0/bits/shared_ptr_base.h:733
(libarrow_dataset.so.900+0x77ad9)
#15 std::__shared_ptr<arrow::compute::ExecPlan,
(__gnu_cxx::_Lock_policy)2>::~__shared_ptr()
/home/antoine/miniconda3/envs/pyarrow/x86_64-conda-linux-gnu/include/c++/10.3.0/bits/shared_ptr_base.h:1183
(libarrow_dataset.so.900+0xd3dfc)
#16 std::shared_ptr<arrow::compute::ExecPlan>::~shared_ptr()
/home/antoine/miniconda3/envs/pyarrow/x86_64-conda-linux-gnu/include/c++/10.3.0/bits/shared_ptr.h:121
(libarrow_dataset.so.900+0xd3e2a)
#17
arrow::dataset::FileSystemDataset::Write(arrow::dataset::FileSystemDatasetWriteOptions
const&, std::shared_ptr<arrow::dataset::Scanner>)
/home/antoine/arrow/dev/cpp/src/arrow/dataset/file_base.cc:398
(libarrow_dataset.so.900+0xd49ca)
#18 arrow::dataset::TestFileSystemDataset_WriteProjected_Test::TestBody()
/home/antoine/arrow/dev/cpp/src/arrow/dataset/file_test.cc:330
(arrow-dataset-file-test+0x2e382)
#19 void
testing::internal::HandleExceptionsInMethodIfSupported<testing::Test,
void>(testing::Test*, void (testing::Test::*)(), char const*) <null>
(libgtest.so.1.11.0+0x5bd3d)
Previous read of size 8 at 0x7b2c000017d0 by thread T3:
#0 pthread_cond_broadcast
../../../../libsanitizer/tsan/tsan_interceptors_posix.cpp:1201
(libtsan.so.0+0x31b51)
#1 arrow::ConcreteFutureImpl::DoMarkFinishedOrFailed(arrow::FutureState)
/home/antoine/arrow/dev/cpp/src/arrow/util/future.cc:343
(libarrow.so.900+0xa6bee0)
#2 arrow::ConcreteFutureImpl::DoMarkFinished()
/home/antoine/arrow/dev/cpp/src/arrow/util/future.cc:232
(libarrow.so.900+0xa6b0f4)
#3 arrow::FutureImpl::MarkFinished()
/home/antoine/arrow/dev/cpp/src/arrow/util/future.cc:409
(libarrow.so.900+0xa6c83f)
#4
arrow::Future<arrow::internal::Empty>::DoMarkFinished(arrow::Result<arrow::internal::Empty>)
/home/antoine/arrow/dev/cpp/src/arrow/util/future.h:725
(libarrow.so.900+0x9cbf81)
#5 void
arrow::Future<arrow::internal::Empty>::MarkFinished<arrow::internal::Empty,
void>(arrow::Status) /home/antoine/arrow/dev/cpp/src/arrow/util/future.h:476
(libarrow.so.900+0x9c921c)
#6 operator()
/home/antoine/arrow/dev/cpp/src/arrow/compute/exec/exec_plan.cc:192
(libarrow.so.900+0xe82ee6)
#7 operator() /home/antoine/arrow/dev/cpp/src/arrow/util/future.h:522
(libarrow.so.900+0xea70a3)
{code}
I think the fix is simply to signal the condition variable with the mutex
locked (which might be a bit worse performance-wise).
--
This message was sent by Atlassian Jira
(v8.20.10#820010)