This is an automated email from the ASF dual-hosted git repository.
achennaka pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new d14130142 KUDU-2439 address the issue up to some extent
d14130142 is described below
commit d1413014255df62a6e38550552a8596644aa0033
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Aug 6 20:09:28 2025 -0700
KUDU-2439 address the issue up to some extent
This changelist adds a work-around for the issue described in KUDU-2439.
It's based on the recently added provisions for calling OPENSSL_cleanup
explicitly with the library of versions 1.1.1 and newer.
Yes, it's a stop-gap, but it's better than nothing.
As for testing, I verified that the issue is gone after seeing its
manifestation with the frequency of about 1 in 30 runs of the kudu CLI
tool with RELEASE bits on RHEL8.8 x86_64 and RHEL9.2 x86_64 without
the patch. I ran the following for reproduction, changing 100rep
to 1000rep for verificaion runs:
./bin/kudu-tool-test --gtest_repeat=100
Without the patch, I saw core files left by the kudu CLI binary during
every 100rep run, where many of the core files would have stack traces
attributable to the JIRA item. With this patch, not a single crash
has been observed and no core files have been generated after many
100rep and a few 1000rep test runs with Kudu RELEASE bits.
NOTE: the issue no longer manifests itself in RELEASE builds,
but there is still something that makes kudu CLI sometimes crash
on exit in DEBUG builds, but the stack trace doesn't involve any
OpenSSL symbols: instead, it has mostly libtcmalloc symbols
and also involves __do_global_dtors_aux() from librocksdb
Change-Id: I472bb6b5a4edf2a1a03c0e3cdcd64e743b7e1e1f
Reviewed-on: http://gerrit.cloudera.org:8080/23262
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Abhishek Chennaka <[email protected]>
---
src/kudu/rpc/messenger.cc | 8 ++++++++
src/kudu/rpc/messenger.h | 19 +++++++++++++------
src/kudu/util/before_and_after_main.h | 25 +++++++++++++++++++++++++
3 files changed, 46 insertions(+), 6 deletions(-)
diff --git a/src/kudu/rpc/messenger.cc b/src/kudu/rpc/messenger.cc
index c8a056b1a..c4532d41a 100644
--- a/src/kudu/rpc/messenger.cc
+++ b/src/kudu/rpc/messenger.cc
@@ -151,6 +151,8 @@ Status MessengerBuilder::Build(shared_ptr<Messenger>* msgr)
{
return Status::OK();
}
+std::atomic<uint32_t> Messenger::kInstanceCount_ = 0;
+
// See comment on Messenger::retain_self_ member.
void Messenger::AllExternalReferencesDropped() {
// The last external ref may have been dropped in the context of a task
@@ -393,6 +395,7 @@ Messenger::Messenger(const MessengerBuilder& bld)
reuseport_(bld.reuseport_),
acceptor_listen_backlog_(bld.acceptor_listen_backlog_),
retain_self_(this) {
+ kInstanceCount_.fetch_add(1, std::memory_order_release);
for (int i = 0; i < bld.num_reactors_; i++) {
reactors_.push_back(new Reactor(retain_self_, i, bld));
}
@@ -406,9 +409,14 @@ Messenger::Messenger(const MessengerBuilder& bld)
.Build(&server_negotiation_pool_));
}
+uint32_t Messenger::GetInstanceCount() {
+ return kInstanceCount_.load(std::memory_order_acquire);
+}
+
Messenger::~Messenger() {
CHECK_EQ(state_, kClosing) << "Should have already shut down";
STLDeleteElements(&reactors_);
+ kInstanceCount_.fetch_sub(1, std::memory_order_release);
}
Reactor* Messenger::RemoteToReactor(const Sockaddr& remote) const {
diff --git a/src/kudu/rpc/messenger.h b/src/kudu/rpc/messenger.h
index c7fe866d7..c11542b2c 100644
--- a/src/kudu/rpc/messenger.h
+++ b/src/kudu/rpc/messenger.h
@@ -16,6 +16,7 @@
// under the License.
#pragma once
+#include <atomic>
#include <cstdint>
#include <functional>
#include <memory>
@@ -318,16 +319,15 @@ class MessengerBuilder {
// one as a singleton, and then make calls using Proxy objects.
//
// See rpc-test.cc and rpc-bench.cc for example usages.
-class Messenger {
+class Messenger final {
public:
- friend class MessengerBuilder;
- friend class Proxy;
- friend class Reactor;
- friend class ReactorThread;
typedef std::vector<std::shared_ptr<AcceptorPool> > acceptor_vec_t;
typedef std::unordered_map<std::string, scoped_refptr<RpcService> >
RpcServicesMap;
- static const uint64_t UNKNOWN_CALL_ID = 0;
+ static constexpr const uint64_t UNKNOWN_CALL_ID = 0;
+
+ // TODO(KUDU-2439): remove this stop-gap method once the issue is addressed
+ static uint32_t GetInstanceCount();
~Messenger();
@@ -473,6 +473,10 @@ class Messenger {
const scoped_refptr<RpcService> rpc_service(const std::string& service_name)
const;
private:
+ friend class MessengerBuilder;
+ friend class Proxy;
+ friend class Reactor;
+ friend class ReactorThread;
FRIEND_TEST(TestRpc, TestConnectionKeepalive);
FRIEND_TEST(TestRpc, TestConnectionAlwaysKeepalive);
FRIEND_TEST(TestRpc, TestClientConnectionsMetrics);
@@ -500,6 +504,9 @@ class Messenger {
// any references. See 'retain_self_' for more info.
void AllExternalReferencesDropped();
+ // TODO(KUDU-2439): remove this stop-gap field once the issue is addressed
+ static std::atomic<uint32_t> kInstanceCount_;
+
// Get the total number of currently pending connections across all the RPC
// endpoints this messenger is bound to. This utility method returns -1
// if the information on the listened socket's backlog cannot be retrieved
diff --git a/src/kudu/util/before_and_after_main.h
b/src/kudu/util/before_and_after_main.h
index c83924f77..cf77c428b 100644
--- a/src/kudu/util/before_and_after_main.h
+++ b/src/kudu/util/before_and_after_main.h
@@ -76,6 +76,31 @@ static void module_init_openssl() {
}
static void module_fini_openssl() {
+#if !defined(KUDU_TEST_MAIN)
+ // This is a stop-gap to work around KUDU-2439. For the kudu CLI and
+ // kudu-{master,tserver} binaries, wait a bit for all Messenger instances
+ // to be shut down. Otherwise, if the asynchronous Messenger's shutdown
+ // process is still in progress, it might issue calls to the OpenSSL's
runtime
+ // when destructing the Messenger::tls_context_ field. Meanwhile, the OpenSSL
+ // library could be shut down already or in the process of doing so,
+ // and a data race like that would often end up in undefined behavior or
+ // a crash with SIGSEGV/SIGFPE/SIGABRT. Introducing a bit of latency into
+ // the kudu-{master,tserver} shutdown isn't a bit deal. Also, in the vast
+ // majority of the kudu CLI's use cases, it's better to incur an extra second
+ // of latency compared with a crash and inability to tell whether the tool
+ // succeeded or not by analyzing its exit code.
+ if (const auto mc = kudu::rpc::Messenger::GetInstanceCount(); mc != 0) {
+ RAW_VLOG(2, "waiting for %d Messengers to shut down", mc);
+ const auto deadline = kudu::MonoTime::Now() +
kudu::MonoDelta::FromSeconds(1);
+ while (kudu::MonoTime::Now() < deadline) {
+ SleepFor(kudu::MonoDelta::FromMilliseconds(50));
+ if (kudu::rpc::Messenger::GetInstanceCount() == 0) {
+ break;
+ }
+ }
+ }
+#endif // #if !defined(KUDU_TEST_MAIN) ...
+
// Call OPENSSL_cleanup() to release resources and clean up the global state
// of the library: it's applicable to OpenSSL 1.1.1 and newer versions.
// At this point, tcmalloc must still be operational.