Mostafa Mokhtar created IMPALA-6787:
---------------------------------------
Summary: On large secure clusters the connection setup thread
becomes bottleneck at warmup and cause occasional timeout failures
Key: IMPALA-6787
URL: https://issues.apache.org/jira/browse/IMPALA-6787
Project: IMPALA
Issue Type: Bug
Components: Distributed Exec
Affects Versions: Impala 2.12.0
Reporter: Mostafa Mokhtar
On +200 node clusters a single thread is not sufficient and ends up being a
bottleneck for a while, which appears to cause queries to fail with
{code}
I0401 20:20:55.032140 1806361 thrift-util.cc:123] TSocket::open() connect()
<Host: va1007.foo.com Port: 22000>Connection timed out
I0401 20:20:55.032346 1806361 thrift-client.cc:78] Couldn't open transport for
va1007.foo.com:22000 (connect() failed: Connection timed out)
I0401 20:20:55.032364 1806361 thrift-client.cc:94] Unable to connect to
va1007.foo.com:22000
{code}
{code}
// Only using one thread here is sufficient for performance, and it avoids
potential
// thread safety issues with the thrift code called in SetupConnection.
constexpr int CONNECTION_SETUP_POOL_SIZE = 1;
// New - this is the thread pool used to process the internal accept queue.
ThreadPool<shared_ptr<TTransport>> connection_setup_pool("setup-server",
"setup-worker",
CONNECTION_SETUP_POOL_SIZE, FLAGS_accepted_cnxn_queue_depth,
[this](int tid, const shared_ptr<TTransport>& item) {
this->SetupConnection(item);
});
{code}
{code}
#0 0x00007fd927de8e20 in krb5int_MD5Update () from /lib64/libk5crypto.so.3
#1 0x00007fd927de7bca in k5_md5_hash () from /lib64/libk5crypto.so.3
#2 0x00007fd927e01e32 in krb5int_hmac_keyblock () from /lib64/libk5crypto.so.3
#3 0x00007fd927dfc448 in usage_key.isra.2 () from /lib64/libk5crypto.so.3
#4 0x00007fd927dfc9fc in krb5int_arcfour_decrypt () from
/lib64/libk5crypto.so.3
#5 0x00007fd927df97e4 in krb5_k_decrypt () from /lib64/libk5crypto.so.3
#6 0x00007fd927df98bd in krb5_c_decrypt () from /lib64/libk5crypto.so.3
#7 0x00007fd9297191fb in rd_req_decoded_opt () from /lib64/libkrb5.so.3
#8 0x00007fd92971a1da in krb5_rd_req_decoded () from /lib64/libkrb5.so.3
#9 0x00007fd9282371df in kg_accept_krb5 () from /lib64/libgssapi_krb5.so.2
#10 0x00007fd9282388ca in krb5_gss_accept_sec_context_ext () from
/lib64/libgssapi_krb5.so.2
#11 0x00007fd928238a29 in krb5_gss_accept_sec_context () from
/lib64/libgssapi_krb5.so.2
#12 0x00007fd92822607a in gss_accept_sec_context () from
/lib64/libgssapi_krb5.so.2
#13 0x00007fd92653aedc in gssapi_server_mech_step () from
/usr/lib64/sasl2/libgssapiv2.so
#14 0x00007fd92bc27b9b in sasl_server_step () from /lib64/libsasl2.so.3
#15 0x0000000000caf3b1 in
sasl::TSaslServer::evaluateChallengeOrResponse(unsigned char const*, unsigned
int, unsigned int*) ()
#16 0x0000000000cb3040 in
apache::thrift::transport::TSaslTransport::doSaslNegotiation() ()
#17 0x0000000000cb1488 in
apache::thrift::transport::TSaslServerTransport::Factory::getTransport(boost::shared_ptr<apache::thrift::transport::TTransport>)
()
#18 0x0000000000b143c7 in
apache::thrift::server::TAcceptQueueServer::SetupConnection(boost::shared_ptr<apache::thrift::transport::TTransport>)
()
#19 0x0000000000b14eb2 in
boost::detail::function::void_function_obj_invoker2<apache::thrift::server::TAcceptQueueServer::serve()::{lambda(int,
boost::shared_ptr<apache::thrift::transport::TTransport> const&)#1}, void,
int, boost::shared_ptr<apache::thrift::transport::TTransport>
const&>::invoke(boost::detail::function::function_buffer&, int,
boost::shared_ptr<apache::thrift::transport::TTransport> const&) ()
#20 0x0000000000b17d79 in
impala::ThreadPool<boost::shared_ptr<apache::thrift::transport::TTransport>
>::WorkerThread(int) ()
#21 0x0000000000d6049f in impala::Thread::SuperviseThread(std::string const&,
std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*,
impala::Promise<long>*) ()
#22 0x0000000000d60c9a in boost::detail::thread_data<boost::_bi::bind_t<void,
void (*)(std::string const&, std::string const&, boost::function<void ()>,
impala::ThreadDebugInfo const*, impala::Promise<long>*),
boost::_bi::list5<boost::_bi::value<std::string>,
boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >,
boost::_bi::value<impala::ThreadDebugInfo*>,
boost::_bi::value<impala::Promise<long>*> > > >::run() ()
#23 0x00000000012d794a in thread_proxy ()
#24 0x00007fd928c7ddc5 in start_thread () from /lib64/libpthread.so.0
#25 0x00007fd9289aaced in clone () from /lib64/libc.so.6
{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)