[
https://issues.apache.org/jira/browse/IMPALA-11653?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17621353#comment-17621353
]
Qifan Chen edited comment on IMPALA-11653 at 10/20/22 8:17 PM:
---------------------------------------------------------------
Before the very first communication with the client, impalad needs to setup the
connection by calling SetupConnection(). It is in this setup method that the
above "super long wait" can happen.
{code:java}
(gdb) bt
#0 0x00007f3b972ee20d in poll () from ./lib64/libc.so.6
#1 0x0000000002dcd5bc in apache::thrift::transport::TSocket::read(unsigned
char*, unsigned int) ()
#2 0x0000000002dd1803 in unsigned int
apache::thrift::transport::readAll<apache::thrift::transport::TSocket>(apache::thrift::transport::TSocket&,
unsigned char*, unsigned int) ()
#3 0x0000000001330cc9 in readAll (len=5, buf=0x7f3277ea4f8b "",
this=<optimized out>) at
../../../toolchain/toolchain-packages-gcc7.5.0/thrift-0.9.3-p8/include/thrift/transport/TTransport.h:121
#4 apache::thrift::transport::TSaslTransport::receiveSaslMessage
(this=this@entry=0x278a96b0, status=status@entry=0x7f3277ea500c,
length=length@entry=0x7f3277ea5008) at TSaslTransport.cpp:259
#5 0x000000000132db14 in
apache::thrift::transport::TSaslServerTransport::handleSaslStartMessage
(this=0x278a96b0) at TSaslServerTransport.cpp:95
#6 0x0000000001330e33 in
apache::thrift::transport::TSaslTransport::doSaslNegotiation (this=0x278a96b0)
at TSaslTransport.cpp:81
#7 0x000000000132e723 in open (this=0x12e29750) at
../../../toolchain/toolchain-packages-gcc7.5.0/thrift-0.9.3-p8/include/thrift/transport/TBufferTransports.h:218.
void open() { transport_->open(); }
132 void TSaslTransport::open() {
133 // Only client should open the underlying transport.
134 if (isClient_ && !transport_->isOpen()) {
135 transport_->open();
136 }
137
138 // Start the SASL negotiation protocol.
139 doSaslNegotiation();
140 }
#8 apache::thrift::transport::TSaslServerTransport::Factory::getTransport
(this=0xf825a70, trans=...) at TSaslServerTransport.cpp:173
#9 0x00000000010cd49d in
apache::thrift::server::TAcceptQueueServer::SetupConnection (this=0x174270c0,
entry=...) at TAcceptQueueServer.cpp:233
#10 0x00000000010cef4d in operator()
{code}
{code:java}
130 std::shared_ptr<TTransport> TSaslServerTransport::Factory::getTransport(
131 std::shared_ptr<TTransport> trans) {
132 // Thrift servers use both an input and an output transport to
communicate with
133 // clients. In principal, these can be different, but for SASL clients we
require them
134 // to be the same so that the authentication state is identical for
communication in
135 // both directions. In order to do this, we share the same TTransport
object for both
136 // input and output set in TAcceptQueueServer::SetupConnection.
137 std::shared_ptr<TBufferedTransport> ret_transport;
138 std::shared_ptr<TTransport> wrapped(
139 new TSaslServerTransport(serverDefinitionMap_, trans));
140 // Set socket timeouts to prevent TSaslServerTransport->open from
blocking the server
141 // from accepting new connections if a read/write blocks during the
handshake
142 TSocket* socket = static_cast<TSocket*>(trans.get());
143 socket->setRecvTimeout(FLAGS_sasl_connect_tcp_timeout_ms); <==
5min timeout for read() calls invoked indirectly at line 147: open()
144 socket->setSendTimeout(FLAGS_sasl_connect_tcp_timeout_ms);
145 ret_transport.reset(new TBufferedTransport(wrapped,
146
impala::ThriftServer::BufferedTransportFactory::DEFAULT_BUFFER_SIZE_BYTES));
147 ret_transport.get()->open();
148 // Reset socket timeout back to zero, so idle clients do not timeout
149 socket->setRecvTimeout(0);
150 socket->setSendTimeout(0);
151 return ret_transport;
152 }
153
154 }}}
155 #endif
transport/TSaslServerTransport.cpp
{code}
was (Author: sql_forever):
Before the very first communication with the client, impalad needs to setup the
connection by calling SetupConnection(). It is in this setup method that the
above "infinite wait" can happen.
{code:java}
(gdb) bt
#0 0x00007f3b972ee20d in poll () from ./lib64/libc.so.6
#1 0x0000000002dcd5bc in apache::thrift::transport::TSocket::read(unsigned
char*, unsigned int) ()
#2 0x0000000002dd1803 in unsigned int
apache::thrift::transport::readAll<apache::thrift::transport::TSocket>(apache::thrift::transport::TSocket&,
unsigned char*, unsigned int) ()
#3 0x0000000001330cc9 in readAll (len=5, buf=0x7f3277ea4f8b "",
this=<optimized out>) at
../../../toolchain/toolchain-packages-gcc7.5.0/thrift-0.9.3-p8/include/thrift/transport/TTransport.h:121
#4 apache::thrift::transport::TSaslTransport::receiveSaslMessage
(this=this@entry=0x278a96b0, status=status@entry=0x7f3277ea500c,
length=length@entry=0x7f3277ea5008) at TSaslTransport.cpp:259
#5 0x000000000132db14 in
apache::thrift::transport::TSaslServerTransport::handleSaslStartMessage
(this=0x278a96b0) at TSaslServerTransport.cpp:95
#6 0x0000000001330e33 in
apache::thrift::transport::TSaslTransport::doSaslNegotiation (this=0x278a96b0)
at TSaslTransport.cpp:81
#7 0x000000000132e723 in open (this=0x12e29750) at
../../../toolchain/toolchain-packages-gcc7.5.0/thrift-0.9.3-p8/include/thrift/transport/TBufferTransports.h:218.
void open() { transport_->open(); }
132 void TSaslTransport::open() {
133 // Only client should open the underlying transport.
134 if (isClient_ && !transport_->isOpen()) {
135 transport_->open();
136 }
137
138 // Start the SASL negotiation protocol.
139 doSaslNegotiation();
140 }
#8 apache::thrift::transport::TSaslServerTransport::Factory::getTransport
(this=0xf825a70, trans=...) at TSaslServerTransport.cpp:173
#9 0x00000000010cd49d in
apache::thrift::server::TAcceptQueueServer::SetupConnection (this=0x174270c0,
entry=...) at TAcceptQueueServer.cpp:233
#10 0x00000000010cef4d in operator()
{code}
{code:java}
130 std::shared_ptr<TTransport> TSaslServerTransport::Factory::getTransport(
131 std::shared_ptr<TTransport> trans) {
132 // Thrift servers use both an input and an output transport to
communicate with
133 // clients. In principal, these can be different, but for SASL clients we
require them
134 // to be the same so that the authentication state is identical for
communication in
135 // both directions. In order to do this, we share the same TTransport
object for both
136 // input and output set in TAcceptQueueServer::SetupConnection.
137 std::shared_ptr<TBufferedTransport> ret_transport;
138 std::shared_ptr<TTransport> wrapped(
139 new TSaslServerTransport(serverDefinitionMap_, trans));
140 // Set socket timeouts to prevent TSaslServerTransport->open from
blocking the server
141 // from accepting new connections if a read/write blocks during the
handshake
142 TSocket* socket = static_cast<TSocket*>(trans.get());
143 socket->setRecvTimeout(FLAGS_sasl_connect_tcp_timeout_ms); <==
5min timeout for read() calls invoked indirectly at line 147: open()
144 socket->setSendTimeout(FLAGS_sasl_connect_tcp_timeout_ms);
145 ret_transport.reset(new TBufferedTransport(wrapped,
146
impala::ThriftServer::BufferedTransportFactory::DEFAULT_BUFFER_SIZE_BYTES));
147 ret_transport.get()->open();
148 // Reset socket timeout back to zero, so idle clients do not timeout
149 socket->setRecvTimeout(0);
150 socket->setSendTimeout(0);
151 return ret_transport;
152 }
153
154 }}}
155 #endif
transport/TSaslServerTransport.cpp
{code}
> Identify and time out connections that are not from a supported Impala client
> more eagerly
> ------------------------------------------------------------------------------------------
>
> Key: IMPALA-11653
> URL: https://issues.apache.org/jira/browse/IMPALA-11653
> Project: IMPALA
> Issue Type: Improvement
> Affects Versions: Impala 4.1.0
> Reporter: Vincent Tran
> Assignee: Qifan Chen
> Priority: Major
> Attachments: simple_tcp_client.py
>
>
> When a tcp client opens a connection to an Impala client interface (hs2 or
> beeswax), the connection is accepted immediately after the 3-way handshake
> (SYN, SYN-ACK, ACK) and is queued forĀ
> *TAcceptQueueServer::SetupConnection()*. However, if the client sends
> nothing else, the ImpalaServer will block in
> *apache::thrift::transport::TSocket::read()* until the client sends a RST/FIN
> or until *sasl_connect_tcp_timeout_ms* elapses (which is by default, 5
> minutes).
> The connection setup thread stack trace can be observed below during this
> period.
> {noformat}
> (gdb) bt
> #0 0x00007f3b972ee20d in poll () from ./lib64/libc.so.6
> #1 0x0000000002dcd5bc in apache::thrift::transport::TSocket::read(unsigned
> char*, unsigned int) ()
> #2 0x0000000002dd1803 in unsigned int
> apache::thrift::transport::readAll<apache::thrift::transport::TSocket>(apache::thrift::transport::TSocket&,
> unsigned char*, unsigned int) ()
> #3 0x0000000001330cc9 in readAll (len=5, buf=0x7f3277ea4f8b "",
> this=<optimized out>) at
> ../../../toolchain/toolchain-packages-gcc7.5.0/thrift-0.9.3-p8/include/thrift/transport/TTransport.h:121
> #4 apache::thrift::transport::TSaslTransport::receiveSaslMessage
> (this=this@entry=0x278a96b0, status=status@entry=0x7f3277ea500c,
> length=length@entry=0x7f3277ea5008) at TSaslTransport.cpp:259
> #5 0x000000000132db14 in
> apache::thrift::transport::TSaslServerTransport::handleSaslStartMessage
> (this=0x278a96b0) at TSaslServerTransport.cpp:95
> #6 0x0000000001330e33 in
> apache::thrift::transport::TSaslTransport::doSaslNegotiation
> (this=0x278a96b0) at TSaslTransport.cpp:81
> #7 0x000000000132e723 in open (this=0x12e29750) at
> ../../../toolchain/toolchain-packages-gcc7.5.0/thrift-0.9.3-p8/include/thrift/transport/TBufferTransports.h:218
> #8 apache::thrift::transport::TSaslServerTransport::Factory::getTransport
> (this=0xf825a70, trans=...) at TSaslServerTransport.cpp:173
> #9 0x00000000010cd49d in
> apache::thrift::server::TAcceptQueueServer::SetupConnection (this=0x174270c0,
> entry=...) at TAcceptQueueServer.cpp:233
> #10 0x00000000010cef4d in operator() (tid=<optimized out>, item=...,
> __closure=<optimized out>) at TAcceptQueueServer.cpp:323
> #11
> boost::detail::function::void_function_obj_invoker2<apache::thrift::server::TAcceptQueueServer::serve()::<lambda(int,
> const boost::shared_ptr<apache::thrift::server::TAcceptQueueEntry>&)>, void,
> int, const
> boost::shared_ptr<apache::thrift::server::TAcceptQueueEntry>&>::invoke(boost::detail::function::function_buffer
> &, int, const boost::shared_ptr<apache::thrift::server::TAcceptQueueEntry>
> &) (function_obj_ptr=..., a0=<optimized out>, a1=...)
> at
> ../../../toolchain/toolchain-packages-gcc7.5.0/boost-1.61.0-p2/include/boost/function/function_template.hpp:159
> #12 0x00000000010d3e59 in operator() (a1=..., a0=1, this=0x7f3279ea9510) at
> ../../../toolchain/toolchain-packages-gcc7.5.0/boost-1.61.0-p2/include/boost/function/function_template.hpp:770
> #13
> impala::ThreadPool<boost::shared_ptr<apache::thrift::server::TAcceptQueueEntry>
> >::WorkerThread (this=0x7f3279ea94c0, thread_id=1) at
> ../util/thread-pool.h:166
> #14 0x000000000144f8f2 in operator() (this=0x7f3277ea5b40) at
> ../../../toolchain/toolchain-packages-gcc7.5.0/boost-1.61.0-p2/include/boost/function/function_template.hpp:770
> #15 impala::Thread::SuperviseThread(std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > const&,
> std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>
> > const&, boost::function<void ()>, impala::ThreadDebugInfo const*,
> impala::Promise<long, (impala::PromiseMode)0>*) (name=..., category=...,
> functor=..., parent_thread_info=<optimized out>,
> thread_started=0x7f3279ea9110) at thread.cc:360
> #16 0x0000000001450d6b in operator()<void (*)(const
> std::__cxx11::basic_string<char>&, const std::__cxx11::basic_string<char>&,
> boost::function<void()>, const impala::ThreadDebugInfo*, impala::Promise<long
> int>*), boost::_bi::list0> (a=<synthetic pointer>,
> f=@0x1417ccf8: 0x144f5f0
> <impala::Thread::SuperviseThread(std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > const&,
> std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>
> > const&, boost::function<void ()>, impala::ThreadDebugInfo const*,
> impala::Promise<long, (impala::PromiseMode)0>*)>, this=0x1417cd00) at
> ../../../toolchain/toolchain-packages-gcc7.5.0/boost-1.61.0-p2/include/boost/bind/bind.hpp:531
> #17 operator() (this=0x1417ccf8) at
> ../../../toolchain/toolchain-packages-gcc7.5.0/boost-1.61.0-p2/include/boost/bind/bind.hpp:1222
> #18 boost::detail::thread_data<boost::_bi::bind_t<void, void
> (*)(std::__cxx11::basic_string<char, std::char_traits<char>,
> std::allocator<char> > const&, std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > const&, boost::function<void
> ()>, impala::ThreadDebugInfo const*, impala::Promise<long,
> (impala::PromiseMode)0>*),
> boost::_bi::list5<boost::_bi::value<std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > >,
> boost::_bi::value<std::__cxx11::basic_string<char, std::char_traits<char>,
> std::allocator<char> > >, boost::_bi::value<boost::function<void ()> >,
> boost::_bi::value<impala::ThreadDebugInfo*>,
> boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > >
> >::run() (this=0x1417cb40)
> at
> ../../../toolchain/toolchain-packages-gcc7.5.0/boost-1.61.0-p2/include/boost/thread/detail/thread.hpp:116
> #19 0x0000000001ca17c2 in thread_proxy ()
> #20 0x00007f3b9a7f2dd5 in start_thread () from ./lib64/libpthread.so.0
> #21 0x00007f3b972f8ead in clone () from ./lib64/libc.so.6
> {noformat}
> As discussed in IMPALA-7638, we need to distinguish between a client that is
> slow to finish sasl negotiation (due to slow Kerberos negotiation) from a
> client that is never going to do anything beyond the tcp handshake and time
> these out much sooner than the *sasl_connect_tcp_timeout_ms* duration.
> The logging pattern below captures some instances of these abnormal
> connections
> {noformat}
> I1007 12:16:07.636166 185038 TAcceptQueueServer.cpp:227] TAcceptQueueServer:
> hiveserver2-frontend started connection setup for client <Host: x.x.x.x Port:
> 32878>
> I1007 12:21:06.634896 185038 thrift-util.cc:96] TAcceptQueueServer:
> hiveserver2-frontend connection setup failed for client <Host: x.x.x.x Port:
> 32878>. Caught TException: No more data to read.
> ====
> I1007 12:16:23.488011 185039 TAcceptQueueServer.cpp:227] TAcceptQueueServer:
> hiveserver2-frontend started connection setup for client <Host: x.x.x.x Port:
> 33934>
> I1007 12:21:22.488610 185039 thrift-util.cc:96] TAcceptQueueServer:
> hiveserver2-frontend connection setup failed for client <Host: x.x.x.x Port:
> 33934>. Caught TException: No more data to read.
> {noformat}
> This instance of ImpalaServer was running with
> *accepted_cnxn_setup_thread_pool_size*=2. That means that both of these
> threads (185038 &185039) were tied up during this 5-minute window. Subsequent
> incoming client connections to the HS2 interface will be in the accept queue
> until one of these threads frees up. If more of those accepted connections
> do not start sasl negotiation, the problem will snowball.
> Attached a simple tcp client [^simple_tcp_client.py] that when called more
> than once in quick succession, will block port 21050 and cause otherwise
> legitimate connections from Impala-supported clients to wait in the "accept"
> queue for at least 5 minutes.
> {noformat}
> # python simple_tcp_client.py &
> [1] 19986
> 2022-10-12 11:28:16 INFO Created a tcp client
> 2022-10-12 11:28:16 INFO Connecting to: c908086-2.vpc.cloudera.com:21050
> 2022-10-12 11:28:16 INFO Client1 connected
> 2022-10-12 11:28:16 INFO Sleeping for 5 minutes
> # python simple_tcp_client.py &
> [2] 19989
> 2022-10-12 11:28:20 INFO Created a tcp client
> 2022-10-12 11:28:20 INFO Connecting to: c908086-2.vpc.cloudera.com:21050
> 2022-10-12 11:28:20 INFO Client1 connected
> 2022-10-12 11:28:20 INFO Sleeping for 5 minutes
> {noformat}
> {noformat}
> # impala-shell -i c908086-2.vpc.cloudera.com -d default -k --protocol=hs2
> Starting Impala Shell using Kerberos authentication
> Using service name 'impala'
> Socket error None: timed out
> ***********************************************************************************
> Welcome to the Impala shell.
> (Impala Shell v3.4.0-SNAPSHOT (a1dfdfd) built on Sun Aug 21 10:10:08 UTC 2022)
> When pretty-printing is disabled, you can use the '--output_delimiter' flag
> to set
> the delimiter for fields in the same row. The default is '\t'.
> ***********************************************************************************
> [Not connected] >
> {noformat}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]