Repository: kudu Updated Branches: refs/heads/master 2453a6731 -> ead37caf3
rpc: show outbound call state in /rpcz dump Recently I've been looking at a stress cluster that is exhibiting lots of consensus request timeouts (eg KUDU-1788). It seems that many of the requests are timing out while the call is still in the process of being transferred, or in some cases not even sent yet. However, that wasn't obvious and took a lot of spelunking to figure out what was going on. This adds a new state to the OutboundCall state machine for 'SENDING', which is entered when we first start sending the request over the socket. As before, it transitions to 'SENT' once the request has been completely transferred. The state at the time of the timeout is now also put into the Status::TimedOut error string. It's a little bit ugly, but should be very useful to see when an issue is client-side or network-related rather than a server-side call timeout. Change-Id: Id52bc627a25be87a73b4b75941d7dcc2cf95eaba Reviewed-on: http://gerrit.cloudera.org:8080/5371 Tested-by: Kudu Jenkins Reviewed-by: Mike Percy <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/ead37caf Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/ead37caf Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/ead37caf Branch: refs/heads/master Commit: ead37caf3e6fc2b01346198176c21a19d2cf6224 Parents: 2453a67 Author: Todd Lipcon <[email protected]> Authored: Tue Dec 6 12:33:37 2016 +0800 Committer: Mike Percy <[email protected]> Committed: Wed Dec 7 16:02:05 2016 +0000 ---------------------------------------------------------------------- src/kudu/rpc/connection.cc | 2 ++ src/kudu/rpc/outbound_call.cc | 47 ++++++++++++++++++++++++++++--- src/kudu/rpc/outbound_call.h | 15 ++++++---- src/kudu/rpc/rpc_introspection.proto | 16 +++++++++++ 4 files changed, 71 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/ead37caf/src/kudu/rpc/connection.cc ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/connection.cc b/src/kudu/rpc/connection.cc index d5c3080..8ce98a1 100644 --- a/src/kudu/rpc/connection.cc +++ b/src/kudu/rpc/connection.cc @@ -590,6 +590,8 @@ void Connection::WriteHandler(ev::io &watcher, int revents) { delete transfer; continue; } + + car->call->SetSending(); } } http://git-wip-us.apache.org/repos/asf/kudu/blob/ead37caf/src/kudu/rpc/outbound_call.cc ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/outbound_call.cc b/src/kudu/rpc/outbound_call.cc index a26cbec..6952cc3 100644 --- a/src/kudu/rpc/outbound_call.cc +++ b/src/kudu/rpc/outbound_call.cc @@ -130,6 +130,8 @@ string OutboundCall::StateName(State state) { return "READY"; case ON_OUTBOUND_QUEUE: return "ON_OUTBOUND_QUEUE"; + case SENDING: + return "SENDING"; case SENT: return "SENT"; case TIMED_OUT: @@ -162,11 +164,16 @@ void OutboundCall::set_state_unlocked(State new_state) { case ON_OUTBOUND_QUEUE: DCHECK_EQ(state_, READY); break; + case SENDING: + // Allow SENDING to be set idempotently so we don't have to specifically check + // whether the state is transitioning in the RPC code. + DCHECK(state_ == ON_OUTBOUND_QUEUE || state_ == SENDING); + break; case SENT: - DCHECK_EQ(state_, ON_OUTBOUND_QUEUE); + DCHECK_EQ(state_, SENDING); break; case TIMED_OUT: - DCHECK(state_ == SENT || state_ == ON_OUTBOUND_QUEUE); + DCHECK(state_ == SENT || state_ == ON_OUTBOUND_QUEUE || state_ == SENDING); break; case FINISHED_SUCCESS: DCHECK_EQ(state_, SENT); @@ -233,6 +240,10 @@ void OutboundCall::SetQueued() { set_state(ON_OUTBOUND_QUEUE); } +void OutboundCall::SetSending() { + set_state(SENDING); +} + void OutboundCall::SetSent() { set_state(SENT); @@ -271,10 +282,11 @@ void OutboundCall::SetTimedOut() { { std::lock_guard<simple_spinlock> l(lock_); status_ = Status::TimedOut(Substitute( - "$0 RPC to $1 timed out after $2", + "$0 RPC to $1 timed out after $2 ($3)", remote_method_.method_name(), conn_id_.remote().ToString(), - timeout.ToString())); + timeout.ToString(), + StateName(state_))); set_state_unlocked(TIMED_OUT); } CallCallback(); @@ -289,6 +301,7 @@ bool OutboundCall::IsFinished() const { std::lock_guard<simple_spinlock> l(lock_); switch (state_) { case READY: + case SENDING: case ON_OUTBOUND_QUEUE: case SENT: return false; @@ -312,6 +325,32 @@ void OutboundCall::DumpPB(const DumpRunningRpcsRequestPB& req, resp->mutable_header()->CopyFrom(header_); resp->set_micros_elapsed( (MonoTime::Now() - start_time_).ToMicroseconds()); + + switch (state_) { + case READY: + // Don't bother setting a state for "READY" since we don't expose a call + // until it's at least on the queue of a connection. + break; + case ON_OUTBOUND_QUEUE: + resp->set_state(RpcCallInProgressPB::ON_OUTBOUND_QUEUE); + break; + case SENDING: + resp->set_state(RpcCallInProgressPB::SENDING); + break; + case SENT: + resp->set_state(RpcCallInProgressPB::SENT); + break; + case TIMED_OUT: + resp->set_state(RpcCallInProgressPB::TIMED_OUT); + break; + case FINISHED_ERROR: + resp->set_state(RpcCallInProgressPB::FINISHED_ERROR); + break; + case FINISHED_SUCCESS: + resp->set_state(RpcCallInProgressPB::FINISHED_SUCCESS); + break; + } + } /// http://git-wip-us.apache.org/repos/asf/kudu/blob/ead37caf/src/kudu/rpc/outbound_call.h ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/outbound_call.h b/src/kudu/rpc/outbound_call.h index 45d02b2..bd5163e 100644 --- a/src/kudu/rpc/outbound_call.h +++ b/src/kudu/rpc/outbound_call.h @@ -172,6 +172,10 @@ class OutboundCall { // Callback after the call has been put on the outbound connection queue. void SetQueued(); + // Update the call state to show that the request has started being sent + // on the socket. + void SetSending(); + // Update the call state to show that the request has been sent. void SetSent(); @@ -229,11 +233,12 @@ class OutboundCall { // and OutboundCall::StateName(State state) as well. enum State { READY = 0, - ON_OUTBOUND_QUEUE = 1, - SENT = 2, - TIMED_OUT = 3, - FINISHED_ERROR = 4, - FINISHED_SUCCESS = 5 + ON_OUTBOUND_QUEUE, + SENDING, + SENT, + TIMED_OUT, + FINISHED_ERROR, + FINISHED_SUCCESS }; static std::string StateName(State state); http://git-wip-us.apache.org/repos/asf/kudu/blob/ead37caf/src/kudu/rpc/rpc_introspection.proto ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/rpc_introspection.proto b/src/kudu/rpc/rpc_introspection.proto index 7718569..9c41bed 100644 --- a/src/kudu/rpc/rpc_introspection.proto +++ b/src/kudu/rpc/rpc_introspection.proto @@ -28,6 +28,22 @@ message RpcCallInProgressPB { required RequestHeader header = 1; optional string trace_buffer = 2; optional uint64 micros_elapsed = 3; + + enum State { + UNKNOWN = 999; + + // States for OutboundCall + ON_OUTBOUND_QUEUE = 1; + SENDING = 2; + SENT = 3; + TIMED_OUT = 4; + FINISHED_ERROR = 5; + FINISHED_SUCCESS = 6; + + // TODO(todd): add states for InboundCall + } + + optional State state = 4; } message RpcConnectionPB {
