IMPALA-6238: Enhance TErrorCode::DATASTREAM_SENDER_TIMEOUT message This change augments the message of TErrorCode::DATASTREAM_SENDER_TIMEOUT to include the source address when KRPC is enabled. The source address is not readily available in Thrift. The new message includes the destination plan node id in case there are multiple exchange nodes in a fragment instance.
Testing done: Confirmed the error message by testing with following options: "--stress_datastream_recvr_delay_ms=90000 datastream_sender_timeout_ms=1000" Change-Id: Ie3e83773fe6feda057296e7d5544690aa9271fa0 Reviewed-on: http://gerrit.cloudera.org:8080/8751 Reviewed-by: Michael Ho <k...@cloudera.com> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/e4a2f5d2 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/e4a2f5d2 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/e4a2f5d2 Branch: refs/heads/master Commit: e4a2f5d2123508dbd9281980a395d4f9e1851dd7 Parents: 66704f9 Author: Michael Ho <k...@cloudera.com> Authored: Sat Dec 2 19:49:48 2017 -0800 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Tue Dec 5 02:09:21 2017 +0000 ---------------------------------------------------------------------- be/src/runtime/data-stream-mgr.cc | 6 ++++-- be/src/runtime/data-stream-sender.cc | 3 ++- be/src/runtime/krpc-data-stream-mgr.cc | 5 +++-- be/src/runtime/krpc-data-stream-sender.cc | 3 ++- common/thrift/generate_error_codes.py | 4 ++-- 5 files changed, 13 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/data-stream-mgr.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/data-stream-mgr.cc b/be/src/runtime/data-stream-mgr.cc index 503cd29..93c524e 100644 --- a/be/src/runtime/data-stream-mgr.cc +++ b/be/src/runtime/data-stream-mgr.cc @@ -186,7 +186,8 @@ Status DataStreamMgr::AddData(const TUniqueId& fragment_instance_id, // FindRecvrOrWait() timed out, which is unexpected and suggests a query setup error; // we return DATASTREAM_SENDER_TIMEOUT to trigger tear-down of the query. if (already_unregistered) return Status::OK(); - ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, PrintId(fragment_instance_id)); + ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, "", PrintId(fragment_instance_id), + dest_node_id); VLOG_QUERY << "DataStreamMgr::AddData(): " << msg.msg(); return Status::Expected(msg); } @@ -210,7 +211,8 @@ Status DataStreamMgr::CloseSender(const TUniqueId& fragment_instance_id, // Was not able to notify the receiver that this was the end of stream. Notify the // sender that this failed so that they can take appropriate action (i.e. failing // the query). - ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, PrintId(fragment_instance_id)); + ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, "", + PrintId(fragment_instance_id), dest_node_id); VLOG_QUERY << "DataStreamMgr::CloseSender(): " << msg.msg(); status = Status::Expected(msg); } http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/data-stream-sender.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/data-stream-sender.cc b/be/src/runtime/data-stream-sender.cc index cf749cd..c76e626 100644 --- a/be/src/runtime/data-stream-sender.cc +++ b/be/src/runtime/data-stream-sender.cc @@ -275,7 +275,8 @@ Status DataStreamSender::Channel::SendCurrentBatch() { Status DataStreamSender::Channel::GetSendStatus() { WaitForRpc(); if (!rpc_status_.ok()) { - LOG(ERROR) << "channel send status: " << rpc_status_.GetDetail(); + LOG(ERROR) << "channel send to " << TNetworkAddressToString(address_) << " failed: " + << rpc_status_.GetDetail(); } return rpc_status_; } http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/krpc-data-stream-mgr.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/krpc-data-stream-mgr.cc b/be/src/runtime/krpc-data-stream-mgr.cc index 7c36191..348b9ab 100644 --- a/be/src/runtime/krpc-data-stream-mgr.cc +++ b/be/src/runtime/krpc-data-stream-mgr.cc @@ -334,8 +334,9 @@ void KrpcDataStreamMgr::RespondToTimedOutSender(const std::unique_ptr<ContextTyp TUniqueId finst_id; finst_id.__set_lo(request->dest_fragment_instance_id().lo()); finst_id.__set_hi(request->dest_fragment_instance_id().hi()); - - ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, PrintId(finst_id)); + string remote_addr = Substitute(" $0", ctx->rpc_context->remote_address().host()); + ErrorMsg msg(TErrorCode::DATASTREAM_SENDER_TIMEOUT, remote_addr, PrintId(finst_id), + ctx->request->dest_node_id()); VLOG_QUERY << msg.msg(); Status::Expected(msg).ToProto(ctx->response->mutable_status()); ctx->rpc_context->RespondSuccess(); http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/be/src/runtime/krpc-data-stream-sender.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/krpc-data-stream-sender.cc b/be/src/runtime/krpc-data-stream-sender.cc index 32e20cd..0c2a295 100644 --- a/be/src/runtime/krpc-data-stream-sender.cc +++ b/be/src/runtime/krpc-data-stream-sender.cc @@ -331,7 +331,8 @@ Status KrpcDataStreamSender::Channel::WaitForRpc(std::unique_lock<SpinLock>* loc DCHECK(!rpc_in_flight_); if (UNLIKELY(!rpc_status_.ok())) { - LOG(ERROR) << "channel send status: " << rpc_status_.GetDetail(); + LOG(ERROR) << "channel send to " << TNetworkAddressToString(address_) << " failed: " + << rpc_status_.GetDetail(); return rpc_status_; } return Status::OK(); http://git-wip-us.apache.org/repos/asf/impala/blob/e4a2f5d2/common/thrift/generate_error_codes.py ---------------------------------------------------------------------- diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index bf1953e..b137b5e 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -224,8 +224,8 @@ error_codes = ( ("COMPRESSED_FILE_TRUNCATED", 70, "Unexpected end of compressed file. File may be truncated. file=$0"), - ("DATASTREAM_SENDER_TIMEOUT", 71, "Sender timed out waiting for receiver fragment " - "instance: $0"), + ("DATASTREAM_SENDER_TIMEOUT", 71, "Sender$0 timed out waiting for receiver fragment " + "instance: $1, dest node: $2"), ("KUDU_IMPALA_TYPE_MISSING", 72, "Kudu type $0 is not available in Impala."),