This is an automated email from the ASF dual-hosted git repository. tmarshall pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit a47700ed790c2415e52a85e40063bed53a7cb9e8 Author: Vihang Karajgaonkar <[email protected]> AuthorDate: Fri Feb 19 15:30:07 2021 -0800 IMPALA-10450: Catalogd crashes due to exception in ThriftDebugString This patch adds a wrapper around ThriftDebugString method provided in the Thrift library. The thrift's method can throw exceptions like (bad_alloc or TProtocolException) when the object cannot be serialized into a string representation. This exception is not caught on the catalogd side and it crashes the catalogd. The error was specifically seen in the catalogd's debug UI which provides a way to display a Table object. An exception thrown when rendering the table on the UI would have crashed the catalogd before the patch. In order to simulate this crash a new debug action called EXCEPTION was added. A new custom cluster test was added which simulates a exception thrown in this method and makes sure that fetching the table from catalogd's debug UI does not crash the catalogd. Tests: 1. Added a new custom cluster test which reproduces the crash. 2. Created a large table which has ~270K partitions and reduced the memory of the catalogd to 16GB. This configuration throws bad_alloc exception in the ThriftDebugString method and crashes the catalogd. After the patch the crash is averted and we see a error message on the debug UI instead. I also looped around the catalog web UI call for more than an hour to see if there are any other stability issues. I could not see any problems. Change-Id: I42cee6186a3d5bacc1117bae5961ac60ac9f7a66 Reviewed-on: http://gerrit.cloudera.org:8080/17110 Reviewed-by: Vihang Karajgaonkar <[email protected]> Tested-by: Vihang Karajgaonkar <[email protected]> --- be/src/catalog/catalog-server.cc | 13 +++--- be/src/util/debug-util.cc | 14 ++++++ be/src/util/debug-util.h | 8 ++++ be/src/util/thrift-debug-util.h | 41 +++++++++++++++++ .../test_thrift_debug_string_exception.py | 51 ++++++++++++++++++++++ 5 files changed, 121 insertions(+), 6 deletions(-) diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc index e14c982..9731dd3 100644 --- a/be/src/catalog/catalog-server.cc +++ b/be/src/catalog/catalog-server.cc @@ -32,6 +32,7 @@ #include "util/event-metrics.h" #include "util/logging-support.h" #include "util/metrics.h" +#include "util/thrift-debug-util.h" #include "util/webserver.h" #include "common/names.h" @@ -88,7 +89,6 @@ DEFINE_int64(topic_update_tbl_max_wait_time_ms, 500, "Maximum time " "table lock. A value of 0 disables the timeout based locking which means topic " "update thread will always block until table lock is acquired."); - DECLARE_string(debug_actions); DECLARE_string(state_store_host); DECLARE_int32(state_store_subscriber_port); @@ -186,7 +186,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf { Status status = catalog_server_->catalog()->GetCatalogObject(req.object_desc, &resp.catalog_object); if (!status.ok()) LOG(ERROR) << status.GetDetail(); - VLOG_RPC << "GetCatalogObject(): response=" << ThriftDebugString(resp); + VLOG_RPC << "GetCatalogObject(): response=" << ThriftDebugStringNoThrow(resp); } void GetPartialCatalogObject(TGetPartialCatalogObjectResponse& resp, @@ -204,7 +204,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf { TStatus thrift_status; status.ToThrift(&thrift_status); resp.__set_status(thrift_status); - VLOG_RPC << "GetPartialCatalogObject(): response=" << ThriftDebugString(resp); + VLOG_RPC << "GetPartialCatalogObject(): response=" << ThriftDebugStringNoThrow(resp); } void GetPartitionStats(TGetPartitionStatsResponse& resp, @@ -215,7 +215,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf { TStatus thrift_status; status.ToThrift(&thrift_status); resp.__set_status(thrift_status); - VLOG_RPC << "GetPartitionStats(): response=" << ThriftDebugString(resp); + VLOG_RPC << "GetPartitionStats(): response=" << ThriftDebugStringNoThrow(resp); } // Prioritizes the loading of metadata for one or more catalog objects. Currently only @@ -662,8 +662,9 @@ void CatalogServer::CatalogObjectsUrlCallback(const Webserver::WebRequest& req, // Get the object and dump its contents. TCatalogObject result; if (status.ok()) status = catalog_->GetCatalogObject(request, &result); - if (status.ok()) { - Value debug_string(ThriftDebugString(result).c_str(), document->GetAllocator()); + if(status.ok()) { + Value debug_string( + ThriftDebugStringNoThrow(result).c_str(), document->GetAllocator()); document->AddMember("thrift_string", debug_string, document->GetAllocator()); } } diff --git a/be/src/util/debug-util.cc b/be/src/util/debug-util.cc index 1ca8fd6..de43741 100644 --- a/be/src/util/debug-util.cc +++ b/be/src/util/debug-util.cc @@ -426,6 +426,20 @@ Status DebugActionImpl( ImpaladMetrics::DEBUG_ACTION_NUM_FAIL->Increment(1l); } return Status(TErrorCode::INTERNAL_ERROR, error_msg); + } else if (iequals(cmd, "EXCEPTION")) { + //EXCEPTION@<exception_type> + if (tokens.size() != 2) { + return Status(Substitute(ERROR_MSG, components[0], action_str, + "expected EXCEPTION@<exception_type>")); + } + static const auto end = EXCEPTION_STR_MAP.end(); + auto it = EXCEPTION_STR_MAP.find(tokens[1]); + if (it != end) { + it->second(); + } else { + return Status( + Substitute(ERROR_MSG, components[0], action_str, "Invalid exception type")); + } } else { DCHECK(false) << "Invalid debug action"; return Status(Substitute(ERROR_MSG, components[0], action_str, "invalid command")); diff --git a/be/src/util/debug-util.h b/be/src/util/debug-util.h index e477acb..0165093 100644 --- a/be/src/util/debug-util.h +++ b/be/src/util/debug-util.h @@ -21,6 +21,7 @@ #include <vector> #include <thrift/protocol/TDebugProtocol.h> +#include <thrift/Thrift.h> #include "common/compiler-util.h" #include "common/config.h" @@ -190,6 +191,13 @@ static inline void DebugActionNoFail( DebugActionNoFail(query_options.debug_action, label); } +/// Map of exception string to the exception throwing function which is used when +/// executing the EXCEPTION debug action. +static const std::unordered_map<std::string,std::function<void()>> EXCEPTION_STR_MAP { + {"exception", [](){ throw std::exception(); }}, + {"bad_alloc", [](){ throw std::bad_alloc(); }}, + {"TException", [](){ throw apache::thrift::TException(); }} +}; // FILE_CHECKs are conditions that we expect to be true but could fail due to a malformed // input file. They differentiate these cases from DCHECKs, which indicate conditions that // are true unless there's a bug in Impala. We would ideally always return a bad Status diff --git a/be/src/util/thrift-debug-util.h b/be/src/util/thrift-debug-util.h new file mode 100644 index 0000000..8169837 --- /dev/null +++ b/be/src/util/thrift-debug-util.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include <string> +#include <thrift/protocol/TDebugProtocol.h> +#include "util/debug-util.h" + +DECLARE_string(debug_actions); + +namespace impala { + + /// This method is a wrapper over Thrift's ThriftDebugString. It catches any exception + /// that might be thrown by the underlying Thrift method. Typically, we use this method + /// when we suspect the ThriftStruct could be a large object like a HdfsTable whose + /// string representation could have significantly larger memory requirements than the + /// binary representation. + template<typename ThriftStruct> + std::string ThriftDebugStringNoThrow(ThriftStruct ts) noexcept { + try { + Status status = DebugAction(FLAGS_debug_actions, "THRIFT_DEBUG_STRING"); + return apache::thrift::ThriftDebugString(ts); + } catch (std::exception& e) { + return "Unexpected exception received: " + std::string(e.what()); + } + } +} diff --git a/tests/custom_cluster/test_thrift_debug_string_exception.py b/tests/custom_cluster/test_thrift_debug_string_exception.py new file mode 100644 index 0000000..af03abd --- /dev/null +++ b/tests/custom_cluster/test_thrift_debug_string_exception.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from tests.common.custom_cluster_test_suite import CustomClusterTestSuite + + +class TestThriftDebugStringExceptions(CustomClusterTestSuite): + """Regression tests for IMPALA-10450""" + + @CustomClusterTestSuite.with_args( + catalogd_args="--debug_actions=THRIFT_DEBUG_STRING:EXCEPTION@bad_alloc") + def test_thrift_debug_str_bad_alloc(self): + """The test executes a API call to get a catalog object from the debug UI and makes + sure that catalogd does not crash if the ThriftDebugString throws bad_alloc + exception.""" + obj = self._get_catalog_object() + assert "Unexpected exception received" in obj + + @CustomClusterTestSuite.with_args( + catalogd_args="--debug_actions=THRIFT_DEBUG_STRING:EXCEPTION@TException") + def test_thrift_debug_str_texception(self): + """The test executes a API call to get a catalog object from the debug UI and makes + sure that catalogd does not crash if the ThriftDebugString throws a TException.""" + obj = self._get_catalog_object() + assert "Unexpected exception received" in obj + + @CustomClusterTestSuite.with_args() + def test_thrift_debug_str(self): + """Sanity test which executes API call to get a catalog object and make sure that + it does not return a error message under normal circumstances.""" + obj = self._get_catalog_object() + assert "Unexpected exception received" not in obj + + def _get_catalog_object(self): + """ Return the catalog object of functional.alltypes serialized to string. """ + return self.cluster.catalogd.service.read_debug_webpage( + "catalog_object?object_type=TABLE&object_name=functional.alltypes")
