[kudu-admin-test] improvements on error reporting This changelist introduces improvements on logging in case if an assertion triggers in the rebalancer-related scenarios.
I found it's hard to troubleshoot the flakiness reported lately in the RebalancerAndSingleReplicaTablets.SingleReplicasStayOrMove scenario. With hindsight, it turned out that improved logging on assertions would help a lot. Change-Id: Ibfa74d5a78e89bf7ff1a0e914384999466460145 Reviewed-on: http://gerrit.cloudera.org:8080/11046 Reviewed-by: Andrew Wong <aw...@cloudera.com> Tested-by: Alexey Serbin <aser...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/660ee80b Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/660ee80b Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/660ee80b Branch: refs/heads/master Commit: 660ee80bb4c0f81cb6f0a08365a7e2abd0a3e409 Parents: 7e733a7 Author: Alexey Serbin <aser...@cloudera.com> Authored: Tue Jul 24 20:20:00 2018 -0700 Committer: Alexey Serbin <aser...@cloudera.com> Committed: Mon Jul 30 15:06:44 2018 +0000 ---------------------------------------------------------------------- src/kudu/tools/kudu-admin-test.cc | 80 ++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/660ee80b/src/kudu/tools/kudu-admin-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc index 21bae3b..0408e35 100644 --- a/src/kudu/tools/kudu-admin-test.cc +++ b/src/kudu/tools/kudu-admin-test.cc @@ -48,7 +48,6 @@ #include "kudu/gutil/map-util.h" #include "kudu/gutil/strings/split.h" #include "kudu/gutil/strings/substitute.h" -#include "kudu/gutil/strings/util.h" #include "kudu/integration-tests/cluster_itest_util.h" #include "kudu/integration-tests/cluster_verifier.h" #include "kudu/integration-tests/test_workload.h" @@ -105,6 +104,7 @@ using std::atomic; using std::back_inserter; using std::copy; using std::deque; +using std::endl; using std::ostringstream; using std::string; using std::thread; @@ -1331,6 +1331,14 @@ TEST_F(AdminCliTest, TestListTablesDetail) { } } +static string ToolRunInfo(const Status& s, const string& out, const string& err) { + ostringstream str; + str << s.ToString() << endl; + str << "stdout: " << out << endl; + str << "stderr: " << err << endl; + return str.str(); +} + // Make sure the rebalancer doesn't start if a tablet server is down. class RebalanceStartCriteriaTest : public AdminCliTest, @@ -1359,13 +1367,14 @@ TEST_P(RebalanceStartCriteriaTest, TabletServerIsDown) { ts->Shutdown(); } + string out; string err; Status s = RunKuduTool({ "cluster", "rebalance", cluster_->master()->bound_rpc_addr().ToString() - }, nullptr, &err); - ASSERT_TRUE(s.IsRuntimeError()) << s.ToString(); + }, &out, &err); + ASSERT_TRUE(s.IsRuntimeError()) << ToolRunInfo(s, out, err); const auto err_msg_pattern = Substitute( "Illegal state: tablet server .* \\($0\\): " "unacceptable health status UNAVAILABLE", @@ -1501,7 +1510,7 @@ TEST_P(RebalanceParamTest, Rebalance) { string out; string err; const Status s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced") << "stderr: " << err; } @@ -1512,7 +1521,7 @@ TEST_P(RebalanceParamTest, Rebalance) { string out; string err; const Status s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced (moved 0 replicas)") << "stderr: " << err; @@ -1532,7 +1541,7 @@ TEST_P(RebalanceParamTest, Rebalance) { string out; string err; const Status s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced") << "stderr: " << err; // The cluster was un-balanced, so many replicas should have been moved. @@ -1570,6 +1579,8 @@ class RebalancingTest : } protected: + static const char* const kExitOnSignalStr; + void Prepare(const vector<string>& extra_tserver_flags = {}, const vector<string>& extra_master_flags = {}) { copy(extra_tserver_flags.begin(), extra_tserver_flags.end(), @@ -1616,6 +1627,7 @@ class RebalancingTest : vector<string> tserver_flags_; vector<string> master_flags_; }; +const char* const RebalancingTest::kExitOnSignalStr = "kudu: process exited on signal"; // Make sure the rebalancer is able to do its job if running concurrently // with DDL activity on the cluster. @@ -1781,7 +1793,7 @@ TEST_P(DDLDuringRebalancingTest, TablesCreatedAndDeletedDuringRebalancing) { string out; string err; const auto s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced") << "stderr: " << err; } @@ -1792,7 +1804,7 @@ TEST_P(DDLDuringRebalancingTest, TablesCreatedAndDeletedDuringRebalancing) { string out; string err; const auto s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced (moved 0 replicas)") << "stderr: " << err; @@ -1829,21 +1841,18 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) { }; const auto runner_func = [&]() { + string out; string err; - const auto s = RunKuduTool(tool_args, nullptr, &err); - - ostringstream os; - os << "rebalancer status: " << s.ToString(); - // One might expect a bad status returned: e.g., due to some race so - // the rebalancer didn't able to make progress for more than - // --max_staleness_interval_sec, etc. + const auto s = RunKuduTool(tool_args, &out, &err); if (!s.ok()) { - os << " : " << err; + // One might expect a bad status returned: e.g., due to some race so + // the rebalancer didn't able to make progress for more than + // --max_staleness_interval_sec, etc. + LOG(INFO) << "rebalancer run info: " << ToolRunInfo(s, out, err); } - LOG(INFO) << os.str(); // Should not exit on a signal: not expecting SIGSEGV, SIGABRT, etc. - return !MatchPattern(err, "*kudu: process exited on signal*"); + return s.ToString().find(kExitOnSignalStr) == string::npos; }; CountDownLatch start_synchronizer(1); @@ -1868,7 +1877,7 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) { string out; string err; const auto s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced") << "stderr: " << err; } @@ -1881,7 +1890,7 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) { string out; string err; const auto s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err; + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced (moved 0 replicas)") << "stderr: " << err; @@ -1921,13 +1930,14 @@ TEST_P(TserverGoesDownDuringRebalancingTest, TserverDown) { // Pre-condition: 'kudu cluster ksck' should be happy with the cluster state // shortly after initial setup. ASSERT_EVENTUALLY([&]() { + string out; string err; const auto s = RunKuduTool({ "cluster", "ksck", cluster_->master()->bound_rpc_addr().ToString(), - }, nullptr, &err); - ASSERT_TRUE(s.ok()) << "stderr: " << err; + }, &out, &err); + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); }); Random r(SeedRandom()); @@ -1960,13 +1970,13 @@ TEST_P(TserverGoesDownDuringRebalancingTest, TserverDown) { // server goes down. "--max_moves_per_server=1", }, &out, &err); - ASSERT_TRUE(s.IsRuntimeError()) << s.ToString(); + ASSERT_TRUE(s.IsRuntimeError()) << ToolRunInfo(s, out, err); + + // The rebalancer tool should not crash. + ASSERT_STR_NOT_CONTAINS(s.ToString(), kExitOnSignalStr); ASSERT_STR_MATCHES( err, "Illegal state: tablet server .* \\(.*\\): " "unacceptable health status UNAVAILABLE"); - - // The rebalancer tool should not crash. - ASSERT_STR_NOT_CONTAINS(err, "kudu: process exited on signal"); } run = false; @@ -2011,9 +2021,10 @@ TEST_P(TserverAddedDuringRebalancingTest, TserverStarts) { atomic<bool> run(true); thread runner([&]() { while (run) { + string out; string err; - const auto s = RunKuduTool(tool_args, nullptr, &err); - CHECK(s.ok()) << s.ToString() << "stderr: " << err; + const auto s = RunKuduTool(tool_args, &out, &err); + CHECK(s.ok()) << ToolRunInfo(s, out, err); } }); auto runner_cleanup = MakeScopedCleanup([&]() { @@ -2037,7 +2048,7 @@ TEST_P(TserverAddedDuringRebalancingTest, TserverStarts) { string out; string err; const auto s = RunKuduTool(tool_args, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced (moved 0 replicas)") << "stderr: " << err; @@ -2125,17 +2136,20 @@ TEST_P(RebalancerAndSingleReplicaTablets, SingleReplicasStayOrMove) { cluster_->master()->bound_rpc_addr().ToString(), Substitute("--move_single_replicas=$0", move_single_replica), }, &out, &err); - ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err; - ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced"); + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); + ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced") + << "stderr: " << err; if (move_single_replica == "enabled" || (move_single_replica == "auto" && is_343_scheme)) { // Should move appropriate replicas of single-replica tablets. ASSERT_STR_NOT_CONTAINS(out, - "rebalancing is complete: cluster is balanced (moved 0 replicas)"); + "rebalancing is complete: cluster is balanced (moved 0 replicas)") + << "stderr: " << err; ASSERT_STR_NOT_CONTAINS(err, "has single replica, skipping"); } else { ASSERT_STR_CONTAINS(out, - "rebalancing is complete: cluster is balanced (moved 0 replicas)"); + "rebalancing is complete: cluster is balanced (moved 0 replicas)") + << "stderr: " << err; ASSERT_STR_MATCHES(err, "tablet .* of table '.*' (.*) has single replica, skipping"); }