This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit d267b8b22191c0bb219a9a6a33c1dbecf3894dbe Author: Alexey Serbin <[email protected]> AuthorDate: Sun May 19 23:13:54 2019 -0700 [tool-test] add scenario for KUDU-2819 regressions Added a test scenario to check how ksck works in case of collecting information from many tablets servers a Kudu cluster. This is to cover regressions for a race condition that existed in FetchInfoFromTabletServers() some time ago before it was fixed with d17f9fce6 (along with bringing other improvements). One manifestation of the race condition is described in KUDU-2819. With this test, I verified that the version prior to the above mentioned patch d17f9fce6 was susceptible to the race as reported by sanitizers: ThreadSanitizer (100% failure rate): http://dist-test.cloudera.org/job?job_id=aserbin.1558485044.66856 AddressSanitizer (~1 out of 20 failed), reproducing double-free, heap-buffer-overflow, memory leaks and friends: http://dist-test.cloudera.org/job?job_id=aserbin.1558484756.62807 Also, I verified that by commenting out the guard that protects concurrent calls of std::vector::emplace_back() on 'warning_messages' container in FetchInfoFromTabletServers() method (src/kudu/toolts/ksck.cc), both sanitizers were able to detect the race: ThreadSanitizer (100% failure rate): http://dist-test.cloudera.org/job?job_id=aserbin.1558485274.73242 AddressSanitizer (~1 out of 20 failed): http://dist-test.cloudera.org/job?job_id=aserbin.1558485217.70253 Change-Id: I16cf69b6f7d2fb59014df26601dfc30e124a52ee Reviewed-on: http://gerrit.cloudera.org:8080/13380 Tested-by: Alexey Serbin <[email protected]> Reviewed-by: Adar Dembo <[email protected]> --- src/kudu/tools/kudu-tool-test.cc | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc index ed0bc68..9d445fb 100644 --- a/src/kudu/tools/kudu-tool-test.cc +++ b/src/kudu/tools/kudu-tool-test.cc @@ -4560,6 +4560,47 @@ TEST_F(ToolTest, TestGetFlags) { } } +// This is a synthetic test to provide coverage for regressions of KUDU-2819. +TEST_F(ToolTest, TabletServersWithUnusualFlags) { + // Run many tablet servers: it helps in detection of races, if any. +#if defined(THREAD_SANITIZER) + // In case of TSAN builds, it takes too long to wait for the start up of too + // many tablet servers. + static constexpr int kNumTabletServers = 32; +#else + // Run as many tablet servers in external minicluster as possible. + static constexpr int kNumTabletServers = 62; +#endif + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = kNumTabletServers; + NO_FATALS(StartExternalMiniCluster(std::move(opts))); + + // Leave only one tablet server running, shutdown all others. + for (size_t i = 1; i < kNumTabletServers; ++i) { + cluster_->tablet_server(i)->Shutdown(); + } + + // The 'cluster ksck' tool should report on unavailable tablet servers. + const string& master_addr = cluster_->master()->bound_rpc_addr().ToString(); + { + string err; + Status s = RunActionStderrString( + Substitute("cluster ksck $0", master_addr), &err); + ASSERT_TRUE(s.IsRuntimeError()) << s.ToString(); + ASSERT_STR_CONTAINS(err, "Runtime error: ksck discovered errors"); + } + + // The 'cluster rebalance' tool should bail and report an error due to + // unavailability of tablet servers in the cluster. + { + string err; + Status s = RunActionStderrString( + Substitute("cluster rebalance $0", master_addr), &err); + ASSERT_TRUE(s.IsRuntimeError()) << s.ToString(); + ASSERT_STR_CONTAINS(err, "unacceptable health status UNAVAILABLE"); + } +} + TEST_F(ToolTest, TestParseStacks) { const string kDataPath = JoinPathSegments(GetTestExecutableDirectory(), "testdata/sample-diagnostics-log.txt");
