This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch branch-1.11.x in repository https://gitbox.apache.org/repos/asf/kudu.git
commit a8733419fd43c488586172f71cab0892581146e8 Author: triplesheep <[email protected]> AuthorDate: Fri Nov 1 10:29:20 2019 +0800 KUDU-2987 Intra location rebalance crashes in special case. The crash manifested itself in cases where a Kudu cluster had a location that didn't host even a single replica of a tablet. Change-Id: Iea39472e55178fca688b390249432a0f7fefaaba Reviewed-on: http://gerrit.cloudera.org:8080/14608 Tested-by: Alexey Serbin <[email protected]> Reviewed-by: Alexey Serbin <[email protected]> (cherry picked from commit 0fea1cb8ede852a87efc422b394ffe8d1e89bc6c) Reviewed-on: http://gerrit.cloudera.org:8080/14686 Tested-by: Kudu Jenkins Reviewed-by: Grant Henke <[email protected]> --- src/kudu/tools/rebalancer_tool-test.cc | 119 ++++++++++++++++++++++++++++++++- src/kudu/tools/rebalancer_tool.cc | 4 +- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/src/kudu/tools/rebalancer_tool-test.cc b/src/kudu/tools/rebalancer_tool-test.cc index 8a14ca0..18063ee 100644 --- a/src/kudu/tools/rebalancer_tool-test.cc +++ b/src/kudu/tools/rebalancer_tool-test.cc @@ -473,6 +473,7 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase { tserver_unresponsive_ms_, created_tables_names)); } else { ASSERT_OK(CreateTablesExcludingLocations(empty_locations, + kTableNamePattern, created_tables_names)); } } @@ -481,8 +482,13 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase { // tablet servers in the specified locations. This is similar to // CreateUnbalancedTables() but the set of tablet servers to avoid is defined // by the set of the specified locations. + // + // Note: 'table_name_pattern' is used to describe the table names created by this + // function. Each call to this function can create different batch of tables with + // this parameter set. Status CreateTablesExcludingLocations( const unordered_set<string>& excluded_locations, + const string& table_name_pattern = kTableNamePattern, vector<string>* table_names = nullptr) { // Shutdown all tablet servers in the specified locations so no tablet // replicas would be hosted by those servers. @@ -504,7 +510,7 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase { // are available. SleepFor(MonoDelta::FromMilliseconds(5 * tserver_unresponsive_ms_ / 4)); RETURN_NOT_OK(CreateTables(cluster_.get(), client_.get(), schema_, - kTableNamePattern, num_tables_, rep_factor_, + table_name_pattern, num_tables_, rep_factor_, table_names)); // Start tablet servers at the excluded locations. if (!excluded_locations.empty()) { @@ -519,6 +525,44 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase { return Status::OK(); } + // Create tables placing their tablet replicas elsewhere but not at the specified + // number of tservers in the specified locations. + Status CreateTablesExcludingTserversInLocations( + const unordered_map<string, int>& excluded_tserver_by_location, + const string& table_name_pattern = kTableNamePattern, + vector<string>* table_names = nullptr) { + unordered_map<string, int> excluded_tservers = excluded_tserver_by_location; + vector<string> excluded_tserver_uuids; + // Shutdown some tablet servers in the specified locations so no tablet + // replicas would be hosted by those servers. + if (!excluded_tservers.empty()) { + for (const auto& elem : tablet_servers_) { + auto* ts = elem.second; + if (ContainsKey(excluded_tservers, ts->location) + && excluded_tservers[ts->location] > 0) { + excluded_tserver_uuids.push_back(ts->uuid()); + --excluded_tservers[ts->location]; + cluster_->tablet_server_by_uuid(ts->uuid())->Shutdown(); + } + } + } + + for (const auto& elem : excluded_tservers) { + CHECK_EQ(0, elem.second); + } + // Wait for the catalog manager to understand that not all tablet servers + // are available. + SleepFor(MonoDelta::FromMilliseconds(5 * tserver_unresponsive_ms_ / 4)); + RETURN_NOT_OK(CreateTables(cluster_.get(), client_.get(), schema_, + table_name_pattern, num_tables_, rep_factor_, + table_names)); + // Start tablet servers at the excluded locations. + for (const auto& uuid : excluded_tserver_uuids) { + RETURN_NOT_OK(cluster_->tablet_server_by_uuid(uuid)->Restart()); + } + return Status::OK(); + } + // When the rebalancer starts moving replicas, ksck detects corruption // (that's why RuntimeError), seeing affected tables as non-healthy // with data state of corresponding tablets as TABLET_DATA_COPYING. If using @@ -1668,5 +1712,78 @@ TEST_P(LocationAwareRebalancingParamTest, Rebalance) { } } +// Basic intra location rebalance tests. +class IntraLocationRebalancingBasicTest : public RebalancingTest { +public: + IntraLocationRebalancingBasicTest() + : RebalancingTest(/*num_tables=*/ 1, + /*rep_factor=*/ 3, + /*num_tservers=*/ 11) { + } + + bool is_343_scheme() const override { + return true; + } +}; + +TEST_F(IntraLocationRebalancingBasicTest, LocationsWithEmptyTabletServers) { + if (!AllowSlowTests()) { + LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run"; + return; + } + const string first_table_name_pattern = "rebalance_test_first_table_$0"; + const string second_table_name_pattern = "rebalance_test_second_table_$0"; + const LocationInfo location_info = { { "/A", 3 }, { "/B", 3 }, { "/C", 3 }, + { "/D", 2 } }; + const vector<string>& extra_master_flags = { + "--master_client_location_assignment_enabled=false", + }; + + copy(extra_master_flags.begin(), extra_master_flags.end(), + back_inserter(master_flags_)); + + FLAGS_num_tablet_servers = num_tservers_; + FLAGS_num_replicas = rep_factor_; + NO_FATALS(BuildAndStart(tserver_flags_, master_flags_, location_info, + /*create_table=*/ false)); + + // Create special intra location imbalance in location "/D". Location "/D" + // should only contain tablet replicas of one table. + // + // Note: there are 3 tablets in each table and the replica factor is 3, + // so it's 9 tablet replicas per table. + // + // 1. Create table0 with tablet replicas in locations /A, /B and /C. Because + // the per table location load skew is not greater than 1, it won't trigger + // cross location rebalance, so no replica of table0 will be moved. + ASSERT_OK(CreateTablesExcludingLocations({"/D"}, first_table_name_pattern)); + // 2. Create table1 in { "/A", 3 }, { "/B", 3 }, { "/C", 3 }, {"/D", 1 }. + // There is an imbalance in location "/D" of table1. + ASSERT_OK(CreateTablesExcludingTserversInLocations({ {"/D", 1 } }, + second_table_name_pattern)); + + const vector<string> tool_args = { + "cluster", + "rebalance", + cluster_->master()->bound_rpc_addr().ToString(), + }; + + // Before the fix added in this changelist the rebalancer would simply + // crash while running. See KUDU-2987 for detail informations. + { + string out; + string err; + const Status s = RunKuduTool(tool_args, &out, &err); + ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err); + ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced") + << "stderr: " << err; + + // Intra location rebalancing in location "/D" should move one replica + // to another tserver, so there is at least one move. + ASSERT_STR_NOT_CONTAINS(out, "(moved 0 replicas)"); + ASSERT_STR_CONTAINS(out, "Placement policy violations:\n none\n"); + } +} + } // namespace tools } // namespace kudu diff --git a/src/kudu/tools/rebalancer_tool.cc b/src/kudu/tools/rebalancer_tool.cc index 60231e1..cfaaa25 100644 --- a/src/kudu/tools/rebalancer_tool.cc +++ b/src/kudu/tools/rebalancer_tool.cc @@ -286,9 +286,9 @@ Status RebalancerTool::KsckResultsToClusterRawInfo( } if (!replicas_at_location.empty()) { table_ids_at_location.insert(summary.table_id); + tablet_summaries.push_back(summary); + tablet_summaries.back().replicas = std::move(replicas_at_location); } - tablet_summaries.push_back(summary); - tablet_summaries.back().replicas = std::move(replicas_at_location); } for (const auto& summary : ksck_info.cluster_status.table_summaries) {
