[master] update placement logic for RF % 2 == 0 Updated the logic of the replica placement in master to properly handle even replication factors. Added corresponding unit tests as well.
It's possible to configure Kudu to allow creation of tables with an even replication factor. I think it's easier to implement the handling of those cases instead of handling possible inconsistencies in the rebalancer and other tools and adding extra paragraphs into release notes. Change-Id: I4259f805090dd350f31bf3bf2b6477898214ece4 Reviewed-on: http://gerrit.cloudera.org:8080/11763 Tested-by: Kudu Jenkins Reviewed-by: Will Berkeley <wdberke...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2c5c9d06 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2c5c9d06 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2c5c9d06 Branch: refs/heads/master Commit: 2c5c9d06bce6897d78f31178ad6a437d7c48f29b Parents: be64560 Author: Alexey Serbin <aser...@cloudera.com> Authored: Tue Oct 23 17:54:05 2018 -0700 Committer: Alexey Serbin <aser...@cloudera.com> Committed: Mon Oct 29 18:32:12 2018 +0000 ---------------------------------------------------------------------- src/kudu/master/placement_policy-test.cc | 116 ++++++++++++++++++++++++++ src/kudu/master/placement_policy.cc | 21 ++++- src/kudu/master/placement_policy.h | 5 +- 3 files changed, 137 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/2c5c9d06/src/kudu/master/placement_policy-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/master/placement_policy-test.cc b/src/kudu/master/placement_policy-test.cc index 951bcd0..541d0eb 100644 --- a/src/kudu/master/placement_policy-test.cc +++ b/src/kudu/master/placement_policy-test.cc @@ -769,5 +769,121 @@ TEST_F(PlacementPolicyTest, PlaceExtraTablet_L5_TS16_RF5) { EXPECT_GT(1500, placement_stats["B_ts3"]); } +// Even RF case: edge cases with 2 locaitons. +TEST_F(PlacementPolicyTest, PlaceTabletReplicasEmptyCluster_L2_EvenRFEdgeCase) { + const vector<LocationInfo> cluster_info = { + { "A", { { "A_ts0", 10 }, { "A_ts1", 10 }, { "A_ts2", 10 }, } }, + { "B", { { "B_ts0", 0 }, { "B_ts1", 0 }, { "B_ts2", 1 }, { "B_ts3", 10 }, } }, + }; + ASSERT_OK(Prepare(cluster_info)); + + const auto& all = descriptors(); + PlacementPolicy policy(all, rng()); + + { + static constexpr auto num_replicas = 2; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + ASSERT_EQ(1, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_EQ(1, m.count("B_ts0") + m.count("B_ts1") + m.count("B_ts2")); + } + { + static constexpr auto num_replicas = 4; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + ASSERT_EQ(2, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_EQ(2, m.count("B_ts0") + m.count("B_ts1") + m.count("B_ts2")); + } + { + static constexpr auto num_replicas = 6; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + ASSERT_EQ(1, m.count("A_ts0")); + ASSERT_EQ(1, m.count("A_ts1")); + ASSERT_EQ(1, m.count("A_ts2")); + ASSERT_EQ(1, m.count("B_ts0")); + ASSERT_EQ(1, m.count("B_ts1")); + ASSERT_EQ(1, m.count("B_ts2") + m.count("B_ts3")); + } +} + +// Even RF case: place tablet replicas into a cluster with 3 locations. +TEST_F(PlacementPolicyTest, PlaceTabletReplicasEmptyCluster_L3_EvenRF) { + const vector<LocationInfo> cluster_info = { + { "A", { { "A_ts0", 10 }, { "A_ts1", 10 }, { "A_ts2", 10 }, } }, + { "B", { { "B_ts0", 0 }, { "B_ts1", 0 }, { "B_ts2", 10 }, } }, + { "C", { { "C_ts0", 0 }, { "C_ts1", 0 }, { "C_ts2", 10 }, } }, + }; + ASSERT_OK(Prepare(cluster_info)); + + const auto& all = descriptors(); + PlacementPolicy policy(all, rng()); + + { + static constexpr auto num_replicas = 2; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + // Two location are to have one replica, one location to have none. + ASSERT_GE(1, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_GE(1, m.count("B_ts0") + m.count("B_ts1")); + ASSERT_GE(1, m.count("C_ts0") + m.count("C_ts1")); + } + { + static constexpr auto num_replicas = 4; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + // One location is to have two replicas, the rest are to have just one. + ASSERT_LE(1, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_GE(2, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_LE(1, m.count("B_ts0") + m.count("B_ts1")); + ASSERT_GE(2, m.count("B_ts0") + m.count("B_ts1")); + ASSERT_LE(1, m.count("C_ts0") + m.count("C_ts1")); + ASSERT_GE(2, m.count("C_ts0") + m.count("C_ts1")); + } + { + static constexpr auto num_replicas = 6; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + ASSERT_EQ(2, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_EQ(1, m.count("B_ts0")); + ASSERT_EQ(1, m.count("B_ts1")); + ASSERT_EQ(1, m.count("C_ts0")); + ASSERT_EQ(1, m.count("C_ts1")); + } + { + static constexpr auto num_replicas = 8; + TSDescriptorVector result; + ASSERT_OK(policy.PlaceTabletReplicas(num_replicas, &result)); + ASSERT_EQ(num_replicas, result.size()); + TSDescriptorsMap m; + ASSERT_OK(TSDescriptorVectorToMap(result, &m)); + ASSERT_EQ(2, m.count("A_ts0") + m.count("A_ts1") + m.count("A_ts2")); + ASSERT_EQ(1, m.count("B_ts0")); + ASSERT_EQ(1, m.count("B_ts1")); + ASSERT_EQ(1, m.count("B_ts2")); + ASSERT_EQ(1, m.count("C_ts0")); + ASSERT_EQ(1, m.count("C_ts1")); + ASSERT_EQ(1, m.count("C_ts2")); + } +} + } // namespace master } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/2c5c9d06/src/kudu/master/placement_policy.cc ---------------------------------------------------------------------- diff --git a/src/kudu/master/placement_policy.cc b/src/kudu/master/placement_policy.cc index 45f9ef7..5b0b0d6 100644 --- a/src/kudu/master/placement_policy.cc +++ b/src/kudu/master/placement_policy.cc @@ -299,6 +299,8 @@ Status PlacementPolicy::SelectLocation( string* location) const { DCHECK(location); + const auto num_locations = ltd_.size(); + // A pair of the location-per-load maps. The idea is to get a group to select // the best location based on the load, while not placing the majority of // replicas in same location, if possible. Using multimap (but not @@ -317,11 +319,24 @@ Status PlacementPolicy::SelectLocation( // per tablet server. continue; } - if (location_replicas_num + 1 > num_replicas / 2) { + // When placing the replicas of a tablet, it's necessary to take into + // account number of available locations, since the maximum number + // of replicas per non-overflow location depends on that. For example, + // in case of 2 locations the best placement for 4 replicas would be + // (2 + 2), while in case of 4 and more locations that's (1 + 1 + 1 + 1). + // Similarly, in case of 2 locations and 6 replicas, the best placement + // is (3 + 3), while for 3 locations that's (2 + 2 + 2). + if ((num_locations == 2 && num_replicas % 2 == 0 && + location_replicas_num + 1 > num_replicas / 2) || + (num_locations > 2 && + location_replicas_num + 1 >= (num_replicas + 1) / 2)) { // If possible, avoid placing the majority of the tablet's replicas // into a single location even if load-based criterion would favor that. - // So, if placing one extra replica will add up to the majority, place - // this location into the overflow group. + // Prefer such a distribution of replicas that will keep the majority + // of replicas alive if any single location fails. So, if placing one + // extra replica would add up to the majority in case of odd replication + // factor or add up to the half of all replicas in case of even + // replication factor, place this location into the overflow group. location_per_load_overflow.emplace( GetLocationLoad(location, locations_info), location); continue; http://git-wip-us.apache.org/repos/asf/kudu/blob/2c5c9d06/src/kudu/master/placement_policy.h ---------------------------------------------------------------------- diff --git a/src/kudu/master/placement_policy.h b/src/kudu/master/placement_policy.h index b645818..924c0d1 100644 --- a/src/kudu/master/placement_policy.h +++ b/src/kudu/master/placement_policy.h @@ -113,7 +113,7 @@ class PlacementPolicy { const ReplicaLocationsInfo& locations_info) const; // Select locations to place the given number of replicas ('nreplicas') for - // a new tablet. The locations are be chosen according to the placement + // a new tablet. The locations are chosen according to the placement // policies. // // TODO (aserbin): add the reference to the document once it's in the repo. @@ -135,7 +135,8 @@ class PlacementPolicy { // Select location for next replica of a tablet with the specified replication // factor. In essence, the algorithm picks the least loaded location, - // making sure no location contains the majority of the replicas. + // making sure no location contains the majority of replicas of the tablet, + // if possible. // // Parameters: // 'num_replicas' The total number of tablet replicas to place.