Repository: kudu Updated Branches: refs/heads/branch-1.7.x 155e9efb4 -> f4383d572
[linked_list-test] fix flake with the 3-4-3 scheme The LinkedListTest.TestLoadWhileOneServerDownAndVerify scenario became flaky when running with --seconds_to_run set to about 800 and more once the 3-4-3 replica management scheme became the default one. Those cases (--seconds_to_run=X, X >= 800) are special because the stopped replica falls behind WAL segment GC threshold when running with the linked list input data, so the system automatically replaces the failed replica. In case of the 3-4-3 scheme, the newly added replica is added as a non-voter. The WaitForServersToAgree() looks only at the OpId indices, not distinguishing between voter and non-voter replicas. However, the verification phase of the scenario assumes the only replica left alive is a voter replica. Prior to this fix, the scenario didn't account for the case when the replica at the restarted tablet server was still a non-voter, and in most cases the rest 2 out of 3 tservers were shutdown before the newly added replica was promoted. As a result, the latter replica was left non-voter and the written data could not be read back from it. This patch adds a step to verify that all 3 replicas are registered as voters with the master(s) before shutting down the tservers hosting the source 2 replicas. The scenario is now stable when running with --seconds_to_run=900. Change-Id: I132206371e2935f1e0f39e9eacad866fde22c5b8 Reviewed-on: http://gerrit.cloudera.org:8080/9795 Tested-by: Alexey Serbin <aser...@cloudera.com> Reviewed-by: Mike Percy <mpe...@apache.org> (cherry picked from commit a781710264da43a1938aff2fffd112342efd41c9) Reviewed-on: http://gerrit.cloudera.org:8080/9811 Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f4383d57 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f4383d57 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f4383d57 Branch: refs/heads/branch-1.7.x Commit: f4383d57280428e54ef494244e2d466255989a69 Parents: 155e9ef Author: Alexey Serbin <aser...@cloudera.com> Authored: Fri Mar 23 19:40:54 2018 -0700 Committer: Alexey Serbin <aser...@cloudera.com> Committed: Tue Mar 27 00:29:31 2018 +0000 ---------------------------------------------------------------------- src/kudu/integration-tests/linked_list-test.cc | 45 ++++++++++++++------- 1 file changed, 31 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/f4383d57/src/kudu/integration-tests/linked_list-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/linked_list-test.cc b/src/kudu/integration-tests/linked_list-test.cc index b12eb14..cd15545 100644 --- a/src/kudu/integration-tests/linked_list-test.cc +++ b/src/kudu/integration-tests/linked_list-test.cc @@ -50,6 +50,7 @@ #include "kudu/integration-tests/cluster_itest_util.h" #include "kudu/integration-tests/linked_list-test-util.h" #include "kudu/integration-tests/ts_itest-base.h" +#include "kudu/master/master.pb.h" #include "kudu/mini-cluster/external_mini_cluster.h" #include "kudu/mini-cluster/mini_cluster.h" #include "kudu/tserver/tablet_server-test-base.h" @@ -76,6 +77,10 @@ DEFINE_bool(stress_wal_gc, false, using kudu::client::sp::shared_ptr; using kudu::cluster::ClusterNodes; using kudu::itest::TServerDetails; +using kudu::itest::WAIT_FOR_LEADER; +using kudu::itest::WaitForReplicasReportedToMaster; +using kudu::itest::WaitForServersToAgree; +using kudu::master::VOTER_REPLICA; using std::string; using std::vector; @@ -288,30 +293,42 @@ TEST_F(LinkedListTest, TestLoadWhileOneServerDownAndVerify) { FLAGS_num_tablet_servers = 3; FLAGS_num_tablets = 1; - ASSERT_NO_FATAL_FAILURE(BuildAndStart()); + const auto run_time = MonoDelta::FromSeconds(FLAGS_seconds_to_run); + const auto wait_time = run_time; + + NO_FATALS(BuildAndStart()); // Load the data with one of the three servers down. cluster_->tablet_server(0)->Shutdown(); - int64_t written = 0; - ASSERT_OK(tester_->LoadLinkedList(MonoDelta::FromSeconds(FLAGS_seconds_to_run), - FLAGS_num_snapshots, - &written)); + int64_t written; + ASSERT_OK(tester_->LoadLinkedList(run_time, FLAGS_num_snapshots, &written)); // Start back up the server that missed all of the data being loaded. It should be // able to stream the data back from the other server which is still up. ASSERT_OK(cluster_->tablet_server(0)->Restart()); - // We'll give the tablets 5 seconds to start up regardless of how long we - // inserted for. This prevents flakiness in TSAN builds in particular. - const int kBaseTimeToWaitSecs = 5; - const int kWaitTime = FLAGS_seconds_to_run + kBaseTimeToWaitSecs; string tablet_id = tablet_replicas_.begin()->first; - ASSERT_NO_FATAL_FAILURE(WaitForServersToAgree( - MonoDelta::FromSeconds(kWaitTime), - tablet_servers_, - tablet_id, - written / FLAGS_num_chains)); + + // When running for longer times (like --seconds_to_run=800), the replica + // at the shutdown tservers falls behind the WAL segment GC threshold. In case + // of the 3-4-3 replica management scheme, that leads to evicting the former + // replica and replacing it with a non-voter one. The WaitForServersToAgree() + // below doesn't take into account that a non-voter replica, even caught up + // with the leader, first needs to be promoted to voter before commencing the + // verification phase. So, before checking for the minimum required operaiton + // index, let's first make sure that the replica at the restarted tserver + // is a voter. + bool has_leader; + master::TabletLocationsPB tablet_locations; + ASSERT_OK(WaitForReplicasReportedToMaster( + cluster_->master_proxy(), FLAGS_num_tablet_servers, tablet_id, wait_time, + WAIT_FOR_LEADER, VOTER_REPLICA, &has_leader, &tablet_locations)); + + // All right, having the necessary number of voter replicas, make sure all + // replicas are up-to-date in terms of OpId index. + ASSERT_OK(WaitForServersToAgree(wait_time, tablet_servers_, tablet_id, + written / FLAGS_num_chains)); cluster_->tablet_server(1)->Shutdown(); cluster_->tablet_server(2)->Shutdown();