This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 5e895c6  [master] make automatic AddMaster resilient to leadership 
changes
5e895c6 is described below

commit 5e895c642d28a1c5e3ee0845bb0775fe2e106d29
Author: Andrew Wong <[email protected]>
AuthorDate: Thu Jul 15 16:41:43 2021 -0700

    [master] make automatic AddMaster resilient to leadership changes
    
    AutoAddMasterTest.TestAddWithOnGoingDdl would previously fail with the
    new master shutting down with the following error:
    
    Remote error: Failed to perform AddMaster RPC: Illegal state: Failed 
initiating master Raft ChangeConfig request, error: unknown: Leader has not yet 
committed an operation in its own term
    
    This patch addresses this by catching errors commonly seen in the Raft
    layer and retrying, as we might do for DML operations.
    
    Without this patch, the test failed ~10% of the time in debug mode with
    --stress_cpu_threads=3. With the patch, the test only fails ~1% of the
    time, which will be addressed in a later patch.
    
    This patch also addresses some leftover feedback from
    7e66534d0e62fb850bf300d52da4a0a76889f4b8 regarding verification of
    masters in the presence of DNS failures. I didn't add a new test for
    this because it turned out to not change the behavior of things -- the
    new master would still fail upon attempting to resolve replicas.
    
    Change-Id: Ie38453c6fc41ce98c59c010902e2d9fe9db62dee
    Reviewed-on: http://gerrit.cloudera.org:8080/17691
    Tested-by: Kudu Jenkins
    Reviewed-by: Bankim Bhavsar <[email protected]>
    Reviewed-by: Alexey Serbin <[email protected]>
---
 src/kudu/master/dynamic_multi_master-test.cc | 21 +++++++++++++++------
 src/kudu/master/master.cc                    |  8 ++++----
 src/kudu/master/master_runner.cc             | 10 ++++++++--
 src/kudu/master/master_service.cc            | 11 +++++++++++
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/src/kudu/master/dynamic_multi_master-test.cc 
b/src/kudu/master/dynamic_multi_master-test.cc
index e6d345d..d1e473c 100644
--- a/src/kudu/master/dynamic_multi_master-test.cc
+++ b/src/kudu/master/dynamic_multi_master-test.cc
@@ -1435,6 +1435,8 @@ class AutoAddMasterTest : public KuduTest {
   unique_ptr<ExternalMiniCluster> cluster_;
 };
 
+constexpr const int64_t kShortRetryIntervalSecs = 1;
+
 // Test that nothing goes wrong when starting up masters but the entire cluster
 // isn't fully healthy. The auto-add checks should still run, but should be
 // inconsequential if they fail because the entire cluster isn't healthy.
@@ -1514,13 +1516,20 @@ TEST_F(AutoAddMasterTest, 
TestFailWithoutReplicatingAddMaster) {
     ASSERT_OK(cluster_->SetFlag(cluster_->master(i),
                                 "follower_reject_update_consensus_requests", 
"true"));
   }
-  // Upon starting, the master will attempt to add itself, but fail to do so
-  // and exit early.
-  ASSERT_OK(cluster_->AddMaster());
+  // Upon starting, the master will attempt to add itself, but fail to do so.
+  // Even after several attempts.
+  ASSERT_OK(cluster_->AddMaster({ 
Substitute("--master_auto_join_retry_interval_secs=$0",
+                                             kShortRetryIntervalSecs) }));
   auto* new_master = cluster_->master(args_.orig_num_masters);
-  ASSERT_EVENTUALLY([&] {
-    ASSERT_FALSE(new_master->IsProcessAlive());
-  });
+  SleepFor(MonoDelta::FromSeconds(5 * kShortRetryIntervalSecs));
+
+  // The new master should still be around, but not be added as a part of the
+  // Raft group.
+  ASSERT_TRUE(new_master->IsProcessAlive());
+  Status s = VerifyVoterMastersForCluster(cluster_->num_masters(), nullptr, 
cluster_.get());
+  ASSERT_FALSE(s.ok());
+  ASSERT_STR_CONTAINS(s.ToString(), "expected 3 masters but got 2");
+
   // Since nothing was successfully replicated, it shouldn't be a problem to
   // start up again and re-add.
   new_master->Shutdown();
diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc
index 8ed0929..a9b9538 100644
--- a/src/kudu/master/master.cc
+++ b/src/kudu/master/master.cc
@@ -524,10 +524,10 @@ Status Master::AddMaster(const HostPort& hp, 
rpc::RpcContext* rpc) {
     const auto& config = consensus->CommittedConfig();
     DCHECK_EQ(1, config.peers_size());
     if (!config.peers(0).has_last_known_addr()) {
-      return Status::IllegalState("'last_known_addr' field in single master 
Raft configuration not "
-                                  "set. Please restart master with 
--master_addresses flag set "
-                                  "to the single master which will populate 
the 'last_known_addr' "
-                                  "field.");
+      return Status::InvalidArgument("'last_known_addr' field in single master 
Raft configuration "
+                                     "not set. Please restart master with 
--master_addresses flag "
+                                     "set to the single master which will 
populate the "
+                                     "'last_known_addr' field.");
     }
   }
 
diff --git a/src/kudu/master/master_runner.cc b/src/kudu/master/master_runner.cc
index 77bb386..f4c194a 100644
--- a/src/kudu/master/master_runner.cc
+++ b/src/kudu/master/master_runner.cc
@@ -167,14 +167,20 @@ Status VerifyMastersGetHostPorts(const vector<HostPort>& 
master_addrs,
   int64_t committed_config_index = -1;
   for (const auto& hp : master_addrs) {
     Sockaddr master_addr;
-    RETURN_NOT_OK(SockaddrFromHostPort(hp, &master_addr));
+    Status s = SockaddrFromHostPort(hp, &master_addr);
+    if (!s.ok()) {
+      LOG(INFO) << Substitute("Error resolving master address for $0: $1",
+                              hp.ToString(), s.ToString());
+      *needs_retry = true;
+      return Status::OK();
+    }
 
     // First, get the UUID of the remote master.
     GetMasterRegistrationRequestPB reg_req;
     GetMasterRegistrationResponsePB reg_resp;
     RpcController reg_rpc;
     MasterServiceProxy proxy(messenger, master_addr, master_addr.host());
-    Status s = proxy.GetMasterRegistration(reg_req, &reg_resp, &reg_rpc);
+    s = proxy.GetMasterRegistration(reg_req, &reg_resp, &reg_rpc);
     if (!s.ok() || reg_resp.has_error()) {
       LOG(INFO) << Substitute("Error getting master registration for $0: $1, 
$2",
                               master_addr.ToString(), s.ToString(),
diff --git a/src/kudu/master/master_service.cc 
b/src/kudu/master/master_service.cc
index 8993dc0..dcfb8c5 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -275,6 +275,17 @@ void MasterServiceImpl::AddMaster(const 
AddMasterRequestPB* req,
       rpc->RespondSuccess();
       return;
     }
+    // Several errors may show up in the Raft layer, usually indicating a lack
+    // of or change in leadership, or an otherwise transient issue. Catch these
+    // and return a retriable error.
+    if (s.IsIllegalState() || s.IsServiceUnavailable() || s.IsAborted()) {
+      LOG(WARNING) << Substitute("AddMaster $0 failed with retriable error: 
$1",
+                                 hp.ToString(), s.ToString());
+      StatusToPB(s, resp->mutable_error()->mutable_status());
+      resp->mutable_error()->set_code(MasterErrorPB::NOT_THE_LEADER);
+      rpc->RespondSuccess();
+      return;
+    }
     LOG(ERROR) << Substitute("Failed adding master $0. $1", hp.ToString(), 
s.ToString());
     rpc->RespondFailure(s);
     return;

Reply via email to