Repository: kudu
Updated Branches:
  refs/heads/master 475920a21 -> 97831eadf


Make ExternalDaemon::StartProcess() handle fault injection

Occasionally starting a kudu daemon fails becuase of fault injection.
We recently fixed this error manifesting as SIGPIPE, but we didn't
fix the underlying cause: under load, tests doing fault
injection might fail before StartProcess() completes making
this method return a non-OK() status.

This patch makes sure that if the exit status is an expected one (85)
we still return as if starting the process had succeeded. Subsequent
calls to WaitForInjectedCrash() will return Status::OK();

To make sure this works, this patch includes a test that enables
a "fake" fault early in the startup process and makes sure that
both WaitForInjectedCrash and StartProcess() return OK() in
spite of it.

Change-Id: I6046e34a321de3e324e20e3d63249e4073712447
Reviewed-on: http://gerrit.cloudera.org:8080/6211
Reviewed-by: Todd Lipcon <[email protected]>
Reviewed-by: Mike Percy <[email protected]>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f7fa057b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f7fa057b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f7fa057b

Branch: refs/heads/master
Commit: f7fa057bd0527b19c6e100d412efe851be68b1db
Parents: 475920a
Author: David Alves <[email protected]>
Authored: Wed Mar 1 13:49:21 2017 -0800
Committer: David Ribeiro Alves <[email protected]>
Committed: Tue Mar 7 02:06:31 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/external_mini_cluster-test.cc  | 15 +++++++++++++++
 src/kudu/integration-tests/external_mini_cluster.cc  | 14 +++++++++++---
 src/kudu/integration-tests/external_mini_cluster.h   |  1 +
 src/kudu/tserver/tablet_server_main.cc               | 10 ++++++++++
 4 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/integration-tests/external_mini_cluster-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster-test.cc 
b/src/kudu/integration-tests/external_mini_cluster-test.cc
index 5b3405c..56fa7f3 100644
--- a/src/kudu/integration-tests/external_mini_cluster-test.cc
+++ b/src/kudu/integration-tests/external_mini_cluster-test.cc
@@ -193,6 +193,21 @@ TEST_P(ExternalMiniClusterTest, TestBasicOperation) {
                         "No Kerberos credentials|"
                         ".*No such file or directory)");
   }
+
+  // Test that if we inject a fault into a tablet server's boot process
+  // ExternalTabletServer::Restart() still returns OK, even if the tablet 
server crashed.
+  ts->Shutdown();
+  ts->mutable_flags()->push_back("--fault_before_start=1.0");
+  ASSERT_OK(ts->Restart());
+  ASSERT_FALSE(ts->IsProcessAlive());
+  // Since the process should have already crashed, waiting for an injected 
crash with no
+  // timeout should still return OK.
+  ASSERT_OK(ts->WaitForInjectedCrash(MonoDelta::FromSeconds(0)));
+  ts->mutable_flags()->pop_back();
+  ts->Shutdown();
+  ASSERT_OK(ts->Restart());
+  ASSERT_TRUE(ts->IsProcessAlive());
+
   cluster.Shutdown();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/integration-tests/external_mini_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster.cc 
b/src/kudu/integration-tests/external_mini_cluster.cc
index a9b5bab..e82b6ec 100644
--- a/src/kudu/integration-tests/external_mini_cluster.cc
+++ b/src/kudu/integration-tests/external_mini_cluster.cc
@@ -705,11 +705,20 @@ Status ExternalDaemon::StartProcess(const vector<string>& 
user_flags) {
       break;
     }
     SleepFor(MonoDelta::FromMilliseconds(10));
-    Status s = p->WaitNoBlock();
+    int wait_status;
+    Status s = p->WaitNoBlock(&wait_status);
     if (s.IsTimedOut()) {
       // The process is still running.
       continue;
     }
+
+    // If the process exited with expected exit status we need to still swap() 
the process
+    // and exit as if it had succeeded.
+    if (WIFEXITED(wait_status) && WEXITSTATUS(wait_status) == 
fault_injection::kExitStatus) {
+      process_.swap(p);
+      return Status::OK();
+    }
+
     RETURN_NOT_OK_PREPEND(s, Substitute("Failed waiting on $0", exe_));
     string exit_info;
     RETURN_NOT_OK(p->GetExitStatus(nullptr, &exit_info));
@@ -1152,8 +1161,7 @@ Status ExternalTabletServer::Restart() {
                                bound_http_.host()));
   }
   flags.push_back("--tserver_master_addrs=" + master_addrs_);
-  RETURN_NOT_OK(StartProcess(flags));
-  return Status::OK();
+  return StartProcess(flags);
 }
 
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/integration-tests/external_mini_cluster.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster.h 
b/src/kudu/integration-tests/external_mini_cluster.h
index 57fdba1..c696f5d 100644
--- a/src/kudu/integration-tests/external_mini_cluster.h
+++ b/src/kudu/integration-tests/external_mini_cluster.h
@@ -433,6 +433,7 @@ class ExternalDaemon : public 
RefCountedThreadSafe<ExternalDaemon> {
   friend class RefCountedThreadSafe<ExternalDaemon>;
   virtual ~ExternalDaemon();
 
+  // Starts a process with the given flags.
   Status StartProcess(const std::vector<std::string>& user_flags);
 
   // Wait for the process to exit, and then call 'wait_status_predicate'

http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/tserver/tablet_server_main.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server_main.cc 
b/src/kudu/tserver/tablet_server_main.cc
index 759fc82..5123f0f 100644
--- a/src/kudu/tserver/tablet_server_main.cc
+++ b/src/kudu/tserver/tablet_server_main.cc
@@ -20,7 +20,9 @@
 
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/tserver/tablet_server.h"
+#include "kudu/util/fault_injection.h"
 #include "kudu/util/flags.h"
+#include "kudu/util/flag_tags.h"
 #include "kudu/util/init.h"
 #include "kudu/util/logging.h"
 #include "kudu/util/version_info.h"
@@ -31,6 +33,12 @@ DECLARE_string(rpc_bind_addresses);
 DECLARE_int32(rpc_num_service_threads);
 DECLARE_int32(webserver_port);
 
+DEFINE_double(fault_before_start, 0.0,
+              "Fake fault flag that always causes a crash on startup. "
+              "Used to test the test infrastructure. Should never be set 
outside of tests.");
+TAG_FLAG(fault_before_start, hidden);
+TAG_FLAG(fault_before_start, unsafe);
+
 namespace kudu {
 namespace tserver {
 
@@ -63,6 +71,8 @@ static int TabletServerMain(int argc, char** argv) {
   LOG(INFO) << "Initializing tablet server...";
   CHECK_OK(server.Init());
 
+  MAYBE_FAULT(FLAGS_fault_before_start);
+
   LOG(INFO) << "Starting tablet server...";
   CHECK_OK(server.Start());
 

Reply via email to