Repository: kudu Updated Branches: refs/heads/master 475920a21 -> 97831eadf
Make ExternalDaemon::StartProcess() handle fault injection Occasionally starting a kudu daemon fails becuase of fault injection. We recently fixed this error manifesting as SIGPIPE, but we didn't fix the underlying cause: under load, tests doing fault injection might fail before StartProcess() completes making this method return a non-OK() status. This patch makes sure that if the exit status is an expected one (85) we still return as if starting the process had succeeded. Subsequent calls to WaitForInjectedCrash() will return Status::OK(); To make sure this works, this patch includes a test that enables a "fake" fault early in the startup process and makes sure that both WaitForInjectedCrash and StartProcess() return OK() in spite of it. Change-Id: I6046e34a321de3e324e20e3d63249e4073712447 Reviewed-on: http://gerrit.cloudera.org:8080/6211 Reviewed-by: Todd Lipcon <[email protected]> Reviewed-by: Mike Percy <[email protected]> Tested-by: Kudu Jenkins Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f7fa057b Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f7fa057b Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f7fa057b Branch: refs/heads/master Commit: f7fa057bd0527b19c6e100d412efe851be68b1db Parents: 475920a Author: David Alves <[email protected]> Authored: Wed Mar 1 13:49:21 2017 -0800 Committer: David Ribeiro Alves <[email protected]> Committed: Tue Mar 7 02:06:31 2017 +0000 ---------------------------------------------------------------------- .../integration-tests/external_mini_cluster-test.cc | 15 +++++++++++++++ src/kudu/integration-tests/external_mini_cluster.cc | 14 +++++++++++--- src/kudu/integration-tests/external_mini_cluster.h | 1 + src/kudu/tserver/tablet_server_main.cc | 10 ++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/integration-tests/external_mini_cluster-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/external_mini_cluster-test.cc b/src/kudu/integration-tests/external_mini_cluster-test.cc index 5b3405c..56fa7f3 100644 --- a/src/kudu/integration-tests/external_mini_cluster-test.cc +++ b/src/kudu/integration-tests/external_mini_cluster-test.cc @@ -193,6 +193,21 @@ TEST_P(ExternalMiniClusterTest, TestBasicOperation) { "No Kerberos credentials|" ".*No such file or directory)"); } + + // Test that if we inject a fault into a tablet server's boot process + // ExternalTabletServer::Restart() still returns OK, even if the tablet server crashed. + ts->Shutdown(); + ts->mutable_flags()->push_back("--fault_before_start=1.0"); + ASSERT_OK(ts->Restart()); + ASSERT_FALSE(ts->IsProcessAlive()); + // Since the process should have already crashed, waiting for an injected crash with no + // timeout should still return OK. + ASSERT_OK(ts->WaitForInjectedCrash(MonoDelta::FromSeconds(0))); + ts->mutable_flags()->pop_back(); + ts->Shutdown(); + ASSERT_OK(ts->Restart()); + ASSERT_TRUE(ts->IsProcessAlive()); + cluster.Shutdown(); } http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/integration-tests/external_mini_cluster.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/external_mini_cluster.cc b/src/kudu/integration-tests/external_mini_cluster.cc index a9b5bab..e82b6ec 100644 --- a/src/kudu/integration-tests/external_mini_cluster.cc +++ b/src/kudu/integration-tests/external_mini_cluster.cc @@ -705,11 +705,20 @@ Status ExternalDaemon::StartProcess(const vector<string>& user_flags) { break; } SleepFor(MonoDelta::FromMilliseconds(10)); - Status s = p->WaitNoBlock(); + int wait_status; + Status s = p->WaitNoBlock(&wait_status); if (s.IsTimedOut()) { // The process is still running. continue; } + + // If the process exited with expected exit status we need to still swap() the process + // and exit as if it had succeeded. + if (WIFEXITED(wait_status) && WEXITSTATUS(wait_status) == fault_injection::kExitStatus) { + process_.swap(p); + return Status::OK(); + } + RETURN_NOT_OK_PREPEND(s, Substitute("Failed waiting on $0", exe_)); string exit_info; RETURN_NOT_OK(p->GetExitStatus(nullptr, &exit_info)); @@ -1152,8 +1161,7 @@ Status ExternalTabletServer::Restart() { bound_http_.host())); } flags.push_back("--tserver_master_addrs=" + master_addrs_); - RETURN_NOT_OK(StartProcess(flags)); - return Status::OK(); + return StartProcess(flags); } http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/integration-tests/external_mini_cluster.h ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/external_mini_cluster.h b/src/kudu/integration-tests/external_mini_cluster.h index 57fdba1..c696f5d 100644 --- a/src/kudu/integration-tests/external_mini_cluster.h +++ b/src/kudu/integration-tests/external_mini_cluster.h @@ -433,6 +433,7 @@ class ExternalDaemon : public RefCountedThreadSafe<ExternalDaemon> { friend class RefCountedThreadSafe<ExternalDaemon>; virtual ~ExternalDaemon(); + // Starts a process with the given flags. Status StartProcess(const std::vector<std::string>& user_flags); // Wait for the process to exit, and then call 'wait_status_predicate' http://git-wip-us.apache.org/repos/asf/kudu/blob/f7fa057b/src/kudu/tserver/tablet_server_main.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tserver/tablet_server_main.cc b/src/kudu/tserver/tablet_server_main.cc index 759fc82..5123f0f 100644 --- a/src/kudu/tserver/tablet_server_main.cc +++ b/src/kudu/tserver/tablet_server_main.cc @@ -20,7 +20,9 @@ #include "kudu/gutil/strings/substitute.h" #include "kudu/tserver/tablet_server.h" +#include "kudu/util/fault_injection.h" #include "kudu/util/flags.h" +#include "kudu/util/flag_tags.h" #include "kudu/util/init.h" #include "kudu/util/logging.h" #include "kudu/util/version_info.h" @@ -31,6 +33,12 @@ DECLARE_string(rpc_bind_addresses); DECLARE_int32(rpc_num_service_threads); DECLARE_int32(webserver_port); +DEFINE_double(fault_before_start, 0.0, + "Fake fault flag that always causes a crash on startup. " + "Used to test the test infrastructure. Should never be set outside of tests."); +TAG_FLAG(fault_before_start, hidden); +TAG_FLAG(fault_before_start, unsafe); + namespace kudu { namespace tserver { @@ -63,6 +71,8 @@ static int TabletServerMain(int argc, char** argv) { LOG(INFO) << "Initializing tablet server..."; CHECK_OK(server.Init()); + MAYBE_FAULT(FLAGS_fault_before_start); + LOG(INFO) << "Starting tablet server..."; CHECK_OK(server.Start());
