Repository: mesos Updated Branches: refs/heads/master 12d61403a -> d9f85d7f7
Updated scheduler driver to exponentially backoff during registration retries. Review: https://reviews.apache.org/r/27315 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d9f85d7f Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d9f85d7f Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d9f85d7f Branch: refs/heads/master Commit: d9f85d7f7209cf24a4893156253909343ff12504 Parents: 12d6140 Author: Vinod Kone <[email protected]> Authored: Thu Nov 6 18:19:11 2014 -0800 Committer: Vinod Kone <[email protected]> Committed: Thu Nov 6 18:19:13 2014 -0800 ---------------------------------------------------------------------- src/Makefile.am | 3 ++ src/sched/constants.cpp | 36 ++++++++++++++++++++ src/sched/constants.hpp | 40 ++++++++++++++++++++++ src/sched/flags.hpp | 55 ++++++++++++++++++++++++++++++ src/sched/sched.cpp | 58 +++++++++++++++++++++++++++++--- src/slave/slave.cpp | 18 +++++----- src/slave/slave.hpp | 2 +- src/tests/fault_tolerance_tests.cpp | 7 ++-- 8 files changed, 200 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/Makefile.am ---------------------------------------------------------------------- diff --git a/src/Makefile.am b/src/Makefile.am index 9ab3b9c..443554a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -279,6 +279,7 @@ libmesos_no_3rdparty_la_SOURCES = \ master/registrar.cpp \ master/repairer.cpp \ module/manager.cpp \ + sched/constants.cpp \ sched/sched.cpp \ scheduler/scheduler.cpp \ slave/constants.cpp \ @@ -431,6 +432,8 @@ libmesos_no_3rdparty_la_SOURCES += \ module/authenticator.hpp \ module/isolator.hpp \ module/manager.hpp \ + sched/constants.hpp \ + sched/flags.hpp \ slave/constants.hpp \ slave/flags.hpp \ slave/gc.hpp \ http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/constants.cpp ---------------------------------------------------------------------- diff --git a/src/sched/constants.cpp b/src/sched/constants.cpp new file mode 100644 index 0000000..44ccfbe --- /dev/null +++ b/src/sched/constants.cpp @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sched/constants.hpp" + +namespace mesos { +namespace internal { +namespace scheduler { + +// NOTE: The default backoff factor for the scheduler (2s) is +// different from the slave (1s) because the scheduler driver doesn't +// do an initial backoff for the very first attempt unlike the slave. +// TODO(vinod): Once we fix the scheduler driver to do initial backoff +// we can change the default to 1s. +const Duration REGISTRATION_BACKOFF_FACTOR = Seconds(2); + +const Duration REGISTRATION_RETRY_INTERVAL_MAX = Minutes(1); + +} // namespace scheduler { +} // namespace internal { +} // namespace mesos { http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/constants.hpp ---------------------------------------------------------------------- diff --git a/src/sched/constants.hpp b/src/sched/constants.hpp new file mode 100644 index 0000000..63707a8 --- /dev/null +++ b/src/sched/constants.hpp @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SCHED_CONSTANTS_HPP__ +#define __SCHED_CONSTANTS_HPP__ + +#include <stout/duration.hpp> + +namespace mesos { +namespace internal { +namespace scheduler { + +// Default backoff interval used by the scheduler driver to wait +// before registration. +extern const Duration REGISTRATION_BACKOFF_FACTOR; + +// The maximum interval the scheduler driver waits before retrying +// registration. +extern const Duration REGISTRATION_RETRY_INTERVAL_MAX; + +} // namespace scheduler { +} // namespace internal { +} // namespace mesos { + +#endif // __SCHED_CONSTANTS_HPP__ http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/flags.hpp ---------------------------------------------------------------------- diff --git a/src/sched/flags.hpp b/src/sched/flags.hpp new file mode 100644 index 0000000..62a634b --- /dev/null +++ b/src/sched/flags.hpp @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SCHED_FLAGS_HPP__ +#define __SCHED_FLAGS_HPP__ + +#include <stout/flags.hpp> + +#include "logging/flags.hpp" + +#include "sched/constants.hpp" + +namespace mesos { +namespace internal { +namespace scheduler { + +class Flags : public logging::Flags +{ +public: + Flags() + { + add(&Flags::registration_backoff_factor, + "registration_backoff_factor", + "Scheduler driver (re-)registration retries are exponentially backed\n" + "off based on 'b', the registration backoff factor (e.g., 1st retry\n" + "uses a random value between [0, b], 2nd retry between [0, b * 2^1],\n" + "3rd retry between [0, b * 2^2]...) up to a maximum of (framework\n" + "failover timeout/10, if failover timeout is specified) or " + + stringify(REGISTRATION_RETRY_INTERVAL_MAX) + ", whichever is smaller", + REGISTRATION_BACKOFF_FACTOR); + } + + Duration registration_backoff_factor; +}; + +} // namespace scheduler { +} // namespace internal { +} // namespace mesos { + +#endif // __SCHED_FLAGS_HPP__ http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/sched/sched.cpp ---------------------------------------------------------------------- diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp index e5f828d..8ca0526 100644 --- a/src/sched/sched.cpp +++ b/src/sched/sched.cpp @@ -64,6 +64,7 @@ #include "common/lock.hpp" #include "common/type_utils.hpp" +#include "local/flags.hpp" #include "local/local.hpp" #include "logging/flags.hpp" @@ -73,6 +74,9 @@ #include "messages/messages.hpp" +#include "sched/constants.hpp" +#include "sched/flags.hpp" + using namespace mesos; using namespace mesos::internal; using namespace mesos::internal::master; @@ -105,6 +109,7 @@ public: const Option<Credential>& _credential, const string& schedulerId, MasterDetector* _detector, + const internal::scheduler::Flags& _flags, pthread_mutex_t* _mutex, pthread_cond_t* _cond) // We use a UUID here to ensure that the master can reliably @@ -128,6 +133,7 @@ public: connected(false), aborted(false), detector(_detector), + flags(_flags), credential(_credential), authenticatee(NULL), authenticating(None()), @@ -235,13 +241,19 @@ protected: if (credential.isSome()) { // Authenticate with the master. + // TODO(vinod): Do a backoff for authentication similar to what + // we do for registration. authenticate(); } else { // Proceed with registration without authentication. LOG(INFO) << "No credentials provided." << " Attempting to register without authentication"; - doReliableRegistration(); + // TODO(vinod): Similar to the slave add a random delay to the + // first registration attempt too. This needs fixing tests + // that expect scheduler to register even with clock paused + // (e.g., rate limiting tests). + doReliableRegistration(flags.registration_backoff_factor); } } else { // In this case, we don't actually invoke Scheduler::error @@ -359,7 +371,7 @@ protected: authenticated = true; authenticating = None(); - doReliableRegistration(); // Proceed with registration. + doReliableRegistration(flags.registration_backoff_factor); } void authenticationTimeout(Future<bool> future) @@ -463,7 +475,7 @@ protected: VLOG(1) << "Scheduler::reregistered took " << stopwatch.elapsed(); } - void doReliableRegistration() + void doReliableRegistration(Duration maxBackoff) { if (connected || master.isNone()) { return; @@ -488,7 +500,29 @@ protected: send(master.get(), message); } - delay(Seconds(1), self(), &Self::doReliableRegistration); + // Bound the maximum backoff by 'REGISTRATION_RETRY_INTERVAL_MAX'. + maxBackoff = + std::min(maxBackoff, scheduler::REGISTRATION_RETRY_INTERVAL_MAX); + + // If failover timeout is present, bound the maximum backoff + // by 1/10th of the failover timeout. + if (framework.has_failover_timeout()) { + Try<Duration> duration = Duration::create(framework.failover_timeout()); + if (duration.isSome()) { + maxBackoff = std::min(maxBackoff, duration.get() / 10); + } + } + + // Determine the delay for next attempt by picking a random + // duration between 0 and 'maxBackoff'. + // TODO(vinod): Use random numbers from <random> header. + Duration delay = maxBackoff * ((double) ::random() / RAND_MAX); + + VLOG(1) << "Will retry registration in " << delay << " if necessary"; + + // Backoff. + process::delay( + delay, self(), &Self::doReliableRegistration, maxBackoff * 2); } void resourceOffers( @@ -1021,6 +1055,8 @@ private: MasterDetector* detector; + const internal::scheduler::Flags flags; + hashmap<OfferID, hashmap<SlaveID, UPID> > savedOffers; hashmap<SlaveID, UPID> savedSlavePids; @@ -1051,7 +1087,6 @@ void MesosSchedulerDriver::initialize() { // we'll probably want a way to load master::Flags and slave::Flags // as well. local::Flags flags; - Try<Nothing> load = flags.load("MESOS_"); if (load.isError()) { @@ -1116,6 +1151,7 @@ void MesosSchedulerDriver::initialize() { url = pid.isSome() ? static_cast<string>(pid.get()) : master; } + // Implementation of C++ API. // // Notes: @@ -1230,6 +1266,16 @@ Status MesosSchedulerDriver::start() detector = detector_.get(); } + // Load scheduler flags. + internal::scheduler::Flags flags; + Try<Nothing> load = flags.load("MESOS_"); + + if (load.isError()) { + status = DRIVER_ABORTED; + scheduler->error(this, load.error()); + return status; + } + CHECK(process == NULL); if (credential == NULL) { @@ -1240,6 +1286,7 @@ Status MesosSchedulerDriver::start() None(), schedulerId, detector, + flags, &mutex, &cond); } else { @@ -1251,6 +1298,7 @@ Status MesosSchedulerDriver::start() cred, schedulerId, detector, + flags, &mutex, &cond); } http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/slave/slave.cpp ---------------------------------------------------------------------- diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index dbfd1a8..81e0c4b 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -906,7 +906,7 @@ void Slave::reregistered( } -void Slave::doReliableRegistration(const Duration& duration) +void Slave::doReliableRegistration(Duration maxBackoff) { if (master.isNone()) { LOG(INFO) << "Skipping registration because no master present"; @@ -1040,19 +1040,17 @@ void Slave::doReliableRegistration(const Duration& duration) send(master.get(), message); } - // Retry registration if necessary. - Duration next = std::min( - duration * ((double) ::random() / RAND_MAX), - REGISTER_RETRY_INTERVAL_MAX); + // Bound the maximum backoff by 'REGISTER_RETRY_INTERVAL_MAX'. + maxBackoff = std::min(maxBackoff, REGISTER_RETRY_INTERVAL_MAX); - Duration duration_ = std::min( - duration * 2, - REGISTER_RETRY_INTERVAL_MAX); + // Determine the delay for next attempt by picking a random + // duration between 0 and 'maxBackoff'. + Duration delay = maxBackoff * ((double) ::random() / RAND_MAX); - VLOG(1) << "Will retry registration in " << next << " if necessary"; + VLOG(1) << "Will retry registration in " << delay << " if necessary"; // Backoff. - delay(next, self(), &Slave::doReliableRegistration, duration_); + process::delay(delay, self(), &Slave::doReliableRegistration, maxBackoff * 2); } http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/slave/slave.hpp ---------------------------------------------------------------------- diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp index 5b082fc..72bbec9 100644 --- a/src/slave/slave.hpp +++ b/src/slave/slave.hpp @@ -104,7 +104,7 @@ public: const SlaveID& slaveId, const std::vector<ReconcileTasksMessage>& reconciliations); - void doReliableRegistration(const Duration& duration); + void doReliableRegistration(Duration maxBackoff); // Made 'virtual' for Slave mocking. virtual void runTask( http://git-wip-us.apache.org/repos/asf/mesos/blob/d9f85d7f/src/tests/fault_tolerance_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp index 372c4fd..5baeda6 100644 --- a/src/tests/fault_tolerance_tests.cpp +++ b/src/tests/fault_tolerance_tests.cpp @@ -40,6 +40,8 @@ #include "master/allocator.hpp" #include "master/master.hpp" +#include "sched/constants.hpp" + #include "slave/constants.hpp" #include "slave/slave.hpp" @@ -803,7 +805,7 @@ TEST_F(FaultToleranceTest, SchedulerFailoverRetriedReregistration) AWAIT_READY(reregistrationMessage); // Trigger the re-registration retry. - Clock::advance(Seconds(1)); + Clock::advance(internal::scheduler::REGISTRATION_BACKOFF_FACTOR); AWAIT_READY(sched2Registered); @@ -856,9 +858,8 @@ TEST_F(FaultToleranceTest, FrameworkReliableRegistration) AWAIT_READY(frameworkRegisteredMessage); - // TODO(benh): Pull out constant from SchedulerProcess. Clock::pause(); - Clock::advance(Seconds(1)); + Clock::advance(internal::scheduler::REGISTRATION_BACKOFF_FACTOR); AWAIT_READY(registered); // Ensures registered message is received.
