[mesos] branch master updated: Added tests for 'volume/csi' isolator recovery.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new a8059a7 Added tests for 'volume/csi' isolator recovery. a8059a7 is described below commit a8059a78473774e3d95e8e908f360ee5e9aadd0d Author: Greg Mann AuthorDate: Fri Sep 4 10:39:10 2020 -0700 Added tests for 'volume/csi' isolator recovery. Review: https://reviews.apache.org/r/72806/ --- .../containerizer/volume_csi_isolator_tests.cpp| 360 + 1 file changed, 360 insertions(+) diff --git a/src/tests/containerizer/volume_csi_isolator_tests.cpp b/src/tests/containerizer/volume_csi_isolator_tests.cpp index dafb0b7..d51d3c9 100644 --- a/src/tests/containerizer/volume_csi_isolator_tests.cpp +++ b/src/tests/containerizer/volume_csi_isolator_tests.cpp @@ -1117,6 +1117,366 @@ TEST_P(VolumeCSIIsolatorTest, ROOT_UnmanagedPlugin) AWAIT_READY(finishedUpdate); } + +// When the agent fails over while a CSI volume is mounted to a container, the +// agent should recover the volume state so that the volume can be successfully +// unpublished after agent recovery is complete. +TEST_P(VolumeCSIIsolatorTest, ROOT_INTERNET_CURL_UnpublishAfterAgentFailover) +{ + createCsiPluginConfig(Bytes(0), TEST_VOLUME_ID + ":1MB"); + + Try> master = StartMaster(); + ASSERT_SOME(master); + + Future slaveRegisteredMessage = +FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); + + Owned detector = master.get()->createDetector(); + + slave::Flags agentFlags = CreateSlaveFlags(); + + Fetcher fetcher(agentFlags); + + // Use a consistent ID across agent restart so that the executor can register. + string processId = process::ID::generate("slave"); + + SlaveOptions agentOptions = SlaveOptions(detector.get()) +.withId(processId) +.withFlags(agentFlags); + + Try> agent = StartSlave(agentOptions); + ASSERT_SOME(agent); + + AWAIT_READY(slaveRegisteredMessage); + + auto scheduler = std::make_shared(); + + v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO; + frameworkInfo.set_checkpoint(true); + + EXPECT_CALL(*scheduler, connected(_)) +.WillOnce(v1::scheduler::SendSubscribe(frameworkInfo)); + + Future subscribed; + EXPECT_CALL(*scheduler, subscribed(_, _)) +.WillOnce(FutureArg<1>()); + + Future offers; + EXPECT_CALL(*scheduler, offers(_, _)) +.WillOnce(FutureArg<1>()); + + EXPECT_CALL(*scheduler, heartbeat(_)) +.WillRepeatedly(Return()); // Ignore heartbeats. + + v1::scheduler::TestMesos mesos( + master.get()->pid, + ContentType::PROTOBUF, + scheduler); + + AWAIT_READY(subscribed); + v1::FrameworkID frameworkId(subscribed->framework_id()); + + v1::Resources resources = +v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(); + + v1::ExecutorInfo executorInfo = v1::createExecutorInfo( + v1::DEFAULT_EXECUTOR_ID, + None(), + resources, + v1::ExecutorInfo::DEFAULT, + frameworkId); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->offers().empty()); + + v1::Offer offer = offers->offers(0); + const v1::AgentID& agentId = offer.agent_id(); + + // Run a command which will loop until a file disappears. This allows us to + // terminate the task after agent failover. + Try taskCommand = strings::format( + "touch %s && while [ -e %s ]; do : sleep 0.01 ; done", + TEST_CONTAINER_PATH + TEST_OUTPUT_FILE, + TEST_CONTAINER_PATH + TEST_OUTPUT_FILE); + + v1::TaskInfo taskInfo = v1::createTask(agentId, resources, taskCommand.get()); + + taskInfo.mutable_container()->CopyFrom(v1::createContainerInfo( + "alpine", + {v1::createVolumeCsi( + TEST_CSI_PLUGIN_TYPE, + TEST_VOLUME_ID, + TEST_CONTAINER_PATH, + mesos::v1::Volume::Source::CSIVolume::VolumeCapability +::AccessMode::SINGLE_NODE_WRITER, + false)})); + + Future startingUpdate; + Future runningUpdate; + Future finishedUpdate; + + testing::Sequence taskSequence; + EXPECT_CALL( + *scheduler, + update(_, TaskStatusUpdateStateEq(v1::TASK_STARTING))) +.InSequence(taskSequence) +.WillOnce(DoAll( +FutureArg<1>(), +v1::scheduler::SendAcknowledge(frameworkId, agentId))) +.WillRepeatedly(v1::scheduler::SendAcknowledge(frameworkId, agentId)); + + EXPECT_CALL( + *scheduler, + update(_, TaskStatusUpdateStateEq(v1::TASK_RUNNING))) +.InSequence(taskSequence) +.WillOnce(DoAll( +FutureArg<1>(), +v1::scheduler::SendAcknowledge(frameworkId, agentId))) +.WillRepeatedly(v1::scheduler::SendAcknowledge(frameworkId, agentId)); + + EXPECT_CALL( + *scheduler, + update(_, TaskStatusUpdateStateEq(v1::TASK_FINISHED))) +.InSequence(taskSequence) +.WillOnce(DoAll(
[mesos] branch master updated (2bf7f5d -> fc22984)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 2bf7f5d Added a test of UPDATE_FRAMEWORK with the same FrameworkInfo. new a3fe939 Updated the test CSI plugin for CSI server testing. new f0ce0f1 Added a test helper for CSI volumes. new fc22984 Added tests for the 'volume/csi' isolator. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/Makefile.am|1 + src/examples/test_csi_plugin.cpp | 94 +- src/tests/CMakeLists.txt |1 + src/tests/cluster.cpp |2 +- .../containerizer/volume_csi_isolator_tests.cpp| 1122 src/tests/mesos.hpp| 76 ++ 6 files changed, 1275 insertions(+), 21 deletions(-) create mode 100644 src/tests/containerizer/volume_csi_isolator_tests.cpp
[mesos] 03/03: Added tests for the 'volume/csi' isolator.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit fc22984de558302029a8cad0655e375653208448 Author: Greg Mann AuthorDate: Thu Sep 3 12:06:38 2020 -0700 Added tests for the 'volume/csi' isolator. Review: https://reviews.apache.org/r/72728/ --- src/Makefile.am|1 + src/tests/CMakeLists.txt |1 + src/tests/cluster.cpp |2 +- .../containerizer/volume_csi_isolator_tests.cpp| 1122 4 files changed, 1125 insertions(+), 1 deletion(-) diff --git a/src/Makefile.am b/src/Makefile.am index 673ea6c..c2da4e9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2873,6 +2873,7 @@ mesos_tests_SOURCES += \ tests/containerizer/runtime_isolator_tests.cpp \ tests/containerizer/sched_tests.cpp \ tests/containerizer/setns_test_helper.cpp\ + tests/containerizer/volume_csi_isolator_tests.cpp\ tests/containerizer/volume_host_path_isolator_tests.cpp \ tests/containerizer/volume_image_isolator_tests.cpp \ tests/containerizer/volume_secret_isolator_tests.cpp diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 6b420d0..6beb74e 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -247,6 +247,7 @@ if (LINUX) containerizer/rootfs.cpp containerizer/runtime_isolator_tests.cpp containerizer/sched_tests.cpp +containerizer/volume_csi_isolator_tests.cpp containerizer/volume_host_path_isolator_tests.cpp containerizer/volume_image_isolator_tests.cpp containerizer/volume_secret_isolator_tests.cpp) diff --git a/src/tests/cluster.cpp b/src/tests/cluster.cpp index 3c86855..d547cbb 100644 --- a/src/tests/cluster.cpp +++ b/src/tests/cluster.cpp @@ -537,7 +537,7 @@ Try> Slave::create( const process::http::URL agentUrl( scheme, process::address().ip, -flags.port, +process::address().port, processId + "/api/v1"); Try> _csiServer = slave::CSIServer::create( diff --git a/src/tests/containerizer/volume_csi_isolator_tests.cpp b/src/tests/containerizer/volume_csi_isolator_tests.cpp new file mode 100644 index 000..dafb0b7 --- /dev/null +++ b/src/tests/containerizer/volume_csi_isolator_tests.cpp @@ -0,0 +1,1122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef USE_SSL_SOCKET +#include "authentication/executor/jwt_secret_generator.hpp" +#endif // USE_SSL_SOCKET + +#include "csi/paths.hpp" + +#include "master/flags.hpp" + +#include "slave/csi_server.hpp" +#include "slave/flags.hpp" +#include "slave/paths.hpp" + +#include "slave/containerizer/fetcher.hpp" + +#include "slave/containerizer/mesos/containerizer.hpp" +#include "slave/containerizer/mesos/paths.hpp" + +#include "tests/environment.hpp" +#include "tests/mesos.hpp" + +#ifdef USE_SSL_SOCKET +using mesos::authentication::executor::JWTSecretGenerator; +#endif // USE_SSL_SOCKET + +using mesos::internal::slave::CSIServer; +using mesos::internal::slave::Fetcher; +using mesos::internal::slave::MesosContainerizer; + +using mesos::internal::slave::containerizer::paths::getContainerPid; + +using mesos::master::detector::MasterDetector; + +using process::Clock; +using process::Future; +using process::Owned; + +using std::list; +using std::string; +using std::vector; + +using testing::AllOf; +using testing::AnyOf; +using testing::DoAll; + +namespace mesos { +namespace internal { +namespace tests { + +const string TEST_CONTAINER_PATH = "volume-container-path/"; +const string TEST_CSI_PLUGIN_TYPE = "org.apa
[mesos] 02/03: Added a test helper for CSI volumes.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit f0ce0f1d8601228f16efbb98420693af42b19d43 Author: Greg Mann AuthorDate: Thu Sep 3 12:06:34 2020 -0700 Added a test helper for CSI volumes. Review: https://reviews.apache.org/r/72805/ --- src/tests/mesos.hpp | 76 + 1 file changed, 76 insertions(+) diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp index 8f89d7c..49abfc2 100644 --- a/src/tests/mesos.hpp +++ b/src/tests/mesos.hpp @@ -853,6 +853,66 @@ inline TVolume createVolumeFromDockerImage( } +template +inline TVolume createVolumeCsi( +const std::string& pluginName, +const std::string volumeId, +const std::string& containerPath, +const typename TVolume::Source::CSIVolume::VolumeCapability + ::AccessMode::Mode mode, +bool readonly) +{ + TVolume volume; + volume.set_container_path(containerPath); + + typename TVolume::Source* source = volume.mutable_source(); + source->set_type(TVolume::Source::CSI_VOLUME); + source->mutable_csi_volume()->set_plugin_name(pluginName); + + typename TVolume::Source::CSIVolume::StaticProvisioning* staticInfo = +source->mutable_csi_volume()->mutable_static_provisioning(); + + staticInfo->set_volume_id(volumeId); + staticInfo->set_readonly(readonly); + staticInfo->mutable_volume_capability()->mutable_mount(); + staticInfo->mutable_volume_capability() +->mutable_access_mode()->set_mode(mode); + + typedef typename TVolume::Source::CSIVolume::VolumeCapability::AccessMode +CSIAccessMode; + + // Set the top-level `mode` field of the volume based on the values of the + // CSI access mode and the `readonly` field. + typename TVolume::Mode mesosMode; + + switch (mode) { +case CSIAccessMode::SINGLE_NODE_WRITER: +case CSIAccessMode::MULTI_NODE_SINGLE_WRITER: +case CSIAccessMode::MULTI_NODE_MULTI_WRITER: { + if (readonly) { +mesosMode = TVolume::RO; + } else { +mesosMode = TVolume::RW; + } + + break; +} + +case CSIAccessMode::SINGLE_NODE_READER_ONLY: +case CSIAccessMode::MULTI_NODE_READER_ONLY: +default: { + mesosMode = TVolume::RO; + + break; +} + } + + volume.set_mode(mesosMode); + + return volume; +} + + template inline TNetworkInfo createNetworkInfo( const std::string& networkName) @@ -1745,6 +1805,14 @@ inline Volume createVolumeFromDockerImage(Args&&... args) template +inline Volume createVolumeCsi(Args&&... args) +{ + return common::createVolumeCsi( + std::forward(args)...); +} + + +template inline NetworkInfo createNetworkInfo(Args&&... args) { return common::createNetworkInfo(std::forward(args)...); @@ -2035,6 +2103,14 @@ inline mesos::v1::Volume createVolumeFromDockerImage(Args&&... args) template +inline mesos::v1::Volume createVolumeCsi(Args&&... args) +{ + return common::createVolumeCsi( + std::forward(args)...); +} + + +template inline mesos::v1::NetworkInfo createNetworkInfo(Args&&... args) { return common::createNetworkInfo(
[mesos] 01/03: Updated the test CSI plugin for CSI server testing.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit a3fe939616fe13f34bd3555d613a0e1323730424 Author: Greg Mann AuthorDate: Thu Sep 3 12:06:31 2020 -0700 Updated the test CSI plugin for CSI server testing. This patch adds additional configuration flags to the test CSI plugin which are necessary in order to test the agent's CSI server. Review: https://reviews.apache.org/r/72727/ --- src/examples/test_csi_plugin.cpp | 94 +++- 1 file changed, 74 insertions(+), 20 deletions(-) diff --git a/src/examples/test_csi_plugin.cpp b/src/examples/test_csi_plugin.cpp index 214a3ee..e878bd6 100644 --- a/src/examples/test_csi_plugin.cpp +++ b/src/examples/test_csi_plugin.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -96,8 +97,6 @@ using grpc::ServerContext; using grpc::Status; using grpc::WriteOptions; -using mesos::csi::VolumeInfo; - using process::grpc::StatusError; using VolumeCapability = mesos::Volume::Source::CSIVolume::VolumeCapability; @@ -159,6 +158,12 @@ public: "If a volume with the same name already exists, the pair will be\n" "ignored. (Example: 'volume1:1GB;volume2:2GB')"); +add(::volume_id_path, +"volume_id_path", +"When set to true, this flag causes the volume ID of all volumes to\n" +"be set to the volume's path.", +true); + add(::forward, "forward", "If set, the plugin forwards all requests to the specified Unix\n" @@ -172,10 +177,20 @@ public: Option create_parameters; Option volume_metadata; Option volumes; + bool volume_id_path; Option forward; }; +struct VolumeInfo +{ + Bytes capacity; + string id; + string path; + google::protobuf::Map context; +}; + + class TestCSIPlugin : public csi::v0::Identity::Service, public csi::v0::Controller::Service, @@ -192,13 +207,15 @@ public: const Bytes& _availableCapacity, const hashmap& _createParameters, const hashmap& _volumeMetadata, - const hashmap& _volumes) + const hashmap& _volumes, + bool _volumeIdPath) : apiVersion(_apiVersion), endpoint(_endpoint), workDir(_workDir), availableCapacity(_availableCapacity), createParameters(_createParameters.begin(), _createParameters.end()), - volumeMetadata(_volumeMetadata.begin(), _volumeMetadata.end()) + volumeMetadata(_volumeMetadata.begin(), _volumeMetadata.end()), + volumeIdPath(_volumeIdPath) { // Construct the default mount volume capability. defaultVolumeCapability.mutable_mount(); @@ -212,8 +229,9 @@ public: // TODO(jieyu): Consider not using CHECKs here. Try> paths = fs::list(path::join(workDir, "*-*")); foreach (const string& path, CHECK_NOTERROR(paths)) { - volumes.put(path, CHECK_NOTERROR(parseVolumePath(path))); - usedCapacity += volumes.at(path).capacity; + Try createdVolume = CHECK_NOTERROR(parseVolumePath(path)); + volumes.put(createdVolume->id, createdVolume.get()); + usedCapacity += createdVolume->capacity; } // Create preprovisioned volumes if they have not existed yet. @@ -229,10 +247,11 @@ public: continue; } - VolumeInfo volumeInfo{ -capacity, getVolumePath(capacity, name), volumeMetadata}; - Try mkdir = os::mkdir(volumeInfo.id); + VolumeInfo volumeInfo = +createVolumeInfo(capacity, name, volumeMetadata); + + Try mkdir = os::mkdir(volumeInfo.path); CHECK_SOME(mkdir) << "Failed to create directory for preprovisioned volume '" << name << "': " << mkdir.error(); @@ -428,6 +447,14 @@ private: Try parseVolumePath(const string& dir); Option findVolumeByName(const string& name); + // Creates a volume info with the specified name based on the + // value of the `volume_id_path` flag. + VolumeInfo createVolumeInfo( + const Bytes& _capacity, + const string& name, + const google::protobuf::Map context); + + Try createVolume( const string& name, const Bytes& requiredBytes, @@ -494,6 +521,7 @@ private: Map createParameters; Map volumeMetadata; hashmap volumes; + bool volumeIdPath; }; @@ -1299,14 +1327,19 @@ Try TestCSIPlugin::parseVolumePath(const string& dir) << "Cannot reconstruct volume path '" << dir << "' from volume name '" << name.get() << "' and capacity " << capacity.get(); - return VolumeInfo{capacity.get(), dir, volumeMetadata}; + const string volumeId = volumeIdPath ? dir : name.get(); + + re
[mesos] branch master updated: Fixed broken authorization in the CSI server.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new a1bfa74 Fixed broken authorization in the CSI server. a1bfa74 is described below commit a1bfa749e594bd8d9eb008ea4d90e6811f5f7e07 Author: Greg Mann AuthorDate: Mon Aug 31 13:02:18 2020 -0700 Fixed broken authorization in the CSI server. The CSI server must use a principal when authenticating which contains a claim that allows the authorizer to implicitly approve requests from the CSI server to the agent's HTTP API. Review: https://reviews.apache.org/r/72816/ --- src/slave/csi_server.cpp | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp index 3f29a81..14fa866 100644 --- a/src/slave/csi_server.cpp +++ b/src/slave/csi_server.cpp @@ -73,6 +73,8 @@ namespace mesos { namespace internal { namespace slave { +constexpr char DEFAULT_CSI_CONTAINER_PREFIX[] = "mesos-internal-csi-"; + static VolumeState createVolumeState( const Volume::Source::CSIVolume::StaticProvisioning& volume); @@ -232,7 +234,7 @@ Try CSIServerProcess::initializePlugin(const Option& name) rootDir, info, extractServices(info), - "org-apache-mesos-internal-", + DEFAULT_CSI_CONTAINER_PREFIX, authToken, plugin.runtime, )); @@ -317,7 +319,9 @@ Future CSIServerProcess::start(const SlaveID& _agentId) // The contents of this principal are arbitrary. We choose to avoid a // principal with a 'value' string so that we do not unintentionally collide // with another real principal with restricted permissions. -Principal principal(Option::none(), {{"key", "csi-server"}}); +Principal principal( +Option::none(), +{{"cid_prefix", DEFAULT_CSI_CONTAINER_PREFIX}}); result = secretGenerator->generate(principal) .then(defer(self(), [=](const Secret& secret) -> Future {
[mesos] branch master updated: Fixed a bug in CSI server initialization.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 22e Fixed a bug in CSI server initialization. 22e is described below commit 22e50813597edd2cbb0304823ca56e5f2d25 Author: Greg Mann AuthorDate: Mon Aug 24 17:51:17 2020 -0700 Fixed a bug in CSI server initialization. Previously, the CSI server would initialize the service managers before the auth token was generated, meaning that requests made by the service managers to an agent which requires HTTP authentication would fail. This patch changes the order of initialization so that the service managers will be initialized with a valid auth token when necessary. Review: https://reviews.apache.org/r/72799/ --- src/slave/csi_server.cpp | 71 ++-- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp index 0ffe020..3f29a81 100644 --- a/src/slave/csi_server.cpp +++ b/src/slave/csi_server.cpp @@ -311,44 +311,51 @@ Future CSIServerProcess::start(const SlaveID& _agentId) agentId = _agentId; - // Load all CSI plugin configurations found. - Try init = initializePlugin(); - if (init.isError()) { -return Failure( -"CSI server failed to initialize CSI plugins: " + init.error()); - } - - if (!secretGenerator) { -return Nothing(); + Future result = Nothing(); + + if (secretGenerator) { +// The contents of this principal are arbitrary. We choose to avoid a +// principal with a 'value' string so that we do not unintentionally collide +// with another real principal with restricted permissions. +Principal principal(Option::none(), {{"key", "csi-server"}}); + +result = secretGenerator->generate(principal) + .then(defer(self(), [=](const Secret& secret) -> Future { +Option error = common::validation::validateSecret(secret); +if (error.isSome()) { + return Failure( + "CSI server failed to validate generated secret: " + + error->message); +} + +if (secret.type() != Secret::VALUE) { + return Failure( + "CSI server expecting generated secret to be of VALUE type " + "instead of " + stringify(secret.type()) + " type; " + + "only VALUE type secrets are supported at this time"); +} + +CHECK(secret.has_value()); + +authToken = secret.value().data(); + +return Nothing(); +})); } - // The contents of this principal are arbitrary. We choose to avoid a - // principal with a 'value' string so that we do not unintentionally collide - // with another real principal with restricted permissions. - Principal principal(Option::none(), {{"key", "csi-server"}}); - - return secretGenerator->generate(principal) -.then([=](const Secret& secret) -> Future { - Option error = common::validation::validateSecret(secret); - if (error.isSome()) { + return result +.then(defer(self(), [=]() -> Future { + // Load all CSI plugin configurations found. + // NOTE: `initializePlugin()` requires that the `authToken` has already + // been set, so the order of these continuations matters. + Try init = initializePlugin(); + if (init.isError()) { return Failure( -"CSI server failed to validate generated secret: " + -error->message); +"CSI server failed to initialize CSI plugins: " + init.error()); } - if (secret.type() != Secret::VALUE) { -return Failure( -"CSI server expecting generated secret to be of VALUE type " -"instead of " + stringify(secret.type()) + " type; " + -"only VALUE type secrets are supported at this time"); - } - - CHECK(secret.has_value()); - - authToken = secret.value().data(); - return Nothing(); - }); +})); }
[mesos] 01/03: Fixed a bug in CSI volume manager initialization.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 68b481085fb82b475e108b9aa39935a8d7729983 Author: Greg Mann AuthorDate: Thu Aug 20 19:26:48 2020 -0700 Fixed a bug in CSI volume manager initialization. Previously, the volume managers would assume that they could make CONTROLLER_SERVICE calls during plugin initialization, regardless of whether or not the plugin provides that service. Review: https://reviews.apache.org/r/72726/ --- src/csi/v0_volume_manager.cpp | 2 +- src/csi/v1_volume_manager.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp index 42a23ba..8ba6100 100644 --- a/src/csi/v0_volume_manager.cpp +++ b/src/csi/v0_volume_manager.cpp @@ -648,7 +648,7 @@ Future VolumeManagerProcess::prepareServices() vector> futures; foreach (const Service& service, services) { futures.push_back(call( -CONTROLLER_SERVICE, ::getPluginInfo, GetPluginInfoRequest()) +service, ::getPluginInfo, GetPluginInfoRequest()) .onReady([service](const GetPluginInfoResponse& response) { LOG(INFO) << service << " loaded: " << stringify(response); })); diff --git a/src/csi/v1_volume_manager.cpp b/src/csi/v1_volume_manager.cpp index c05265c..1a1b97c 100644 --- a/src/csi/v1_volume_manager.cpp +++ b/src/csi/v1_volume_manager.cpp @@ -669,7 +669,7 @@ Future VolumeManagerProcess::prepareServices() vector> futures; foreach (const Service& service, services) { futures.push_back(call( -CONTROLLER_SERVICE, ::getPluginInfo, GetPluginInfoRequest()) +service, ::getPluginInfo, GetPluginInfoRequest()) .onReady([service](const GetPluginInfoResponse& response) { LOG(INFO) << service << " loaded: " << stringify(response); }));
[mesos] 03/03: Initialized plugins lazily in the CSI server.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 4ff51041df860dbcc2247ef47a0596e5132da190 Author: Greg Mann AuthorDate: Thu Aug 20 19:27:23 2020 -0700 Initialized plugins lazily in the CSI server. Review: https://reviews.apache.org/r/72779/ --- src/slave/csi_server.cpp | 403 +-- src/slave/csi_server.hpp | 8 +- 2 files changed, 253 insertions(+), 158 deletions(-) diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp index 2ba4f22..0ffe020 100644 --- a/src/slave/csi_server.cpp +++ b/src/slave/csi_server.cpp @@ -19,6 +19,7 @@ #include #include +#include #include @@ -58,6 +59,7 @@ using mesos::csi::state::VolumeState; using process::Failure; using process::Future; using process::Owned; +using process::Promise; using process::grpc::client::Runtime; @@ -85,17 +87,17 @@ public: CSIServerProcess( const process::http::URL& _agentUrl, const string& _rootDir, + const string& _pluginConfigDir, SecretGenerator* _secretGenerator, - SecretResolver* _secretResolver, - hashmap _pluginConfigs) + SecretResolver* _secretResolver) : process::ProcessBase(process::ID::generate("csi-server")), agentUrl(_agentUrl), rootDir(_rootDir), + pluginConfigDir(_pluginConfigDir), secretGenerator(_secretGenerator), - secretResolver(_secretResolver), - pluginConfigs(_pluginConfigs) {} + secretResolver(_secretResolver) {} - Future start(); + Future start(const SlaveID& _agentId); Future publishVolume(const Volume::Source::CSIVolume& volume); @@ -106,73 +108,125 @@ public: private: struct CSIPlugin { -CSIPlugin(const string& metricsPrefix) : metrics(metricsPrefix) {} +CSIPlugin( +const CSIPluginInfo& _info, +const string& metricsPrefix) + : info(_info), +metrics(metricsPrefix) {} CSIPluginInfo info; Owned serviceManager; Owned volumeManager; Runtime runtime; csi::Metrics metrics; + +// CSI plugins are initialized lazily. When a publish/unpublish call is +// received for a plugin which is not yet initialized, this promise is used +// to perform the call after initialization is complete. +Promise initialized; }; + // Attempts to load configuration for a plugin with the specified name and + // then initializes the plugin. If no name is specified, then all + // configurations found in the plugin config directory are loaded. + Try initializePlugin(const Option& name = None()); + // Contains the plugins loaded by the server. The key of this map is the // plugin name. hashmap plugins; const process::http::URL agentUrl; + Option agentId; const string rootDir; + const string pluginConfigDir; SecretGenerator* secretGenerator; SecretResolver* secretResolver; Option authToken; - hashmap pluginConfigs; - Option agentId; }; -Future CSIServerProcess::start() +Try CSIServerProcess::initializePlugin(const Option& name) { - Future result = Nothing(); + if (name.isSome()) { +CHECK(!plugins.contains(name.get())); + } - // The contents of this principal are arbitrary. We choose to avoid a - // principal with a 'value' string so that we do not unintentionally collide - // with another real principal with restricted permissions. - Principal principal(Option::none(), {{"key", "csi-server"}}); + Try> entries = os::ls(pluginConfigDir); + if (entries.isError()) { +return Error( +"Unable to list the CSI plugin configuration directory '" + +pluginConfigDir + "': " + entries.error()); + } + + // We are either looking for one specific plugin (if `name` is SOME), or we + // are loading all configs we find (if `name` is NONE). First, we populate + // `pluginConfigs` with one or more valid configurations. Then, we will + // initialize the plugin(s) based on the configuration(s) found. + hashmap pluginConfigs; + + foreach (const string& entry, entries.get()) { +const string path = path::join(pluginConfigDir, entry); + +// Ignore directory entries. +if (os::stat::isdir(path)) { + continue; +} + +Try read = os::read(path); +if (read.isError()) { + // In case of an error we log and skip to the next entry. + LOG(ERROR) << "Failed to read CSI plugin configuration file '" + << path << "': " << read.error(); + + continue; +} + +Try json = JSON::parse(read.get()); +if (json.isError()) { + return Error("JSON parse of '" + path + "' failed: " + json.error()); +} + +Try parse = ::protobuf::parse(json.get()); +if (parse.isError()) { + retu
[mesos] branch master updated (f284314 -> 4ff5104)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from f284314 Added a scheduler API test for a valid offer constraints update. new 68b4810 Fixed a bug in CSI volume manager initialization. new 5ed30db Added the CSI server to the Mesos agent. new 4ff5104 Initialized plugins lazily in the CSI server. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/csi/v0_volume_manager.cpp | 2 +- src/csi/v1_volume_manager.cpp | 2 +- src/local/local.cpp | 1 + src/slave/csi_server.cpp | 403 ++ src/slave/csi_server.hpp | 8 +- src/slave/main.cpp| 101 +++ src/slave/slave.cpp | 18 ++ src/slave/slave.hpp | 3 + src/tests/cluster.cpp | 128 ++ src/tests/cluster.hpp | 3 + src/tests/mesos.cpp | 1 + src/tests/mesos.hpp | 9 + src/tests/mock_slave.cpp | 7 + src/tests/mock_slave.hpp | 3 + 14 files changed, 463 insertions(+), 226 deletions(-)
[mesos] 02/03: Added the CSI server to the Mesos agent.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 5ed30db48785007e35805886a024ebb8a61a7037 Author: Greg Mann AuthorDate: Thu Aug 20 19:27:02 2020 -0700 Added the CSI server to the Mesos agent. This patch adds a CSI server to the Mesos agent in both the agent binary and in tests. Review: https://reviews.apache.org/r/72761/ --- src/local/local.cpp | 1 + src/slave/main.cpp | 101 ++--- src/slave/slave.cpp | 18 +++ src/slave/slave.hpp | 3 ++ src/tests/cluster.cpp| 128 ++- src/tests/cluster.hpp| 3 ++ src/tests/mesos.cpp | 1 + src/tests/mesos.hpp | 9 src/tests/mock_slave.cpp | 7 +++ src/tests/mock_slave.hpp | 3 ++ 10 files changed, 208 insertions(+), 66 deletions(-) diff --git a/src/local/local.cpp b/src/local/local.cpp index 8950570..9535399 100644 --- a/src/local/local.cpp +++ b/src/local/local.cpp @@ -535,6 +535,7 @@ PID launch(const Flags& flags, Allocator* _allocator) secretGenerators->back(), nullptr, nullptr, +nullptr, #ifndef __WINDOWS__ None(), #endif // __WINDOWS__ diff --git a/src/slave/main.cpp b/src/slave/main.cpp index 0aa2cc9..84b813c 100644 --- a/src/slave/main.cpp +++ b/src/slave/main.cpp @@ -37,6 +37,8 @@ #include #include +#include + #include #include #include @@ -84,6 +86,7 @@ #include "module/manager.hpp" #include "slave/constants.hpp" +#include "slave/csi_server.hpp" #include "slave/gc.hpp" #include "slave/slave.hpp" #include "slave/task_status_update_manager.hpp" @@ -111,6 +114,8 @@ using mesos::Authorizer; using mesos::SecretResolver; using mesos::SlaveInfo; +using net::IP; + using process::Owned; using process::firewall::DisabledEndpointsFirewallRule; @@ -528,6 +533,69 @@ int main(int argc, char** argv) << futureTracker.error(); } + SecretGenerator* secretGenerator = nullptr; + +#ifdef USE_SSL_SOCKET + if (flags.jwt_secret_key.isSome()) { +Try jwtSecretKey = os::read(flags.jwt_secret_key.get()); +if (jwtSecretKey.isError()) { + EXIT(EXIT_FAILURE) << "Failed to read the file specified by " + << "--jwt_secret_key"; +} + +// TODO(greggomann): Factor the following code out into a common helper, +// since we also do this when loading credentials. +Try permissions = + os::permissions(flags.jwt_secret_key.get()); +if (permissions.isError()) { + LOG(WARNING) << "Failed to stat jwt secret key file '" + << flags.jwt_secret_key.get() + << "': " << permissions.error(); +} else if (permissions->others.rwx) { + LOG(WARNING) << "Permissions on executor secret key file '" + << flags.jwt_secret_key.get() + << "' are too open; it is recommended that your" + << " key file is NOT accessible by others"; +} + +secretGenerator = new JWTSecretGenerator(jwtSecretKey.get()); + } +#endif // USE_SSL_SOCKET + + // The agent will hold ownership of the CSI server, but we also pass a pointer + // to it into the containerizer for use by the 'volume/csi' isolator. + Owned csiServer; + + if (flags.csi_plugin_config_dir.isSome()) { +// Initialize the CSI server, which manages any configured CSI plugins. +string scheme = "http"; + +#ifdef USE_SSL_SOCKET +if (process::network::openssl::flags().enabled) { + scheme = "https"; +} +#endif + +const process::http::URL agentUrl( +scheme, +process::address().ip, +process::address().port, +id + "/api/v1"); + +Try> csiServer_ = CSIServer::create( +flags, +agentUrl, +secretGenerator, +secretResolver.get()); + +if (csiServer_.isError()) { + EXIT(EXIT_FAILURE) +<< "Failed to initialize the CSI server: " << csiServer_.error(); +} + +csiServer = std::move(csiServer_.get()); + } + Try containerizer = Containerizer::create( flags, false, @@ -535,7 +603,8 @@ int main(int argc, char** argv) gc, secretResolver.get(), volumeGidManager, - futureTracker.get()); + futureTracker.get(), + csiServer.get()); if (containerizer.isError()) { EXIT(EXIT_FAILURE) @@ -608,35 +677,6 @@ int main(int argc, char** argv) << qosController.error(); } - SecretGenerator* secretGenerator = nullptr; - -#ifdef USE_SSL_SOCKET - if (flags.jwt_secret_key.isSome()) { -Try
[mesos] branch master updated (c78dc33 -> fe0cd02)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from c78dc33 Added interface for the CSI server. new 38ba191 Added support for secrets to the CSI volume managers. new fe0cd02 Added implementation of the CSI server. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/CMakeLists.txt| 1 + src/Makefile.am | 2 + src/csi/state.proto | 6 + src/csi/v0_volume_manager.cpp | 103 +++- src/csi/v0_volume_manager.hpp | 5 +- src/csi/v0_volume_manager_process.hpp | 13 +- src/csi/v1_volume_manager.cpp | 96 ++- src/csi/v1_volume_manager.hpp | 5 +- src/csi/v1_volume_manager_process.hpp | 13 +- src/csi/volume_manager.cpp| 21 +- src/csi/volume_manager.hpp| 5 +- src/slave/csi_server.cpp | 455 ++ src/slave/csi_server.hpp | 8 +- 13 files changed, 711 insertions(+), 22 deletions(-) create mode 100644 src/slave/csi_server.cpp
[mesos] 02/02: Added implementation of the CSI server.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit fe0cd02a0697a4c4fcf5087fcafd6729beec0b41 Author: Greg Mann AuthorDate: Mon Aug 10 20:11:50 2020 -0700 Added implementation of the CSI server. Review: https://reviews.apache.org/r/72716/ --- src/CMakeLists.txt | 1 + src/Makefile.am | 2 + src/slave/csi_server.cpp | 455 +++ src/slave/csi_server.hpp | 8 +- 4 files changed, 465 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e15e3d..c60d98a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,6 +150,7 @@ set(AGENT_SRC slave/constants.cpp slave/container_daemon.cpp slave/container_logger.cpp + slave/csi_server.cpp slave/flags.cpp slave/gc.cpp slave/http.cpp diff --git a/src/Makefile.am b/src/Makefile.am index 447db32..49dab4b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1208,6 +1208,8 @@ libmesos_no_3rdparty_la_SOURCES += \ slave/container_daemon.hpp \ slave/container_daemon_process.hpp \ slave/container_logger.cpp \ + slave/csi_server.cpp \ + slave/csi_server.hpp \ slave/container_loggers/sandbox.cpp \ slave/container_loggers/sandbox.hpp \ slave/containerizer/composing.cpp\ diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp new file mode 100644 index 000..a9a3995 --- /dev/null +++ b/src/slave/csi_server.cpp @@ -0,0 +1,455 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "common/validation.hpp" + +#include "csi/metrics.hpp" +#include "csi/paths.hpp" +#include "csi/service_manager.hpp" +#include "csi/volume_manager.hpp" + +#include "slave/csi_server.hpp" +#include "slave/flags.hpp" +#include "slave/paths.hpp" + +using mesos::csi::ServiceManager; +using mesos::csi::VolumeManager; + +using mesos::csi::state::VolumeState; + +using process::Failure; +using process::Future; +using process::Owned; + +using process::grpc::client::Runtime; + +using process::http::authentication::Principal; + +using std::list; +using std::string; +using std::vector; + +namespace mesos { +namespace internal { +namespace slave { + +static VolumeState createVolumeState( +const Volume::Source::CSIVolume::StaticProvisioning& volume); + + +static hashset extractServices( +const CSIPluginInfo& plugin); + + +class CSIServerProcess : public process::Process +{ +public: + CSIServerProcess( + const process::http::URL& _agentUrl, + const string& _rootDir, + SecretGenerator* _secretGenerator, + SecretResolver* _secretResolver, + hashmap _pluginConfigs) +: process::ProcessBase(process::ID::generate("csi-server")), + agentUrl(_agentUrl), + rootDir(_rootDir), + secretGenerator(_secretGenerator), + secretResolver(_secretResolver), + pluginConfigs(_pluginConfigs) {} + + Future start(); + + Future publishVolume(const Volume::Source::CSIVolume& volume); + + Future unpublishVolume( + const string& pluginName, + const string& volumeId); + +private: + struct CSIPlugin + { +CSIPlugin(const string& metricsPrefix) : metrics(metricsPrefix) {} + +CSIPluginInfo info; +Owned serviceManager; +Owned volumeManager; +Runtime runtime; +csi::Metrics metrics; + }; + + // Contains the plugins loaded by the server. The key of this map is the + // plugin nam
[mesos] 02/02: Added implementation of the CSI server.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit fe0cd02a0697a4c4fcf5087fcafd6729beec0b41 Author: Greg Mann AuthorDate: Mon Aug 10 20:11:50 2020 -0700 Added implementation of the CSI server. Review: https://reviews.apache.org/r/72716/ --- src/CMakeLists.txt | 1 + src/Makefile.am | 2 + src/slave/csi_server.cpp | 455 +++ src/slave/csi_server.hpp | 8 +- 4 files changed, 465 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e15e3d..c60d98a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,6 +150,7 @@ set(AGENT_SRC slave/constants.cpp slave/container_daemon.cpp slave/container_logger.cpp + slave/csi_server.cpp slave/flags.cpp slave/gc.cpp slave/http.cpp diff --git a/src/Makefile.am b/src/Makefile.am index 447db32..49dab4b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1208,6 +1208,8 @@ libmesos_no_3rdparty_la_SOURCES += \ slave/container_daemon.hpp \ slave/container_daemon_process.hpp \ slave/container_logger.cpp \ + slave/csi_server.cpp \ + slave/csi_server.hpp \ slave/container_loggers/sandbox.cpp \ slave/container_loggers/sandbox.hpp \ slave/containerizer/composing.cpp\ diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp new file mode 100644 index 000..a9a3995 --- /dev/null +++ b/src/slave/csi_server.cpp @@ -0,0 +1,455 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "common/validation.hpp" + +#include "csi/metrics.hpp" +#include "csi/paths.hpp" +#include "csi/service_manager.hpp" +#include "csi/volume_manager.hpp" + +#include "slave/csi_server.hpp" +#include "slave/flags.hpp" +#include "slave/paths.hpp" + +using mesos::csi::ServiceManager; +using mesos::csi::VolumeManager; + +using mesos::csi::state::VolumeState; + +using process::Failure; +using process::Future; +using process::Owned; + +using process::grpc::client::Runtime; + +using process::http::authentication::Principal; + +using std::list; +using std::string; +using std::vector; + +namespace mesos { +namespace internal { +namespace slave { + +static VolumeState createVolumeState( +const Volume::Source::CSIVolume::StaticProvisioning& volume); + + +static hashset extractServices( +const CSIPluginInfo& plugin); + + +class CSIServerProcess : public process::Process +{ +public: + CSIServerProcess( + const process::http::URL& _agentUrl, + const string& _rootDir, + SecretGenerator* _secretGenerator, + SecretResolver* _secretResolver, + hashmap _pluginConfigs) +: process::ProcessBase(process::ID::generate("csi-server")), + agentUrl(_agentUrl), + rootDir(_rootDir), + secretGenerator(_secretGenerator), + secretResolver(_secretResolver), + pluginConfigs(_pluginConfigs) {} + + Future start(); + + Future publishVolume(const Volume::Source::CSIVolume& volume); + + Future unpublishVolume( + const string& pluginName, + const string& volumeId); + +private: + struct CSIPlugin + { +CSIPlugin(const string& metricsPrefix) : metrics(metricsPrefix) {} + +CSIPluginInfo info; +Owned serviceManager; +Owned volumeManager; +Runtime runtime; +csi::Metrics metrics; + }; + + // Contains the plugins loaded by the server. The key of this map is the + // plugin nam
[mesos] 01/02: Added support for secrets to the CSI volume managers.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 38ba19127ddb48244f7c6c699e3c41e5ea12b594 Author: Greg Mann AuthorDate: Mon Aug 10 20:26:26 2020 -0700 Added support for secrets to the CSI volume managers. Review: https://reviews.apache.org/r/72732/ --- src/csi/state.proto | 6 ++ src/csi/v0_volume_manager.cpp | 103 +++--- src/csi/v0_volume_manager.hpp | 5 +- src/csi/v0_volume_manager_process.hpp | 13 - src/csi/v1_volume_manager.cpp | 96 +-- src/csi/v1_volume_manager.hpp | 5 +- src/csi/v1_volume_manager_process.hpp | 13 - src/csi/volume_manager.cpp| 21 ++- src/csi/volume_manager.hpp| 5 +- 9 files changed, 246 insertions(+), 21 deletions(-) diff --git a/src/csi/state.proto b/src/csi/state.proto index 836e30c..630e4f5 100644 --- a/src/csi/state.proto +++ b/src/csi/state.proto @@ -78,4 +78,10 @@ message VolumeState { // Indicates that the volume must be mounted read-only. bool readonly = 9; + + // Secrets to be included in `NodeStageVolumeRequest`. + map node_stage_secrets = 10; + + // Secrets to be included in `NodePublishVolumeRequest`. + map node_publish_secrets = 11; } diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp index 89a6da5..9e840a7 100644 --- a/src/csi/v0_volume_manager.cpp +++ b/src/csi/v0_volume_manager.cpp @@ -21,6 +21,8 @@ #include #include +#include + #include #include #include @@ -81,14 +83,16 @@ VolumeManagerProcess::VolumeManagerProcess( const hashset _services, const Runtime& _runtime, ServiceManager* _serviceManager, -Metrics* _metrics) +Metrics* _metrics, +SecretResolver* _secretResolver) : ProcessBase(process::ID::generate("csi-v0-volume-manager")), rootDir(_rootDir), info(_info), services(_services), runtime(_runtime), serviceManager(_serviceManager), -metrics(_metrics) +metrics(_metrics), +secretResolver(_secretResolver) { // This should have been validated in `VolumeManager::create`. CHECK(!services.empty()) @@ -961,8 +965,33 @@ Future VolumeManagerProcess::_publishVolume(const string& volumeId) request.set_staging_target_path(stagingPath); } - return call(NODE_SERVICE, ::nodePublishVolume, std::move(request)) -.then(defer(self(), [this, volumeId, targetPath] { + Future rpcResult; + + if (!volumeState.node_publish_secrets().empty()) { +rpcResult = resolveSecrets(volumeState.node_publish_secrets()) + .then(process::defer( + self(), + [this, request](const Map& secrets) { +NodePublishVolumeRequest request_(request); +*request_.mutable_node_publish_secrets() = secrets; + +return call( +NODE_SERVICE, +::nodePublishVolume, +std::move(request_)); + })); + } else { +rpcResult = + call(NODE_SERVICE, ::nodePublishVolume, std::move(request)); + } + + return rpcResult +.then(process::defer(self(), [this, volumeId, targetPath]() +-> Future { + if (!os::exists(targetPath)) { +return Failure("Target path '" + targetPath + "' not created"); + } + CHECK(volumes.contains(volumeId)); VolumeState& volumeState = volumes.at(volumeId).state; @@ -1042,7 +1071,25 @@ Future VolumeManagerProcess::__publishVolume(const string& volumeId) evolve(volumeState.volume_capability()); *request.mutable_volume_attributes() = volumeState.volume_context(); - return call(NODE_SERVICE, ::nodeStageVolume, std::move(request)) + Future rpcResult; + + if (!volumeState.node_stage_secrets().empty()) { +rpcResult = resolveSecrets(volumeState.node_stage_secrets()) + .then([=](const Map& secrets) { +NodeStageVolumeRequest request_(request); +*request_.mutable_node_stage_secrets() = secrets; + +return call( +NODE_SERVICE, +::nodeStageVolume, +std::move(request_)); + }); + } else { +rpcResult = + call(NODE_SERVICE, ::nodeStageVolume, std::move(request)); + } + + return rpcResult .then(process::defer(self(), [this, volumeId] { CHECK(volumes.contains(volumeId)); VolumeState& volumeState = volumes.at(volumeId).state; @@ -1236,20 +1283,62 @@ void VolumeManagerProcess::removeVolume(const string& volumeId) } +Future> VolumeManagerProcess::resolveSecrets( +const Map& secrets) +{ + if (!secretResolver) { +return Failure( +"CSI volume included secrets but the agent was not initialized with " +"a secret resolver"); + } + + // This `futures` is used below with `process::colle
[mesos] 01/02: Added support for secrets to the CSI volume managers.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 38ba19127ddb48244f7c6c699e3c41e5ea12b594 Author: Greg Mann AuthorDate: Mon Aug 10 20:26:26 2020 -0700 Added support for secrets to the CSI volume managers. Review: https://reviews.apache.org/r/72732/ --- src/csi/state.proto | 6 ++ src/csi/v0_volume_manager.cpp | 103 +++--- src/csi/v0_volume_manager.hpp | 5 +- src/csi/v0_volume_manager_process.hpp | 13 - src/csi/v1_volume_manager.cpp | 96 +-- src/csi/v1_volume_manager.hpp | 5 +- src/csi/v1_volume_manager_process.hpp | 13 - src/csi/volume_manager.cpp| 21 ++- src/csi/volume_manager.hpp| 5 +- 9 files changed, 246 insertions(+), 21 deletions(-) diff --git a/src/csi/state.proto b/src/csi/state.proto index 836e30c..630e4f5 100644 --- a/src/csi/state.proto +++ b/src/csi/state.proto @@ -78,4 +78,10 @@ message VolumeState { // Indicates that the volume must be mounted read-only. bool readonly = 9; + + // Secrets to be included in `NodeStageVolumeRequest`. + map node_stage_secrets = 10; + + // Secrets to be included in `NodePublishVolumeRequest`. + map node_publish_secrets = 11; } diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp index 89a6da5..9e840a7 100644 --- a/src/csi/v0_volume_manager.cpp +++ b/src/csi/v0_volume_manager.cpp @@ -21,6 +21,8 @@ #include #include +#include + #include #include #include @@ -81,14 +83,16 @@ VolumeManagerProcess::VolumeManagerProcess( const hashset _services, const Runtime& _runtime, ServiceManager* _serviceManager, -Metrics* _metrics) +Metrics* _metrics, +SecretResolver* _secretResolver) : ProcessBase(process::ID::generate("csi-v0-volume-manager")), rootDir(_rootDir), info(_info), services(_services), runtime(_runtime), serviceManager(_serviceManager), -metrics(_metrics) +metrics(_metrics), +secretResolver(_secretResolver) { // This should have been validated in `VolumeManager::create`. CHECK(!services.empty()) @@ -961,8 +965,33 @@ Future VolumeManagerProcess::_publishVolume(const string& volumeId) request.set_staging_target_path(stagingPath); } - return call(NODE_SERVICE, ::nodePublishVolume, std::move(request)) -.then(defer(self(), [this, volumeId, targetPath] { + Future rpcResult; + + if (!volumeState.node_publish_secrets().empty()) { +rpcResult = resolveSecrets(volumeState.node_publish_secrets()) + .then(process::defer( + self(), + [this, request](const Map& secrets) { +NodePublishVolumeRequest request_(request); +*request_.mutable_node_publish_secrets() = secrets; + +return call( +NODE_SERVICE, +::nodePublishVolume, +std::move(request_)); + })); + } else { +rpcResult = + call(NODE_SERVICE, ::nodePublishVolume, std::move(request)); + } + + return rpcResult +.then(process::defer(self(), [this, volumeId, targetPath]() +-> Future { + if (!os::exists(targetPath)) { +return Failure("Target path '" + targetPath + "' not created"); + } + CHECK(volumes.contains(volumeId)); VolumeState& volumeState = volumes.at(volumeId).state; @@ -1042,7 +1071,25 @@ Future VolumeManagerProcess::__publishVolume(const string& volumeId) evolve(volumeState.volume_capability()); *request.mutable_volume_attributes() = volumeState.volume_context(); - return call(NODE_SERVICE, ::nodeStageVolume, std::move(request)) + Future rpcResult; + + if (!volumeState.node_stage_secrets().empty()) { +rpcResult = resolveSecrets(volumeState.node_stage_secrets()) + .then([=](const Map& secrets) { +NodeStageVolumeRequest request_(request); +*request_.mutable_node_stage_secrets() = secrets; + +return call( +NODE_SERVICE, +::nodeStageVolume, +std::move(request_)); + }); + } else { +rpcResult = + call(NODE_SERVICE, ::nodeStageVolume, std::move(request)); + } + + return rpcResult .then(process::defer(self(), [this, volumeId] { CHECK(volumes.contains(volumeId)); VolumeState& volumeState = volumes.at(volumeId).state; @@ -1236,20 +1283,62 @@ void VolumeManagerProcess::removeVolume(const string& volumeId) } +Future> VolumeManagerProcess::resolveSecrets( +const Map& secrets) +{ + if (!secretResolver) { +return Failure( +"CSI volume included secrets but the agent was not initialized with " +"a secret resolver"); + } + + // This `futures` is used below with `process::colle
[mesos] branch master updated (c78dc33 -> fe0cd02)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from c78dc33 Added interface for the CSI server. new 38ba191 Added support for secrets to the CSI volume managers. new fe0cd02 Added implementation of the CSI server. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/CMakeLists.txt| 1 + src/Makefile.am | 2 + src/csi/state.proto | 6 + src/csi/v0_volume_manager.cpp | 103 +++- src/csi/v0_volume_manager.hpp | 5 +- src/csi/v0_volume_manager_process.hpp | 13 +- src/csi/v1_volume_manager.cpp | 96 ++- src/csi/v1_volume_manager.hpp | 5 +- src/csi/v1_volume_manager_process.hpp | 13 +- src/csi/volume_manager.cpp| 21 +- src/csi/volume_manager.hpp| 5 +- src/slave/csi_server.cpp | 455 ++ src/slave/csi_server.hpp | 8 +- 13 files changed, 711 insertions(+), 22 deletions(-) create mode 100644 src/slave/csi_server.cpp
[mesos] branch master updated (d2c84d1 -> c78dc33)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from d2c84d1 Improved CSI service manager to support unmanaged CSI plugins. new 8d54518 Enabled pre-provisioned volumes in the volume manager. new c63797c Set the readonly field in the CSI volume manager. new c78dc33 Added interface for the CSI server. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/csi/state.proto | 8 +++ src/csi/v0_volume_manager.cpp | 99 +++ src/csi/v0_volume_manager.hpp | 5 +- src/csi/v0_volume_manager_process.hpp | 8 ++- src/csi/v1_volume_manager.cpp | 99 +++ src/csi/v1_volume_manager.hpp | 5 +- src/csi/v1_volume_manager_process.hpp | 8 ++- src/csi/volume_manager.hpp| 18 +-- src/slave/csi_server.hpp | 90 +++ 9 files changed, 287 insertions(+), 53 deletions(-) create mode 100644 src/slave/csi_server.hpp
[mesos] 03/03: Added interface for the CSI server.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit c78dc333fc893a43d40dc33299a61987198a6ea9 Author: Greg Mann AuthorDate: Mon Aug 3 10:11:57 2020 -0700 Added interface for the CSI server. This component will hold objects associated with CSI plugins running on the agent. Review: https://reviews.apache.org/r/72707/ --- src/slave/csi_server.hpp | 90 1 file changed, 90 insertions(+) diff --git a/src/slave/csi_server.hpp b/src/slave/csi_server.hpp new file mode 100644 index 000..17882e1 --- /dev/null +++ b/src/slave/csi_server.hpp @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __SLAVE_CSI_SERVER_HPP__ +#define __SLAVE_CSI_SERVER_HPP__ + +#include + +#include + +#include + +#include +#include +#include + +#include +#include + +#include "csi/service_manager.hpp" +#include "csi/volume_manager.hpp" + +#include "slave/flags.hpp" + +namespace mesos { +namespace internal { +namespace slave { + +class CSIServerProcess; + +// A CSI server is a collection of volume managers and associated service +// managers. This object can be instantiated and held by the Mesos agent to +// manage a collection of CSI plugins and proxy calls to them. +class CSIServer +{ +public: + ~CSIServer(); + + static Try> create( + const Flags& flags, + const process::http::URL& agentUrl, + SecretGenerator* secretGenerator); + + // Starts the CSI server. Any `publishVolume()` or `unpublishVolume()` calls + // which were made previously will be executed after this method is called. + // Returns a future which is satisfied once initialization is complete. + process::Future start(); + + // Publish a CSI volume to this agent. If the `start()` method has not yet + // been called, then the publishing of this volume will not be completed until + // the CSI server is started. + // Returns the target path at which the volume has been published. + process::Future publishVolume( + const Volume::Source::CSIVolume& volume); + + // Unpublishes a CSI volume from this agent. If the `start()` method has not + // yet been called, then the unpublishing of this volume will not be completed + // until the CSI server is started. + process::Future unpublishVolume( + const std::string& pluginName, + const std::string& volumeId); + +private: + CSIServer( + const process::http::URL& agentUrl, + const std::string& csiRootDir, + SecretGenerator* secretGenerator, + const hashmap& csiPluginConfigs); + + process::Owned process; +}; + +} // namespace slave { +} // namespace internal { +} // namespace mesos { + +#endif // __SLAVE_CSI_SERVER_HPP__
[mesos] 02/03: Set the readonly field in the CSI volume manager.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit c63797cd80a1d7d3f91d6a7eb1574e1e8ef063fb Author: Greg Mann AuthorDate: Mon Aug 3 10:11:54 2020 -0700 Set the readonly field in the CSI volume manager. This patch introduces a new `readonly` field in the CSI `VolumeState` message and passes it through when publishing volumes. This will allow us to set this field appropriately when publishing pre-provisioned volumes. Review: https://reviews.apache.org/r/72715/ --- src/csi/state.proto | 3 +++ src/csi/v0_volume_manager.cpp | 4 ++-- src/csi/v1_volume_manager.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/csi/state.proto b/src/csi/state.proto index af0ef1c..836e30c 100644 --- a/src/csi/state.proto +++ b/src/csi/state.proto @@ -75,4 +75,7 @@ message VolumeState { // pre-provisioned by some other means and then attached to the node using a // CSI plugin. bool pre_provisioned = 8; + + // Indicates that the volume must be mounted read-only. + bool readonly = 9; } diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp index 5368440..89a6da5 100644 --- a/src/csi/v0_volume_manager.cpp +++ b/src/csi/v0_volume_manager.cpp @@ -822,7 +822,7 @@ Future VolumeManagerProcess::_attachVolume(const string& volumeId) request.set_node_id(CHECK_NOTNONE(nodeId)); *request.mutable_volume_capability() = evolve(volumeState.volume_capability()); - request.set_readonly(false); + request.set_readonly(volumeState.readonly()); *request.mutable_volume_attributes() = volumeState.volume_context(); return call( @@ -950,7 +950,7 @@ Future VolumeManagerProcess::_publishVolume(const string& volumeId) request.set_target_path(targetPath); *request.mutable_volume_capability() = evolve(volumeState.volume_capability()); - request.set_readonly(false); + request.set_readonly(volumeState.readonly()); *request.mutable_volume_attributes() = volumeState.volume_context(); if (nodeCapabilities->stageUnstageVolume) { diff --git a/src/csi/v1_volume_manager.cpp b/src/csi/v1_volume_manager.cpp index 7eae638..5178b2f 100644 --- a/src/csi/v1_volume_manager.cpp +++ b/src/csi/v1_volume_manager.cpp @@ -844,7 +844,7 @@ Future VolumeManagerProcess::_attachVolume(const string& volumeId) request.set_node_id(CHECK_NOTNONE(nodeId)); *request.mutable_volume_capability() = evolve(volumeState.volume_capability()); - request.set_readonly(false); + request.set_readonly(volumeState.readonly()); *request.mutable_volume_context() = volumeState.volume_context(); return call( @@ -976,7 +976,7 @@ Future VolumeManagerProcess::_publishVolume(const string& volumeId) request.set_target_path(targetPath); *request.mutable_volume_capability() = evolve(volumeState.volume_capability()); - request.set_readonly(false); + request.set_readonly(volumeState.readonly()); *request.mutable_volume_context() = volumeState.volume_context(); if (nodeCapabilities->stageUnstageVolume) {
[mesos] 01/03: Enabled pre-provisioned volumes in the volume manager.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 8d545180294ae705f7b8a2fe1578653107ede540 Author: Greg Mann AuthorDate: Mon Aug 3 10:11:50 2020 -0700 Enabled pre-provisioned volumes in the volume manager. This patch makes it possible to publish CSI volumes on an agent which were pre-provisioned out of band. Review: https://reviews.apache.org/r/72681/ --- src/csi/state.proto | 5 ++ src/csi/v0_volume_manager.cpp | 95 +++ src/csi/v0_volume_manager.hpp | 5 +- src/csi/v0_volume_manager_process.hpp | 8 ++- src/csi/v1_volume_manager.cpp | 95 +++ src/csi/v1_volume_manager.hpp | 5 +- src/csi/v1_volume_manager_process.hpp | 8 ++- src/csi/volume_manager.hpp| 18 +-- 8 files changed, 190 insertions(+), 49 deletions(-) diff --git a/src/csi/state.proto b/src/csi/state.proto index 28ad5ef..af0ef1c 100644 --- a/src/csi/state.proto +++ b/src/csi/state.proto @@ -70,4 +70,9 @@ message VolumeState { // hence needs cleanup. If set, the resource provider MUST transition the // volume to `PUBLISHED` state during recovery. bool node_publish_required = 7; + + // Indicates that the volume was not created by a CSI plugin, but rather was + // pre-provisioned by some other means and then attached to the node using a + // CSI plugin. + bool pre_provisioned = 8; } diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp index b383598..5368440 100644 --- a/src/csi/v0_volume_manager.cpp +++ b/src/csi/v0_volume_manager.cpp @@ -452,8 +452,29 @@ Future VolumeManagerProcess::detachVolume(const string& volumeId) } -Future VolumeManagerProcess::publishVolume(const string& volumeId) +Future VolumeManagerProcess::publishVolume( +const string& volumeId, +const Option& volumeState) { + if (volumeState.isSome()) { +if (!volumeState->pre_provisioned()) { + return Failure( + "Cannot specify volume state when publishing a volume unless that" + " volume is pre-provisioned"); +} + +if (volumeState->state() != VolumeState::VOL_READY && +volumeState->state() != VolumeState::NODE_READY) { + return Failure( + "Cannot specify volume state when publishing a volume unless that" + " volume is in either the VOL_READY or NODE_READY state"); +} + +// This must be an untracked volume. Track it now before we continue. +volumes.put(volumeId, VolumeState(volumeState.get())); +checkpointVolumeState(volumeId); + } + if (!volumes.contains(volumeId)) { return Failure("Cannot publish unknown volume '" + volumeId + "'"); } @@ -728,16 +749,7 @@ Future VolumeManagerProcess::_deleteVolume(const std::string& volumeId) // the future returned by the sequence ready as well. return __deleteVolume(volumeId) .then(process::defer(self(), [this, volumeId](bool deleted) { - volumes.erase(volumeId); - - const string volumePath = -paths::getVolumePath(rootDir, info.type(), info.name(), volumeId); - - Try rmdir = os::rmdir(volumePath); - CHECK_SOME(rmdir) << "Failed to remove checkpointed volume state at '" -<< volumePath << "': " << rmdir.error(); - - garbageCollectMountPath(volumeId); + removeVolume(volumeId); return deleted; })); @@ -1051,6 +1063,13 @@ Future VolumeManagerProcess::_unpublishVolume(const string& volumeId) if (volumeState.state() == VolumeState::NODE_READY) { CHECK(volumeState.boot_id().empty()); + +if (volumeState.pre_provisioned()) { + // Since this volume was pre-provisioned, it has reached the end of its + // lifecycle. Remove it now. + removeVolume(volumeId); +} + return Nothing(); } @@ -1063,9 +1082,16 @@ Future VolumeManagerProcess::_unpublishVolume(const string& volumeId) } if (!nodeCapabilities->stageUnstageVolume) { -// Since this is a no-op, no need to checkpoint here. -volumeState.set_state(VolumeState::NODE_READY); -volumeState.clear_boot_id(); +if (volumeState.pre_provisioned()) { + // Since this volume was pre-provisioned, it has reached the end of its + // lifecycle. Remove it now. + removeVolume(volumeId); +} else { + // Since this is a no-op, no need to checkpoint here. + volumeState.set_state(VolumeState::NODE_READY); + volumeState.clear_boot_id(); +} + return Nothing(); } @@ -1091,13 +1117,20 @@ Future VolumeManagerProcess::_unpublishVolume(const string& volumeId) request.set_staging_target_path(stagingPath); return call(NODE_SERVICE, ::nodeUnstageVol
[mesos] branch master updated: Fixed an example in the documentation.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 8e74d18 Fixed an example in the documentation. 8e74d18 is described below commit 8e74d18c1a34fe3d5a9f553552c6e9b66411a575 Author: Greg Mann AuthorDate: Thu Jul 16 09:10:30 2020 -0700 Fixed an example in the documentation. --- docs/operator-http-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operator-http-api.md b/docs/operator-http-api.md index 585e94c..8b0fbdc 100644 --- a/docs/operator-http-api.md +++ b/docs/operator-http-api.md @@ -2675,7 +2675,7 @@ Accept: application/json "agent_id": { "value": "3192b9d1-db71-4699-ae25-e28dfbf42de1" }, -"max_grace_period": "10mins", +"max_grace_period": {"seconds": 600}, "mark_gone": false } }
[mesos] annotated tag 1.7.3 created (now 20756d0)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to annotated tag 1.7.3 in repository https://gitbox.apache.org/repos/asf/mesos.git. at 20756d0 (tag) tagging 684c41947fe47ae9fc3020598e26368fb8863eea (tag) length 159 bytes by Greg Mann on Tue May 19 12:29:49 2020 -0700 - Log - Tagging Mesos 1.7.3 --- No new revisions were added by this update.
svn commit: r39700 - in /release/mesos/1.7.3: ./ mesos-1.7.3.tar.gz mesos-1.7.3.tar.gz.asc mesos-1.7.3.tar.gz.sha512
Author: grag Date: Tue May 19 19:29:49 2020 New Revision: 39700 Log: Adding mesos-1.7.3. Added: release/mesos/1.7.3/ release/mesos/1.7.3/mesos-1.7.3.tar.gz (with props) release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512 Added: release/mesos/1.7.3/mesos-1.7.3.tar.gz == Binary file - no diff available. Propchange: release/mesos/1.7.3/mesos-1.7.3.tar.gz -- svn:mime-type = application/octet-stream Added: release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc == --- release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc (added) +++ release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc Tue May 19 19:29:49 2020 @@ -0,0 +1,17 @@ +-BEGIN PGP SIGNATURE- +Version: GnuPG v2.0.22 (GNU/Linux) + +iQIcBAABAgAGBQJesFMkAAoJEEPsLr7bUMMbMQQQAIDWLNTTHHrSiwgvkOYRa+Hy +U8J5cHUJiFVy+s/DHApFjwZoU299uqtxk+zn6shesMT7sEC3razKYDfvsa7+F6dK +mDQUcRw7DKVXxPt40QhM/eZjX2UnaamC7vbpXt8R2I6JUDN4mEkQi4v+tXeKBueh +d0owEhpLtMVtvruBcQKUXZQU4v9PTe+vSUitWIGdc3JmtqK+Ocw1okCwRLBGbYs3 +lpOUdKAt0cGrHk3uBwurVFY/draVDdAE7gIOdmwCKXzwZ1f58bSEa/YFbmFrsIEx +vI2nQGG5om4Gt3RaecwbONZUJoObZZtWdPY6ebQLaMp0PlI35lWTM5s80zRr38nB +r3NcSIfEscUEMnqzQ/fBGlMip17M+iE5J2JiMmre4jcIbiDk2n+14QjR2D2ehPdN +JdVJZ07Z8PtWV/kRI+9UK6rfvE+FrdnmiCcZaFWo3lEy/L0FArFSnBgPV50/4i/Z +UnPl3klDaSmXlXTk9d7arMXxEBGXmEuYWzne2dqEOaB3VCURTBhFg3t0n7XrHW7e +PXHqOx4dxgTtrfssRcLBlWhhQ6aDZ6sIVUou9YAiOv1zKjuiekoBT5Mhj4q/LTAR +zVcuN/zVT4wv+F8URU48WAsymGrsmzWEORvowcvBsKEADpmi050vd4c4+e1dQQ8i +E/dXz1wYgN8B6Be2Qz8S +=+QoT +-END PGP SIGNATURE- Added: release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512 == --- release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512 (added) +++ release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512 Tue May 19 19:29:49 2020 @@ -0,0 +1 @@ +6ecea4edcd49e364549f5e8d5728644964cf87fc6a6d0431693efeb94b5c970ad663a0e8279694f4e51408c0ea91aebd4ae08ba5b880b460e4708309d9503bd9 mesos-1.7.3.tar.gz
[mesos] 01/02: Fixed the java bindings in the cmake build.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 630dbf7486bfe315472fbcd8d0087f75cd9f3786 Author: Andriy Kornatskyy AuthorDate: Fri May 8 14:10:39 2020 -0700 Fixed the java bindings in the cmake build. This closes #360 --- src/CMakeLists.txt | 3 +-- src/java/CMakeLists.txt | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 96cd867..810acbf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -627,8 +627,7 @@ target_link_libraries( sasl2 zookeeper mesos-protobufs - $<$,$>:nvml> - $<$:mesos-java>) + $<$,$>:nvml>) if (NOT WIN32) target_link_libraries(mesos PUBLIC leveldb) diff --git a/src/java/CMakeLists.txt b/src/java/CMakeLists.txt index 29422e9..81eb9b5 100644 --- a/src/java/CMakeLists.txt +++ b/src/java/CMakeLists.txt @@ -116,7 +116,8 @@ add_custom_command( add_library(mesos-java ${JAVA_SRC} ${JAVA_H}) target_link_libraries( - mesos-java + mesos-java PUBLIC + mesos mesos-protobufs process zookeeper
[mesos] branch master updated (48922e0 -> 07cd355)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 48922e0 Updated CHANGELOG for 1.10.0. new 630dbf7 Fixed the java bindings in the cmake build. new 07cd355 Added ability to specify a root dir for boost and curl with cmake. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: 3rdparty/CMakeLists.txt | 61 +--- cmake/CompilationConfigure.cmake | 14 + src/CMakeLists.txt | 3 +- src/java/CMakeLists.txt | 3 +- 4 files changed, 56 insertions(+), 25 deletions(-)
[mesos] 02/02: Added ability to specify a root dir for boost and curl with cmake.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 07cd355f90892a897e0b210a8cd0eda6103ae6c9 Author: Andriy Kornatskyy AuthorDate: Fri May 8 14:11:10 2020 -0700 Added ability to specify a root dir for boost and curl with cmake. This closes #361 --- 3rdparty/CMakeLists.txt | 61 +--- cmake/CompilationConfigure.cmake | 14 + 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 119813e..7b84f12 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -194,29 +194,34 @@ endfunction() # Boost: C++ Libraries. # http://www.boost.org ### -EXTERNAL(boost ${BOOST_VERSION} ${CMAKE_CURRENT_BINARY_DIR}) -add_library(boost INTERFACE) -add_dependencies(boost ${BOOST_TARGET}) -if (CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES Clang) - # Headers including Boost 1.65.0 fail to compile with GCC 7.2 and - # CLang 3.6 without `-Wno-unused-local-typedefs`. - # TODO(andschwa): Remove this when Boost has a resolution. - target_compile_options(boost INTERFACE -Wno-unused-local-typedefs) -endif () -target_include_directories(boost INTERFACE ${BOOST_ROOT}) +if ("${BOOST_ROOT_DIR}" STREQUAL "") + EXTERNAL(boost ${BOOST_VERSION} ${CMAKE_CURRENT_BINARY_DIR}) + add_library(boost INTERFACE) + add_dependencies(boost ${BOOST_TARGET}) + if (CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES Clang) +# Headers including Boost 1.65.0 fail to compile with GCC 7.2 and +# CLang 3.6 without `-Wno-unused-local-typedefs`. +# TODO(andschwa): Remove this when Boost has a resolution. +target_compile_options(boost INTERFACE -Wno-unused-local-typedefs) + endif () + target_include_directories(boost INTERFACE ${BOOST_ROOT}) -# Patch Boost to avoid repeated "Unknown compiler warnings" on Windows. -PATCH_CMD(BOOST_PATCH_CMD boost-${BOOST_VERSION}.patch) + # Patch Boost to avoid repeated "Unknown compiler warnings" on Windows. + PATCH_CMD(BOOST_PATCH_CMD boost-${BOOST_VERSION}.patch) -ExternalProject_Add( - ${BOOST_TARGET} - PREFIX${BOOST_CMAKE_ROOT} - PATCH_COMMAND ${BOOST_PATCH_CMD} - CONFIGURE_COMMAND ${CMAKE_NOOP} - BUILD_COMMAND ${CMAKE_NOOP} - INSTALL_COMMAND ${CMAKE_NOOP} - URL ${BOOST_URL} - URL_HASH ${BOOST_HASH}) + ExternalProject_Add( +${BOOST_TARGET} +PREFIX${BOOST_CMAKE_ROOT} +PATCH_COMMAND ${BOOST_PATCH_CMD} +CONFIGURE_COMMAND ${CMAKE_NOOP} +BUILD_COMMAND ${CMAKE_NOOP} +INSTALL_COMMAND ${CMAKE_NOOP} +URL ${BOOST_URL} +URL_HASH ${BOOST_HASH}) +else () + add_library(boost INTERFACE) + target_include_directories(boost INTERFACE ${BOOST_ROOT_DIR}/include) +endif () # moodycamel::ConcurrentQueue: An industrial-strength lock-free queue. @@ -861,7 +866,19 @@ if (WIN32) COMMAND ${CMAKE_COMMAND} -E copy $ ${CMAKE_BINARY_DIR}/src/curl.exe DEPENDEES build) else () - find_package(CURL REQUIRED) + if ("${CURL_ROOT_DIR}" STREQUAL "") +find_package(CURL REQUIRED) + else () +set(POSSIBLE_CURL_INCLUDE_DIRS ${CURL_ROOT_DIR}/include) +set(POSSIBLE_CURL_LIB_DIRS ${CURL_ROOT_DIR}/lib) + +set(CURL_LIBRARY_NAMES curl) + +FIND_PACKAGE_HELPER(CURL curl/curl.h) +SET(CURL_INCLUDE_DIRS ${CURL_INCLUDE_DIR}) +SET(CURL_LIBRARIES ${CURL_LIBS}) + endif () + add_library(libcurl SHARED IMPORTED) set_target_properties( diff --git a/cmake/CompilationConfigure.cmake b/cmake/CompilationConfigure.cmake index f9511fc..af1a8b5 100644 --- a/cmake/CompilationConfigure.cmake +++ b/cmake/CompilationConfigure.cmake @@ -103,6 +103,20 @@ if (ENABLE_LIBEVENT) "Specify the path to libevent, e.g. \"C:\\libevent-Win64\".") endif() +set( + BOOST_ROOT_DIR + "" + CACHE STRING + "Specify the path to boost.") + +if (NOT WIN32) + set( +CURL_ROOT_DIR +"" +CACHE STRING +"Specify the path to libcurl.") +endif() + option( UNBUNDLED_LEVELDB "Build with an installed leveldb version instead of the bundled."
[mesos] branch 1.9.x updated: Added MESOS-10118 to the 1.9.1 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.9.x by this push: new 30b7aae Added MESOS-10118 to the 1.9.1 CHANGELOG. 30b7aae is described below commit 30b7aae987c3534af87453e884e1a87841f3a72c Author: Greg Mann AuthorDate: Wed May 6 18:07:36 2020 -0700 Added MESOS-10118 to the 1.9.1 CHANGELOG. --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index d07f6ce..59ccef6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -14,6 +14,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP) * [MESOS-10041] - Libprocess SSL verification can leak memory. * [MESOS-10094] - Master's agent draining VLOG prints incorrect task counts. * [MESOS-10096] - Reactivating a draining agent leaves the agent in draining state. + * [MESOS-10118] - Agent incorrectly handles draining when empty. ** Improvement * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in Master::__reregisterSlave.
[mesos] branch master updated: Added MESOS-10118 to the 1.9.1 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 8682b5d Added MESOS-10118 to the 1.9.1 CHANGELOG. 8682b5d is described below commit 8682b5ddf8b773beffe8bf0428c9350d6ae59412 Author: Greg Mann AuthorDate: Wed May 6 18:07:36 2020 -0700 Added MESOS-10118 to the 1.9.1 CHANGELOG. --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index a115101..f43ab8d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -22,6 +22,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP) * [MESOS-10041] - Libprocess SSL verification can leak memory. * [MESOS-10094] - Master's agent draining VLOG prints incorrect task counts. * [MESOS-10096] - Reactivating a draining agent leaves the agent in draining state. + * [MESOS-10118] - Agent incorrectly handles draining when empty. ** Improvement * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in Master::__reregisterSlave.
[mesos] branch 1.9.x updated: Fixed a bug in the agent's draining handler.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.9.x by this push: new 35912a2 Fixed a bug in the agent's draining handler. 35912a2 is described below commit 35912a22081e88ba243d2b690667dff6a90c51d0 Author: Greg Mann AuthorDate: Wed May 6 16:35:19 2020 -0700 Fixed a bug in the agent's draining handler. Previously, when the agent had no tasks or operations and received a `DrainSlaveMessage`, it would checkpoint the `DrainConfig` to disk, implicitly placing it into a "draining" state indefinitely. This patch updates the agent's handler to avoid checkpointing anything to disk in this case. The `SlaveTest.DrainInfoInAPIOutputs` test is also removed and its functionality is moved into the test `SlaveTest.DrainAgentKillsRunningTask`. The running task in the latter test allows us to verify agent API outputs both before and after the task's terminal update is acknowleged. Review: https://reviews.apache.org/r/72368/ --- src/slave/slave.cpp | 12 +++ src/tests/slave_tests.cpp | 215 +- 2 files changed, 127 insertions(+), 100 deletions(-) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 23d2ddd..7110ff4 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -998,6 +998,18 @@ void Slave::drain( const UPID& from, DrainSlaveMessage&& drainSlaveMessage) { + if (operations.empty() && frameworks.empty()) { +LOG(INFO) + << "Received DrainConfig " << drainSlaveMessage.config() + << (drainConfig.isSome() + ? "; previously stored DrainConfig " + stringify(*drainConfig) + : "") + << "; agent has no stored frameworks, tasks, or operations," + " so draining is already complete"; + +return; + } + hashmap> pendingTaskIds; foreachvalue (Framework* framework, frameworks) { foreachvalue (const auto& taskMap, framework->pendingTasks) { diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index c147bfc..335a1c4 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -11928,97 +11928,8 @@ TEST_F( } -// When the agent receives a `DrainSlaveMessage` from the master, the agent's -// drain info should be visible in the agent's API output. -TEST_F(SlaveTest, DrainInfoInAPIOutputs) -{ - Clock::pause(); - - const int GRACE_PERIOD_NANOS = 100; - - Try> master = StartMaster(); - ASSERT_SOME(master); - - Future slaveRegisteredMessage = -FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); - - StandaloneMasterDetector detector(master.get()->pid); - - slave::Flags slaveFlags = CreateSlaveFlags(); - - Try> slave = StartSlave(, slaveFlags); - ASSERT_SOME(slave); - - Clock::advance(slaveFlags.registration_backoff_factor); - - AWAIT_READY(slaveRegisteredMessage); - - // Simulate the master sending a `DrainSlaveMessage` to the agent. - DurationInfo maxGracePeriod; - maxGracePeriod.set_nanoseconds(GRACE_PERIOD_NANOS); - - DrainConfig drainConfig; - drainConfig.set_mark_gone(true); - drainConfig.mutable_max_grace_period()->CopyFrom(maxGracePeriod); - - DrainSlaveMessage drainSlaveMessage; - drainSlaveMessage.mutable_config()->CopyFrom(drainConfig); - - process::post(master.get()->pid, slave.get()->pid, drainSlaveMessage); - - Clock::settle(); - - { -v1::agent::Call call; -call.set_type(v1::agent::Call::GET_AGENT); - -const ContentType contentType = ContentType::PROTOBUF; - -process::http::Headers headers = createBasicAuthHeaders(DEFAULT_CREDENTIAL); -headers["Accept"] = stringify(contentType); - -Future httpResponse = - process::http::post( - slave.get()->pid, - "api/v1", - headers, - serialize(contentType, call), - stringify(contentType)); - -AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, httpResponse); - -Future responseMessage = - deserialize(contentType, httpResponse->body); - -AWAIT_READY(responseMessage); -ASSERT_TRUE(responseMessage->IsInitialized()); -ASSERT_EQ(v1::agent::Response::GET_AGENT, responseMessage->type()); -ASSERT_TRUE(responseMessage->get_agent().has_drain_config()); -EXPECT_EQ( -drainConfig, -devolve(responseMessage->get_agent().drain_config())); - } - - { -Future response = process::http::get( -slave.get()->pid, -"state", -None(), -createBasicAuthHeaders(DEFAULT_CREDENTIAL)); - -AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); -AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response); - -Try state
[mesos] branch master updated (ff9f5cc -> 06cc8ac)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from ff9f5cc Fixed a bug in the agent's draining handler. add 5e77c94 Made the scheduler resources a top-level item in the documentation. add 1949f52 Added intro docs on running workloads. add 06cc8ac Updated existing documentation for resource limits. No new revisions were added by this update. Summary of changes: docs/home.md| 16 +- docs/nested-container-and-task-group.md | 91 docs/running-workloads.md | 255 docs/scheduler-http-api.md | 93 ++-- docs/upgrades.md| 5 + 5 files changed, 385 insertions(+), 75 deletions(-) create mode 100644 docs/running-workloads.md
[mesos] branch master updated: Fixed a bug in the agent's draining handler.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new ff9f5cc Fixed a bug in the agent's draining handler. ff9f5cc is described below commit ff9f5cc796f6a99302d94de121726cd7b5988f11 Author: Greg Mann AuthorDate: Wed May 6 16:35:19 2020 -0700 Fixed a bug in the agent's draining handler. Previously, when the agent had no tasks or operations and received a `DrainSlaveMessage`, it would checkpoint the `DrainConfig` to disk, implicitly placing it into a "draining" state indefinitely. This patch updates the agent's handler to avoid checkpointing anything to disk in this case. The `SlaveTest.DrainInfoInAPIOutputs` test is also removed and its functionality is moved into the test `SlaveTest.DrainAgentKillsRunningTask`. The running task in the latter test allows us to verify agent API outputs both before and after the task's terminal update is acknowleged. Review: https://reviews.apache.org/r/72368/ --- src/slave/slave.cpp | 12 +++ src/tests/slave_tests.cpp | 215 +- 2 files changed, 127 insertions(+), 100 deletions(-) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 1a32c81..c828d99 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -1074,6 +1074,18 @@ void Slave::drain( const UPID& from, DrainSlaveMessage&& drainSlaveMessage) { + if (operations.empty() && frameworks.empty()) { +LOG(INFO) + << "Received DrainConfig " << drainSlaveMessage.config() + << (drainConfig.isSome() + ? "; previously stored DrainConfig " + stringify(*drainConfig) + : "") + << "; agent has no stored frameworks, tasks, or operations," + " so draining is already complete"; + +return; + } + hashmap> pendingTaskIds; foreachvalue (Framework* framework, frameworks) { foreachvalue (const auto& taskMap, framework->pendingTasks) { diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 5ad04b2..b46e561 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -12089,97 +12089,8 @@ TEST_F( } -// When the agent receives a `DrainSlaveMessage` from the master, the agent's -// drain info should be visible in the agent's API output. -TEST_F(SlaveTest, DrainInfoInAPIOutputs) -{ - Clock::pause(); - - const int GRACE_PERIOD_NANOS = 100; - - Try> master = StartMaster(); - ASSERT_SOME(master); - - Future slaveRegisteredMessage = -FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); - - StandaloneMasterDetector detector(master.get()->pid); - - slave::Flags slaveFlags = CreateSlaveFlags(); - - Try> slave = StartSlave(, slaveFlags); - ASSERT_SOME(slave); - - Clock::advance(slaveFlags.registration_backoff_factor); - - AWAIT_READY(slaveRegisteredMessage); - - // Simulate the master sending a `DrainSlaveMessage` to the agent. - DurationInfo maxGracePeriod; - maxGracePeriod.set_nanoseconds(GRACE_PERIOD_NANOS); - - DrainConfig drainConfig; - drainConfig.set_mark_gone(true); - drainConfig.mutable_max_grace_period()->CopyFrom(maxGracePeriod); - - DrainSlaveMessage drainSlaveMessage; - drainSlaveMessage.mutable_config()->CopyFrom(drainConfig); - - process::post(master.get()->pid, slave.get()->pid, drainSlaveMessage); - - Clock::settle(); - - { -v1::agent::Call call; -call.set_type(v1::agent::Call::GET_AGENT); - -const ContentType contentType = ContentType::PROTOBUF; - -process::http::Headers headers = createBasicAuthHeaders(DEFAULT_CREDENTIAL); -headers["Accept"] = stringify(contentType); - -Future httpResponse = - process::http::post( - slave.get()->pid, - "api/v1", - headers, - serialize(contentType, call), - stringify(contentType)); - -AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, httpResponse); - -Future responseMessage = - deserialize(contentType, httpResponse->body); - -AWAIT_READY(responseMessage); -ASSERT_TRUE(responseMessage->IsInitialized()); -ASSERT_EQ(v1::agent::Response::GET_AGENT, responseMessage->type()); -ASSERT_TRUE(responseMessage->get_agent().has_drain_config()); -EXPECT_EQ( -drainConfig, -devolve(responseMessage->get_agent().drain_config())); - } - - { -Future response = process::http::get( -slave.get()->pid, -"state", -None(), -createBasicAuthHeaders(DEFAULT_CREDENTIAL)); - -AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response); -AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response); - -Try state
svn commit: r39283 - in /dev/mesos/1.7.3-rc1: ./ mesos-1.7.3.tar.gz mesos-1.7.3.tar.gz.asc mesos-1.7.3.tar.gz.sha512
Author: grag Date: Mon May 4 17:39:21 2020 New Revision: 39283 Log: Adding mesos-1.7.3-rc1. Added: dev/mesos/1.7.3-rc1/ dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz (with props) dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512 Added: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz == Binary file - no diff available. Propchange: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz -- svn:mime-type = application/octet-stream Added: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc == --- dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc (added) +++ dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc Mon May 4 17:39:21 2020 @@ -0,0 +1,17 @@ +-BEGIN PGP SIGNATURE- +Version: GnuPG v2.0.22 (GNU/Linux) + +iQIcBAABAgAGBQJesFMkAAoJEEPsLr7bUMMbMQQQAIDWLNTTHHrSiwgvkOYRa+Hy +U8J5cHUJiFVy+s/DHApFjwZoU299uqtxk+zn6shesMT7sEC3razKYDfvsa7+F6dK +mDQUcRw7DKVXxPt40QhM/eZjX2UnaamC7vbpXt8R2I6JUDN4mEkQi4v+tXeKBueh +d0owEhpLtMVtvruBcQKUXZQU4v9PTe+vSUitWIGdc3JmtqK+Ocw1okCwRLBGbYs3 +lpOUdKAt0cGrHk3uBwurVFY/draVDdAE7gIOdmwCKXzwZ1f58bSEa/YFbmFrsIEx +vI2nQGG5om4Gt3RaecwbONZUJoObZZtWdPY6ebQLaMp0PlI35lWTM5s80zRr38nB +r3NcSIfEscUEMnqzQ/fBGlMip17M+iE5J2JiMmre4jcIbiDk2n+14QjR2D2ehPdN +JdVJZ07Z8PtWV/kRI+9UK6rfvE+FrdnmiCcZaFWo3lEy/L0FArFSnBgPV50/4i/Z +UnPl3klDaSmXlXTk9d7arMXxEBGXmEuYWzne2dqEOaB3VCURTBhFg3t0n7XrHW7e +PXHqOx4dxgTtrfssRcLBlWhhQ6aDZ6sIVUou9YAiOv1zKjuiekoBT5Mhj4q/LTAR +zVcuN/zVT4wv+F8URU48WAsymGrsmzWEORvowcvBsKEADpmi050vd4c4+e1dQQ8i +E/dXz1wYgN8B6Be2Qz8S +=+QoT +-END PGP SIGNATURE- Added: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512 == --- dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512 (added) +++ dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512 Mon May 4 17:39:21 2020 @@ -0,0 +1 @@ +6ecea4edcd49e364549f5e8d5728644964cf87fc6a6d0431693efeb94b5c970ad663a0e8279694f4e51408c0ea91aebd4ae08ba5b880b460e4708309d9503bd9 mesos-1.7.3.tar.gz
[mesos] annotated tag 1.7.3-rc1 created (now 684c419)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to annotated tag 1.7.3-rc1 in repository https://gitbox.apache.org/repos/asf/mesos.git. at 684c419 (tag) tagging 5f617044c969ebcfca281d043a2474c1a6b39f23 (commit) replaces 1.7.2 by Greg Mann on Mon May 4 10:06:56 2020 -0700 - Log - Tagging Mesos 1.7.3-rc1. --- No new revisions were added by this update.
[mesos] branch 1.7.x updated: Prepared the 1.7.3 CHANGELOG for release.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.7.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.7.x by this push: new 5f61704 Prepared the 1.7.3 CHANGELOG for release. 5f61704 is described below commit 5f617044c969ebcfca281d043a2474c1a6b39f23 Author: Greg Mann AuthorDate: Tue Apr 28 23:12:14 2020 -0700 Prepared the 1.7.3 CHANGELOG for release. --- CHANGELOG | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index d2e3a19..64921a6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -Release Notes - Mesos - Version 1.7.3 (WIP) +Release Notes - Mesos - Version 1.7.3 --- * This is a bug fix release.
[mesos] branch master updated: Added missing issues to the 1.7.3 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new db83570 Added missing issues to the 1.7.3 CHANGELOG. db83570 is described below commit db83570b13ad82f467ae6dfc5642cc8da2f7a8fc Author: Greg Mann AuthorDate: Tue Apr 28 22:52:02 2020 -0700 Added missing issues to the 1.7.3 CHANGELOG. --- CHANGELOG | 9 + 1 file changed, 9 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 8df75fb..a115101 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -731,12 +731,15 @@ Release Notes - Mesos - Version 1.7.3 (WIP) ** Bug * [MESOS-8467] - Destroyed executors might be used after `Slave::publishResource()`. + * [MESOS-8537] - Default executor doesn't wait for status updates to be ack'd before shutting down. * [MESOS-9124] - Agent reconfiguration can cause master to unsuppress on scheduler's behalf. * [MESOS-9507] - Agent could not recover due to empty docker volume checkpointed files. * [MESOS-9529] - `/proc` should be remounted even if a nested container set `share_pid_namespace` to true. * [MESOS-9549] - nvidia/cuda 10 does not work on GPU isolator. * [MESOS-9564] - Logrotate container logger lets tasks execute arbitrary commands in the Mesos agent's namespace. * [MESOS-9568] - SLRP does not clean up mount directories for destroyed MOUNT disks. + * [MESOS-9581] - Mesos package naming appears to be undeterministic. + * [MESOS-9590] - Mesos CI sometimes, incorrectly, overwrites already-pushed mesos master nightly images with new images built from non-master branches. * [MESOS-9607] - Removing a resource provider with consumers breaks resource publishing. * [MESOS-9610] - Fetcher vulnerability - escaping from sandbox. * [MESOS-9616] - `Filters.refuse_seconds` declines resources not in offers. @@ -752,6 +755,7 @@ Release Notes - Mesos - Version 1.7.3 (WIP) * [MESOS-9787] - Log slow SSL (TLS) peer reverse DNS lookup. * [MESOS-9803] - Memory leak caused by an infinite chain of futures in `UriDiskProfileAdaptor`. * [MESOS-9836] - Docker containerizer overwrites `/mesos/slave` cgroups. + * [MESOS-9847] - Docker executor doesn't wait for status updates to be ack'd before shutting down. * [MESOS-9852] - Slow memory growth in master due to deferred deletion of offer filters and timers. * [MESOS-9856] - REVIVE call with specified role(s) clears filters for all roles of a framework. * [MESOS-9868] - NetworkInfo from the agent /state endpoint is not correct. @@ -762,13 +766,18 @@ Release Notes - Mesos - Version 1.7.3 (WIP) * [MESOS-9925] - Default executor takes a couple of seconds to start and subscribe Mesos agent. * [MESOS-9964] - Support destroying UCR containers in provisioning state. * [MESOS-9966] - Agent crashes when trying to destroy orphaned nested container if root container is orphaned as well. + * [MESOS-9968] - WWWAuthenticate header parsing fails when commas are in (quoted) realm. * [MESOS-10007] - Command executor can miss exit status for short-lived commands due to double-reaping. * [MESOS-10015] - updateAllocation() can stall the allocator with a huge number of reservations on an agent. + * [MESOS-10018] - Duplicate tasks if agent partitioned during maintenance down. + * [MESOS-10084] - Detecting whether executor is generated for command task should work when the launcher_dir changes. + * [MESOS-10092] - Cannot pull image from docker registry which does not reply with 'scope'/'service' in WWW-Authenticate header. ** Improvements * [MESOS-8880] - Add minimum capabilities in the master. * [MESOS-9159] - Support Foreign URLs in docker registry puller. * [MESOS-9540] - Support `DESTROY_DISK` on preprovisioned CSI volumes. + * [MESOS-9545] - Marking an unreachable agent as gone should transition the tasks to terminal state. * [MESOS-9675] - Docker Manifest V2 Schema2 Support. * [MESOS-9704] - Support docker manifest v2s2 config GC. * [MESOS-9759] - Log required quota headroom and available quota headroom in the allocator.
[mesos] branch 1.7.x updated: Added missing issues to the 1.7.3 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.7.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.7.x by this push: new 1ae15d9 Added missing issues to the 1.7.3 CHANGELOG. 1ae15d9 is described below commit 1ae15d9f30fbce4cb2e4789af5abcd2cf309493b Author: Greg Mann AuthorDate: Tue Apr 28 22:52:02 2020 -0700 Added missing issues to the 1.7.3 CHANGELOG. --- CHANGELOG | 9 + 1 file changed, 9 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index b5334b0..d2e3a19 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,12 +4,15 @@ Release Notes - Mesos - Version 1.7.3 (WIP) ** Bug * [MESOS-8467] - Destroyed executors might be used after `Slave::publishResource()`. + * [MESOS-8537] - Default executor doesn't wait for status updates to be ack'd before shutting down. * [MESOS-9124] - Agent reconfiguration can cause master to unsuppress on scheduler's behalf. * [MESOS-9507] - Agent could not recover due to empty docker volume checkpointed files. * [MESOS-9529] - `/proc` should be remounted even if a nested container set `share_pid_namespace` to true. * [MESOS-9549] - nvidia/cuda 10 does not work on GPU isolator. * [MESOS-9564] - Logrotate container logger lets tasks execute arbitrary commands in the Mesos agent's namespace. * [MESOS-9568] - SLRP does not clean up mount directories for destroyed MOUNT disks. + * [MESOS-9581] - Mesos package naming appears to be undeterministic. + * [MESOS-9590] - Mesos CI sometimes, incorrectly, overwrites already-pushed mesos master nightly images with new images built from non-master branches. * [MESOS-9607] - Removing a resource provider with consumers breaks resource publishing. * [MESOS-9610] - Fetcher vulnerability - escaping from sandbox. * [MESOS-9616] - `Filters.refuse_seconds` declines resources not in offers. @@ -25,6 +28,7 @@ Release Notes - Mesos - Version 1.7.3 (WIP) * [MESOS-9787] - Log slow SSL (TLS) peer reverse DNS lookup. * [MESOS-9803] - Memory leak caused by an infinite chain of futures in `UriDiskProfileAdaptor`. * [MESOS-9836] - Docker containerizer overwrites `/mesos/slave` cgroups. + * [MESOS-9847] - Docker executor doesn't wait for status updates to be ack'd before shutting down. * [MESOS-9852] - Slow memory growth in master due to deferred deletion of offer filters and timers. * [MESOS-9856] - REVIVE call with specified role(s) clears filters for all roles of a framework. * [MESOS-9868] - NetworkInfo from the agent /state endpoint is not correct. @@ -35,13 +39,18 @@ Release Notes - Mesos - Version 1.7.3 (WIP) * [MESOS-9925] - Default executor takes a couple of seconds to start and subscribe Mesos agent. * [MESOS-9964] - Support destroying UCR containers in provisioning state. * [MESOS-9966] - Agent crashes when trying to destroy orphaned nested container if root container is orphaned as well. + * [MESOS-9968] - WWWAuthenticate header parsing fails when commas are in (quoted) realm. * [MESOS-10007] - Command executor can miss exit status for short-lived commands due to double-reaping. * [MESOS-10015] - updateAllocation() can stall the allocator with a huge number of reservations on an agent. + * [MESOS-10018] - Duplicate tasks if agent partitioned during maintenance down. + * [MESOS-10084] - Detecting whether executor is generated for command task should work when the launcher_dir changes. + * [MESOS-10092] - Cannot pull image from docker registry which does not reply with 'scope'/'service' in WWW-Authenticate header. ** Improvements * [MESOS-8880] - Add minimum capabilities in the master. * [MESOS-9159] - Support Foreign URLs in docker registry puller. * [MESOS-9540] - Support `DESTROY_DISK` on preprovisioned CSI volumes. + * [MESOS-9545] - Marking an unreachable agent as gone should transition the tasks to terminal state. * [MESOS-9675] - Docker Manifest V2 Schema2 Support. * [MESOS-9704] - Support docker manifest v2s2 config GC. * [MESOS-9759] - Log required quota headroom and available quota headroom in the allocator.
[mesos] branch master updated: Updated executor API docs to include the domain socket.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new ae7b90d Updated executor API docs to include the domain socket. ae7b90d is described below commit ae7b90d74a61d81d8b716a67cac921d6542a305a Author: Greg Mann AuthorDate: Tue Apr 28 15:44:21 2020 -0700 Updated executor API docs to include the domain socket. Review: https://reviews.apache.org/r/72413/ --- docs/executor-http-api.md | 97 +-- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/docs/executor-http-api.md b/docs/executor-http-api.md index 4af4cd4..fba58d9 100644 --- a/docs/executor-http-api.md +++ b/docs/executor-http-api.md @@ -7,48 +7,87 @@ layout: documentation A Mesos executor can be built in two different ways: -1. By using the `ExecutorDriver` C++ interface. The `ExecutorDriver` handles the -details of communicating with the Mesos agent. Executor developers implement -custom executor logic by registering callbacks with the `ExecutorDriver` for -significant events, such as when a new task launch request is received. Because -the `ExecutorDriver` interface is written in C++, this typically requires that -executor developers either use C++ or use a C++ binding to their language of -choice (e.g., JNI when using JVM-based languages). - -2. By using the new HTTP API. This allows Mesos executors to be developed -without using C++ or a native client library; instead, a custom executor -interacts with the Mesos agent via HTTP requests, as described below. Although -it is theoretically possible to use the HTTP executor API "directly" (e.g., by -using a generic HTTP library), most executor developers should use a library for -their language of choice that manages the details of the HTTP API; see the -document on [HTTP API client libraries](api-client-libraries.md) for a list. - -The v1 Executor HTTP API was introduced in Mesos 0.28.0. As of Mesos 1.0, it is -considered stable and is the recommended way to develop new Mesos executors. +1. By using the HTTP API. This allows Mesos executors to be developed without +using C++ or a native client library; instead, a custom executor interacts with +the Mesos agent via HTTP requests, as described below. Although it is +theoretically possible to use the HTTP executor API "directly" (e.g., by using a +generic HTTP library), most executor developers should use a library for their +language of choice that manages the details of the HTTP API; see the document on +[HTTP API client libraries](api-client-libraries.md) for a list. This is the +recommended way to develop new Mesos executors. + +2. By using the deprecated `ExecutorDriver` C++ interface. While this interface +is still supported, note that new features are usually not added to it. The +`ExecutorDriver` handles the details of communicating with the Mesos agent. +Executor developers implement custom executor logic by registering callbacks +with the `ExecutorDriver` for significant events, such as when a new task launch +request is received. Because the `ExecutorDriver` interface is written in C++, +this typically requires that executor developers either use C++ or use a C++ +binding to their language of choice (e.g., JNI when using JVM-based languages). ## Overview -The executor interacts with Mesos via the [/api/v1/executor](endpoints/slave/api/v1/executor.md) agent endpoint. We refer to this endpoint with its suffix "/executor" in the rest of this document. This endpoint accepts HTTP POST requests with data encoded as JSON (Content-Type: application/json) or binary Protobuf (Content-Type: application/x-protobuf). The first request that the executor sends to "/executor" endpoint is called SUBSCRIBE and results in a streaming response ("200 OK" stat [...] - -**Executors are expected to keep the subscription connection open as long as possible (barring network errors, agent process restarts, software bugs, etc.) and incrementally process the response.** HTTP client libraries that can only parse the response after the connection is closed cannot be used. For the encoding used, please refer to **Events** section below. - -All subsequent (non-`SUBSCRIBE`) requests to the "/executor" endpoint (see details below in **Calls** section) must be sent using a different connection than the one used for subscription. The agent responds to these HTTP POST requests with "202 Accepted" status codes (or, for unsuccessful requests, with 4xx or 5xx status codes; details in later sections). The "202 Accepted" response means that a request has been accepted for processing, not that the processing of the request has been co [...] +The executor interacts with Mesos via the [/api/v1/executor] +(endpoin
[mesos] branch master updated: Fixed libevent SSL socket shutdown race condition.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new e8793b2 Fixed libevent SSL socket shutdown race condition. e8793b2 is described below commit e8793b2ca92524c76f96c11e4ca52f41f9a8d414 Author: Greg Mann AuthorDate: Mon Apr 13 13:41:08 2020 -0700 Fixed libevent SSL socket shutdown race condition. This fixes an issue where the functions `shutdown()` and `event_callback()` race to access the bufferevent held by our libevent SSL socket implementation, leading to a CHECK failure. This race resulted in MESOS-10111, where multiple rapid changes in ZK membership led to one master re-linking to another multiple times in RECONNECT mode. This causes `shutdown()` to be called on the existing socket while it's attempting a connection, at which point a failure to connect can produce the CHECK failure. Review: https://reviews.apache.org/r/72354/ --- 3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp b/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp index dcb6d8e..864802d 100644 --- a/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp +++ b/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp @@ -190,7 +190,9 @@ Try LibeventSSLSocketImpl::shutdown(int how) CHECK(__in_event_loop__); CHECK(self); -CHECK_NOTNULL(self->bev); +if (self->bev == nullptr) { + return; +} synchronized (self->bev) { Owned request;
[mesos] 01/03: Added agent-side validation for resource limits.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit ad612599df9e866b2a622baa4cdb659ece5c4574 Author: Greg Mann AuthorDate: Mon Apr 6 15:16:33 2020 -0700 Added agent-side validation for resource limits. This prevents tasks from being launched on agents which would not be capable of enforcing the specified limits. Review: https://reviews.apache.org/r/72297/ --- src/slave/slave.cpp | 64 +-- src/slave/slave.hpp | 3 ++ src/tests/master_tests.cpp| 10 +- src/tests/master_validation_tests.cpp | 20 +-- 4 files changed, 91 insertions(+), 6 deletions(-) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 6a48023..1a32c81 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -2168,6 +2168,34 @@ void Slave::runTask( } +Option Slave::validateResourceLimitsAndIsolators( +const vector& tasks) +{ + foreach (const TaskInfo& task, tasks) { +if (!(task.has_container() && + task.container().type() == ContainerInfo::DOCKER)) { + if (task.limits().count("cpus") && + !(strings::contains(flags.isolation, "cgroups/cpu") || +strings::contains(flags.isolation, "cgroups/all"))) { +return Error( +"CPU limits can only be set on tasks launched in Mesos containers" +" when the agent has loaded the 'cgroups/cpu' isolator"); + } + + if (task.limits().count("mem") && + !(strings::contains(flags.isolation, "cgroups/mem") || +strings::contains(flags.isolation, "cgroups/all"))) { +return Error( +"Memory limits can only be set on tasks launched in Mesos" +" containers when the agent has loaded the 'cgroups/mem' isolator"); + } +} + } + + return None(); +} + + void Slave::run( const FrameworkInfo& frameworkInfo, ExecutorInfo executorInfo, @@ -2320,6 +2348,40 @@ void Slave::run( } } + CHECK_NOTNULL(framework); + + Option error = validateResourceLimitsAndIsolators(tasks); + if (error.isSome()) { +// We report TASK_DROPPED to the framework because the task was +// never launched. For non-partition-aware frameworks, we report +// TASK_LOST for backward compatibility. +mesos::TaskState taskState = TASK_DROPPED; +if (!protobuf::frameworkHasCapability( +frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) { + taskState = TASK_LOST; +} + +foreach (const TaskInfo& _task, tasks) { + const StatusUpdate update = protobuf::createStatusUpdate( + frameworkId, + info.id(), + _task.task_id(), + taskState, + TaskStatus::SOURCE_SLAVE, + id::UUID::random(), + error->message, + TaskStatus::REASON_GC_ERROR); + + statusUpdate(update, UPID()); +} + +if (framework->idle()) { + removeFramework(framework); +} + +return; + } + const ExecutorID& executorId = executorInfo.executor_id(); if (HookManager::hooksAvailable()) { @@ -2342,8 +2404,6 @@ void Slave::run( } } - CHECK_NOTNULL(framework); - // Track the pending task / task group to ensure the framework is // not removed and the framework and top level executor directories // are not scheduled for deletion before '_run()' is called. diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp index d7e65e0..2cf45c6 100644 --- a/src/slave/slave.hpp +++ b/src/slave/slave.hpp @@ -158,6 +158,9 @@ public: const process::UPID& from, RunTaskMessage&& runTaskMessage); + Option validateResourceLimitsAndIsolators( + const std::vector& tasks); + // Made 'virtual' for Slave mocking. virtual void runTask( const process::UPID& from, diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 8cc8d20..785e5d5 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -4330,7 +4330,15 @@ TEST_F(MasterTest, TasksEndpoint) TestContainerizer containerizer(); Owned detector = master.get()->createDetector(); - Try> slave = StartSlave(detector.get(), ); + + // We must enable the CPU and memory isolators on the agent so that it can + // accept resource limits. + slave::Flags flags = CreateSlaveFlags(); + flags.isolation = "cgroups/cpu,cgroups/mem"; + + Try> slave = +StartSlave(detector.get(), , flags); + ASSERT_SOME(slave); MockScheduler sched; diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp index 9efca42..816635a 100644 --- a/src/tests/master_validation_tests.cpp +++ b/src/tests/master_validation_tests.cpp @@
[mesos] 03/03: Updated CFS tests to avoid checking CPU usage.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 924a6776ca2e474cc65fcb1d59938a0ff6ad46c8 Author: Greg Mann AuthorDate: Mon Apr 6 15:16:54 2020 -0700 Updated CFS tests to avoid checking CPU usage. This verification is error-prone and leads to flaky test failures. It is sufficient to verify that the cgroup values are set correctly. We will avoid going so far as to confirm that the kernel's scheduler is honoring those values. Review: https://reviews.apache.org/r/72309/ --- src/tests/containerizer/cgroups_isolator_tests.cpp | 59 ++ 1 file changed, 4 insertions(+), 55 deletions(-) diff --git a/src/tests/containerizer/cgroups_isolator_tests.cpp b/src/tests/containerizer/cgroups_isolator_tests.cpp index f4425f0..57158ae 100644 --- a/src/tests/containerizer/cgroups_isolator_tests.cpp +++ b/src/tests/containerizer/cgroups_isolator_tests.cpp @@ -386,8 +386,7 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_RevocableCpu) // This test verifies that a task launched with 0.5 cpu and 32MB memory as its // resource requests (but no resource limits specified) will have its CPU and -// memory's soft & hard limits and OOM score adjustment set correctly, and it -// cannot consume more cpu time than its CFS quota. +// memory's soft & hard limits and OOM score adjustment set correctly. TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskNoLimits) { Try> master = StartMaster(); @@ -436,17 +435,6 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskNoLimits) AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); - // Generate random numbers to max out a single core. We'll run this - // for 0.5 seconds of wall time so it should consume approximately - // 300 ms of total cpu time when limited to 0.6 cpu. We use - // /dev/urandom to prevent blocking on Linux when there's - // insufficient entropy. - string command = -"cat /dev/urandom > /dev/null & " -"export MESOS_TEST_PID=$! && " -"sleep 0.5 && " -"kill $MESOS_TEST_PID"; - // We will launch a task with 0.5 cpu and 32MB memory, and the command // executor will be given 0.1 cpu (`DEFAULT_EXECUTOR_CPUS`) and 32MB // memory (DEFAULT_EXECUTOR_MEM) by default, so we need 0.6 cpu and 64MB @@ -462,7 +450,7 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskNoLimits) TaskInfo task = createTask( offers.get()[0].slave_id(), Resources::parse("cpus:0.5;mem:32").get(), - command); + SLEEP_COMMAND(1000)); Future statusStarting; Future statusRunning; @@ -527,20 +515,6 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskNoLimits) Try oomScoreAdj = numify(strings::trim(read.get())); ASSERT_SOME_EQ(0, oomScoreAdj); - Future usage = containerizer->usage(containerId); - AWAIT_READY(usage); - - // Expect that no more than 400 ms of cpu time has been consumed. We - // also check that at least 50 ms of cpu time has been consumed so - // this test will fail if the host system is very heavily loaded. - // This behavior is correct because under such conditions we aren't - // actually testing the CFS cpu limiter. - double cpuTime = usage->cpus_system_time_secs() + - usage->cpus_user_time_secs(); - - EXPECT_GE(0.4, cpuTime); - EXPECT_LE(0.05, cpuTime); - driver.stop(); driver.join(); } @@ -548,7 +522,7 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskNoLimits) // This test verifies that a task launched with resource limits specified // will have its CPU and memory's soft & hard limits and OOM score adjustment -// set correctly, and it cannot consume more cpu time than its CFS quota. +// set correctly. TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskLimits) { Try> master = StartMaster(); @@ -603,17 +577,6 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskLimits) AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); - // Generate random numbers to max out a single core. We'll run - // this for 0.5 seconds of wall time so it should consume - // approximately 300 ms of total cpu time when limited to 0.6 - // cpu. We use /dev/urandom to prevent blocking on Linux when - // there's insufficient entropy. - string command = -"cat /dev/urandom > /dev/null & " -"export MESOS_TEST_PID=$! && " -"sleep 0.5 && " -"kill $MESOS_TEST_PID"; - // Launch a task with 0.2 cpu request, 0.5 cpu limit, half of // host total memory - `DEFAULT_EXECUTOR_MEM` as memory request // and half of host total memory as memory limit. @@ -632,7 +595,7 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskLimits) TaskInfo task = createTask( offers.get()[0].slave
[mesos] 02/03: Sent appropriate task status reason when task over memory request.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit be90edd31a1833c5ed706b39f3a5547ae8153dd2 Author: Greg Mann AuthorDate: Mon Apr 6 15:16:45 2020 -0700 Sent appropriate task status reason when task over memory request. Review: https://reviews.apache.org/r/72305/ --- src/common/protobuf_utils.cpp | 3 ++- .../mesos/isolators/cgroups/subsystems/memory.cpp | 24 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp index 723d85a..8d1d5c4 100644 --- a/src/common/protobuf_utils.cpp +++ b/src/common/protobuf_utils.cpp @@ -254,7 +254,8 @@ StatusUpdate createStatusUpdate( CHECK( reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION || reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION_DISK || -reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY) +reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY || +reason.get() == TaskStatus::REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED) << reason.get(); status->mutable_limitation()->mutable_resources()->CopyFrom( diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp index 15f87ba..60c7a89 100644 --- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp +++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp @@ -699,11 +699,33 @@ void MemorySubsystemProcess::oomWaited( ? (double) usage->bytes() / Bytes::MEGABYTES : 0), "*").get(); + TaskStatus::Reason reason = TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY; + + // If the container has a hard limit set higher than the soft limit, then + // check if the memory usage is above the soft limit but less than the hard + // limit. If so, we send a task status reason to the scheduler which indicates + // that this container was preferentially OOM-killed because it exceeded its + // memory request without hitting its memory limit. + Try softLimit = +cgroups::memory::soft_limit_in_bytes(hierarchy, cgroup); + + if (softLimit.isError()) { +LOG(ERROR) << "Failed to read 'memory.soft_limit_in_bytes': " + << softLimit.error(); + } else if (softLimit.get() < limit.get()) { +if (!usage.isError() && +!limit.isError() && +usage.get() > softLimit.get() && +usage.get() < limit.get()) { + reason = TaskStatus::REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED; +} + } + infos[containerId]->limitation.set( protobuf::slave::createContainerLimitation( mem, message.str(), - TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY)); + reason)); }
[mesos] branch master updated (92f8768 -> 924a677)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 92f8768 Fixed build of tests broken in 9bd3ea4665402943af070e64327e8d7dc341e301. new ad61259 Added agent-side validation for resource limits. new be90edd Sent appropriate task status reason when task over memory request. new 924a677 Updated CFS tests to avoid checking CPU usage. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/common/protobuf_utils.cpp | 3 +- .../mesos/isolators/cgroups/subsystems/memory.cpp | 24 +++- src/slave/slave.cpp| 64 +- src/slave/slave.hpp| 3 + src/tests/containerizer/cgroups_isolator_tests.cpp | 59 ++-- src/tests/master_tests.cpp | 10 +++- src/tests/master_validation_tests.cpp | 20 ++- 7 files changed, 120 insertions(+), 63 deletions(-)
[mesos] branch master updated: Added resource limits to the web UI.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new f8a3dd3 Added resource limits to the web UI. f8a3dd3 is described below commit f8a3dd334934094ec44e07fa350f958d218bc78f Author: Greg Mann AuthorDate: Tue Mar 31 21:55:42 2020 -0700 Added resource limits to the web UI. Review: https://reviews.apache.org/r/72269/ --- src/webui/app/agents/agent-executor.html | 82 +--- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/src/webui/app/agents/agent-executor.html b/src/webui/app/agents/agent-executor.html index 7ec56c3..d8a5250 100644 --- a/src/webui/app/agents/agent-executor.html +++ b/src/webui/app/agents/agent-executor.html @@ -106,13 +106,19 @@ class="table table-striped table-bordered table-condensed"> - ID - Name - Role - CPUs - GPUs - Mem - Disk + ID + Name + Role + CPUs + Mem + GPUs (allocated) + Disk (allocated) + + + Allocated + Limit + Allocated + Limit @@ -121,8 +127,10 @@ {{queued_task.name}} {{queued_task.role}} {{queued_task.resources.cpus | number}} - {{queued_task.resources.gpus | number}} + {{queued_task.limits.cpus | number}} {{queued_task.resources.mem * (1024 * 1024) | dataSize}} + {{queued_task.limits.mem * (1024 * 1024) | dataSize}} + {{queued_task.resources.gpus | number}} {{queued_task.resources.disk * (1024 * 1024) | dataSize}} @@ -132,16 +140,22 @@ class="table table-striped table-bordered table-condensed"> - ID - Name - Role - State - Health - CPUs (allocated) - GPUs (allocated) - Mem (allocated) - Disk (allocated) - + ID + Name + Role + State + Health + CPUs + Mem + GPUs (allocated) + Disk (allocated) + + + + Allocated + Limit + Allocated + Limit @@ -152,8 +166,10 @@ {{task.state}} {{task.healthy | taskHealth}} {{task.resources.cpus | number}} - {{task.resources.gpus | number}} + {{task.limits.cpus | number}} {{task.resources.mem * (1024 * 1024) | dataSize}} + {{task.limits.mem * (1024 * 1024) | dataSize}} + {{task.resources.gpus | number}} {{task.resources.disk * (1024 * 1024) | dataSize}} - ID - Name - Role - State - CPUs (allocated) - GPUs (allocated) - Mem (allocated) - Disk (allocated) - + ID + Name + Role + State + CPUs + Mem + GPUs (allocated) + Disk (allocated) + + + + Allocated + Limit + Allocated + Limit @@ -187,8 +209,10 @@ {{completed_task.role}} {{completed_task.state}} {{completed_task.resources.cpus | number}} - {{completed_task.resources.gpus | number}} + {{completed_task.limits.cpus | number}} {{completed_task.resources.mem * (1024 * 1024) | dataSize}} + {{completed_task.limits.mem * (1024 * 1024) | dataSize}} + {{completed_task.resources.gpus | number}} {{completed_task.resources.disk * (1024 * 1024) | dataSize}}
[mesos] branch master updated: Added resource limits to v0 endpoint results.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 77f5165 Added resource limits to v0 endpoint results. 77f5165 is described below commit 77f5165fd2f942a89063803a7be5465d35774b10 Author: Greg Mann AuthorDate: Tue Mar 24 21:13:44 2020 -0700 Added resource limits to v0 endpoint results. Review: https://reviews.apache.org/r/72262/ --- src/common/http.cpp | 31 +++ src/common/http.hpp | 1 + src/tests/common/http_tests.cpp | 7 +++ src/tests/master_tests.cpp | 14 -- 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/common/http.cpp b/src/common/http.cpp index 3dd77dc..093d837 100644 --- a/src/common/http.cpp +++ b/src/common/http.cpp @@ -633,6 +633,10 @@ JSON::Object model(const Task& task) object.values["state"] = TaskState_Name(task.state()); object.values["resources"] = model(task.resources()); + if (!task.limits().empty()) { +object.values["limits"] = model(task.limits()); + } + if (task.has_user()) { object.values["user"] = task.user(); } @@ -785,6 +789,18 @@ JSON::Object model(const FileInfo& fileInfo) return file; } + +JSON::Object model(const google::protobuf::Map& map) +{ + JSON::Object result, scalar; + + foreach (auto item, map) { +result.values[item.first] = item.second.value(); + } + + return result; +} + } // namespace internal { @@ -1082,6 +1098,17 @@ void json( } +// Used to include resource limits in JSON output. +void json( +JSON::ObjectWriter* writer, +const google::protobuf::Map& map) +{ + foreach (auto item, map) { +writer->field(item.first, item.second.value()); + } +} + + void json(JSON::ObjectWriter* writer, const Task& task) { writer->field("id", task.task_id().value()); @@ -1092,6 +1119,10 @@ void json(JSON::ObjectWriter* writer, const Task& task) writer->field("state", TaskState_Name(task.state())); writer->field("resources", task.resources()); + if (!task.limits().empty()) { +writer->field("limits", task.limits()); + } + // Tasks are not allowed to mix resources allocated to // different roles, see MESOS-6636. writer->field("role", task.resources().begin()->allocation_info().role()); diff --git a/src/common/http.hpp b/src/common/http.hpp index 02633e1..9d5b8ed 100644 --- a/src/common/http.hpp +++ b/src/common/http.hpp @@ -211,6 +211,7 @@ JSON::Object model(const ExecutorInfo& executorInfo); JSON::Array model(const Labels& labels); JSON::Object model(const Task& task); JSON::Object model(const FileInfo& fileInfo); +JSON::Object model(const google::protobuf::Map& map); void json(JSON::ObjectWriter* writer, const Task& task); diff --git a/src/tests/common/http_tests.cpp b/src/tests/common/http_tests.cpp index 5f36527..12dcf67 100644 --- a/src/tests/common/http_tests.cpp +++ b/src/tests/common/http_tests.cpp @@ -91,6 +91,8 @@ TEST(HTTPTest, ModelTask) taskInfo.mutable_command()->set_value("echo hello"); taskInfo.mutable_command()->set_user("user1"); taskInfo.mutable_discovery()->CopyFrom(discovery); + (*taskInfo.mutable_limits())["cpus"].set_value(1.0); + (*taskInfo.mutable_limits())["mem"].set_value(32); Task task = createTask(taskInfo, state, frameworkId); task.add_statuses()->CopyFrom(statuses[0]); @@ -110,6 +112,11 @@ TEST(HTTPTest, ModelTask) "\"gpus\":0," "\"mem\":0" " }," + " \"limits\":" + " {" + "\"cpus\": 1.0," + "\"mem\": 32" + " }," " \"slave_id\":\"s\"," " \"state\":\"TASK_RUNNING\"," " \"statuses\":" diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 617abfa..8cc8d20 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -4357,6 +4357,8 @@ TEST_F(MasterTest, TasksEndpoint) task1.mutable_slave_id()->MergeFrom(offer->slave_id()); task1.mutable_resources()->MergeFrom( Resources::parse("cpus:0.1;mem:12").get()); + (*task1.mutable_limits())["cpus"].set_value(0.5); + (*task1.mutable_limits())["mem"].set_value(64); task1.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); TaskInfo task2; @@ -4418,7 +4420,11 @@ TEST_F(MasterTest, TasksEndpoint) "\"framework_id\":\"" + frameworkId->valu
[mesos] branch master updated: Moved containerizer utils in CMakeLists.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 12e5e87 Moved containerizer utils in CMakeLists. 12e5e87 is described below commit 12e5e870c38681bfc0455960f89a41127dac3daf Author: Qian Zhang AuthorDate: Tue Mar 24 10:44:39 2020 -0700 Moved containerizer utils in CMakeLists. This is to ensure the function `calculateOOMScoreAdj()` can be resolved on Windows. Review: https://reviews.apache.org/r/72263/ --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5133550..96cd867 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -176,6 +176,7 @@ set(AGENT_SRC slave/containerizer/mesos/launcher_tracker.cpp slave/containerizer/mesos/mount.cpp slave/containerizer/mesos/paths.cpp + slave/containerizer/mesos/utils.cpp slave/containerizer/mesos/io/switchboard.cpp slave/containerizer/mesos/isolators/environment_secret.cpp slave/containerizer/mesos/isolators/filesystem/posix.cpp @@ -188,7 +189,6 @@ set(AGENT_SRC if (NOT WIN32) list(APPEND AGENT_SRC -slave/containerizer/mesos/utils.cpp slave/containerizer/mesos/isolators/docker/volume/driver.cpp slave/containerizer/mesos/isolators/docker/volume/paths.cpp slave/containerizer/mesos/isolators/network/cni/paths.cpp
[mesos] 05/05: Updated the comment for the 'share_cgroups' field.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit eb455e74ab8f31b93f7f5e87dc1829e05701e411 Author: Greg Mann AuthorDate: Fri Mar 20 10:35:39 2020 -0700 Updated the comment for the 'share_cgroups' field. Review: https://reviews.apache.org/r/72250/ --- include/mesos/mesos.proto| 22 +++--- include/mesos/v1/mesos.proto | 22 +++--- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index 6dba47e..9412ed7 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -3353,17 +3353,17 @@ message LinuxInfo { // If set as 'true', the container will share the cgroups from its parent // container, otherwise it will have its own cgroups created. Please note: - // 1. This field should be only used for the task containers in a task group - //(i.e., the 1st level nested containers). It will be ignored for the - //executor containers (i.e., the top-level containers) since the executor - //container will always have its own cgroups created, and it will also be - //ignored for the nested containers under the 1st nested container (e.g., - //the debug container running as 2nd level nested container) since those - //containers should always share cgroups from its parent container. - // 2. The value of this field should be same for all the tasks launched by a - //single executor. - // 3. It is not allowed to set resource limits for the task which has this - //field set as true. + // 1. For tasks in a task group launched via the LAUNCH_GROUP operation, + //this field may be set to 'true' or 'false'. Resource limits may only be + //set for tasks in a task group when this field is set to 'false'. + // 2. For tasks launched via the LAUNCH operation, this field may only be set + //to 'true', and in this case resource limits may be set on these tasks. + // 3. For containers launched via the agent's LAUNCH_NESTED_CONTAINER_SESSION + //call, this field must be set to 'true'. + // 4. For executor containers, this field may only be set to 'false'. + // 5. All tasks under a single executor must share the same value of this + //field, if it is set. Note that this means that all tasks within a single + //task group must set this field to the same value. optional bool share_cgroups = 8 [default = true]; } diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto index e96f51f..194c42c 100644 --- a/include/mesos/v1/mesos.proto +++ b/include/mesos/v1/mesos.proto @@ -3342,17 +3342,17 @@ message LinuxInfo { // If set as 'true', the container will share the cgroups from its parent // container, otherwise it will have its own cgroups created. Please note: - // 1. This field should be only used for the task containers in a task group - //(i.e., the 1st level nested containers). It will be ignored for the - //executor containers (i.e., the top-level containers) since the executor - //container will always have its own cgroups created, and it will also be - //ignored for the nested containers under the 1st nested container (e.g., - //the debug container running as 2nd level nested container) since those - //containers should always share cgroups from its parent container. - // 2. The value of this field should be same for all the tasks launched by a - //single executor. - // 3. It is not allowed to set resource limits for the task which has this - //field set as true. + // 1. For tasks in a task group launched via the LAUNCH_GROUP operation, + //this field may be set to 'true' or 'false'. Resource limits may only be + //set for tasks in a task group when this field is set to 'false'. + // 2. For tasks launched via the LAUNCH operation, this field may only be set + //to 'true', and in this case resource limits may be set on these tasks. + // 3. For containers launched via the agent's LAUNCH_NESTED_CONTAINER_SESSION + //call, this field must be set to 'true'. + // 4. For executor containers, this field may only be set to 'false'. + // 5. All tasks under a single executor must share the same value of this + //field, if it is set. Note that this means that all tasks within a single + //task group must set this field to the same value. optional bool share_cgroups = 8 [default = true]; }
[mesos] branch master updated (9ab68cb -> eb455e7)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 9ab68cb Updated default executor to call the `LaunchContainer` agent API. new 504af58 Added master validation for task resource limits and shared cgroups. new 72e78f0 Added tests for master validation of limits and shared cgroups. new 1088dd3 Added agent validation for shared cgroups. new 74c3550 Added tests for agent validation of shared cgroups. new eb455e7 Updated the comment for the 'share_cgroups' field. The 5 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: include/mesos/mesos.proto | 22 +- include/mesos/v1/mesos.proto | 22 +- src/master/validation.cpp | 230 +- src/slave/validation.cpp | 116 +++-- src/tests/master_validation_tests.cpp | 771 ++ src/tests/slave_validation_tests.cpp | 143 ++- 6 files changed, 1246 insertions(+), 58 deletions(-)
[mesos] 03/05: Added agent validation for shared cgroups.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 1088dd3b77eb903718b3df8064d5d1d6c379f25b Author: Greg Mann AuthorDate: Fri Mar 20 10:35:38 2020 -0700 Added agent validation for shared cgroups. Review: https://reviews.apache.org/r/72221/ --- src/slave/validation.cpp | 116 +++ 1 file changed, 87 insertions(+), 29 deletions(-) diff --git a/src/slave/validation.cpp b/src/slave/validation.cpp index efb2e0c..25e9fbd 100644 --- a/src/slave/validation.cpp +++ b/src/slave/validation.cpp @@ -171,8 +171,11 @@ Option validate( return Error("Expecting 'launch_nested_container' to be present"); } + const mesos::agent::Call::LaunchNestedContainer& launch = +call.launch_nested_container(); + Option error = validation::container::validateContainerId( - call.launch_nested_container().container_id()); + launch.container_id()); if (error.isSome()) { return Error("'launch_nested_container.container_id' is invalid" @@ -181,27 +184,36 @@ Option validate( // The parent `ContainerID` is required, so that we know // which container to place it underneath. - if (!call.launch_nested_container().container_id().has_parent()) { + if (!launch.container_id().has_parent()) { return Error("Expecting 'launch_nested_container.container_id.parent'" " to be present"); } - if (call.launch_nested_container().has_command()) { -error = common::validation::validateCommandInfo( -call.launch_nested_container().command()); + if (launch.has_command()) { +error = common::validation::validateCommandInfo(launch.command()); if (error.isSome()) { return Error("'launch_nested_container.command' is invalid" ": " + error->message); } } - if (call.launch_nested_container().has_container()) { -error = common::validation::validateContainerInfo( -call.launch_nested_container().container()); + if (launch.has_container()) { +error = common::validation::validateContainerInfo(launch.container()); if (error.isSome()) { return Error("'launch_nested_container.container' is invalid" ": " + error->message); } + +if (launch.container().has_linux_info() && +launch.container().linux_info().has_share_cgroups() && +!launch.container().linux_info().share_cgroups() && +launch.container_id().has_parent() && +launch.container_id().parent().has_parent()) { +return Error( +"'launch_nested_container' is invalid: containers nested at " +"the second level or greater cannot set 'share_cgroups' to " +"'false'"); +} } return None(); @@ -279,8 +291,11 @@ Option validate( "Expecting 'launch_nested_container_session' to be present"); } + const mesos::agent::Call::LaunchNestedContainerSession& launch = +call.launch_nested_container_session(); + Option error = validation::container::validateContainerId( - call.launch_nested_container_session().container_id()); + launch.container_id()); if (error.isSome()) { return Error("'launch_nested_container_session.container_id' is invalid" @@ -289,28 +304,34 @@ Option validate( // The parent `ContainerID` is required, so that we know // which container to place it underneath. - if (!call.launch_nested_container_session().container_id().has_parent()) { + if (!launch.container_id().has_parent()) { return Error( "Expecting 'launch_nested_container_session.container_id.parent'" " to be present"); } - if (call.launch_nested_container_session().has_command()) { -error = common::validation::validateCommandInfo( -call.launch_nested_container_session().command()); + if (launch.has_command()) { +error = common::validation::validateCommandInfo(launch.command()); if (error.isSome()) { return Error("'launch_nested_container_session.command' is invalid" ": " + error->message); } } - if (call.launch_nested_container_session().has_container()) { -error = common::validation::validateContainerInfo( -call.launch_nested_container_session().container()); + if (launch.has_container()) { +error = c
[mesos] 02/05: Added tests for master validation of limits and shared cgroups.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 72e78f02f08116c0d4c4a825144b078cfbee5052 Author: Greg Mann AuthorDate: Fri Mar 20 10:35:37 2020 -0700 Added tests for master validation of limits and shared cgroups. Review: https://reviews.apache.org/r/72217/ --- src/tests/master_validation_tests.cpp | 771 ++ 1 file changed, 771 insertions(+) diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp index 8d5e74e..9efca42 100644 --- a/src/tests/master_validation_tests.cpp +++ b/src/tests/master_validation_tests.cpp @@ -3486,6 +3486,214 @@ TEST_F(TaskValidationTest, TaskSettingDockerParameterName) driver.join(); } + +TEST_F(TaskValidationTest, ResourceLimitLessThanRequest) +{ + Try> master = StartMaster(); + ASSERT_SOME(master); + + Owned detector = master.get()->createDetector(); + Try> slave = StartSlave(detector.get()); + ASSERT_SOME(slave); + + MockScheduler sched; + MesosSchedulerDriver driver( + , DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(, _, _)); + + Future> offers; + EXPECT_CALL(sched, resourceOffers(, _)) +.WillOnce(FutureArg<1>()) +.WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->empty()); + + Future status; + EXPECT_CALL(sched, statusUpdate(, _)) +.WillOnce(FutureArg<1>()); + + Map limits; + limits["cpus"].set_value(0.01); + + TaskInfo task = createTask( + offers->at(0), + "exit 0", + None(), + "test-task", + id::UUID::random().toString(), + limits); + + driver.launchTasks(offers->at(0).id(), {task}); + + AWAIT_READY(status); + EXPECT_EQ(TASK_ERROR, status->state()); + EXPECT_TRUE(strings::contains( + status->message(), + "The cpu limit must be greater than or equal to the cpu request")); + + driver.stop(); + driver.join(); +} + + +TEST_F(TaskValidationTest, LimitOtherThanCpuOrMem) +{ + Try> master = StartMaster(); + ASSERT_SOME(master); + + Owned detector = master.get()->createDetector(); + Try> slave = StartSlave(detector.get()); + ASSERT_SOME(slave); + + MockScheduler sched; + MesosSchedulerDriver driver( + , DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(, _, _)); + + Future> offers; + EXPECT_CALL(sched, resourceOffers(, _)) +.WillOnce(FutureArg<1>()) +.WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->empty()); + + Future status; + EXPECT_CALL(sched, statusUpdate(, _)) +.WillOnce(FutureArg<1>()); + + Map limits; + limits["disk"].set_value(128); + + TaskInfo task = createTask( + offers->at(0), + "exit 0", + None(), + "test-task", + id::UUID::random().toString(), + limits); + + driver.launchTasks(offers->at(0).id(), {task}); + + AWAIT_READY(status); + EXPECT_EQ(TASK_ERROR, status->state()); + EXPECT_TRUE(strings::contains( + status->message(), + "Only cpus and mem may be included in a task's resource limits")); + + driver.stop(); + driver.join(); +} + + +TEST_F(TaskValidationTest, NestedCgroupInLaunchOperation) +{ + Try> master = StartMaster(); + ASSERT_SOME(master); + + Owned detector = master.get()->createDetector(); + Try> slave = StartSlave(detector.get()); + ASSERT_SOME(slave); + + MockScheduler sched; + MesosSchedulerDriver driver( + , DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); + + EXPECT_CALL(sched, registered(, _, _)); + + Future> offers; + EXPECT_CALL(sched, resourceOffers(, _)) +.WillOnce(FutureArg<1>()) +.WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->empty()); + + Future status; + EXPECT_CALL(sched, statusUpdate(, _)) +.WillOnce(FutureArg<1>()); + + TaskInfo task = createTask(offers->at(0), "exit 0"); + + task.mutable_container()->set_type(ContainerInfo::MESOS); + task.mutable_container()->mutable_linux_info()->set_share_cgroups(false); + + driver.launchTasks(offers->at(0).id(), {task}); + + AWAIT_READY(status); + EXPECT_EQ(TASK_ERROR, status->state()); + EXPECT_TRUE(strings::contains( + status->message(), + "Only tasks in a task group may have 'share_cgroups' set to 'false'")); + + driver.stop(); + driver.join(); +} + + +TEST_F(TaskValidationTest, SharedCgroupOnExecutor) +{ + Try> master = StartMaster(); + ASSERT_SOME(master); + + Owned detector = master.get()->cr
[mesos] 04/05: Added tests for agent validation of shared cgroups.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 74c355060304da4287d286ed775df072b6101816 Author: Greg Mann AuthorDate: Fri Mar 20 10:35:38 2020 -0700 Added tests for agent validation of shared cgroups. This patch adds validation for shared cgroups when specified via the agent APIs. In doing so, a new validation test is added for the agent's LaunchContainer API, since this was previously missing. Review: https://reviews.apache.org/r/7/ --- src/tests/slave_validation_tests.cpp | 143 ++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/src/tests/slave_validation_tests.cpp b/src/tests/slave_validation_tests.cpp index 25019cc..f6203ee 100644 --- a/src/tests/slave_validation_tests.cpp +++ b/src/tests/slave_validation_tests.cpp @@ -495,8 +495,24 @@ TEST(AgentCallValidationTest, LaunchNestedContainerSession) "variable 'ENV_VAR_KEY' of type 'VALUE' must have a value set", error->message); - // Test the valid case. + // Container with 'share_cgroups' set to 'false', which is not allowed for + // nested container sessions. variable->set_value("env_var_value"); + + launch->mutable_container()->set_type(ContainerInfo::MESOS); + launch->mutable_container()->mutable_linux_info()->set_share_cgroups(false); + + error = validation::agent::call::validate(call); + EXPECT_SOME(error); + EXPECT_TRUE(strings::contains( + error->message, + "'launch_nested_container_session.container.linux_info' is invalid: " + "'share_cgroups' cannot be set to 'false' for nested container " + "sessions")); + + // Test the valid case. + launch->mutable_container()->mutable_linux_info()->set_share_cgroups(true); + error = validation::agent::call::validate(call); EXPECT_NONE(error); @@ -512,6 +528,131 @@ TEST(AgentCallValidationTest, LaunchNestedContainerSession) } +TEST(AgentCallValidationTest, LaunchContainer) +{ + // Missing `launch_container`. + agent::Call call; + call.set_type(agent::Call::LAUNCH_CONTAINER); + + Option error = validation::agent::call::validate(call); + EXPECT_SOME(error); + + // `container_id` is not valid. + ContainerID badContainerId; + badContainerId.set_value("no spaces allowed"); + + agent::Call::LaunchContainer* launch = call.mutable_launch_container(); + + launch->mutable_container_id()->CopyFrom(badContainerId); + + error = validation::agent::call::validate(call); + EXPECT_SOME(error); + + // Invalid `command.environment`. Set an invalid environment variable to check + // that the common validation code for the command's environment is being + // executed. + ContainerID containerId; + containerId.set_value(id::UUID::random().toString()); + + launch->mutable_container_id()->CopyFrom(containerId); + + launch->mutable_command()->CopyFrom(createCommandInfo("exit 0")); + + Environment::Variable* variable = launch +->mutable_command() +->mutable_environment() +->mutable_variables() +->Add(); + variable->set_name("ENV_VAR_KEY"); + variable->set_type(mesos::Environment::Variable::VALUE); + + error = validation::agent::call::validate(call); + EXPECT_SOME(error); + EXPECT_EQ( + "'launch_container.command' is invalid: Environment variable " + "'ENV_VAR_KEY' of type 'VALUE' must have a value set", + error->message); + + // Invalid resources. + variable->set_value("env_var_value"); + + Resource cpus; + cpus.set_type(Value::SCALAR); + cpus.set_name("cpus"); + cpus.mutable_scalar()->set_value(-0.1); + + launch->add_resources()->CopyFrom(cpus); + + error = validation::agent::call::validate(call); + EXPECT_SOME(error); + EXPECT_EQ( + "Invalid resources: Resource 'cpus:-0.1' is invalid: " + "Invalid scalar resource: value <= 0", + error->message); + + // Invalid 'ContainerInfo'. + launch->clear_resources(); + cpus.mutable_scalar()->set_value(0.1); + launch->add_resources()->CopyFrom(cpus); + + launch->mutable_container()->set_type(ContainerInfo::DOCKER); + + error = validation::agent::call::validate(call); + EXPECT_SOME(error); + EXPECT_EQ( + "'launch_container.container' is invalid: DockerInfo 'docker' is not set " + "for DOCKER typed ContainerInfo", + error->message); + + // Container with 'share_cgroups' set to 'true', which is not allowed for + // containers with no parent. + launch->mutable_container()->set_type(ContainerInfo::MESOS); + launch->mutable_container()->mutable_linux_info()->set_share_cgroups(true); + + error = validation::agent::call::validate(call
[mesos] 01/05: Added master validation for task resource limits and shared cgroups.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 504af58dda64b73581b0398c83952421aea64d39 Author: Greg Mann AuthorDate: Fri Mar 20 10:35:37 2020 -0700 Added master validation for task resource limits and shared cgroups. Review: https://reviews.apache.org/r/72216/ --- src/master/validation.cpp | 230 -- 1 file changed, 224 insertions(+), 6 deletions(-) diff --git a/src/master/validation.cpp b/src/master/validation.cpp index 084f281..5b1bcb5 100644 --- a/src/master/validation.cpp +++ b/src/master/validation.cpp @@ -17,6 +17,7 @@ #include "master/validation.hpp" #include +#include #include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include @@ -45,6 +47,8 @@ #include "master/master.hpp" +using process::Owned; + using process::http::authentication::Principal; using std::pair; @@ -1540,6 +1544,77 @@ Option validateContainerInfo(const TaskInfo& task) } +Option validateResourceLimits( +const TaskInfo& task, +Slave* slave) +{ + auto limits = task.limits(); + + if (!limits.empty()) { +if (!slave->capabilities.taskResourceLimits) { + return Error("Agent is not capable of handling task resource limits"); +} + +// Ensure that only "cpus" and "mem" are included. +const size_t cpuCount = limits.count("cpus"); +const size_t memCount = limits.count("mem"); + +if (limits.size() > cpuCount + memCount) { + return Error( + "Only cpus and mem may be included in a task's resource limits"); +} + +if (cpuCount) { + Option taskCpus = Resources(task.resources()).cpus(); + if (taskCpus.isNone()) { +return Error( +"When a CPU limit is specified, a CPU request must also be " +"specified"); + } + + if (limits.at("cpus").value() < taskCpus.get()) { +return Error( +"The cpu limit must be greater than or equal to the cpu request"); + } +} + +if (memCount) { + Option taskMem = Resources(task.resources()).mem(); + if (taskMem.isNone()) { +return Error( +"When a memory limit is specified, a memory request must also be " +"specified"); + } + + if (!std::isinf(limits.at("mem").value()) && + Bytes(limits.at("mem").value(), Bytes::MEGABYTES) < taskMem.get()) { +return Error( +"The memory limit must be greater" +" than or equal to the memory request"); + } +} + } + + return None(); +} + + +// This validation function should only be executed for tasks which are launched +// via the LAUNCH operation, not the LAUNCH_GROUP operation. +Option validateShareCgroups(const TaskInfo& task) +{ + if (task.has_container() && + task.container().has_linux_info() && + task.container().linux_info().has_share_cgroups() && + !task.container().linux_info().share_cgroups()) { +return Error( +"Only tasks in a task group may have 'share_cgroups' set to 'false'"); + } + + return None(); +} + + // Validates task specific fields except its executor (if it exists). Option validateTask( const TaskInfo& task, @@ -1561,7 +1636,8 @@ Option validateTask( lambda::bind(internal::validateHealthCheck, task), lambda::bind(internal::validateResources, task), lambda::bind(internal::validateCommandInfo, task), -lambda::bind(internal::validateContainerInfo, task) +lambda::bind(internal::validateContainerInfo, task), +lambda::bind(internal::validateResourceLimits, task, slave) }; foreach (const lambda::function()>& validator, validators) { @@ -1659,6 +1735,15 @@ Option validateExecutor( << "in future releases."; } +if (executor.has_container() && +executor.container().has_linux_info() && +executor.container().linux_info().has_share_cgroups() && +executor.container().linux_info().share_cgroups()) { + return Error( + "The 'share_cgroups' field cannot be set to 'true'" + " on executor containers"); +} + if (!slave->hasExecutor(framework->id(), task.executor().executor_id())) { total += executorResources; } @@ -1698,7 +1783,8 @@ Option validate( vector()>> validators = { lambda::bind(internal::validateTask, task, framework, slave), -lambda::bind(internal::validateExecutor, task, framework, slave, offered) +lambda::bind(internal::validateExecutor, task, framework, slave, offered), +l
[mesos] branch master updated (4d9013d -> 59ba377)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 4d9013d Added test for removal of ObjectApprovers of disconnected framework. new f445e3a Added the 'TASK_RESOURCE_LIMITS' agent capability. new 59ba377 Cleaned up agent capability validation and associated docs. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docs/configuration/agent.md | 7 +-- docs/upgrades.md | 5 + include/mesos/mesos.proto | 4 include/mesos/v1/mesos.proto | 4 src/common/protobuf_utils.cpp | 3 ++- src/common/protobuf_utils.hpp | 7 +++ src/slave/constants.cpp | 1 + src/slave/flags.cpp | 26 ++ src/tests/master_tests.cpp| 3 ++- src/tests/slave_tests.cpp | 3 ++- 10 files changed, 42 insertions(+), 21 deletions(-)
[mesos] 02/02: Cleaned up agent capability validation and associated docs.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 59ba377616d10c248e4f0607a71ecf6658084e59 Author: Greg Mann AuthorDate: Tue Mar 3 06:03:58 2020 -0800 Cleaned up agent capability validation and associated docs. Review: https://reviews.apache.org/r/72087/ --- docs/configuration/agent.md | 4 +++- src/slave/flags.cpp | 20 +--- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md index 1498df4..01ffa38 100644 --- a/docs/configuration/agent.md +++ b/docs/configuration/agent.md @@ -93,7 +93,8 @@ Example: JSON representation of agent features to whitelist. We always require 'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', -'AGENT_OPERATION_FEEDBACK', 'AGENT_DRAINING', and 'TASK_RESOURCE_LIMITS'. +'AGENT_OPERATION_FEEDBACK', 'RESOURCE_PROVIDER', 'AGENT_DRAINING', and +'TASK_RESOURCE_LIMITS'. Example: @@ -103,6 +104,7 @@ Example: {"type": "HIERARCHICAL_ROLE"}, {"type": "RESERVATION_REFINEMENT"}, {"type": "AGENT_OPERATION_FEEDBACK"}, +{"type": "RESOURCE_PROVIDER"}, {"type": "AGENT_DRAINING"}, {"type": "TASK_RESOURCE_LIMITS"} ] diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp index 5966436..2f88b90 100644 --- a/src/slave/flags.cpp +++ b/src/slave/flags.cpp @@ -812,7 +812,7 @@ mesos::internal::slave::Flags::Flags() "agent_features", "JSON representation of agent features to whitelist. We always require\n" "'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT',\n" - "'AGENT_OPERATION_FEEDBACK', 'AGENT_DRAINING', and\n" + "'AGENT_OPERATION_FEEDBACK', 'RESOURCE_PROVIDER', 'AGENT_DRAINING', and\n" "'TASK_RESOURCE_LIMITS'.\n" "\n" "Example:\n" @@ -822,6 +822,7 @@ mesos::internal::slave::Flags::Flags() "{\"type\": \"HIERARCHICAL_ROLE\"},\n" "{\"type\": \"RESERVATION_REFINEMENT\"},\n" "{\"type\": \"AGENT_OPERATION_FEEDBACK\"},\n" + "{\"type\": \"RESOURCE_PROVIDER\"},\n" "{\"type\": \"AGENT_DRAINING\"},\n" "{\"type\": \"TASK_RESOURCE_LIMITS\"}\n" "]\n" @@ -836,25 +837,14 @@ mesos::internal::slave::Flags::Flags() !capabilities.hierarchicalRole || !capabilities.reservationRefinement || !capabilities.agentOperationFeedback || + !capabilities.resourceProvider || !capabilities.agentDraining || !capabilities.taskResourceLimits) { return Error( "At least the following agent features need to be enabled:" " MULTI_ROLE, HIERARCHICAL_ROLE, RESERVATION_REFINEMENT," -" AGENT_OPERATION_FEEDBACK, AGENT_DRAINING, and" -" TASK_RESOURCE_LIMITS"); - } - - if (capabilities.resizeVolume && !capabilities.resourceProvider) { -return Error( -"RESIZE_VOLUME feature requires RESOURCE_PROVIDER feature"); - } - - if (capabilities.agentOperationFeedback && - !capabilities.resourceProvider) { -return Error( -"AGENT_OPERATION_FEEDBACK feature" -" requires RESOURCE_PROVIDER feature"); +" AGENT_OPERATION_FEEDBACK, RESOURCE_PROVIDER, AGENT_DRAINING," +" and TASK_RESOURCE_LIMITS"); } }
[mesos] 01/02: Added the 'TASK_RESOURCE_LIMITS' agent capability.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit f445e3aea44b4060292fa5e029dbb2c19e219c25 Author: Greg Mann AuthorDate: Tue Mar 3 06:03:57 2020 -0800 Added the 'TASK_RESOURCE_LIMITS' agent capability. This capability will be used by the master to detect whether or not an agent can handle task resource limits. Review: https://reviews.apache.org/r/71991/ --- docs/configuration/agent.md | 5 +++-- docs/upgrades.md | 5 + include/mesos/mesos.proto | 4 include/mesos/v1/mesos.proto | 4 src/common/protobuf_utils.cpp | 3 ++- src/common/protobuf_utils.hpp | 7 +++ src/slave/constants.cpp | 1 + src/slave/flags.cpp | 12 src/tests/master_tests.cpp| 3 ++- src/tests/slave_tests.cpp | 3 ++- 10 files changed, 38 insertions(+), 9 deletions(-) diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md index 0e703d8..1498df4 100644 --- a/docs/configuration/agent.md +++ b/docs/configuration/agent.md @@ -93,7 +93,7 @@ Example: JSON representation of agent features to whitelist. We always require 'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', -'AGENT_OPERATION_FEEDBACK', and 'AGENT_DRAINING'. +'AGENT_OPERATION_FEEDBACK', 'AGENT_DRAINING', and 'TASK_RESOURCE_LIMITS'. Example: @@ -103,7 +103,8 @@ Example: {"type": "HIERARCHICAL_ROLE"}, {"type": "RESERVATION_REFINEMENT"}, {"type": "AGENT_OPERATION_FEEDBACK"}, -{"type": "AGENT_DRAINING"} +{"type": "AGENT_DRAINING"}, +{"type": "TASK_RESOURCE_LIMITS"} ] } diff --git a/docs/upgrades.md b/docs/upgrades.md index afd9dbb..1e73e3d 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -54,6 +54,7 @@ We categorize the changes as follows: + C agent_features @@ -558,6 +559,10 @@ We categorize the changes as follows: The canonical name for the environment variable `LIBPROCESS_SSL_REQUIRE_CERT` was changed to `LIBPROCESS_SSL_REQUIRE_CLIENT_CERT`. The old names will continue to work as before, but operators are encouraged to update their configuration to reduce confusion. + + +* The Mesos agent now requires the new `TASK_RESOURCE_LIMITS` feature. This capability is set by default, but if the `--agent_features` flag is specified explicitly, `TASK_RESOURCE_LIMITS` must be included. + ## Upgrading from 1.8.x to 1.9.x ## diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index d0aed5a..40c45de 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -1050,6 +1050,10 @@ message SlaveInfo { // This expresses the ability for the agent to automatically drain tasks // in preparation for operator maintenance. This capability is required. AGENT_DRAINING = 7; + + // This expresses the ability for the agent to launch tasks which specify + // resource limits for CPU and/or memory. + TASK_RESOURCE_LIMITS = 8; } // Enum fields should be optional, see: MESOS-4997. diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto index 06c4816..6387636 100644 --- a/include/mesos/v1/mesos.proto +++ b/include/mesos/v1/mesos.proto @@ -1038,6 +1038,10 @@ message AgentInfo { // This expresses the ability for the agent to automatically drain tasks // in preparation for operator maintenance. This capability is required. AGENT_DRAINING = 7; + + // This expresses the ability for the agent to launch tasks which specify + // resource limits for CPU and/or memory. + TASK_RESOURCE_LIMITS = 8; } // Enum fields should be optional, see: MESOS-4997. diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp index 7fe4a44..b3057be 100644 --- a/src/common/protobuf_utils.cpp +++ b/src/common/protobuf_utils.cpp @@ -1140,7 +1140,8 @@ bool operator==(const Capabilities& left, const Capabilities& right) left.resourceProvider == right.resourceProvider && left.resizeVolume == right.resizeVolume && left.agentOperationFeedback == right.agentOperationFeedback && - left.agentDraining == right.agentDraining; + left.agentDraining == right.agentDraining && + left.taskResourceLimits == right.taskResourceLimits; } diff --git a/src/common/protobuf_utils.hpp b/src/common/protobuf_utils.hpp index 3852f59..0558249 100644 --- a/src/common/protobuf_utils.hpp +++ b/src/common/protobuf_utils.hpp @@ -361,6 +361,9 @@ struct Capabilities case SlaveInfo::Capability::AGENT_DRAINING: agentDraining = true; break; +case SlaveInfo::Capability::TASK_RESOURCE_LIMITS: +
[mesos] branch master updated (1dd099f -> 4bb7ef9)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 1dd099f Removed dead code from v1 API serialization changes. new e258059 SSL Socket: Moved accept callback logic into protected function. new 599b9e8 Reverted SSL Socket guard against downgrade. new 4bb7ef9 SSL Socket: Added downgrade support. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: 3rdparty/libprocess/src/openssl.cpp| 6 - 3rdparty/libprocess/src/ssl/openssl_socket.cpp | 247 - 3rdparty/libprocess/src/ssl/openssl_socket.hpp | 6 + 3rdparty/libprocess/src/tests/ssl_tests.cpp| 3 - 4 files changed, 168 insertions(+), 94 deletions(-)
[mesos] 03/03: SSL Socket: Added downgrade support.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 4bb7ef93a9555b4c40efa4a1000b560e58ac9858 Author: Joseph Wu AuthorDate: Wed Feb 26 17:14:59 2020 +0100 SSL Socket: Added downgrade support. This adds downgrade support, in the same fashion that the Libevent SSL socket does (and copies a good chunk of the code from there too). To account for Windows not having `io::poll`, a slight hack is taken to check for readable bytes. Review: https://reviews.apache.org/r/72017/ --- 3rdparty/libprocess/src/ssl/openssl_socket.cpp | 66 ++ 1 file changed, 66 insertions(+) diff --git a/3rdparty/libprocess/src/ssl/openssl_socket.cpp b/3rdparty/libprocess/src/ssl/openssl_socket.cpp index 43909f0..a2ec0a3 100644 --- a/3rdparty/libprocess/src/ssl/openssl_socket.cpp +++ b/3rdparty/libprocess/src/ssl/openssl_socket.cpp @@ -573,6 +573,72 @@ Future> OpenSSLSocketImpl::accept() return Break(); } + // If we support downgrading the connection, first wait for this + // socket to become readable. We will then MSG_PEEK it to test + // whether we want to dispatch as SSL or non-SSL. + if (openssl::flags().support_downgrade) { +#ifdef __WINDOWS__ +// Since there is no `io::poll` on Windows, we instead make +// a 0-byte read, which will only return once there is something +// to read. +return io::read(socket->get(), nullptr, 0) +#else +return io::poll(socket->get(), process::io::READ) +#endif // __WINDOWS__ + .then([weak_self, socket]() -> Future> { +std::shared_ptr self(weak_self.lock()); + +if (self == nullptr) { + return Break(); +} + +char data[6]; + +// Try to peek the first 6 bytes of the message. +ssize_t size = ::recv(socket->get(), data, 6, MSG_PEEK); + +// Based on the function 'ssl23_get_client_hello' in openssl, we +// test whether to dispatch to the SSL or non-SSL based accept +// based on the following rules: +// 1. If there are fewer than 3 bytes: non-SSL. +// 2. If the 1st bit of the 1st byte is set AND the 3rd byte +// is equal to SSL2_MT_CLIENT_HELLO: SSL. +// 3. If the 1st byte is equal to SSL3_RT_HANDSHAKE AND the +// 2nd byte is equal to SSL3_VERSION_MAJOR and the 6th byte +// is equal to SSL3_MT_CLIENT_HELLO: SSL. +// 4. Otherwise: non-SSL. + +// For an ascii based protocol to falsely get dispatched to SSL +// it needs to: +// 1. Start with an invalid ascii character (0x80). +// 2. OR have the first 2 characters be a SYN followed by ETX, +// and then the 6th character be SOH. +// These conditions clearly do not constitute valid HTTP +// requests, and are unlikely to collide with other existing +// protocols. + +bool ssl = false; // Default to rule 4. + +if (size < 2) { // Rule 1. + ssl = false; +} else if ((data[0] & 0x80) && + data[2] == SSL2_MT_CLIENT_HELLO) { // Rule 2. + ssl = true; +} else if (data[0] == SSL3_RT_HANDSHAKE && + data[1] == SSL3_VERSION_MAJOR && + data[5] == SSL3_MT_CLIENT_HELLO) { // Rule 3. + ssl = true; +} + +if (ssl) { + return self->handle_accept_callback(socket); +} else { + self->accept_queue.put(socket); + return Continue(); +} + }); + } + return self->handle_accept_callback(socket); });
[mesos] 02/03: Reverted SSL Socket guard against downgrade.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 599b9e83c6c0659e12b0bccaf7c610b70158737c Author: Joseph Wu AuthorDate: Wed Feb 26 17:14:55 2020 +0100 Reverted SSL Socket guard against downgrade. This reverts commit 34bac34419ebec8441e69d3a5684381468352399. Review: https://reviews.apache.org/r/72016/ --- 3rdparty/libprocess/src/openssl.cpp | 6 -- 3rdparty/libprocess/src/tests/ssl_tests.cpp | 3 --- 2 files changed, 9 deletions(-) diff --git a/3rdparty/libprocess/src/openssl.cpp b/3rdparty/libprocess/src/openssl.cpp index b2dd2fe..ec7d6e8 100644 --- a/3rdparty/libprocess/src/openssl.cpp +++ b/3rdparty/libprocess/src/openssl.cpp @@ -550,14 +550,8 @@ void reinitialize() // Notify users of the 'SSL_SUPPORT_DOWNGRADE' flag that this // setting allows insecure connections. if (ssl_flags->support_downgrade) { -#ifdef USE_LIBEVENT LOG(WARNING) << "Failed SSL connections will be downgraded to a non-SSL socket"; -#else -EXIT(EXIT_FAILURE) - << "Non-libevent SSL sockets do not support downgrade yet," - << " see MESOS-10073"; -#endif // USE_LIBEVENT } // TODO(bevers): Remove the deprecated names for these flags after an diff --git a/3rdparty/libprocess/src/tests/ssl_tests.cpp b/3rdparty/libprocess/src/tests/ssl_tests.cpp index a6563fb..3f1d103 100644 --- a/3rdparty/libprocess/src/tests/ssl_tests.cpp +++ b/3rdparty/libprocess/src/tests/ssl_tests.cpp @@ -483,8 +483,6 @@ TEST_F(SSLTest, ECDHESupport) } -// TODO(josephw): Support downgrades on the native OpenSSL socket (MESOS-10073). -#ifdef USE_LIBEVENT // Ensure we can communicate between a POLL based socket and an SSL // socket if 'SSL_SUPPORT_DOWNGRADE' is enabled. TEST_F(SSLTest, ValidDowngrade) @@ -583,7 +581,6 @@ TEST_F(SSLTest, ValidDowngradeEachProtocol) AWAIT_ASSERT_READY(await_subprocess(client.get(), 0)); } } -#endif // USE_LIBEVENT // For each protocol: ensure we CANNOT communicate between a POLL
[mesos] 01/03: SSL Socket: Moved accept callback logic into protected function.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit e258059d49779a02b6267a9d0829bd244b882ed5 Author: Joseph Wu AuthorDate: Wed Feb 26 17:14:50 2020 +0100 SSL Socket: Moved accept callback logic into protected function. To support SSL downgrades, this logic will need to be called from two potential callsites. Also fixes a slight typo in a comment within the moved code. Review: https://reviews.apache.org/r/72014/ --- 3rdparty/libprocess/src/ssl/openssl_socket.cpp | 185 + 3rdparty/libprocess/src/ssl/openssl_socket.hpp | 6 + 2 files changed, 104 insertions(+), 87 deletions(-) diff --git a/3rdparty/libprocess/src/ssl/openssl_socket.cpp b/3rdparty/libprocess/src/ssl/openssl_socket.cpp index 74f9fe2..43909f0 100644 --- a/3rdparty/libprocess/src/ssl/openssl_socket.cpp +++ b/3rdparty/libprocess/src/ssl/openssl_socket.cpp @@ -573,93 +573,7 @@ Future> OpenSSLSocketImpl::accept() return Break(); } - // Wrap this new socket up into our SSL wrapper class by releasing - // the FD and creating a new OpenSSLSocketImpl object with the FD. - const std::shared_ptr ssl_socket = -std::make_shared(socket->release()); - - // Set up SSL object. - SSL* accept_ssl = SSL_new(openssl::context()); - if (accept_ssl == nullptr) { -self->accept_queue.put(Failure("Accept failed, SSL_new")); -return Continue(); - } - - Try peer_address = network::peer(ssl_socket->get()); - if (!peer_address.isSome()) { -SSL_free(accept_ssl); -self->accept_queue.put( -Failure("Could not determine peer IP for connection")); -return Continue(); - } - - // NOTE: Right now, `openssl::configure_socket` does not do anything - // in server mode, but we still pass the correct peer address to - // enable modules to implement application-level logic in the future. - Try configured = openssl::configure_socket( - accept_ssl, Mode::SERVER, peer_address.get(), None()); - - if (configured.isError()) { -SSL_free(accept_ssl); -self->accept_queue.put( -Failure("Could not configure socket: " + configured.error())); -return Continue(); - } - - // Set the SSL context in server mode. - SSL_set_accept_state(accept_ssl); - - // Pass ownership of `accept_ssl` to the newly accepted socket, - // and wtart the SSL handshake. When the SSL handshake completes, - // the listening socket will place the result (failure or success) - // onto the listening socket's `accept_queue`. - // - // TODO(josephw): Add a timeout to catch/close incoming sockets which - // never finish the SSL handshake. - ssl_socket->set_ssl_and_do_handshake(accept_ssl) -.onAny([weak_self, ssl_socket](Future result) { - std::shared_ptr self(weak_self.lock()); - - if (self == nullptr) { -return; - } - - if (result.isFailed()) { -self->accept_queue.put(Failure(result.failure())); -return; - } - - // For verification purposes, we need to grab the address (again). - Try address = network::address(ssl_socket->get()); - if (address.isError()) { -self->accept_queue.put( -Failure("Failed to get address: " + address.error())); -return; - } - - Try inet_address = -network::convert(address.get()); - - Try verify = openssl::verify( - ssl_socket->ssl, - Mode::SERVER, - None(), - inet_address.isSome() -? Some(inet_address->ip) -: Option::none()); - - if (verify.isError()) { -VLOG(1) << "Failed accept, verification error: " -<< verify.error(); - -self->accept_queue.put(Failure(verify.error())); -return; - } - - self->accept_queue.put(ssl_socket); -}); - - return Continue(); + return self->handle_accept_callback(socket); }); accept_loop_started.done(); @@ -735,6 +649,103 @@ Try OpenSSLSocketImpl::shutdown(int how) } +Future> OpenSSLSocketImpl::handle_accept_callback( +const std::shared_ptr& socket) +{ + // Wrap this new socket up into our SSL wrapper class by releasing + // the FD and creat
[mesos] branch master updated: Removed remaining domain socket code from the Windows build.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 65e18be Removed remaining domain socket code from the Windows build. 65e18be is described below commit 65e18bef2c5ff356ef74bac9aa79b128c5b186d9 Author: Greg Mann AuthorDate: Tue Feb 11 10:35:10 2020 -0800 Removed remaining domain socket code from the Windows build. These changes are needed to get the tests to run. Review: https://reviews.apache.org/r/72114/ --- src/slave/flags.hpp | 2 ++ src/slave/slave.cpp | 2 ++ src/tests/command_executor_tests.cpp | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp index 838aaee..c3ff887 100644 --- a/src/slave/flags.hpp +++ b/src/slave/flags.hpp @@ -182,8 +182,10 @@ public: #ifdef USE_SSL_SOCKET bool authenticate_http_executors; #endif // USE_SSL_SOCKET +#ifndef __WINDOWS__ bool http_executor_domain_sockets; Option domain_socket_location; +#endif // __WINDOWS__ Option http_credentials; Option hooks; Option secret_resolver; diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 75bf595..cce275a 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -11176,12 +11176,14 @@ map executorEnvironment( environment["MESOS_HTTP_COMMAND_EXECUTOR"] = flags.http_command_executor ? "1" : "0"; +#ifndef __WINDOWS__ if (flags.http_executor_domain_sockets) { // If `http_executor_domain_sockets` is true, the location should have // been set either by the user or automatically during agent startup. CHECK(flags.domain_socket_location.isSome()); environment["MESOS_DOMAIN_SOCKET"] = *flags.domain_socket_location; } +#endif // __WINDOWS__ // Set executor's shutdown grace period. If set, the customized value // from `ExecutorInfo` overrides the default from agent flags. diff --git a/src/tests/command_executor_tests.cpp b/src/tests/command_executor_tests.cpp index 73f8006..4118a52 100644 --- a/src/tests/command_executor_tests.cpp +++ b/src/tests/command_executor_tests.cpp @@ -496,6 +496,7 @@ TEST_P(CommandExecutorTest, AllocationRoleEnvironmentVariable) } +#ifndef __WINDOWS__ // This test checks that the command executor can communicate // with the agent using unix domain sockets, when the necessary // flags are set on the agent. @@ -572,6 +573,7 @@ TEST_P(CommandExecutorTest, ExecutorDomainSockets) driver.stop(); driver.join(); } +#endif // __WINDOWS__ class HTTPCommandExecutorTest
[mesos] branch master updated: Added a new task status reason.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 42e9b88 Added a new task status reason. 42e9b88 is described below commit 42e9b889900be9934dbb5ffc06504b21a3d8c206 Author: Greg Mann AuthorDate: Thu Jan 16 18:40:50 2020 -0800 Added a new task status reason. The new reason will be sent to frameworks when one of their tasks is OOM-killed on an agent while the task is exceeding its memory request. Review: https://reviews.apache.org/r/71935/ --- include/mesos/mesos.proto| 1 + include/mesos/v1/mesos.proto | 1 + 2 files changed, 2 insertions(+) diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index b0f5905..d0aed5a 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -2636,6 +2636,7 @@ message TaskStatus { REASON_CONTAINER_LIMITATION = 19; REASON_CONTAINER_LIMITATION_DISK = 20; REASON_CONTAINER_LIMITATION_MEMORY = 8; +REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED = 35; REASON_CONTAINER_PREEMPTED = 17; REASON_CONTAINER_UPDATE_FAILED = 22; REASON_MAX_COMPLETION_TIME_REACHED = 33; diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto index 53a7b9b..06c4816 100644 --- a/include/mesos/v1/mesos.proto +++ b/include/mesos/v1/mesos.proto @@ -2625,6 +2625,7 @@ message TaskStatus { REASON_CONTAINER_LIMITATION = 19; REASON_CONTAINER_LIMITATION_DISK = 20; REASON_CONTAINER_LIMITATION_MEMORY = 8; +REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED = 35; REASON_CONTAINER_PREEMPTED = 17; REASON_CONTAINER_UPDATE_FAILED = 22; REASON_MAX_COMPLETION_TIME_REACHED = 33;
[mesos] 02/02: Added MESOS-10041 to the 1.9.1 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 95d34e7f30c6ca8a074bd2af0359ed96310adee3 Author: Greg Mann AuthorDate: Fri Nov 22 15:19:30 2019 -0800 Added MESOS-10041 to the 1.9.1 CHANGELOG. --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index 398f479..08e8944 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -10,6 +10,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP) * [MESOS-10007] - Command executor can miss exit status for short-lived commands due to double-reaping. * [MESOS-10008] - Very large quota values can crash master. * [MESOS-10015] - updateAllocation() can stall the allocator with a huge number of reservations on an agent. + * [MESOS-10041] - Libprocess SSL verification can leak memory. ** Improvement * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in Master::__reregisterSlave.
[mesos] branch 1.9.x updated (c313168 -> 95d34e7)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git. from c313168 Garbage-collected lost tasks which are reported as running again. new a687e71 Fixed memory leak in openssl verification function. new 95d34e7 Added MESOS-10041 to the 1.9.1 CHANGELOG. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: 3rdparty/libprocess/src/openssl.cpp | 18 +- CHANGELOG | 1 + 2 files changed, 6 insertions(+), 13 deletions(-)
[mesos] 01/02: Fixed memory leak in openssl verification function.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit a687e71790151b5b078322e825ade349cc6922dc Author: Benno Evers AuthorDate: Fri Nov 22 12:00:43 2019 -0800 Fixed memory leak in openssl verification function. When the hostname validation scheme was set to 'openssl', the `openssl::verify()` function would return without freeing a previously allocated `X509*` object. To fix the leak, a long-standing TODO to switch to RAII-based memory management for the certificate was resolved. Review: https://reviews.apache.org/r/71805/ --- 3rdparty/libprocess/src/openssl.cpp | 18 +- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/3rdparty/libprocess/src/openssl.cpp b/3rdparty/libprocess/src/openssl.cpp index a81d13f..7eb5822 100644 --- a/3rdparty/libprocess/src/openssl.cpp +++ b/3rdparty/libprocess/src/openssl.cpp @@ -841,8 +841,9 @@ Try verify( } // The X509 object must be freed if this call succeeds. - // TODO(jmlvanre): handle this better. How about RAII? - X509* cert = SSL_get_peer_certificate(ssl); + std::unique_ptr cert( + SSL_get_peer_certificate(ssl), + X509_free); // NOTE: Even without this check, the OpenSSL handshake will not complete // when connecting to servers that do not present a certificate, unless an @@ -852,7 +853,6 @@ Try verify( } if (SSL_get_verify_result(ssl) != X509_V_OK) { -X509_free(cert); return Error("Could not verify peer certificate"); } @@ -896,7 +896,6 @@ Try verify( } if (!ssl_flags->verify_ipadd && peer_hostname.isNone()) { -X509_free(cert); return ssl_flags->require_client_cert ? Error("Cannot verify peer certificate: peer hostname unknown") : Try(Nothing()); @@ -908,7 +907,7 @@ Try verify( // physical host. STACK_OF(GENERAL_NAME)* san_names = reinterpret_cast(X509_get_ext_d2i( -reinterpret_cast(cert), +cert.get(), NID_subject_alt_name, nullptr, nullptr)); @@ -931,7 +930,6 @@ Try verify( const size_t length = ASN1_STRING_length(current_name->d.dNSName); if (length != dns_name.length()) { sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free); - X509_free(cert); return Error( "X509 certificate malformed: " "embedded NUL character in DNS name"); @@ -941,7 +939,6 @@ Try verify( // Compare expected hostname with the DNS name. if (peer_hostname.get() == dns_name) { sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free); -X509_free(cert); VLOG(2) << "dNSName match found for " << peer_hostname.get(); @@ -966,7 +963,6 @@ Try verify( if (ip.get() == ip_add) { sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free); -X509_free(cert); VLOG(2) << "iPAddress match found for " << ip.get(); @@ -985,7 +981,7 @@ Try verify( if (peer_hostname.isSome()) { // If we still haven't verified the hostname, try doing it via // the certificate subject name. -X509_NAME* name = X509_get_subject_name(cert); +X509_NAME* name = X509_get_subject_name(cert.get()); if (name != nullptr) { char text[MAXHOSTNAMELEN] {}; @@ -998,7 +994,6 @@ Try verify( VLOG(2) << "Matching common name: " << text; if (peer_hostname.get() != text) { - X509_free(cert); return Error( "Presented Certificate Name: " + stringify(text) + " does not match peer hostname name: " + peer_hostname.get()); @@ -1006,15 +1001,12 @@ Try verify( VLOG(2) << "Common name match found for " << peer_hostname.get(); -X509_free(cert); return Nothing(); } } } // If we still haven't exited, we haven't verified it, and we give up. - X509_free(cert); - std::vector details; if (peer_hostname.isSome()) {
[mesos] branch master updated: Added MESOS-10041 to the 1.9.1 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new d10a33a Added MESOS-10041 to the 1.9.1 CHANGELOG. d10a33a is described below commit d10a33acc426dda9e34db995f16450faf898bb3b Author: Greg Mann AuthorDate: Fri Nov 22 15:19:30 2019 -0800 Added MESOS-10041 to the 1.9.1 CHANGELOG. --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index cf6311a..21d21d3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -17,6 +17,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP) * [MESOS-10007] - Command executor can miss exit status for short-lived commands due to double-reaping. * [MESOS-10008] - Very large quota values can crash master. * [MESOS-10015] - updateAllocation() can stall the allocator with a huge number of reservations on an agent. + * [MESOS-10041] - Libprocess SSL verification can leak memory. ** Improvement * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in Master::__reregisterSlave.
[mesos] branch master updated: Fixed memory leak in openssl verification function.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new e52d0d1 Fixed memory leak in openssl verification function. e52d0d1 is described below commit e52d0d1f25a91f9940bea4329eb5359373ee0ed0 Author: Benno Evers AuthorDate: Fri Nov 22 12:00:43 2019 -0800 Fixed memory leak in openssl verification function. When the hostname validation scheme was set to 'openssl', the `openssl::verify()` function would return without freeing a previously allocated `X509*` object. To fix the leak, a long-standing TODO to switch to RAII-based memory management for the certificate was resolved. Review: https://reviews.apache.org/r/71805/ --- 3rdparty/libprocess/src/openssl.cpp | 18 +- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/3rdparty/libprocess/src/openssl.cpp b/3rdparty/libprocess/src/openssl.cpp index bd05866..8aab5ac 100644 --- a/3rdparty/libprocess/src/openssl.cpp +++ b/3rdparty/libprocess/src/openssl.cpp @@ -841,8 +841,9 @@ Try verify( } // The X509 object must be freed if this call succeeds. - // TODO(jmlvanre): handle this better. How about RAII? - X509* cert = SSL_get_peer_certificate(ssl); + std::unique_ptr cert( + SSL_get_peer_certificate(ssl), + X509_free); // NOTE: Even without this check, the OpenSSL handshake will not complete // when connecting to servers that do not present a certificate, unless an @@ -852,7 +853,6 @@ Try verify( } if (SSL_get_verify_result(ssl) != X509_V_OK) { -X509_free(cert); return Error("Could not verify peer certificate"); } @@ -896,7 +896,6 @@ Try verify( } if (!ssl_flags->verify_ipadd && peer_hostname.isNone()) { -X509_free(cert); return ssl_flags->require_client_cert ? Error("Cannot verify peer certificate: peer hostname unknown") : Try(Nothing()); @@ -908,7 +907,7 @@ Try verify( // physical host. STACK_OF(GENERAL_NAME)* san_names = reinterpret_cast(X509_get_ext_d2i( -reinterpret_cast(cert), +cert.get(), NID_subject_alt_name, nullptr, nullptr)); @@ -931,7 +930,6 @@ Try verify( const size_t length = ASN1_STRING_length(current_name->d.dNSName); if (length != dns_name.length()) { sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free); - X509_free(cert); return Error( "X509 certificate malformed: " "embedded NUL character in DNS name"); @@ -941,7 +939,6 @@ Try verify( // Compare expected hostname with the DNS name. if (peer_hostname.get() == dns_name) { sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free); -X509_free(cert); VLOG(2) << "dNSName match found for " << peer_hostname.get(); @@ -966,7 +963,6 @@ Try verify( if (ip.get() == ip_add) { sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free); -X509_free(cert); VLOG(2) << "iPAddress match found for " << ip.get(); @@ -985,7 +981,7 @@ Try verify( if (peer_hostname.isSome()) { // If we still haven't verified the hostname, try doing it via // the certificate subject name. -X509_NAME* name = X509_get_subject_name(cert); +X509_NAME* name = X509_get_subject_name(cert.get()); if (name != nullptr) { char text[MAXHOSTNAMELEN] {}; @@ -998,7 +994,6 @@ Try verify( VLOG(2) << "Matching common name: " << text; if (peer_hostname.get() != text) { - X509_free(cert); return Error( "Presented Certificate Name: " + stringify(text) + " does not match peer hostname name: " + peer_hostname.get()); @@ -1006,15 +1001,12 @@ Try verify( VLOG(2) << "Common name match found for " << peer_hostname.get(); -X509_free(cert); return Nothing(); } } } // If we still haven't exited, we haven't verified it, and we give up. - X509_free(cert); - std::vector details; if (peer_hostname.isSome()) {
[mesos] branch 1.9.x updated: Added MESOS-9965 to the 1.9.1 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.9.x by this push: new c115fb0 Added MESOS-9965 to the 1.9.1 CHANGELOG. c115fb0 is described below commit c115fb0e4842f5c1211c7464d38a7a67994f08ae Author: Greg Mann AuthorDate: Thu Sep 12 19:57:35 2019 -0700 Added MESOS-9965 to the 1.9.1 CHANGELOG. --- CHANGELOG | 8 1 file changed, 8 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 876c303..98bbaa0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,11 @@ +Release Notes - Mesos - Version 1.9.1 (WIP) +--- +* This is a bug fix release. + +** Bug + * [MESOS-9965] - Agent should not send `TASK_GONE_BY_OPERATOR` if the framework is not partition aware. + + Release Notes - Mesos - Version 1.9.0 - This release contains the following highlights:
[mesos] branch master updated: Added MESOS-9965 to the 1.9.1 CHANGELOG.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 2e26323 Added MESOS-9965 to the 1.9.1 CHANGELOG. 2e26323 is described below commit 2e26323eb98305e531050b62421ea97c63c1b79b Author: Greg Mann AuthorDate: Thu Sep 12 19:57:35 2019 -0700 Added MESOS-9965 to the 1.9.1 CHANGELOG. --- CHANGELOG | 8 1 file changed, 8 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 876c303..98bbaa0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,11 @@ +Release Notes - Mesos - Version 1.9.1 (WIP) +--- +* This is a bug fix release. + +** Bug + * [MESOS-9965] - Agent should not send `TASK_GONE_BY_OPERATOR` if the framework is not partition aware. + + Release Notes - Mesos - Version 1.9.0 - This release contains the following highlights:
[mesos] branch 1.9.x updated: Fixed a bug for non-partition-aware schedulers.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.9.x by this push: new d8520b0 Fixed a bug for non-partition-aware schedulers. d8520b0 is described below commit d8520b0b4bf52fd27be45817934e2af1b871c399 Author: Greg Mann AuthorDate: Thu Sep 12 16:33:20 2019 -0700 Fixed a bug for non-partition-aware schedulers. Previously, the agent would send task status updates with the state TASK_GONE_BY_OPERATOR to all schedulers when an agent was drained with the `mark_gone` parameter set to `true`. This patch updates this code to ensure that TASK_GONE_BY_OPERATOR is only sent to partition-aware schedulers. Review: https://reviews.apache.org/r/71480/ --- src/slave/slave.cpp | 69 --- src/tests/slave_tests.cpp | 20 +++--- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 4e93656..96890d3 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -5773,40 +5773,6 @@ void Slave::statusUpdate(StatusUpdate update, const Option& pid) update.mutable_status()->set_source( pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR); - // If the agent is draining we provide additional - // information for KILLING or KILLED states. - if (drainConfig.isSome()) { -switch (update.status().state()) { - case TASK_STAGING: - case TASK_STARTING: - case TASK_RUNNING: - case TASK_FAILED: - case TASK_FINISHED: - case TASK_ERROR: - case TASK_LOST: - case TASK_DROPPED: - case TASK_UNREACHABLE: - case TASK_GONE: - case TASK_GONE_BY_OPERATOR: - case TASK_UNKNOWN: { -break; - } - case TASK_KILLING: - case TASK_KILLED: { -// We unconditionally overwrite any previous reason to provide a -// consistent signal that this task went away during draining. -update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING); - -// If the draining marks the agent as gone report tasks as -// gone by operator. -if (drainConfig->mark_gone()) { - update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR); -} -break; - } -} - } - // Set TaskStatus.executor_id if not already set; overwrite existing // value if already set. if (update.has_executor_id()) { @@ -5843,6 +5809,41 @@ void Slave::statusUpdate(StatusUpdate update, const Option& pid) return; } + // If the agent is draining we provide additional + // information for KILLING or KILLED states. + if (drainConfig.isSome()) { +switch (update.status().state()) { + case TASK_STAGING: + case TASK_STARTING: + case TASK_RUNNING: + case TASK_FAILED: + case TASK_FINISHED: + case TASK_ERROR: + case TASK_LOST: + case TASK_DROPPED: + case TASK_UNREACHABLE: + case TASK_GONE: + case TASK_GONE_BY_OPERATOR: + case TASK_UNKNOWN: { +break; + } + case TASK_KILLING: + case TASK_KILLED: { +// We unconditionally overwrite any previous reason to provide a +// consistent signal that this task went away during draining. +update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING); + +// If the draining marks the agent as gone report tasks as +// gone by operator. +if (drainConfig->mark_gone() && +framework->capabilities.partitionAware) { + update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR); +} +break; + } +} + } + if (HookManager::hooksAvailable()) { // Even though the hook(s) return a TaskStatus, we only use two fields: // container_status and labels. Remaining fields are discarded. diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 02b65a9..c147bfc 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -12040,10 +12040,16 @@ TEST_F(SlaveTest, DrainAgentKillsRunningTask) AWAIT_READY(updateSlaveMessage); + // Set the partition-aware capability to ensure that the terminal update state + // is TASK_GONE_BY_OPERATOR, since we will set `mark_gone = true`. + v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO; + frameworkInfo.add_capabilities()->set_type( + v1::FrameworkInfo::Capability::PARTITION_AWARE); + auto scheduler = std::make_shared(); EXPECT_CALL(*scheduler, connected(_)) -.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO)); +.WillOnce(v1::scheduler::SendSubscribe(frameworkInfo)); Future subscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) @@ -12160,10 +12166,16 @@ TEST_F(SlaveTest, DrainAgentKillsQueu
[mesos] branch master updated: Fixed a bug for non-partition-aware schedulers.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 8e1a512 Fixed a bug for non-partition-aware schedulers. 8e1a512 is described below commit 8e1a51207304589a6521cff3540e0705fe1533ff Author: Greg Mann AuthorDate: Thu Sep 12 16:33:20 2019 -0700 Fixed a bug for non-partition-aware schedulers. Previously, the agent would send task status updates with the state TASK_GONE_BY_OPERATOR to all schedulers when an agent was drained with the `mark_gone` parameter set to `true`. This patch updates this code to ensure that TASK_GONE_BY_OPERATOR is only sent to partition-aware schedulers. Review: https://reviews.apache.org/r/71480/ --- src/slave/slave.cpp | 69 --- src/tests/slave_tests.cpp | 20 +++--- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 4e93656..96890d3 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -5773,40 +5773,6 @@ void Slave::statusUpdate(StatusUpdate update, const Option& pid) update.mutable_status()->set_source( pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR); - // If the agent is draining we provide additional - // information for KILLING or KILLED states. - if (drainConfig.isSome()) { -switch (update.status().state()) { - case TASK_STAGING: - case TASK_STARTING: - case TASK_RUNNING: - case TASK_FAILED: - case TASK_FINISHED: - case TASK_ERROR: - case TASK_LOST: - case TASK_DROPPED: - case TASK_UNREACHABLE: - case TASK_GONE: - case TASK_GONE_BY_OPERATOR: - case TASK_UNKNOWN: { -break; - } - case TASK_KILLING: - case TASK_KILLED: { -// We unconditionally overwrite any previous reason to provide a -// consistent signal that this task went away during draining. -update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING); - -// If the draining marks the agent as gone report tasks as -// gone by operator. -if (drainConfig->mark_gone()) { - update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR); -} -break; - } -} - } - // Set TaskStatus.executor_id if not already set; overwrite existing // value if already set. if (update.has_executor_id()) { @@ -5843,6 +5809,41 @@ void Slave::statusUpdate(StatusUpdate update, const Option& pid) return; } + // If the agent is draining we provide additional + // information for KILLING or KILLED states. + if (drainConfig.isSome()) { +switch (update.status().state()) { + case TASK_STAGING: + case TASK_STARTING: + case TASK_RUNNING: + case TASK_FAILED: + case TASK_FINISHED: + case TASK_ERROR: + case TASK_LOST: + case TASK_DROPPED: + case TASK_UNREACHABLE: + case TASK_GONE: + case TASK_GONE_BY_OPERATOR: + case TASK_UNKNOWN: { +break; + } + case TASK_KILLING: + case TASK_KILLED: { +// We unconditionally overwrite any previous reason to provide a +// consistent signal that this task went away during draining. +update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING); + +// If the draining marks the agent as gone report tasks as +// gone by operator. +if (drainConfig->mark_gone() && +framework->capabilities.partitionAware) { + update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR); +} +break; + } +} + } + if (HookManager::hooksAvailable()) { // Even though the hook(s) return a TaskStatus, we only use two fields: // container_status and labels. Remaining fields are discarded. diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 02b65a9..c147bfc 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -12040,10 +12040,16 @@ TEST_F(SlaveTest, DrainAgentKillsRunningTask) AWAIT_READY(updateSlaveMessage); + // Set the partition-aware capability to ensure that the terminal update state + // is TASK_GONE_BY_OPERATOR, since we will set `mark_gone = true`. + v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO; + frameworkInfo.add_capabilities()->set_type( + v1::FrameworkInfo::Capability::PARTITION_AWARE); + auto scheduler = std::make_shared(); EXPECT_CALL(*scheduler, connected(_)) -.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO)); +.WillOnce(v1::scheduler::SendSubscribe(frameworkInfo)); Future subscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) @@ -12160,10 +12166,16 @@ TEST_F(SlaveTest, DrainAgentKillsQueu
[mesos] branch master updated: Added documentation about standalone containers.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new b24ab0a Added documentation about standalone containers. b24ab0a is described below commit b24ab0a97e0043cb4a06624f7593dec9e15f5661 Author: Joseph Wu AuthorDate: Mon Sep 9 10:37:57 2019 -0700 Added documentation about standalone containers. This outlines some of the differences to expect from this new type of container and shows some example API calls. Review: https://reviews.apache.org/r/65112/ --- docs/csi.md | 2 +- docs/home.md | 1 + docs/standalone-containers.md | 202 ++ 3 files changed, 204 insertions(+), 1 deletion(-) diff --git a/docs/csi.md b/docs/csi.md index 4a83581..aa03175 100644 --- a/docs/csi.md +++ b/docs/csi.md @@ -80,7 +80,7 @@ More details about SLRP can be found in the following [section](#storage-local-r CSI plugins are long-running [gRPC](https://grpc.io/) services, like daemons. Those CSI plugins are packaged as containers, and are launched by SLRPs using -the [standalone containers](standalone-container.md) API from the agent. +the [standalone containers](standalone-containers.md) API from the agent. Standalone containers can be launched without any tasks or executors. They use the same isolation mechanism provided by the agent for task and executor containers. diff --git a/docs/home.md b/docs/home.md index ad19919..4e62b4a 100644 --- a/docs/home.md +++ b/docs/home.md @@ -59,6 +59,7 @@ layout: documentation * [Container Sandboxes](sandbox.md) * [Container Volumes](container-volume.md) * [Nested Container and Task Group (Pod)](nested-container-and-task-group.md) +* [Standalone Containers](standalone-containers.md) ## Networking * [Networking Overview](networking.md) diff --git a/docs/standalone-containers.md b/docs/standalone-containers.md new file mode 100644 index 000..1e1e306 --- /dev/null +++ b/docs/standalone-containers.md @@ -0,0 +1,202 @@ +--- +title: Apache Mesos - Standalone Containers +layout: documentation +--- + +# Standalone Containers + +Traditionally, launching a container in a Mesos cluster involves +communication between multiple components: + +``` + Container(s) + +---+ ++ +---+ +--+ + | Framework | <-> | Master | <-> | Agent | <-> | Executor | + +---+ ++ +---+ | `->Task | + ^ +--+ + | +---+ +--+ + +--> | Agent | <-> | Executor | + | +---+ | `->Task | +... +--+ +``` + +Mesos 1.5 introduced "Standalone Containers", which provide an alternate +path for launching containers with a reduced scope and feature set: + +``` + +---++--+ + Operator API <-> | Agent | -> | Standalone Container | + +---++--+ +``` + +**NOTE:** Agents currently require a connection to a Mesos master in +order to accept any Operator API calls. This limitation is not necessary +and may be fixed in future. + +**NOTE:** Standalone containers only apply to the Mesos containerizer. +For standalone docker containers, use docker directly. + +As hinted by the diagrams, standalone containers are launched on single +Agents, rather than cluster-wide. This document describes the major +differences between normal containers and standalone containers; and +provides some examples of how to use the new Operator APIs. + + +## Launching a Standalone Container + +Because standalone containers are launched directly on Mesos Agents, +these containers do not participate in the Mesos Master's offer cycle. +This means standalone containers can be launched regardless of resource +allocation and can potentially overcommit the Mesos Agent, but cannot +use reserved resources. + +An Operator API might look like this: + +``` +LAUNCH_CONTAINER HTTP Request (JSON): + +POST /api/v1 HTTP/1.1 + +Host: agenthost:5051 +Content-Type: application/json + +{ + "type": "LAUNCH_CONTAINER", + "launch_container": { +"container_id": { + "value": "my-standalone-container-id" +}, +"command": { + "value": "sleep 100" +}, +"resources": [ + { +"name": "cpus", +"scalar": { "value": 2.0 }, +"type": "SCALAR" + }, + { +"name": "mem", +
[mesos] 03/03: Fixed formatting in the upgrade docs.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 5e79a584e6ec3e9e2f96e8bf418411df9dafac2e Author: Greg Mann AuthorDate: Fri Aug 30 11:48:18 2019 -0300 Fixed formatting in the upgrade docs. --- docs/upgrades.md | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/upgrades.md b/docs/upgrades.md index d36a9a4..d745752 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -522,21 +522,22 @@ We categorize the changes as follows: - * A new `DRAINING` state has been added to Mesos agents. Once an agent is draining, all tasks running on that agent are gracefully -killed and no offers for that agent are sent to schedulers, preventing the launching of new tasks. -Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. -See [`docs/maintenance`](maintenance.md) for details. +* A new `DRAINING` state has been added to Mesos agents. Once an agent is draining, all tasks running on that agent are gracefully + killed and no offers for that agent are sent to schedulers, preventing the launching of new tasks. + Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. + See [`docs/maintenance`](maintenance.md) for details. + * The Mesos agent now requires the new `AGENT_DRAINING` feature. This capability is set by default, but if the `--agent_features` flag is specified explicitly, `AGENT_DRAINING` must be included. - * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges. +* A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges. - * A new [`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag has been added. This causes the agent to ignore any runtime configuration present in Docker images. +* A new [`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag has been added. This causes the agent to ignore any runtime configuration present in Docker images.
[mesos] branch 1.9.x updated (091f193 -> 5e79a58)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git. from 091f193 Updated docs for the AGENT_DRAINING capability. new 9aa7dab Removed experimental warning from UPDATE_QUOTA call. new 499a571 Updated upgrades.md to note quota limits changes. new 5e79a58 Fixed formatting in the upgrade docs. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docs/upgrades.md | 34 -- include/mesos/master/master.proto| 4 include/mesos/v1/master/master.proto | 4 3 files changed, 28 insertions(+), 14 deletions(-)
[mesos] 01/03: Removed experimental warning from UPDATE_QUOTA call.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 9aa7dab8061292d22ac870a2cf8856dca50e31d2 Author: Benjamin Mahler AuthorDate: Thu Aug 29 12:59:01 2019 -0400 Removed experimental warning from UPDATE_QUOTA call. This is now a fully functional feature and is released in 1.9. Review: https://reviews.apache.org/r/71411 --- include/mesos/master/master.proto| 4 include/mesos/v1/master/master.proto | 4 2 files changed, 8 deletions(-) diff --git a/include/mesos/master/master.proto b/include/mesos/master/master.proto index 8386bd6..315809c 100644 --- a/include/mesos/master/master.proto +++ b/include/mesos/master/master.proto @@ -274,10 +274,6 @@ message Call { required SlaveID slave_id = 1; } - // EXPERIMENTAL DO NOT USE. - // - // This feature is not implementation complete. - // // Updates quota given the provided quota configurations, these configurations // are applied in an all-or-nothing manner. message UpdateQuota { diff --git a/include/mesos/v1/master/master.proto b/include/mesos/v1/master/master.proto index 893162d..5c99112 100644 --- a/include/mesos/v1/master/master.proto +++ b/include/mesos/v1/master/master.proto @@ -275,10 +275,6 @@ message Call { required AgentID agent_id = 1; } - // EXPERIMENTAL DO NOT USE. - // - // This feature is not implementation complete. - // // Updates quota given the provided quota configurations, these configurations // are applied in an all-or-nothing manner. message UpdateQuota {
[mesos] 02/03: Updated upgrades.md to note quota limits changes.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 499a57188518e6d541f95e0d10ac264e6f83a735 Author: Benjamin Mahler AuthorDate: Thu Aug 29 12:59:54 2019 -0400 Updated upgrades.md to note quota limits changes. In particular: * UPDATE_QUOTA replaces the old SET_QUOTA and REMOVE_QUOTA calls. * Quota guarantees are still functional, but deprecated in preparation for optimistic offers. Review: https://reviews.apache.org/r/71412 --- docs/upgrades.md | 21 + 1 file changed, 21 insertions(+) diff --git a/docs/upgrades.md b/docs/upgrades.md index 0345e22..d36a9a4 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -48,6 +48,7 @@ We categorize the changes as follows: + A Quota Limits A Linux NNP isolator A hostname_validation_scheme C TLS certificate verification behaviour @@ -78,6 +79,10 @@ We categorize the changes as follows: + D SET_QUOTA and REMOVE QUOTA deprecated +in favor of UPDATE_QUOTA + D Quota guarantees deprecated in favor +of using quota limits @@ -516,6 +521,7 @@ We categorize the changes as follows: ## Upgrading from 1.8.x to 1.9.x ## + * A new `DRAINING` state has been added to Mesos agents. Once an agent is draining, all tasks running on that agent are gracefully killed and no offers for that agent are sent to schedulers, preventing the launching of new tasks. Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. @@ -525,17 +531,21 @@ We categorize the changes as follows: * The Mesos agent now requires the new `AGENT_DRAINING` feature. This capability is set by default, but if the `--agent_features` flag is specified explicitly, `AGENT_DRAINING` must be included. + * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges. + * A new [`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag has been added. This causes the agent to ignore any runtime configuration present in Docker images. + * A new libprocess TLS flag `--hostname_validation_scheme` along with the corresponding environment variable `LIBPROCESS_SSL_HOSTNAME_VALIDATION_SCHEME` has been added. Using this flag, users can configure the way libprocess performs hostname validation for TLS connections. See [`docs/ssl`](ssl.md) for details. + * The semantics of the libprocess environment variables `LIBPROCESS_SSL_VERIFY_CERT` and `LIBPROCESS_SSL_REQUIRE_CERT` have been slightly updated such that the former now only applies to client-mode and the latter only to server-mode connections. As part of this re-adjustment, the following two changes have been introduced that might require changes for operators running Mesos in unusual TLS configurations. @@ -548,8 +558,19 @@ We categorize the changes as follows: the `LIBPROCESS_SSL_REQUIRE_CERT` option is set to true. + * The Mesos containerizer now supports configurable IPC namespace and /dev/shm. Container can be configured to have a private IPC namespace and /dev/shm or share them from its parent via the field `LinuxInfo.ipc_mode`, and the size of its private /dev/shm is also configurable via the field `LinuxInfo.shm_size`. Operators can control whether it is allowed to share host's IPC namespace and /dev/shm with top level containers via the agent flag `--disallow_sharing_agent_ipc_namespace`, and s [...] + + +* The `SET_QUOTA` and `REMOVE QUOTA` master calls are deprecated in favor of a new `UPDATE_QUOTA` master call. + + + +* Prior to Mesos 1.9, the quota related APIs only exposed quota "guarantees" which ensured a minimum amount of resources would be available to a role. Setting guarantees also set implicit quota limits. In Mesos 1.9+, quota limits are now exposed directly. + * Quota guarantees are now deprecated in favor of using only quota limits. Enforcement of quota guarantees required that Mesos holds back enough resources to meet all of the unsatisfied quota guarantees. Since Mesos is moving towards an optimistic offer model (to improve multi-role / multi- scheduler scalability, see MESOS-1607), it will become no longer possible to enforce quota guarantees by holding back resources. In such a model, quota limits are simple to enforce, but quota guaran [...] + * For these reasons, quota guarantees, while still functional in Mesos 1.9, are now deprecated. A combination of limits and priority based preemption will be simpler in an optimistic offer model. + ## Upgrading from 1.7.x to 1.8.x ##
[mesos] branch master updated: Fixed formatting in the upgrade docs.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new e5bf1b6 Fixed formatting in the upgrade docs. e5bf1b6 is described below commit e5bf1b61ec140f70ad90d522dc37a4ea82554221 Author: Greg Mann AuthorDate: Fri Aug 30 11:48:18 2019 -0300 Fixed formatting in the upgrade docs. --- docs/upgrades.md | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/upgrades.md b/docs/upgrades.md index d36a9a4..d745752 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -522,21 +522,22 @@ We categorize the changes as follows: - * A new `DRAINING` state has been added to Mesos agents. Once an agent is draining, all tasks running on that agent are gracefully -killed and no offers for that agent are sent to schedulers, preventing the launching of new tasks. -Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. -See [`docs/maintenance`](maintenance.md) for details. +* A new `DRAINING` state has been added to Mesos agents. Once an agent is draining, all tasks running on that agent are gracefully + killed and no offers for that agent are sent to schedulers, preventing the launching of new tasks. + Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. + See [`docs/maintenance`](maintenance.md) for details. + * The Mesos agent now requires the new `AGENT_DRAINING` feature. This capability is set by default, but if the `--agent_features` flag is specified explicitly, `AGENT_DRAINING` must be included. - * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges. +* A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges. - * A new [`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag has been added. This causes the agent to ignore any runtime configuration present in Docker images. +* A new [`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag has been added. This causes the agent to ignore any runtime configuration present in Docker images.
[mesos] branch 1.9.x updated: Updated docs for the AGENT_DRAINING capability.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.9.x in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/1.9.x by this push: new 091f193 Updated docs for the AGENT_DRAINING capability. 091f193 is described below commit 091f193685142e5187a4f84d55ed1e28ac34749c Author: Greg Mann AuthorDate: Fri Aug 30 10:23:10 2019 -0300 Updated docs for the AGENT_DRAINING capability. Review: https://reviews.apache.org/r/71405/ --- docs/configuration/agent.md | 7 --- docs/upgrades.md| 4 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md index 760d22b..91e38c2 100644 --- a/docs/configuration/agent.md +++ b/docs/configuration/agent.md @@ -92,8 +92,8 @@ Example: JSON representation of agent features to whitelist. We always require -'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', and -'AGENT_OPERATION_FEEDBACK'. +'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', +'AGENT_OPERATION_FEEDBACK', and 'AGENT_DRAINING'. Example: @@ -102,7 +102,8 @@ Example: {"type": "MULTI_ROLE"}, {"type": "HIERARCHICAL_ROLE"}, {"type": "RESERVATION_REFINEMENT"}, -{"type": "AGENT_OPERATION_FEEDBACK"} +{"type": "AGENT_OPERATION_FEEDBACK"}, +{"type": "AGENT_DRAINING"} ] } diff --git a/docs/upgrades.md b/docs/upgrades.md index ded4a8d..0345e22 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -61,6 +61,7 @@ We categorize the changes as follows: A docker_ignore_runtime A disallow_sharing_agent_ipc_namespace A default_container_shm_size + C agent_features @@ -520,6 +521,9 @@ We categorize the changes as follows: Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. See [`docs/maintenance`](maintenance.md) for details. + +* The Mesos agent now requires the new `AGENT_DRAINING` feature. This capability is set by default, but if the `--agent_features` flag is specified explicitly, `AGENT_DRAINING` must be included. + * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges.
[mesos] branch master updated: Updated docs for the AGENT_DRAINING capability.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 93963c4 Updated docs for the AGENT_DRAINING capability. 93963c4 is described below commit 93963c46b803f59554e12afe9c66605600c7f5b8 Author: Greg Mann AuthorDate: Fri Aug 30 10:23:10 2019 -0300 Updated docs for the AGENT_DRAINING capability. Review: https://reviews.apache.org/r/71405/ --- docs/configuration/agent.md | 7 --- docs/upgrades.md| 4 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md index 760d22b..91e38c2 100644 --- a/docs/configuration/agent.md +++ b/docs/configuration/agent.md @@ -92,8 +92,8 @@ Example: JSON representation of agent features to whitelist. We always require -'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', and -'AGENT_OPERATION_FEEDBACK'. +'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', +'AGENT_OPERATION_FEEDBACK', and 'AGENT_DRAINING'. Example: @@ -102,7 +102,8 @@ Example: {"type": "MULTI_ROLE"}, {"type": "HIERARCHICAL_ROLE"}, {"type": "RESERVATION_REFINEMENT"}, -{"type": "AGENT_OPERATION_FEEDBACK"} +{"type": "AGENT_OPERATION_FEEDBACK"}, +{"type": "AGENT_DRAINING"} ] } diff --git a/docs/upgrades.md b/docs/upgrades.md index 31f4a19..d36a9a4 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -62,6 +62,7 @@ We categorize the changes as follows: A docker_ignore_runtime A disallow_sharing_agent_ipc_namespace A default_container_shm_size + C agent_features @@ -526,6 +527,9 @@ We categorize the changes as follows: Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` operator API call. See [`docs/maintenance`](maintenance.md) for details. + +* The Mesos agent now requires the new `AGENT_DRAINING` feature. This capability is set by default, but if the `--agent_features` flag is specified explicitly, `AGENT_DRAINING` must be included. + * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The isolator supports setting of the `no_new_privs` bit in the container, preventing tasks from acquiring additional privileges.
[mesos] 01/02: Fixed a memory leak in the master's 'removeTask()' helper.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.6.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 6a9cee7999be0a3a4f89d21ec58947fe90c01eeb Author: Greg Mann AuthorDate: Tue Apr 23 22:25:21 2019 -0700 Fixed a memory leak in the master's 'removeTask()' helper. Previously, all removed tasks were added to the `slaves.unreachableTasks` map. This patch adds a conditional so that removed tasks are only added to that structure when they are being marked unreachable. Review: https://reviews.apache.org/r/70518/ --- src/master/master.cpp | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 66e8e92..3b58964 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -11035,7 +11035,10 @@ void Master::removeTask(Task* task, bool unreachable) << " on agent " << *slave; } - slaves.unreachableTasks[slave->id].put(task->framework_id(), task->task_id()); + if (unreachable) { +slaves.unreachableTasks[slave->id].put( +task->framework_id(), task->task_id()); + } // Remove from framework. Framework* framework = getFramework(task->framework_id());
[mesos] branch 1.6.x updated (23020e1 -> c6da50d)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch 1.6.x in repository https://gitbox.apache.org/repos/asf/mesos.git. from 23020e1 Fixed a compilation error on clang build. new 6a9cee7 Fixed a memory leak in the master's 'removeTask()' helper. new c6da50d Transitioned tasks when an unreachable agent is marked as gone. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/master/http.cpp | 10 +-- src/master/master.cpp | 105 +++--- src/master/master.hpp | 2 +- src/tests/api_tests.cpp | 196 4 files changed, 293 insertions(+), 20 deletions(-)
[mesos] 02/02: Transitioned tasks when an unreachable agent is marked as gone.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.6.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit c6da50d10511a1046b8d4bc563dc3ccee875 Author: Greg Mann AuthorDate: Tue Apr 23 22:25:29 2019 -0700 Transitioned tasks when an unreachable agent is marked as gone. This patch updates the master code responsible for marking agents as gone to properly transition tasks on agents which were previously marked as unreachable. Review: https://reviews.apache.org/r/70519/ --- src/master/http.cpp | 10 +-- src/master/master.cpp | 100 +--- src/master/master.hpp | 2 +- src/tests/api_tests.cpp | 196 4 files changed, 289 insertions(+), 19 deletions(-) diff --git a/src/master/http.cpp b/src/master/http.cpp index 0492b97..103b7f5 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -5225,15 +5225,7 @@ Future Master::Http::_markAgentGone(const SlaveID& slaveId) const << registrarResult.failure(); } -Slave* slave = master->slaves.registered.get(slaveId); - -// This can happen if the agent that is being marked as -// gone is not currently registered (unreachable/recovered). -if (slave == nullptr) { - return; -} - -master->markGone(slave, goneTime); +master->markGone(slaveId, goneTime); })); return gone.then([]() -> Future { diff --git a/src/master/master.cpp b/src/master/master.cpp index 3b58964..804de69 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -8881,20 +8881,102 @@ void Master::_markUnreachable( } -void Master::markGone(Slave* slave, const TimeInfo& goneTime) +void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime) { - CHECK_NOTNULL(slave); - CHECK(slaves.markingGone.contains(slave->info.id())); - slaves.markingGone.erase(slave->info.id()); + CHECK(slaves.markingGone.contains(slaveId)); + + slaves.markingGone.erase(slaveId); + + slaves.gone[slaveId] = goneTime; + + const string message = "Agent has been marked gone"; + + Slave* slave = slaves.registered.get(slaveId); - slaves.gone[slave->id] = goneTime; + // If the `Slave` struct does not exist, then the agent + // must be either recovered or unreachable. + if (slave == nullptr) { +CHECK(slaves.recovered.contains(slaveId) || + slaves.unreachable.contains(slaveId)); + +// When a recovered agent is marked gone, we have no task metadata to use in +// order to send task status updates. We could retain this agent ID and send +// updates upon reregistration but do not currently do this. See MESOS-9739. +if (slaves.recovered.contains(slaveId)) { + return; +} + +slaves.unreachable.erase(slaveId); + +// TODO(vinod): Consider moving these tasks into `completedTasks` by +// transitioning them to a terminal state and sending status updates. +// But it's not clear what this state should be. If a framework +// reconciles these tasks after this point it would get `TASK_UNKNOWN` +// which seems appropriate but we don't keep tasks in this state in-memory. +if (slaves.unreachableTasks.contains(slaveId)) { + foreachkey (const FrameworkID& frameworkId, + slaves.unreachableTasks.at(slaveId)) { +Framework* framework = getFramework(frameworkId); +if (framework == nullptr) { + continue; +} + +TaskState newTaskState = TASK_GONE_BY_OPERATOR; +TaskStatus::Reason newTaskReason = + TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR; + +if (!framework->capabilities.partitionAware) { + newTaskState = TASK_LOST; + newTaskReason = TaskStatus::REASON_SLAVE_REMOVED; +} + +foreach (const TaskID& taskId, + slaves.unreachableTasks.at(slaveId).get(frameworkId)) { + if (framework->unreachableTasks.contains(taskId)) { +const Owned& task = framework->unreachableTasks.at(taskId); + +const StatusUpdate& update = protobuf::createStatusUpdate( +task->framework_id(), +task->slave_id(), +task->task_id(), +newTaskState, +TaskStatus::SOURCE_MASTER, +None(), +message, +newTaskReason, +(task->has_executor_id() + ? Option(task->executor_id()) + : None())); + +updateTask(task.get(), update); + +if (!framework->connected()) { + LOG(WARNING) << "Dropping update " << update + << " for disconnected " + << " framework " << frameworkId; +} else {
[mesos] 02/02: Transitioned tasks when an unreachable agent is marked as gone.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.7.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 61f1155675bd3bc5312e0501ea6182d2ee7434af Author: Greg Mann AuthorDate: Tue Apr 23 22:25:29 2019 -0700 Transitioned tasks when an unreachable agent is marked as gone. This patch updates the master code responsible for marking agents as gone to properly transition tasks on agents which were previously marked as unreachable. Review: https://reviews.apache.org/r/70519/ --- src/master/http.cpp | 10 +-- src/master/master.cpp | 100 +--- src/master/master.hpp | 2 +- src/tests/api_tests.cpp | 196 4 files changed, 289 insertions(+), 19 deletions(-) diff --git a/src/master/http.cpp b/src/master/http.cpp index e2773ed..30dddc1 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -5331,15 +5331,7 @@ Future Master::Http::_markAgentGone(const SlaveID& slaveId) const << registrarResult.failure(); } -Slave* slave = master->slaves.registered.get(slaveId); - -// This can happen if the agent that is being marked as -// gone is not currently registered (unreachable/recovered). -if (slave == nullptr) { - return; -} - -master->markGone(slave, goneTime); +master->markGone(slaveId, goneTime); })); return gone.then([]() -> Future { diff --git a/src/master/master.cpp b/src/master/master.cpp index 08a5133..1a95b69 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -8968,20 +8968,102 @@ void Master::_markUnreachable( } -void Master::markGone(Slave* slave, const TimeInfo& goneTime) +void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime) { - CHECK_NOTNULL(slave); - CHECK(slaves.markingGone.contains(slave->info.id())); - slaves.markingGone.erase(slave->info.id()); + CHECK(slaves.markingGone.contains(slaveId)); + + slaves.markingGone.erase(slaveId); + + slaves.gone[slaveId] = goneTime; + + const string message = "Agent has been marked gone"; + + Slave* slave = slaves.registered.get(slaveId); - slaves.gone[slave->id] = goneTime; + // If the `Slave` struct does not exist, then the agent + // must be either recovered or unreachable. + if (slave == nullptr) { +CHECK(slaves.recovered.contains(slaveId) || + slaves.unreachable.contains(slaveId)); + +// When a recovered agent is marked gone, we have no task metadata to use in +// order to send task status updates. We could retain this agent ID and send +// updates upon reregistration but do not currently do this. See MESOS-9739. +if (slaves.recovered.contains(slaveId)) { + return; +} + +slaves.unreachable.erase(slaveId); + +// TODO(vinod): Consider moving these tasks into `completedTasks` by +// transitioning them to a terminal state and sending status updates. +// But it's not clear what this state should be. If a framework +// reconciles these tasks after this point it would get `TASK_UNKNOWN` +// which seems appropriate but we don't keep tasks in this state in-memory. +if (slaves.unreachableTasks.contains(slaveId)) { + foreachkey (const FrameworkID& frameworkId, + slaves.unreachableTasks.at(slaveId)) { +Framework* framework = getFramework(frameworkId); +if (framework == nullptr) { + continue; +} + +TaskState newTaskState = TASK_GONE_BY_OPERATOR; +TaskStatus::Reason newTaskReason = + TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR; + +if (!framework->capabilities.partitionAware) { + newTaskState = TASK_LOST; + newTaskReason = TaskStatus::REASON_SLAVE_REMOVED; +} + +foreach (const TaskID& taskId, + slaves.unreachableTasks.at(slaveId).get(frameworkId)) { + if (framework->unreachableTasks.contains(taskId)) { +const Owned& task = framework->unreachableTasks.at(taskId); + +const StatusUpdate& update = protobuf::createStatusUpdate( +task->framework_id(), +task->slave_id(), +task->task_id(), +newTaskState, +TaskStatus::SOURCE_MASTER, +None(), +message, +newTaskReason, +(task->has_executor_id() + ? Option(task->executor_id()) + : None())); + +updateTask(task.get(), update); + +if (!framework->connected()) { + LOG(WARNING) << "Dropping update " << update + << " for disconnected " + << " framework " << frameworkId; +} else {
[mesos] 01/02: Fixed a memory leak in the master's 'removeTask()' helper.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.7.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 0c5e78bc26653d26a03b08b82923ea517de46fc0 Author: Greg Mann AuthorDate: Tue Apr 23 22:25:21 2019 -0700 Fixed a memory leak in the master's 'removeTask()' helper. Previously, all removed tasks were added to the `slaves.unreachableTasks` map. This patch adds a conditional so that removed tasks are only added to that structure when they are being marked unreachable. Review: https://reviews.apache.org/r/70518/ --- src/master/master.cpp | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 3f0c8c0..08a5133 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -11194,7 +11194,10 @@ void Master::removeTask(Task* task, bool unreachable) << " on agent " << *slave; } - slaves.unreachableTasks[slave->id].put(task->framework_id(), task->task_id()); + if (unreachable) { +slaves.unreachableTasks[slave->id].put( +task->framework_id(), task->task_id()); + } // Remove from framework. Framework* framework = getFramework(task->framework_id());
[mesos] 01/02: Fixed a memory leak in the master's 'removeTask()' helper.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.8.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 6f90cc334701fad10e721312cd4cbd0690e1c6ec Author: Greg Mann AuthorDate: Tue Apr 23 22:25:21 2019 -0700 Fixed a memory leak in the master's 'removeTask()' helper. Previously, all removed tasks were added to the `slaves.unreachableTasks` map. This patch adds a conditional so that removed tasks are only added to that structure when they are being marked unreachable. Review: https://reviews.apache.org/r/70518/ --- src/master/master.cpp | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 5488b7b..9730e65 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -11780,7 +11780,10 @@ void Master::removeTask(Task* task, bool unreachable) << " on agent " << *slave; } - slaves.unreachableTasks[slave->id].put(task->framework_id(), task->task_id()); + if (unreachable) { +slaves.unreachableTasks[slave->id].put( +task->framework_id(), task->task_id()); + } // Remove from framework. Framework* framework = getFramework(task->framework_id());
[mesos] 02/02: Transitioned tasks when an unreachable agent is marked as gone.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch 1.8.x in repository https://gitbox.apache.org/repos/asf/mesos.git commit 13e4cd1c42ae88094f14d6b05cfb9832d4494193 Author: Greg Mann AuthorDate: Tue Apr 23 22:25:29 2019 -0700 Transitioned tasks when an unreachable agent is marked as gone. This patch updates the master code responsible for marking agents as gone to properly transition tasks on agents which were previously marked as unreachable. Review: https://reviews.apache.org/r/70519/ --- src/master/http.cpp | 10 +-- src/master/master.cpp | 100 +--- src/master/master.hpp | 2 +- src/tests/api_tests.cpp | 196 4 files changed, 289 insertions(+), 19 deletions(-) diff --git a/src/master/http.cpp b/src/master/http.cpp index e7a92d0..765bbf1 100644 --- a/src/master/http.cpp +++ b/src/master/http.cpp @@ -4171,15 +4171,7 @@ Future Master::Http::_markAgentGone(const SlaveID& slaveId) const << registrarResult.failure(); } -Slave* slave = master->slaves.registered.get(slaveId); - -// This can happen if the agent that is being marked as -// gone is not currently registered (unreachable/recovered). -if (slave == nullptr) { - return; -} - -master->markGone(slave, goneTime); +master->markGone(slaveId, goneTime); })); return gone.then([]() -> Future { diff --git a/src/master/master.cpp b/src/master/master.cpp index 9730e65..c9b0a38 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -9246,18 +9246,100 @@ void Master::_markUnreachable( } -void Master::markGone(Slave* slave, const TimeInfo& goneTime) +void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime) { - CHECK_NOTNULL(slave); - CHECK(slaves.markingGone.contains(slave->info.id())); - slaves.markingGone.erase(slave->info.id()); + CHECK(slaves.markingGone.contains(slaveId)); + + slaves.markingGone.erase(slaveId); + + slaves.gone[slaveId] = goneTime; + + const string message = "Agent has been marked gone"; + + Slave* slave = slaves.registered.get(slaveId); - slaves.gone[slave->id] = goneTime; + // If the `Slave` struct does not exist, then the agent + // must be either recovered or unreachable. + if (slave == nullptr) { +CHECK(slaves.recovered.contains(slaveId) || + slaves.unreachable.contains(slaveId)); + +// When a recovered agent is marked gone, we have no task metadata to use in +// order to send task status updates. We could retain this agent ID and send +// updates upon reregistration but do not currently do this. See MESOS-9739. +if (slaves.recovered.contains(slaveId)) { + return; +} + +slaves.unreachable.erase(slaveId); + +// TODO(vinod): Consider moving these tasks into `completedTasks` by +// transitioning them to a terminal state and sending status updates. +// But it's not clear what this state should be. If a framework +// reconciles these tasks after this point it would get `TASK_UNKNOWN` +// which seems appropriate but we don't keep tasks in this state in-memory. +if (slaves.unreachableTasks.contains(slaveId)) { + foreachkey (const FrameworkID& frameworkId, + slaves.unreachableTasks.at(slaveId)) { +Framework* framework = getFramework(frameworkId); +if (framework == nullptr) { + continue; +} + +TaskState newTaskState = TASK_GONE_BY_OPERATOR; +TaskStatus::Reason newTaskReason = + TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR; + +if (!framework->capabilities.partitionAware) { + newTaskState = TASK_LOST; + newTaskReason = TaskStatus::REASON_SLAVE_REMOVED; +} + +foreach (const TaskID& taskId, + slaves.unreachableTasks.at(slaveId).get(frameworkId)) { + if (framework->unreachableTasks.contains(taskId)) { +const Owned& task = framework->unreachableTasks.at(taskId); + +const StatusUpdate& update = protobuf::createStatusUpdate( +task->framework_id(), +task->slave_id(), +task->task_id(), +newTaskState, +TaskStatus::SOURCE_MASTER, +None(), +message, +newTaskReason, +(task->has_executor_id() + ? Option(task->executor_id()) + : None())); + +updateTask(task.get(), update); + +if (!framework->connected()) { + LOG(WARNING) << "Dropping update " << update + << " for disconnected " + << " framework " << frameworkId; +} else {
[mesos] branch 1.8.x updated (35bfd8a -> 13e4cd1)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch 1.8.x in repository https://gitbox.apache.org/repos/asf/mesos.git. from 35bfd8a Added MESOS-9925 to 1.8.2 CHANGELOG. new 6f90cc3 Fixed a memory leak in the master's 'removeTask()' helper. new 13e4cd1 Transitioned tasks when an unreachable agent is marked as gone. The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/master/http.cpp | 10 +-- src/master/master.cpp | 105 +++--- src/master/master.hpp | 2 +- src/tests/api_tests.cpp | 196 4 files changed, 293 insertions(+), 20 deletions(-)
[mesos] branch master updated: Updated maintenance docs to include agent draining.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new e6edb0a Updated maintenance docs to include agent draining. e6edb0a is described below commit e6edb0a1cf9fd2f8ab7fcb5c291b7a14389118a6 Author: Greg Mann AuthorDate: Tue Aug 6 09:25:09 2019 +0200 Updated maintenance docs to include agent draining. Review: https://reviews.apache.org/r/71219/ --- docs/maintenance.md | 137 +- docs/operator-http-api.md | 85 2 files changed, 219 insertions(+), 3 deletions(-) diff --git a/docs/maintenance.md b/docs/maintenance.md index bec69e0..10eeea1 100644 --- a/docs/maintenance.md +++ b/docs/maintenance.md @@ -1,9 +1,9 @@ --- -title: Apache Mesos - Maintenance Primitives +title: Apache Mesos - Performing Maintenance layout: documentation --- -# Maintenance Primitives +# Performing Node Maintenance in a Mesos Cluster Operators regularly need to perform maintenance tasks on machines that comprise a Mesos cluster. Most Mesos upgrades can be done without affecting running @@ -14,6 +14,137 @@ For example: * Kernel upgrades * Agent upgrades (e.g., adjusting agent attributes or resources) +Before performing maintenance on an agent node in a Mesos cluster, it is +typically desirable to gracefully migrate tasks away from the node beforehand in +order to minimize service disruption when the machine is taken down. Mesos +provides several ways to accomplish this migration: + +* Automatic agent draining, which does not explicitly require cooperation from + schedulers +* Manual node draining, which allows operators to exercise precise control over + the task draining process +* Maintenance primitives, which permit complex coordination but do require that + schedulers react to the maintenance-related messages that they receive + +# Automatic Node Draining + +Node draining was added to provide a simple method for operators to drain tasks +from nodes on which they plan to perform maintenance, without requiring that +schedulers implement support for any maintenance-specific messages. + +Initiating draining will cause all tasks on the target agent node to receive a +kill event immediately, assuming the agent is currently reachable. If the agent +is unreachable, initiation of the kill event will be delayed until the agent is +reachable by the master again. When the tasks receive a kill event, a SIGTERM +signal will be sent to the task to begin the killing process. Depending on the +particular task's behavior, this signal may be sufficient to terminate it. Some +tasks may use this signal to begin the process of graceful termination, which +may take some time. After some delay, a SIGKILL signal will be sent to the task, +which forcefully terminates the task if it is still running. The delay between +the SIGTERM and SIGKILL signals is determined by the length of the task's kill +grace period. If no grace period is set for the task, a default value of several +seconds will be used. + +## Initiating Draining on a Node + +To begin draining an agent, issue the operator API [`DRAIN_AGENT` +call](operator-http-api.md#drain_agent) to the master: + +$ curl -X POST -d '{"type": "DRAIN_AGENT", "drain_agent": {"agent_id": {"value": ""}}}' masterhost:5050/api/v1 + +This will immediately begin the process of killing all tasks on the agent. Once +draining has begun, it cannot be cancelled. To monitor the progress of the +draining process, you can inspect the state of the agent via the master operator +API [`GET_STATE`](operator-http-api.md#get_state) or +[`GET_AGENTS`](operator-http-api.md#get_agents) calls: + +$ curl -X POST -d '{"type": "GET_AGENTS"}' masterhost:5050/api/v1 + +Locate the relevant agent and inspect its `drain_info.state` field. While +draining, the state will be `DRAINING`. When all tasks on the agent have +terminated, all their terminal status updates have been acknowledged by the +schedulers, and all offer operations on the agent have finished, draining is +complete and the agent's drain state will transition to `DRAINED`. At this +point, the node may be taken down for maintenance. + +## Options for Automatic Node Draining + +You may set an upper bound on the kill grace period of draining tasks by +specifying the `max_grace_period` option when draining: + +$ curl -X POST -d '{"type": "DRAIN_AGENT", "drain_agent": {"agent_id": {"value": ""}, "max_grace_period": "10mins"}}' masterhost:5050/api/v1 + +In cases where you know that the node being drained will not return after +draining is complete, and you would like it to be automatically permanently +removed fr
[mesos] branch master updated: Added documentation for GET_OPERATIONS calls.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 00bb0b6 Added documentation for GET_OPERATIONS calls. 00bb0b6 is described below commit 00bb0b6d6abe7700a5adab0bdaf7e91767a2db19 Author: Greg Mann AuthorDate: Tue Jul 30 10:58:36 2019 +0200 Added documentation for GET_OPERATIONS calls. Review: https://reviews.apache.org/r/71199 --- docs/operator-http-api.md | 174 ++ 1 file changed, 174 insertions(+) diff --git a/docs/operator-http-api.md b/docs/operator-http-api.md index 1167838..ddf29e9 100644 --- a/docs/operator-http-api.md +++ b/docs/operator-http-api.md @@ -1661,6 +1661,93 @@ Content-Type: application/json ``` +### GET_OPERATIONS + +Returns a list of all offer operations throughout the cluster, not including +`LAUNCH` or `LAUNCH_GROUP` operations which can be retrieved with `GET_TASKS`. + +``` +GET_OPERATIONS HTTP Request (JSON): + +POST /api/v1 HTTP/1.1 + +Host: masterhost:5050 +Content-Type: application/json +Accept: application/json + +{ + "type": "GET_OPERATIONS" +} + + +GET_OPERATIONS HTTP Response (JSON): + +HTTP/1.1 200 OK + +Content-Type: application/json + +{ + "type": "GET_OPERATIONS", + "get_operations": { +"operations": [ + { +"framework_id": {"value": "74bddcbc-4a02-4d64-b291-aed52032055f-"}, +"agent_id": {"value": "18083noa-j287-dan4-9qx6-l02b84nksb7z-0021"}, +"info": { + "type": "CREATE_DISK", + "id": {"value": "n2j8nksj-9827-82bk-nd7u-83hbchu7whdk-9978"}, + "create_disk": { +"source": { + "provider_id": {"value": "837hfmi2-u2u7-19pp-1884-812i8f02828j-0030"}, + "name": "disk", + "type": "SCALAR", + "scalar": {"value": 1024.0}, + "role": "storage-role-1", + "allocation_info": {"role": "storage-role-1"}, + "reservation": { +"type": "DYNAMIC", +"role": "storage-role-1", +"principal": "storage-service" + }, + "reservations": [{ +"type": "DYNAMIC", +"role": "storage-role-1", +"principal": "storage-service" + }], + "disk": { +"source": { + "type": "RAW", + "vendor": "nas-service", + "id": "vol-19827509", + "profile": "fast-volume" +} + } +}, +"target_type": "MOUNT" + } +}, +"latest_status": { + "operation_id": {"value": "n2j8nksj-9827-82bk-nd7u-83hbchu7whdk-9978"}, + "state": "OPERATION_PENDING", + "uuid": {"value": "28987843-j288-1k0s-l29n-837ybzmo18tj-nv73"}, + "agent_id": {"value": "18083noa-j287-dan4-9qx6-l02b84nksb7z-0021"}, + "resource_provider_id": {"value": "837hfmi2-u2u7-19pp-1884-812i8f02828j-0030"} +}, +"statuses": [{ + "operation_id": {"value": "n2j8nksj-9827-82bk-nd7u-83hbchu7whdk-9978"}, + "state": "OPERATION_PENDING", + "uuid": {"value": "28987843-j288-1k0s-l29n-837ybzmo18tj-nv73"}, + "agent_id": {"value": "18083noa-j287-dan4-9qx6-l02b84nksb7z-0021"}, + "resource_provider_id": {"value": "837hfmi2-u2u7-19pp-1884-812i8f02828j-0030"} +}], +"uuid": {"value": "nsj27802-jd82-jd19-jd38-837jdfnoqfij-u284"} + } +] + } +} + +``` + ### GET_WEIGHTS This call retrieves the information about role weights. @@ -3590,6 +3677,93 @@ Content-Type: application/json ``` +### GET_OPERATIONS + +Returns a list of all offer operations known to the agent, not including +`LAUNCH` or `LAUNCH_GROUP` operations which can be retrieved with `GET_TASKS`. + +``` +GET_OPERATIONS HTTP Request (JSON): + +POST /api/v1 HTTP/1.1
[mesos] branch master updated (4b15fbd -> 7e160a3)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from 4b15fbd Exposed agent drain information in the webui. new a1e4a9a Moved the Docker executor declaration into a header. new 4cbda17 Enabled the Docker executor to accept kill policy overrides. new 7e160a3 Added test to verify that Docker executor can override kill policy. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: src/CMakeLists.txt | 4 +- src/Makefile.am| 3 +- src/docker/CMakeLists.txt | 20 -- src/docker/executor.cpp| 388 +++-- src/docker/executor.hpp| 58 +++ src/exec/exec.cpp | 22 +- src/internal/evolve.cpp| 6 + src/internal/evolve.hpp| 1 + src/launcher/CMakeLists.txt| 5 + src/launcher/docker_executor.cpp | 266 ++ .../containerizer/docker_containerizer_tests.cpp | 172 + 11 files changed, 651 insertions(+), 294 deletions(-) delete mode 100644 src/docker/CMakeLists.txt create mode 100644 src/launcher/docker_executor.cpp
[mesos] 03/03: Added test to verify that Docker executor can override kill policy.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 7e160a36918ad73f79c05cb53a48b7424958e497 Author: Greg Mann AuthorDate: Thu Jul 25 12:17:45 2019 -0700 Added test to verify that Docker executor can override kill policy. This adds a test which verifies that when a scheduler attemps to override a task's default kill policy, the Docker executor will honor that override. Review: https://reviews.apache.org/r/71035/ --- src/internal/evolve.cpp| 6 + src/internal/evolve.hpp| 1 + .../containerizer/docker_containerizer_tests.cpp | 172 + 3 files changed, 179 insertions(+) diff --git a/src/internal/evolve.cpp b/src/internal/evolve.cpp index 81de15e..c5e4151 100644 --- a/src/internal/evolve.cpp +++ b/src/internal/evolve.cpp @@ -86,6 +86,12 @@ v1::AgentInfo evolve(const SlaveInfo& slaveInfo) } +v1::ContainerInfo evolve(const ContainerInfo& containerInfo) +{ + return evolve(containerInfo); +} + + v1::DomainInfo evolve(const DomainInfo& domainInfo) { return evolve(domainInfo); diff --git a/src/internal/evolve.hpp b/src/internal/evolve.hpp index ffbb342..e4e3ab4 100644 --- a/src/internal/evolve.hpp +++ b/src/internal/evolve.hpp @@ -62,6 +62,7 @@ namespace internal { // Helpers for evolving types between versions. Please add as necessary! v1::AgentID evolve(const SlaveID& slaveId); v1::AgentInfo evolve(const SlaveInfo& slaveInfo); +v1::ContainerInfo evolve(const ContainerInfo& containerInfo); v1::DomainInfo evolve(const DomainInfo& domainInfo); v1::DrainInfo evolve(const DrainInfo& drainInfo); v1::ExecutorID evolve(const ExecutorID& executorId); diff --git a/src/tests/containerizer/docker_containerizer_tests.cpp b/src/tests/containerizer/docker_containerizer_tests.cpp index a621758..3d932a5 100644 --- a/src/tests/containerizer/docker_containerizer_tests.cpp +++ b/src/tests/containerizer/docker_containerizer_tests.cpp @@ -20,6 +20,8 @@ #include +#include + #include #include #include @@ -5240,6 +5242,176 @@ TEST_F(HungDockerTest, ROOT_DOCKER_InspectHungDuringPull) driver.join(); } + +// This test is disabled on windows due to the bash-specific +// command used in the task below. +TEST_F_TEMP_DISABLED_ON_WINDOWS( +DockerContainerizerTest, ROOT_DOCKER_OverrideKillPolicy) +{ + Try> master = StartMaster(); + ASSERT_SOME(master); + + MockDocker* mockDocker = +new MockDocker(tests::flags.docker, tests::flags.docker_socket); + + Shared docker(mockDocker); + + slave::Flags flags = CreateSlaveFlags(); + + Fetcher fetcher(flags); + + Try logger = +ContainerLogger::create(flags.container_logger); + + ASSERT_SOME(logger); + + Future slaveRegisteredMessage = +FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); + + MockDockerContainerizer dockerContainerizer( + flags, + , + Owned(logger.get()), + docker); + + Owned detector = master.get()->createDetector(); + + Try> slave = +StartSlave(detector.get(), , flags); + ASSERT_SOME(slave); + + AWAIT_READY(slaveRegisteredMessage); + + auto scheduler = std::make_shared(); + + EXPECT_CALL(*scheduler, connected(_)) +.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO)); + + Future subscribed; + EXPECT_CALL(*scheduler, subscribed(_, _)) +.WillOnce(FutureArg<1>()); + + Future offers; + EXPECT_CALL(*scheduler, offers(_, _)) +.WillOnce(FutureArg<1>()) +.WillRepeatedly(Return()); + + EXPECT_CALL(*scheduler, heartbeat(_)) +.WillRepeatedly(Return()); // Ignore heartbeats. + + v1::scheduler::TestMesos mesos( + master.get()->pid, + ContentType::PROTOBUF, + scheduler); + + AWAIT_READY(subscribed); + v1::FrameworkID frameworkId(subscribed->framework_id()); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->offers().empty()); + + const v1::Offer& offer = offers->offers(0); + const v1::AgentID& agentId = offer.agent_id(); + + Try parsed = +v1::Resources::parse("cpus:0.1;mem:32;disk:32"); + + ASSERT_SOME(parsed); + + v1::Resources resources = parsed.get(); + + // Create a task which ignores SIGTERM so that we can detect + // when the task receives SIGKILL. + v1::TaskInfo taskInfo = v1::createTask( + agentId, + resources, + "trap \"echo 'SIGTERM received'\" SIGTERM; sleep 99"); + + // TODO(tnachen): Use local image to test if possible. + taskInfo.mutable_container()->CopyFrom( + evolve(createDockerInfo(DOCKER_TEST_IMAGE))); + + { +// Set a long grace period on the task's kill policy so that we +// can detect if the override is effective. +mesos::v1::DurationInfo gracePeriod; +gracePeriod.set_nanoseconds(Minutes(10).ns()); + +mesos::v1::KillPolicy killPolicy;
[mesos] 01/03: Moved the Docker executor declaration into a header.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit a1e4a9aa1d6f2dee9fd56432122c2fa6a35edb77 Author: Greg Mann AuthorDate: Thu Jul 25 12:17:41 2019 -0700 Moved the Docker executor declaration into a header. This moves the declaration of the Docker executor into the Docker executor header file and moves the code for the Docker executor binary into a new launcher implementation file. This change will enable the Mesos executor driver implementation to make use of the `DockerExecutor` symbol. Review: https://reviews.apache.org/r/71033/ --- src/CMakeLists.txt | 4 +- src/Makefile.am | 3 +- src/docker/CMakeLists.txt| 20 --- src/docker/executor.cpp | 348 +-- src/docker/executor.hpp | 53 ++ src/launcher/CMakeLists.txt | 5 + src/launcher/docker_executor.cpp | 266 ++ 7 files changed, 409 insertions(+), 290 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c455ed6..218a75e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -263,7 +263,8 @@ set(CSI_SRC csi/volume_manager.cpp) set(DOCKER_SRC - docker/docker.cpp) + docker/docker.cpp + docker/executor.cpp) if (NOT WIN32) list(APPEND DOCKER_SRC @@ -644,7 +645,6 @@ endif () ## add_subdirectory(checks) add_subdirectory(cli) -add_subdirectory(docker) add_subdirectory(examples) add_subdirectory(launcher) add_subdirectory(local) diff --git a/src/Makefile.am b/src/Makefile.am index 46c66f1..697ab10 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1092,6 +1092,7 @@ libmesos_no_3rdparty_la_SOURCES += \ docker/docker.cpp\ docker/docker.hpp\ docker/executor.hpp \ + docker/executor.cpp \ docker/spec.cpp \ examples/flags.hpp \ examples/test_anonymous_module.hpp \ @@ -1818,7 +1819,7 @@ mesos_usage_CPPFLAGS = $(MESOS_CPPFLAGS) mesos_usage_LDADD = libmesos.la $(LDADD) pkglibexec_PROGRAMS += mesos-docker-executor -mesos_docker_executor_SOURCES = docker/executor.cpp +mesos_docker_executor_SOURCES = launcher/docker_executor.cpp mesos_docker_executor_CPPFLAGS = $(MESOS_CPPFLAGS) mesos_docker_executor_LDADD = libmesos.la $(LDADD) diff --git a/src/docker/CMakeLists.txt b/src/docker/CMakeLists.txt deleted file mode 100644 index 1196664..000 --- a/src/docker/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# THE DOCKER EXECUTOR EXECUTABLE. -# -add_executable(mesos-docker-executor executor.cpp) -target_link_libraries(mesos-docker-executor PRIVATE mesos) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index f638e4b..de8216f 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -856,291 +856,105 @@ private: }; -class DockerExecutor : public Executor +DockerExecutor::DockerExecutor( +const Owned& docker, +const string& container, +const string& sandboxDirectory, +const string& mappedDirectory, +const Duration& shutdownGracePeriod, +const string& launcherDir, +const map& taskEnvironment, +const Option& defaultContainerDNS, +bool cgroupsEnableCfs) { -public: - DockerExecutor( - const Owned& docker, - const string& container, - const string& sandboxDirectory, - const string& mappedDirectory, - const Duration& shutdownGracePeriod, - const string& launcherDir, - const map& taskEnvironment, - const Option& defaultContainerDNS, - bool cgroupsEnableCfs) - { -process = Owned(new D
[mesos] 02/03: Enabled the Docker executor to accept kill policy overrides.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 4cbda17f3667e2b0713a1f1663e50819a076680b Author: Greg Mann AuthorDate: Thu Jul 25 12:17:43 2019 -0700 Enabled the Docker executor to accept kill policy overrides. This adds a new `killTask()` overload to the Docker executor and updates the Mesos executor driver to call into that overload when the loaded executor is the Docker executor. This allows the executor driver to pass the kill policy override, when present, into the Docker executor. Review: https://reviews.apache.org/r/71034/ --- src/docker/executor.cpp | 48 src/docker/executor.hpp | 5 + src/exec/exec.cpp | 22 ++ 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp index de8216f..132f42b 100644 --- a/src/docker/executor.cpp +++ b/src/docker/executor.cpp @@ -396,15 +396,31 @@ public: defer(self(), ::launchHealthCheck, containerName, task)); } - void killTask(ExecutorDriver* driver, const TaskID& taskId) + void killTask( + ExecutorDriver* driver, + const TaskID& taskId, + const Option& killPolicyOverride = None()) { -LOG(INFO) << "Received killTask for task " << taskId.value(); +string overrideMessage = ""; +if (killPolicyOverride.isSome() && killPolicyOverride->has_grace_period()) { + Duration gracePeriodDuration = +Nanoseconds(killPolicyOverride->grace_period().nanoseconds()); + + overrideMessage = +" with grace period override of " + stringify(gracePeriodDuration); +} + +LOG(INFO) << "Received killTask" << overrideMessage + << " for task " << taskId.value(); // Using shutdown grace period as a default is backwards compatible // with the `stop_timeout` flag, deprecated in 1.0. Duration gracePeriod = shutdownGracePeriod; -if (killPolicy.isSome() && killPolicy->has_grace_period()) { +if (killPolicyOverride.isSome() && killPolicyOverride->has_grace_period()) { + gracePeriod = +Nanoseconds(killPolicyOverride->grace_period().nanoseconds()); +} else if (killPolicy.isSome() && killPolicy->has_grace_period()) { gracePeriod = Nanoseconds(killPolicy->grace_period().nanoseconds()); } @@ -929,7 +945,12 @@ void DockerExecutor::launchTask(ExecutorDriver* driver, const TaskInfo& task) void DockerExecutor::killTask(ExecutorDriver* driver, const TaskID& taskId) { - dispatch(process.get(), ::killTask, driver, taskId); + // Need to disambiguate overloaded function. + void (DockerExecutorProcess::*killTaskMethod)( + ExecutorDriver*, const TaskID&, const Option&) += ::killTask; + + process::dispatch(process.get(), killTaskMethod, driver, taskId, None()); } @@ -955,6 +976,25 @@ void DockerExecutor::error(ExecutorDriver* driver, const string& data) dispatch(process.get(), ::error, driver, data); } + +void DockerExecutor::killTask( +ExecutorDriver* driver, +const TaskID& taskId, +const Option& killPolicyOverride) +{ + // Need to disambiguate overloaded function. + void (DockerExecutorProcess::*killTaskMethod)( + ExecutorDriver*, const TaskID&, const Option&) += ::killTask; + + process::dispatch( + process.get(), + killTaskMethod, + driver, + taskId, + killPolicyOverride); +} + } // namespace docker { } // namespace internal { } // namespace mesos { diff --git a/src/docker/executor.hpp b/src/docker/executor.hpp index dfb8ad0..768c2e1 100644 --- a/src/docker/executor.hpp +++ b/src/docker/executor.hpp @@ -151,6 +151,11 @@ public: void error(ExecutorDriver* driver, const std::string& data) override; + void killTask( + ExecutorDriver* driver, + const TaskID& taskId, + const Option& killPolicyOverride); + private: process::Owned process; }; diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index c0fa3b6..67e082e 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -47,6 +47,8 @@ #include "common/protobuf_utils.hpp" +#include "docker/executor.hpp" + #include "logging/flags.hpp" #include "logging/logging.hpp" @@ -183,8 +185,7 @@ public: ::task); install( -::killTask, -::task_id); +::killTask); install( ::statusUpdateAcknowledgement, @@ -339,8 +340,10 @@ protected: VLOG(1) << "Executor::launchTask took " << stopwatch.elapsed(); } - void killTask(const TaskID& taskId) + void killTask(KillTaskMessage&& killTa
[mesos] branch master updated: Exposed agent drain information in the webui.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new 4b15fbd Exposed agent drain information in the webui. 4b15fbd is described below commit 4b15fbdde14eed3a6dbd5c95d271bc26eb7216e2 Author: Benjamin Bannier AuthorDate: Thu Jul 25 09:35:40 2019 -0700 Exposed agent drain information in the webui. Review: https://reviews.apache.org/r/71081/ --- src/webui/app/agents/agent.html | 10 ++ src/webui/app/agents/agents.html | 2 ++ src/webui/app/controllers.js | 11 +++ 3 files changed, 23 insertions(+) diff --git a/src/webui/app/agents/agent.html b/src/webui/app/agents/agent.html index 6d50bfd..25d233b 100644 --- a/src/webui/app/agents/agent.html +++ b/src/webui/app/agents/agent.html @@ -56,6 +56,16 @@ + +Draining + + Mark gone: + {{agent.drain_config.mark_gone}} + Max. grace period: + {{agent.drain_config.max_grace_period.nanoseconds / 10}} seconds + + + Tasks diff --git a/src/webui/app/agents/agents.html b/src/webui/app/agents/agents.html index 98712c6..0c6d330 100644 --- a/src/webui/app/agents/agents.html +++ b/src/webui/app/agents/agents.html @@ -13,6 +13,7 @@ ID Host + State CPUs (Allocated / Total) GPUs (Allocated / Total) Mem (Allocated / Total) @@ -34,6 +35,7 @@ {{agent.hostname}} +{{agent.state}} {{agent.used_resources.cpus | number}} / {{agent.resources.cpus | number}} diff --git a/src/webui/app/controllers.js b/src/webui/app/controllers.js index 66cd32e..725230f 100644 --- a/src/webui/app/controllers.js +++ b/src/webui/app/controllers.js @@ -198,6 +198,17 @@ $scope.unreachable_agents = $scope.state.unreachable_slaves; _.each($scope.state.slaves, function(agent) { + // Calculate the agent "state" from activation and drain state. + if (!agent.deactivated) { +agent.state = "Active"; + } else if (agent.drain_info) { +// Transform the drain state so only the first letter is capitalized. +var s = agent.drain_info.state; +agent.state = s.charAt(0).toUpperCase() + s.slice(1).toLowerCase(); + } else { +agent.state = "Deactivated"; + } + $scope.agents[agent.id] = agent; $scope.total_cpus += agent.resources.cpus; $scope.total_gpus += agent.resources.gpus;
[mesos] branch master updated: Fixed devolution of 'max_grace_period' field in DRAIN_AGENT call.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git The following commit(s) were added to refs/heads/master by this push: new ff8c9a9 Fixed devolution of 'max_grace_period' field in DRAIN_AGENT call. ff8c9a9 is described below commit ff8c9a96be6ae1ee47faf9d5b80a518dfb4a3db0 Author: Greg Mann AuthorDate: Mon Jul 22 18:48:56 2019 -0700 Fixed devolution of 'max_grace_period' field in DRAIN_AGENT call. Review: https://reviews.apache.org/r/71140 --- src/internal/devolve.cpp | 14 +- src/tests/api_tests.cpp | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/internal/devolve.cpp b/src/internal/devolve.cpp index 2809c25..4527c52 100644 --- a/src/internal/devolve.cpp +++ b/src/internal/devolve.cpp @@ -280,7 +280,19 @@ mesos::agent::Response devolve(const v1::agent::Response& response) mesos::master::Call devolve(const v1::master::Call& call) { - return devolve(call); + mesos::master::Call _call = devolve(call); + + // The `google.protobuf.Duration` field in the `DrainAgent` call does not get + // devolved automatically with the templated helper, so we devolve it + // explicitly here. + if (call.type() == v1::master::Call::DRAIN_AGENT && + call.has_drain_agent() && + call.drain_agent().has_max_grace_period()) { +*_call.mutable_drain_agent()->mutable_max_grace_period() = + devolve(call.drain_agent().max_grace_period()); + } + + return _call; } } // namespace internal { diff --git a/src/tests/api_tests.cpp b/src/tests/api_tests.cpp index 3479ed3..641eb15 100644 --- a/src/tests/api_tests.cpp +++ b/src/tests/api_tests.cpp @@ -5588,6 +5588,7 @@ TEST_P(MasterAPITest, DrainAgent) { v1::master::Call::DrainAgent drainAgent; drainAgent.mutable_agent_id()->CopyFrom(agentId); +drainAgent.mutable_max_grace_period()->set_seconds(0); v1::master::Call call; call.set_type(v1::master::Call::DRAIN_AGENT); @@ -5605,6 +5606,7 @@ TEST_P(MasterAPITest, DrainAgent) mesos::v1::DrainInfo drainInfo; drainInfo.set_state(mesos::v1::DRAINED); drainInfo.mutable_config()->set_mark_gone(false); + drainInfo.mutable_config()->mutable_max_grace_period()->set_nanoseconds(0); // Ensure that the agent's drain info is reflected in the master's // GET_AGENTS response.
[mesos] 05/14: Updated an equality operator.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 27f0cd3519bafaf058e8347d482475e776d494e1 Author: Greg Mann AuthorDate: Mon Jul 15 10:25:47 2019 -0700 Updated an equality operator. This patch updates the equality operator for the `Task` message to include two missing conditions. An equality operator for `HealthCheck` is also added to make this possible. Review: https://reviews.apache.org/r/70900/ --- include/mesos/type_utils.hpp | 1 + src/common/type_utils.cpp| 10 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/mesos/type_utils.hpp b/include/mesos/type_utils.hpp index ed9190b..b9e6164 100644 --- a/include/mesos/type_utils.hpp +++ b/include/mesos/type_utils.hpp @@ -62,6 +62,7 @@ bool operator==( bool operator==(const DiscoveryInfo& left, const DiscoveryInfo& right); bool operator==(const Environment& left, const Environment& right); bool operator==(const ExecutorInfo& left, const ExecutorInfo& right); +bool operator==(const HealthCheck& left, const HealthCheck& right); bool operator==(const Label& left, const Label& right); bool operator==(const Labels& left, const Labels& right); bool operator==(const MasterInfo& left, const MasterInfo& right); diff --git a/src/common/type_utils.cpp b/src/common/type_utils.cpp index a7eb0e9..16d6657 100644 --- a/src/common/type_utils.cpp +++ b/src/common/type_utils.cpp @@ -400,6 +400,12 @@ bool operator!=(const ExecutorInfo& left, const ExecutorInfo& right) } +bool operator==(const HealthCheck& left, const HealthCheck& right) +{ + return google::protobuf::util::MessageDifferencer::Equals(left, right); +} + + bool operator==(const MasterInfo& left, const MasterInfo& right) { return left.id() == right.id() && @@ -575,7 +581,9 @@ bool operator==(const Task& left, const Task& right) left.status_update_uuid() == right.status_update_uuid() && left.labels() == right.labels() && left.discovery() == right.discovery() && -left.user() == right.user(); +left.user() == right.user() && +left.container() == right.container() && +left.health_check() == right.health_check(); }
[mesos] branch master updated (a32fd27 -> c076c8c)
This is an automated email from the ASF dual-hosted git repository. grag pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git. from a32fd27 Updated 3 unit tests by changing IO switchboard to local mode. new 3c959eb Added minimal agent handler for 'DrainSlaveMessage'. new 7d08b66 Added the DrainConfig to agent API outputs. new 04d25af Added test for DrainConfig in agent API outputs. new ef19f29 Refactored the agent's task-killing code. new 27f0cd3 Updated an equality operator. new 3bb8287 Added kill policy to the 'Task' message. new e1c7985 Killed all tasks on the agent when draining. new 505928a Added tests for task killing when draining the agent. new 1a32b31 Fixed pid checkpointing for `TestContainerizer`. new 54fb43e Added recovery of agent drain information. new 1889268 Adjusted task status updates during draining. new a7044bd Changed agent to fail task launches received during draining. new 654faf9 Cleared agent drain state when draining is finished. new c076c8c Added test for agent to leave draining state on its own. The 14 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: include/mesos/agent/agent.proto| 2 + include/mesos/mesos.proto | 4 + include/mesos/type_utils.hpp | 8 + include/mesos/v1/agent/agent.proto | 2 + include/mesos/v1/mesos.proto | 4 + src/common/protobuf_utils.cpp | 4 + src/common/type_utils.cpp | 17 +- src/slave/http.cpp | 11 + src/slave/paths.cpp| 9 + src/slave/paths.hpp| 6 + src/slave/slave.cpp| 366 ++ src/slave/slave.hpp| 31 +- src/slave/state.cpp| 16 + src/slave/state.hpp| 3 + src/tests/containerizer.cpp| 12 + src/tests/slave_tests.cpp | 756 + 16 files changed, 1173 insertions(+), 78 deletions(-)
[mesos] 08/14: Added tests for task killing when draining the agent.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 505928a3f51555bd3e45f2fc9787fdf890b28bfb Author: Greg Mann AuthorDate: Mon Jul 15 10:25:56 2019 -0700 Added tests for task killing when draining the agent. Review: https://reviews.apache.org/r/70904/ --- src/tests/slave_tests.cpp | 335 ++ 1 file changed, 335 insertions(+) diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 8098a1a..147967d 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -94,6 +94,8 @@ #include "tests/resources_utils.hpp" #include "tests/utils.hpp" +#include "tests/containerizer/mock_containerizer.hpp" + using namespace mesos::internal::slave; #ifdef USE_SSL_SOCKET @@ -11881,6 +11883,339 @@ TEST_F(SlaveTest, DrainInfoInAPIOutputs) } } + +// When an agent receives a `DrainSlaveMessage`, it should kill running tasks. +TEST_F(SlaveTest, DrainAgentKillsRunningTask) +{ + Clock::pause(); + + Try> master = StartMaster(); + ASSERT_SOME(master); + + Future updateSlaveMessage = +FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); + + StandaloneMasterDetector detector(master.get()->pid); + + slave::Flags slaveFlags = CreateSlaveFlags(); + + Try> slave = StartSlave(, slaveFlags); + ASSERT_SOME(slave); + + Clock::advance(slaveFlags.registration_backoff_factor); + + AWAIT_READY(updateSlaveMessage); + + auto scheduler = std::make_shared(); + + EXPECT_CALL(*scheduler, connected(_)) +.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO)); + + Future subscribed; + EXPECT_CALL(*scheduler, subscribed(_, _)) +.WillOnce(FutureArg<1>()); + + EXPECT_CALL(*scheduler, heartbeat(_)) +.WillRepeatedly(Return()); // Ignore heartbeats. + + Future offers; + EXPECT_CALL(*scheduler, offers(_, _)) +.WillOnce(FutureArg<1>()) +.WillRepeatedly(Return()); // Ignore subsequent offers. + + v1::scheduler::TestMesos mesos( + master.get()->pid, + ContentType::PROTOBUF, + scheduler); + + AWAIT_READY(subscribed); + + v1::FrameworkID frameworkId(subscribed->framework_id()); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->offers().empty()); + + const v1::Offer& offer = offers->offers(0); + const v1::AgentID& agentId = offer.agent_id(); + + Future startingUpdate; + Future runningUpdate; + EXPECT_CALL(*scheduler, update(_, _)) +.WillOnce(DoAll( +FutureArg<1>(), +v1::scheduler::SendAcknowledge(frameworkId, agentId))) +.WillOnce(DoAll( +FutureArg<1>(), +v1::scheduler::SendAcknowledge(frameworkId, agentId))); + + v1::Resources resources = +v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(); + + v1::TaskInfo taskInfo = +v1::createTask(agentId, resources, SLEEP_COMMAND(1000)); + + v1::Offer::Operation launch = v1::LAUNCH({taskInfo}); + + mesos.send( + v1::createCallAccept( + frameworkId, + offer, + {launch})); + + AWAIT_READY(startingUpdate); + EXPECT_EQ(v1::TASK_STARTING, startingUpdate->status().state()); + + AWAIT_READY(runningUpdate); + EXPECT_EQ(v1::TASK_RUNNING, runningUpdate->status().state()); + + Future killedUpdate; + EXPECT_CALL(*scheduler, update(_, _)) +.WillOnce(FutureArg<1>()); + + // Simulate the master sending a `DrainSlaveMessage` to the agent. + + // Immediately kill the task forcefully. + DurationInfo maxGracePeriod; + maxGracePeriod.set_nanoseconds(0); + + DrainConfig drainConfig; + drainConfig.set_mark_gone(true); + drainConfig.mutable_max_grace_period()->CopyFrom(maxGracePeriod); + + DrainSlaveMessage drainSlaveMessage; + drainSlaveMessage.mutable_config()->CopyFrom(drainConfig); + + process::post(master.get()->pid, slave.get()->pid, drainSlaveMessage); + + AWAIT_READY(killedUpdate); + + EXPECT_EQ(v1::TASK_KILLED, killedUpdate->status().state()); +} + + +// When the agent receives a `DrainSlaveMessage`, it should kill queued tasks. +TEST_F(SlaveTest, DrainAgentKillsQueuedTask) +{ + Clock::pause(); + + Try> master = StartMaster(); + ASSERT_SOME(master); + + Future updateSlaveMessage = +FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); + + MockContainerizer mockContainerizer; + StandaloneMasterDetector detector(master.get()->pid); + slave::Flags slaveFlags = CreateSlaveFlags(); + + EXPECT_CALL(mockContainerizer, recover(_)) +.WillOnce(Return(Nothing())); + + EXPECT_CALL(mockContainerizer, containers()) +.WillOnce(Return(hashset())); + + Try> slave = StartSlave( + , + , + slaveFlags); + ASSERT_SOME(slave); + + Clock::advance(slaveFlags.registration_backoff_factor); + + AWAIT_READY(updateSlaveMessage); + + auto scheduler = std::make_shared(); + + EXPECT_CALL(*scheduler, connected(_)) +.
[mesos] 02/14: Added the DrainConfig to agent API outputs.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 7d08b667e446840dc31538d9d40705e3d8fb12a0 Author: Greg Mann AuthorDate: Mon Jul 15 10:25:35 2019 -0700 Added the DrainConfig to agent API outputs. Review: https://reviews.apache.org/r/70835/ --- include/mesos/agent/agent.proto| 2 ++ include/mesos/v1/agent/agent.proto | 2 ++ src/slave/http.cpp | 11 +++ 3 files changed, 15 insertions(+) diff --git a/include/mesos/agent/agent.proto b/include/mesos/agent/agent.proto index 83eb7bb..3cb622d 100644 --- a/include/mesos/agent/agent.proto +++ b/include/mesos/agent/agent.proto @@ -569,6 +569,8 @@ message Response { // Contains the agent's information. message GetAgent { optional SlaveInfo slave_info = 1; + +optional DrainConfig drain_config = 2; } // Lists information about all resource providers known to the agent diff --git a/include/mesos/v1/agent/agent.proto b/include/mesos/v1/agent/agent.proto index f6574cb..4324ad6 100644 --- a/include/mesos/v1/agent/agent.proto +++ b/include/mesos/v1/agent/agent.proto @@ -569,6 +569,8 @@ message Response { // Contains the agent's information. message GetAgent { optional AgentInfo agent_info = 1; + +optional DrainConfig drain_config = 2; } // Lists information about all resource providers known to the agent diff --git a/src/slave/http.cpp b/src/slave/http.cpp index 69e6d74..321dca7 100644 --- a/src/slave/http.cpp +++ b/src/slave/http.cpp @@ -1331,6 +1331,12 @@ Future Http::state( writer->field("domain", slave->info.domain()); } +if (slave->drainConfig.isSome()) { + writer->field( + "drain_config", + JSON::Protobuf(slave->drainConfig.get())); +} + const Resources& totalResources = slave->totalResources; writer->field("resources", totalResources); @@ -1842,6 +1848,11 @@ Future Http::getAgent( response.mutable_get_agent()->mutable_slave_info()->CopyFrom(slave->info); + if (slave->drainConfig.isSome()) { +response.mutable_get_agent()->mutable_drain_config()->CopyFrom( +slave->drainConfig.get()); + } + return OK(serialize(acceptType, evolve(response)), stringify(acceptType)); }
[mesos] 07/14: Killed all tasks on the agent when draining.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit e1c7985e96d84693f3e41d3a50da5f5ea11b6cd8 Author: Greg Mann AuthorDate: Mon Jul 15 10:25:51 2019 -0700 Killed all tasks on the agent when draining. This patch updates the agent's `DrainSlaveMessage` handler to kill all tasks on the agent when the message is received. Review: https://reviews.apache.org/r/70903/ --- include/mesos/type_utils.hpp | 6 + src/slave/slave.cpp | 62 2 files changed, 68 insertions(+) diff --git a/include/mesos/type_utils.hpp b/include/mesos/type_utils.hpp index 2fd8a62..98a2995 100644 --- a/include/mesos/type_utils.hpp +++ b/include/mesos/type_utils.hpp @@ -338,6 +338,12 @@ inline bool operator<(const ContainerID& left, const ContainerID& right) } +inline bool operator<(const DurationInfo& left, const DurationInfo& right) +{ + return left.nanoseconds() < right.nanoseconds(); +} + + inline bool operator<(const ExecutorID& left, const ExecutorID& right) { return left.value() < right.value(); diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 741c1f6..19b4769 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -999,6 +999,68 @@ void Slave::drain( << "Failed to checkpoint DrainConfig"; drainConfig = drainSlaveMessage.config(); + + const Option maxGracePeriod = +drainConfig->has_max_grace_period() + ? drainConfig->max_grace_period() + : Option::none(); + + auto calculateKillPolicy = +[&](const Option& killPolicy) -> Option { + if (maxGracePeriod.isNone()) { +return None(); + } + + KillPolicy killPolicyOverride; + killPolicyOverride.mutable_grace_period()->CopyFrom(maxGracePeriod.get()); + + // Task kill policy is not set or unknown. + if (killPolicy.isNone() || !killPolicy->has_grace_period()) { +return killPolicyOverride; + } + + // Task kill policy is greater than the override. + if (maxGracePeriod.get() < killPolicy->grace_period()) { +return killPolicyOverride; + } + + return None(); +}; + + // Frameworks may be removed within `kill()` or `killPendingTask()` below, + // so we must copy them and their members before looping. + foreachvalue (Framework* framework, utils::copy(frameworks)) { +typedef hashmap TaskMap; +foreachvalue (const TaskMap& tasks, utils::copy(framework->pendingTasks)) { + foreachvalue (const TaskInfo& task, tasks) { +killPendingTask(framework->id(), framework, task.task_id()); + } +} + +foreachvalue (Executor* executor, utils::copy(framework->executors)) { + foreachvalue (Task* task, executor->launchedTasks) { +kill(framework->id(), + framework, + executor, + task->task_id(), + calculateKillPolicy( +task->has_kill_policy() + ? task->kill_policy() + : Option::none())); + } + + foreachvalue (const TaskInfo& task, utils::copy(executor->queuedTasks)) { +kill(framework->id(), + framework, + executor, + task.task_id(), + calculateKillPolicy( +task.has_kill_policy() + ? task.kill_policy() + : Option::none())); + } +} + } }
[mesos] 13/14: Cleared agent drain state when draining is finished.
This is an automated email from the ASF dual-hosted git repository. grag pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git commit 654faf9244b0016f8a17623aca7812923b3a313a Author: Benjamin Bannier AuthorDate: Mon Jul 15 10:26:23 2019 -0700 Cleared agent drain state when draining is finished. Once a draining agent has neither frameworks with pending tasks nor any executors with either queued or launched tasks it has finished draining. This patch adds handling of that case which clears both the in-memory and persisted drain configuration. Review: https://reviews.apache.org/r/70959/ --- src/slave/slave.cpp | 31 +++ src/slave/slave.hpp | 4 2 files changed, 35 insertions(+) diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index eecd71e..2477975 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -7067,6 +7067,8 @@ void Slave::removeFramework(Framework* framework) // Pass ownership of the framework pointer. completedFrameworks.set(framework->id(), Owned(framework)); + updateDrainStatus(); + if (state == TERMINATING && frameworks.empty()) { terminate(self()); } @@ -8944,6 +8946,8 @@ void Slave::removeOperation(Operation* operation) checkpointResourceState( totalResources.filter(mesos::needCheckpointing), false); + + updateDrainStatus(); } @@ -9768,6 +9772,33 @@ void Slave::initializeResourceProviderManager( } +void Slave::updateDrainStatus() +{ + if (drainConfig.isNone()) { +return; + } + + bool drained = operations.empty() && frameworks.empty(); + + if (!drained) { +return; + } + + LOG(INFO) << "Agent finished draining"; + + const string drainConfigPath = paths::getDrainConfigPath(metaDir, info.id()); + + Try rm = os::rm(drainConfigPath); + + if (rm.isError()) { +EXIT(EXIT_FAILURE) << "Could not remove persisted drain configuration " + << "'" << drainConfigPath << "': " << rm.error(); + } + + drainConfig = None(); +} + + Framework::Framework( Slave* _slave, const Flags& slaveFlags, diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp index 58bdd2a..58a5608 100644 --- a/src/slave/slave.hpp +++ b/src/slave/slave.hpp @@ -910,6 +910,10 @@ private: // If the agent is currently draining, contains the configuration used to // drain the agent. If NONE, the agent is not currently draining. Option drainConfig; + + // Check whether draining is finished and possibly remove + // both in-memory and persisted drain configuration. + void updateDrainStatus(); };