[mesos] branch master updated: Added tests for 'volume/csi' isolator recovery.

2020-09-04 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new a8059a7  Added tests for 'volume/csi' isolator recovery.
a8059a7 is described below

commit a8059a78473774e3d95e8e908f360ee5e9aadd0d
Author: Greg Mann 
AuthorDate: Fri Sep 4 10:39:10 2020 -0700

Added tests for 'volume/csi' isolator recovery.

Review: https://reviews.apache.org/r/72806/
---
 .../containerizer/volume_csi_isolator_tests.cpp| 360 +
 1 file changed, 360 insertions(+)

diff --git a/src/tests/containerizer/volume_csi_isolator_tests.cpp 
b/src/tests/containerizer/volume_csi_isolator_tests.cpp
index dafb0b7..d51d3c9 100644
--- a/src/tests/containerizer/volume_csi_isolator_tests.cpp
+++ b/src/tests/containerizer/volume_csi_isolator_tests.cpp
@@ -1117,6 +1117,366 @@ TEST_P(VolumeCSIIsolatorTest, ROOT_UnmanagedPlugin)
   AWAIT_READY(finishedUpdate);
 }
 
+
+// When the agent fails over while a CSI volume is mounted to a container, the
+// agent should recover the volume state so that the volume can be successfully
+// unpublished after agent recovery is complete.
+TEST_P(VolumeCSIIsolatorTest, ROOT_INTERNET_CURL_UnpublishAfterAgentFailover)
+{
+  createCsiPluginConfig(Bytes(0), TEST_VOLUME_ID + ":1MB");
+
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Future slaveRegisteredMessage =
+FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  Owned detector = master.get()->createDetector();
+
+  slave::Flags agentFlags = CreateSlaveFlags();
+
+  Fetcher fetcher(agentFlags);
+
+  // Use a consistent ID across agent restart so that the executor can 
register.
+  string processId = process::ID::generate("slave");
+
+  SlaveOptions agentOptions = SlaveOptions(detector.get())
+.withId(processId)
+.withFlags(agentFlags);
+
+  Try> agent = StartSlave(agentOptions);
+  ASSERT_SOME(agent);
+
+  AWAIT_READY(slaveRegisteredMessage);
+
+  auto scheduler = std::make_shared();
+
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  EXPECT_CALL(*scheduler, connected(_))
+.WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
+
+  Future subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+.WillOnce(FutureArg<1>());
+
+  Future offers;
+  EXPECT_CALL(*scheduler, offers(_, _))
+.WillOnce(FutureArg<1>());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+.WillRepeatedly(Return()); // Ignore heartbeats.
+
+  v1::scheduler::TestMesos mesos(
+  master.get()->pid,
+  ContentType::PROTOBUF,
+  scheduler);
+
+  AWAIT_READY(subscribed);
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+
+  v1::Resources resources =
+v1::Resources::parse("cpus:0.1;mem:32;disk:32").get();
+
+  v1::ExecutorInfo executorInfo = v1::createExecutorInfo(
+  v1::DEFAULT_EXECUTOR_ID,
+  None(),
+  resources,
+  v1::ExecutorInfo::DEFAULT,
+  frameworkId);
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->offers().empty());
+
+  v1::Offer offer = offers->offers(0);
+  const v1::AgentID& agentId = offer.agent_id();
+
+  // Run a command which will loop until a file disappears. This allows us to
+  // terminate the task after agent failover.
+  Try taskCommand = strings::format(
+  "touch %s && while [ -e %s ]; do : sleep 0.01 ; done",
+  TEST_CONTAINER_PATH + TEST_OUTPUT_FILE,
+  TEST_CONTAINER_PATH + TEST_OUTPUT_FILE);
+
+  v1::TaskInfo taskInfo = v1::createTask(agentId, resources, 
taskCommand.get());
+
+  taskInfo.mutable_container()->CopyFrom(v1::createContainerInfo(
+  "alpine",
+  {v1::createVolumeCsi(
+  TEST_CSI_PLUGIN_TYPE,
+  TEST_VOLUME_ID,
+  TEST_CONTAINER_PATH,
+  mesos::v1::Volume::Source::CSIVolume::VolumeCapability
+::AccessMode::SINGLE_NODE_WRITER,
+  false)}));
+
+  Future startingUpdate;
+  Future runningUpdate;
+  Future finishedUpdate;
+
+  testing::Sequence taskSequence;
+  EXPECT_CALL(
+  *scheduler,
+  update(_, TaskStatusUpdateStateEq(v1::TASK_STARTING)))
+.InSequence(taskSequence)
+.WillOnce(DoAll(
+FutureArg<1>(),
+v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+.WillRepeatedly(v1::scheduler::SendAcknowledge(frameworkId, agentId));
+
+  EXPECT_CALL(
+  *scheduler,
+  update(_, TaskStatusUpdateStateEq(v1::TASK_RUNNING)))
+.InSequence(taskSequence)
+.WillOnce(DoAll(
+FutureArg<1>(),
+v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+.WillRepeatedly(v1::scheduler::SendAcknowledge(frameworkId, agentId));
+
+  EXPECT_CALL(
+  *scheduler,
+  update(_, TaskStatusUpdateStateEq(v1::TASK_FINISHED)))
+.InSequence(taskSequence)
+.WillOnce(DoAll(

[mesos] branch master updated (2bf7f5d -> fc22984)

2020-09-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 2bf7f5d  Added a test of UPDATE_FRAMEWORK with the same FrameworkInfo.
 new a3fe939  Updated the test CSI plugin for CSI server testing.
 new f0ce0f1  Added a test helper for CSI volumes.
 new fc22984  Added tests for the 'volume/csi' isolator.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/Makefile.am|1 +
 src/examples/test_csi_plugin.cpp   |   94 +-
 src/tests/CMakeLists.txt   |1 +
 src/tests/cluster.cpp  |2 +-
 .../containerizer/volume_csi_isolator_tests.cpp| 1122 
 src/tests/mesos.hpp|   76 ++
 6 files changed, 1275 insertions(+), 21 deletions(-)
 create mode 100644 src/tests/containerizer/volume_csi_isolator_tests.cpp



[mesos] 03/03: Added tests for the 'volume/csi' isolator.

2020-09-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit fc22984de558302029a8cad0655e375653208448
Author: Greg Mann 
AuthorDate: Thu Sep 3 12:06:38 2020 -0700

Added tests for the 'volume/csi' isolator.

Review: https://reviews.apache.org/r/72728/
---
 src/Makefile.am|1 +
 src/tests/CMakeLists.txt   |1 +
 src/tests/cluster.cpp  |2 +-
 .../containerizer/volume_csi_isolator_tests.cpp| 1122 
 4 files changed, 1125 insertions(+), 1 deletion(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index 673ea6c..c2da4e9 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2873,6 +2873,7 @@ mesos_tests_SOURCES +=
\
   tests/containerizer/runtime_isolator_tests.cpp   \
   tests/containerizer/sched_tests.cpp  \
   tests/containerizer/setns_test_helper.cpp\
+  tests/containerizer/volume_csi_isolator_tests.cpp\
   tests/containerizer/volume_host_path_isolator_tests.cpp  \
   tests/containerizer/volume_image_isolator_tests.cpp  \
   tests/containerizer/volume_secret_isolator_tests.cpp
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 6b420d0..6beb74e 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -247,6 +247,7 @@ if (LINUX)
 containerizer/rootfs.cpp
 containerizer/runtime_isolator_tests.cpp
 containerizer/sched_tests.cpp
+containerizer/volume_csi_isolator_tests.cpp
 containerizer/volume_host_path_isolator_tests.cpp
 containerizer/volume_image_isolator_tests.cpp
 containerizer/volume_secret_isolator_tests.cpp)
diff --git a/src/tests/cluster.cpp b/src/tests/cluster.cpp
index 3c86855..d547cbb 100644
--- a/src/tests/cluster.cpp
+++ b/src/tests/cluster.cpp
@@ -537,7 +537,7 @@ Try> Slave::create(
 const process::http::URL agentUrl(
 scheme,
 process::address().ip,
-flags.port,
+process::address().port,
 processId + "/api/v1");
 
 Try> _csiServer = slave::CSIServer::create(
diff --git a/src/tests/containerizer/volume_csi_isolator_tests.cpp 
b/src/tests/containerizer/volume_csi_isolator_tests.cpp
new file mode 100644
index 000..dafb0b7
--- /dev/null
+++ b/src/tests/containerizer/volume_csi_isolator_tests.cpp
@@ -0,0 +1,1122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#ifdef USE_SSL_SOCKET
+#include "authentication/executor/jwt_secret_generator.hpp"
+#endif // USE_SSL_SOCKET
+
+#include "csi/paths.hpp"
+
+#include "master/flags.hpp"
+
+#include "slave/csi_server.hpp"
+#include "slave/flags.hpp"
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/fetcher.hpp"
+
+#include "slave/containerizer/mesos/containerizer.hpp"
+#include "slave/containerizer/mesos/paths.hpp"
+
+#include "tests/environment.hpp"
+#include "tests/mesos.hpp"
+
+#ifdef USE_SSL_SOCKET
+using mesos::authentication::executor::JWTSecretGenerator;
+#endif // USE_SSL_SOCKET
+
+using mesos::internal::slave::CSIServer;
+using mesos::internal::slave::Fetcher;
+using mesos::internal::slave::MesosContainerizer;
+
+using mesos::internal::slave::containerizer::paths::getContainerPid;
+
+using mesos::master::detector::MasterDetector;
+
+using process::Clock;
+using process::Future;
+using process::Owned;
+
+using std::list;
+using std::string;
+using std::vector;
+
+using testing::AllOf;
+using testing::AnyOf;
+using testing::DoAll;
+
+namespace mesos {
+namespace internal {
+namespace tests {
+
+const string TEST_CONTAINER_PATH = "volume-container-path/";
+const string TEST_CSI_PLUGIN_TYPE = "org.apa

[mesos] 02/03: Added a test helper for CSI volumes.

2020-09-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit f0ce0f1d8601228f16efbb98420693af42b19d43
Author: Greg Mann 
AuthorDate: Thu Sep 3 12:06:34 2020 -0700

Added a test helper for CSI volumes.

Review: https://reviews.apache.org/r/72805/
---
 src/tests/mesos.hpp | 76 +
 1 file changed, 76 insertions(+)

diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp
index 8f89d7c..49abfc2 100644
--- a/src/tests/mesos.hpp
+++ b/src/tests/mesos.hpp
@@ -853,6 +853,66 @@ inline TVolume createVolumeFromDockerImage(
 }
 
 
+template 
+inline TVolume createVolumeCsi(
+const std::string& pluginName,
+const std::string volumeId,
+const std::string& containerPath,
+const typename TVolume::Source::CSIVolume::VolumeCapability
+  ::AccessMode::Mode mode,
+bool readonly)
+{
+  TVolume volume;
+  volume.set_container_path(containerPath);
+
+  typename TVolume::Source* source = volume.mutable_source();
+  source->set_type(TVolume::Source::CSI_VOLUME);
+  source->mutable_csi_volume()->set_plugin_name(pluginName);
+
+  typename TVolume::Source::CSIVolume::StaticProvisioning* staticInfo =
+source->mutable_csi_volume()->mutable_static_provisioning();
+
+  staticInfo->set_volume_id(volumeId);
+  staticInfo->set_readonly(readonly);
+  staticInfo->mutable_volume_capability()->mutable_mount();
+  staticInfo->mutable_volume_capability()
+->mutable_access_mode()->set_mode(mode);
+
+  typedef typename TVolume::Source::CSIVolume::VolumeCapability::AccessMode
+CSIAccessMode;
+
+  // Set the top-level `mode` field of the volume based on the values of the
+  // CSI access mode and the `readonly` field.
+  typename TVolume::Mode mesosMode;
+
+  switch (mode) {
+case CSIAccessMode::SINGLE_NODE_WRITER:
+case CSIAccessMode::MULTI_NODE_SINGLE_WRITER:
+case CSIAccessMode::MULTI_NODE_MULTI_WRITER: {
+  if (readonly) {
+mesosMode = TVolume::RO;
+  } else {
+mesosMode = TVolume::RW;
+  }
+
+  break;
+}
+
+case CSIAccessMode::SINGLE_NODE_READER_ONLY:
+case CSIAccessMode::MULTI_NODE_READER_ONLY:
+default: {
+  mesosMode = TVolume::RO;
+
+  break;
+}
+  }
+
+  volume.set_mode(mesosMode);
+
+  return volume;
+}
+
+
 template 
 inline TNetworkInfo createNetworkInfo(
 const std::string& networkName)
@@ -1745,6 +1805,14 @@ inline Volume createVolumeFromDockerImage(Args&&... args)
 
 
 template 
+inline Volume createVolumeCsi(Args&&... args)
+{
+  return common::createVolumeCsi(
+  std::forward(args)...);
+}
+
+
+template 
 inline NetworkInfo createNetworkInfo(Args&&... args)
 {
   return common::createNetworkInfo(std::forward(args)...);
@@ -2035,6 +2103,14 @@ inline mesos::v1::Volume 
createVolumeFromDockerImage(Args&&... args)
 
 
 template 
+inline mesos::v1::Volume createVolumeCsi(Args&&... args)
+{
+  return common::createVolumeCsi(
+  std::forward(args)...);
+}
+
+
+template 
 inline mesos::v1::NetworkInfo createNetworkInfo(Args&&... args)
 {
   return common::createNetworkInfo(



[mesos] 01/03: Updated the test CSI plugin for CSI server testing.

2020-09-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit a3fe939616fe13f34bd3555d613a0e1323730424
Author: Greg Mann 
AuthorDate: Thu Sep 3 12:06:31 2020 -0700

Updated the test CSI plugin for CSI server testing.

This patch adds additional configuration flags to the
test CSI plugin which are necessary in order to test
the agent's CSI server.

Review: https://reviews.apache.org/r/72727/
---
 src/examples/test_csi_plugin.cpp | 94 +++-
 1 file changed, 74 insertions(+), 20 deletions(-)

diff --git a/src/examples/test_csi_plugin.cpp b/src/examples/test_csi_plugin.cpp
index 214a3ee..e878bd6 100644
--- a/src/examples/test_csi_plugin.cpp
+++ b/src/examples/test_csi_plugin.cpp
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -96,8 +97,6 @@ using grpc::ServerContext;
 using grpc::Status;
 using grpc::WriteOptions;
 
-using mesos::csi::VolumeInfo;
-
 using process::grpc::StatusError;
 
 using VolumeCapability = mesos::Volume::Source::CSIVolume::VolumeCapability;
@@ -159,6 +158,12 @@ public:
 "If a volume with the same name already exists, the pair will be\n"
 "ignored. (Example: 'volume1:1GB;volume2:2GB')");
 
+add(::volume_id_path,
+"volume_id_path",
+"When set to true, this flag causes the volume ID of all volumes to\n"
+"be set to the volume's path.",
+true);
+
 add(::forward,
 "forward",
 "If set, the plugin forwards all requests to the specified Unix\n"
@@ -172,10 +177,20 @@ public:
   Option create_parameters;
   Option volume_metadata;
   Option volumes;
+  bool volume_id_path;
   Option forward;
 };
 
 
+struct VolumeInfo
+{
+  Bytes capacity;
+  string id;
+  string path;
+  google::protobuf::Map context;
+};
+
+
 class TestCSIPlugin
   : public csi::v0::Identity::Service,
 public csi::v0::Controller::Service,
@@ -192,13 +207,15 @@ public:
   const Bytes& _availableCapacity,
   const hashmap& _createParameters,
   const hashmap& _volumeMetadata,
-  const hashmap& _volumes)
+  const hashmap& _volumes,
+  bool _volumeIdPath)
 : apiVersion(_apiVersion),
   endpoint(_endpoint),
   workDir(_workDir),
   availableCapacity(_availableCapacity),
   createParameters(_createParameters.begin(), _createParameters.end()),
-  volumeMetadata(_volumeMetadata.begin(), _volumeMetadata.end())
+  volumeMetadata(_volumeMetadata.begin(), _volumeMetadata.end()),
+  volumeIdPath(_volumeIdPath)
   {
 // Construct the default mount volume capability.
 defaultVolumeCapability.mutable_mount();
@@ -212,8 +229,9 @@ public:
 // TODO(jieyu): Consider not using CHECKs here.
 Try> paths = fs::list(path::join(workDir, "*-*"));
 foreach (const string& path, CHECK_NOTERROR(paths)) {
-  volumes.put(path, CHECK_NOTERROR(parseVolumePath(path)));
-  usedCapacity += volumes.at(path).capacity;
+  Try createdVolume = CHECK_NOTERROR(parseVolumePath(path));
+  volumes.put(createdVolume->id, createdVolume.get());
+  usedCapacity += createdVolume->capacity;
 }
 
 // Create preprovisioned volumes if they have not existed yet.
@@ -229,10 +247,11 @@ public:
 continue;
   }
 
-  VolumeInfo volumeInfo{
-capacity, getVolumePath(capacity, name), volumeMetadata};
 
-  Try mkdir = os::mkdir(volumeInfo.id);
+  VolumeInfo volumeInfo =
+createVolumeInfo(capacity, name, volumeMetadata);
+
+  Try mkdir = os::mkdir(volumeInfo.path);
   CHECK_SOME(mkdir)
 << "Failed to create directory for preprovisioned volume '" << name
 << "': " << mkdir.error();
@@ -428,6 +447,14 @@ private:
   Try parseVolumePath(const string& dir);
   Option findVolumeByName(const string& name);
 
+  // Creates a volume info with the specified name based on the
+  // value of the `volume_id_path` flag.
+  VolumeInfo createVolumeInfo(
+  const Bytes& _capacity,
+  const string& name,
+  const google::protobuf::Map context);
+
+
   Try createVolume(
   const string& name,
   const Bytes& requiredBytes,
@@ -494,6 +521,7 @@ private:
   Map createParameters;
   Map volumeMetadata;
   hashmap volumes;
+  bool volumeIdPath;
 };
 
 
@@ -1299,14 +1327,19 @@ Try TestCSIPlugin::parseVolumePath(const 
string& dir)
 << "Cannot reconstruct volume path '" << dir << "' from volume name '"
 << name.get() << "' and capacity " << capacity.get();
 
-  return VolumeInfo{capacity.get(), dir, volumeMetadata};
+  const string volumeId = volumeIdPath ? dir : name.get();
+
+  re

[mesos] branch master updated: Fixed broken authorization in the CSI server.

2020-08-31 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new a1bfa74  Fixed broken authorization in the CSI server.
a1bfa74 is described below

commit a1bfa749e594bd8d9eb008ea4d90e6811f5f7e07
Author: Greg Mann 
AuthorDate: Mon Aug 31 13:02:18 2020 -0700

Fixed broken authorization in the CSI server.

The CSI server must use a principal when authenticating
which contains a claim that allows the authorizer to
implicitly approve requests from the CSI server to the
agent's HTTP API.

Review: https://reviews.apache.org/r/72816/
---
 src/slave/csi_server.cpp | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp
index 3f29a81..14fa866 100644
--- a/src/slave/csi_server.cpp
+++ b/src/slave/csi_server.cpp
@@ -73,6 +73,8 @@ namespace mesos {
 namespace internal {
 namespace slave {
 
+constexpr char DEFAULT_CSI_CONTAINER_PREFIX[] = "mesos-internal-csi-";
+
 static VolumeState createVolumeState(
 const Volume::Source::CSIVolume::StaticProvisioning& volume);
 
@@ -232,7 +234,7 @@ Try CSIServerProcess::initializePlugin(const 
Option& name)
   rootDir,
   info,
   extractServices(info),
-  "org-apache-mesos-internal-",
+  DEFAULT_CSI_CONTAINER_PREFIX,
   authToken,
   plugin.runtime,
   ));
@@ -317,7 +319,9 @@ Future CSIServerProcess::start(const SlaveID& 
_agentId)
 // The contents of this principal are arbitrary. We choose to avoid a
 // principal with a 'value' string so that we do not unintentionally 
collide
 // with another real principal with restricted permissions.
-Principal principal(Option::none(), {{"key", "csi-server"}});
+Principal principal(
+Option::none(),
+{{"cid_prefix", DEFAULT_CSI_CONTAINER_PREFIX}});
 
 result = secretGenerator->generate(principal)
   .then(defer(self(), [=](const Secret& secret) -> Future {



[mesos] branch master updated: Fixed a bug in CSI server initialization.

2020-08-24 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 22e  Fixed a bug in CSI server initialization.
22e is described below

commit 22e50813597edd2cbb0304823ca56e5f2d25
Author: Greg Mann 
AuthorDate: Mon Aug 24 17:51:17 2020 -0700

Fixed a bug in CSI server initialization.

Previously, the CSI server would initialize the service
managers before the auth token was generated, meaning
that requests made by the service managers to an agent
which requires HTTP authentication would fail.

This patch changes the order of initialization so that
the service managers will be initialized with a valid
auth token when necessary.

Review: https://reviews.apache.org/r/72799/
---
 src/slave/csi_server.cpp | 71 ++--
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp
index 0ffe020..3f29a81 100644
--- a/src/slave/csi_server.cpp
+++ b/src/slave/csi_server.cpp
@@ -311,44 +311,51 @@ Future CSIServerProcess::start(const SlaveID& 
_agentId)
 
   agentId = _agentId;
 
-  // Load all CSI plugin configurations found.
-  Try init = initializePlugin();
-  if (init.isError()) {
-return Failure(
-"CSI server failed to initialize CSI plugins: " + init.error());
-  }
-
-  if (!secretGenerator) {
-return Nothing();
+  Future result = Nothing();
+
+  if (secretGenerator) {
+// The contents of this principal are arbitrary. We choose to avoid a
+// principal with a 'value' string so that we do not unintentionally 
collide
+// with another real principal with restricted permissions.
+Principal principal(Option::none(), {{"key", "csi-server"}});
+
+result = secretGenerator->generate(principal)
+  .then(defer(self(), [=](const Secret& secret) -> Future {
+Option error = common::validation::validateSecret(secret);
+if (error.isSome()) {
+  return Failure(
+  "CSI server failed to validate generated secret: " +
+  error->message);
+}
+
+if (secret.type() != Secret::VALUE) {
+  return Failure(
+  "CSI server expecting generated secret to be of VALUE type "
+  "instead of " + stringify(secret.type()) + " type; " +
+  "only VALUE type secrets are supported at this time");
+}
+
+CHECK(secret.has_value());
+
+authToken = secret.value().data();
+
+return Nothing();
+}));
   }
 
-  // The contents of this principal are arbitrary. We choose to avoid a
-  // principal with a 'value' string so that we do not unintentionally collide
-  // with another real principal with restricted permissions.
-  Principal principal(Option::none(), {{"key", "csi-server"}});
-
-  return secretGenerator->generate(principal)
-.then([=](const Secret& secret) -> Future {
-  Option error = common::validation::validateSecret(secret);
-  if (error.isSome()) {
+  return result
+.then(defer(self(), [=]() -> Future {
+  // Load all CSI plugin configurations found.
+  // NOTE: `initializePlugin()` requires that the `authToken` has already
+  // been set, so the order of these continuations matters.
+  Try init = initializePlugin();
+  if (init.isError()) {
 return Failure(
-"CSI server failed to validate generated secret: " +
-error->message);
+"CSI server failed to initialize CSI plugins: " + init.error());
   }
 
-  if (secret.type() != Secret::VALUE) {
-return Failure(
-"CSI server expecting generated secret to be of VALUE type "
-"instead of " + stringify(secret.type()) + " type; " +
-"only VALUE type secrets are supported at this time");
-  }
-
-  CHECK(secret.has_value());
-
-  authToken = secret.value().data();
-
   return Nothing();
-  });
+}));
 }
 
 



[mesos] 01/03: Fixed a bug in CSI volume manager initialization.

2020-08-21 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 68b481085fb82b475e108b9aa39935a8d7729983
Author: Greg Mann 
AuthorDate: Thu Aug 20 19:26:48 2020 -0700

Fixed a bug in CSI volume manager initialization.

Previously, the volume managers would assume that they could
make CONTROLLER_SERVICE calls during plugin initialization,
regardless of whether or not the plugin provides that service.

Review: https://reviews.apache.org/r/72726/
---
 src/csi/v0_volume_manager.cpp | 2 +-
 src/csi/v1_volume_manager.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp
index 42a23ba..8ba6100 100644
--- a/src/csi/v0_volume_manager.cpp
+++ b/src/csi/v0_volume_manager.cpp
@@ -648,7 +648,7 @@ Future VolumeManagerProcess::prepareServices()
   vector> futures;
   foreach (const Service& service, services) {
 futures.push_back(call(
-CONTROLLER_SERVICE, ::getPluginInfo, GetPluginInfoRequest())
+service, ::getPluginInfo, GetPluginInfoRequest())
   .onReady([service](const GetPluginInfoResponse& response) {
 LOG(INFO) << service << " loaded: " << stringify(response);
   }));
diff --git a/src/csi/v1_volume_manager.cpp b/src/csi/v1_volume_manager.cpp
index c05265c..1a1b97c 100644
--- a/src/csi/v1_volume_manager.cpp
+++ b/src/csi/v1_volume_manager.cpp
@@ -669,7 +669,7 @@ Future VolumeManagerProcess::prepareServices()
   vector> futures;
   foreach (const Service& service, services) {
 futures.push_back(call(
-CONTROLLER_SERVICE, ::getPluginInfo, GetPluginInfoRequest())
+service, ::getPluginInfo, GetPluginInfoRequest())
   .onReady([service](const GetPluginInfoResponse& response) {
 LOG(INFO) << service << " loaded: " << stringify(response);
   }));



[mesos] 03/03: Initialized plugins lazily in the CSI server.

2020-08-21 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 4ff51041df860dbcc2247ef47a0596e5132da190
Author: Greg Mann 
AuthorDate: Thu Aug 20 19:27:23 2020 -0700

Initialized plugins lazily in the CSI server.

Review: https://reviews.apache.org/r/72779/
---
 src/slave/csi_server.cpp | 403 +--
 src/slave/csi_server.hpp |   8 +-
 2 files changed, 253 insertions(+), 158 deletions(-)

diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp
index 2ba4f22..0ffe020 100644
--- a/src/slave/csi_server.cpp
+++ b/src/slave/csi_server.cpp
@@ -19,6 +19,7 @@
 #include 
 
 #include 
+#include 
 
 #include 
 
@@ -58,6 +59,7 @@ using mesos::csi::state::VolumeState;
 using process::Failure;
 using process::Future;
 using process::Owned;
+using process::Promise;
 
 using process::grpc::client::Runtime;
 
@@ -85,17 +87,17 @@ public:
   CSIServerProcess(
   const process::http::URL& _agentUrl,
   const string& _rootDir,
+  const string& _pluginConfigDir,
   SecretGenerator* _secretGenerator,
-  SecretResolver* _secretResolver,
-  hashmap _pluginConfigs)
+  SecretResolver* _secretResolver)
 : process::ProcessBase(process::ID::generate("csi-server")),
   agentUrl(_agentUrl),
   rootDir(_rootDir),
+  pluginConfigDir(_pluginConfigDir),
   secretGenerator(_secretGenerator),
-  secretResolver(_secretResolver),
-  pluginConfigs(_pluginConfigs) {}
+  secretResolver(_secretResolver) {}
 
-  Future start();
+  Future start(const SlaveID& _agentId);
 
   Future publishVolume(const Volume::Source::CSIVolume& volume);
 
@@ -106,73 +108,125 @@ public:
 private:
   struct CSIPlugin
   {
-CSIPlugin(const string& metricsPrefix) : metrics(metricsPrefix) {}
+CSIPlugin(
+const CSIPluginInfo& _info,
+const string& metricsPrefix)
+  : info(_info),
+metrics(metricsPrefix) {}
 
 CSIPluginInfo info;
 Owned serviceManager;
 Owned volumeManager;
 Runtime runtime;
 csi::Metrics metrics;
+
+// CSI plugins are initialized lazily. When a publish/unpublish call is
+// received for a plugin which is not yet initialized, this promise is used
+// to perform the call after initialization is complete.
+Promise initialized;
   };
 
+  // Attempts to load configuration for a plugin with the specified name and
+  // then initializes the plugin. If no name is specified, then all
+  // configurations found in the plugin config directory are loaded.
+  Try initializePlugin(const Option& name = None());
+
   // Contains the plugins loaded by the server. The key of this map is the
   // plugin name.
   hashmap plugins;
 
   const process::http::URL agentUrl;
+  Option agentId;
   const string rootDir;
+  const string pluginConfigDir;
   SecretGenerator* secretGenerator;
   SecretResolver* secretResolver;
   Option authToken;
-  hashmap pluginConfigs;
-  Option agentId;
 };
 
 
-Future CSIServerProcess::start()
+Try CSIServerProcess::initializePlugin(const Option& name)
 {
-  Future result = Nothing();
+  if (name.isSome()) {
+CHECK(!plugins.contains(name.get()));
+  }
 
-  // The contents of this principal are arbitrary. We choose to avoid a
-  // principal with a 'value' string so that we do not unintentionally collide
-  // with another real principal with restricted permissions.
-  Principal principal(Option::none(), {{"key", "csi-server"}});
+  Try> entries = os::ls(pluginConfigDir);
+  if (entries.isError()) {
+return Error(
+"Unable to list the CSI plugin configuration directory '" +
+pluginConfigDir + "': " + entries.error());
+  }
+
+  // We are either looking for one specific plugin (if `name` is SOME), or we
+  // are loading all configs we find (if `name` is NONE). First, we populate
+  // `pluginConfigs` with one or more valid configurations. Then, we will
+  // initialize the plugin(s) based on the configuration(s) found.
+  hashmap pluginConfigs;
+
+  foreach (const string& entry, entries.get()) {
+const string path = path::join(pluginConfigDir, entry);
+
+// Ignore directory entries.
+if (os::stat::isdir(path)) {
+  continue;
+}
+
+Try read = os::read(path);
+if (read.isError()) {
+  // In case of an error we log and skip to the next entry.
+  LOG(ERROR) << "Failed to read CSI plugin configuration file '"
+ << path << "': " << read.error();
+
+  continue;
+}
+
+Try json = JSON::parse(read.get());
+if (json.isError()) {
+  return Error("JSON parse of '" + path + "' failed: " + json.error());
+}
+
+Try parse = ::protobuf::parse(json.get());
+if (parse.isError()) {
+  retu

[mesos] branch master updated (f284314 -> 4ff5104)

2020-08-21 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from f284314  Added a scheduler API test for a valid offer constraints 
update.
 new 68b4810  Fixed a bug in CSI volume manager initialization.
 new 5ed30db  Added the CSI server to the Mesos agent.
 new 4ff5104  Initialized plugins lazily in the CSI server.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/csi/v0_volume_manager.cpp |   2 +-
 src/csi/v1_volume_manager.cpp |   2 +-
 src/local/local.cpp   |   1 +
 src/slave/csi_server.cpp  | 403 ++
 src/slave/csi_server.hpp  |   8 +-
 src/slave/main.cpp| 101 +++
 src/slave/slave.cpp   |  18 ++
 src/slave/slave.hpp   |   3 +
 src/tests/cluster.cpp | 128 ++
 src/tests/cluster.hpp |   3 +
 src/tests/mesos.cpp   |   1 +
 src/tests/mesos.hpp   |   9 +
 src/tests/mock_slave.cpp  |   7 +
 src/tests/mock_slave.hpp  |   3 +
 14 files changed, 463 insertions(+), 226 deletions(-)



[mesos] 02/03: Added the CSI server to the Mesos agent.

2020-08-21 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 5ed30db48785007e35805886a024ebb8a61a7037
Author: Greg Mann 
AuthorDate: Thu Aug 20 19:27:02 2020 -0700

Added the CSI server to the Mesos agent.

This patch adds a CSI server to the Mesos agent in both
the agent binary and in tests.

Review: https://reviews.apache.org/r/72761/
---
 src/local/local.cpp  |   1 +
 src/slave/main.cpp   | 101 ++---
 src/slave/slave.cpp  |  18 +++
 src/slave/slave.hpp  |   3 ++
 src/tests/cluster.cpp| 128 ++-
 src/tests/cluster.hpp|   3 ++
 src/tests/mesos.cpp  |   1 +
 src/tests/mesos.hpp  |   9 
 src/tests/mock_slave.cpp |   7 +++
 src/tests/mock_slave.hpp |   3 ++
 10 files changed, 208 insertions(+), 66 deletions(-)

diff --git a/src/local/local.cpp b/src/local/local.cpp
index 8950570..9535399 100644
--- a/src/local/local.cpp
+++ b/src/local/local.cpp
@@ -535,6 +535,7 @@ PID launch(const Flags& flags, Allocator* 
_allocator)
 secretGenerators->back(),
 nullptr,
 nullptr,
+nullptr,
 #ifndef __WINDOWS__
 None(),
 #endif // __WINDOWS__
diff --git a/src/slave/main.cpp b/src/slave/main.cpp
index 0aa2cc9..84b813c 100644
--- a/src/slave/main.cpp
+++ b/src/slave/main.cpp
@@ -37,6 +37,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -84,6 +86,7 @@
 #include "module/manager.hpp"
 
 #include "slave/constants.hpp"
+#include "slave/csi_server.hpp"
 #include "slave/gc.hpp"
 #include "slave/slave.hpp"
 #include "slave/task_status_update_manager.hpp"
@@ -111,6 +114,8 @@ using mesos::Authorizer;
 using mesos::SecretResolver;
 using mesos::SlaveInfo;
 
+using net::IP;
+
 using process::Owned;
 
 using process::firewall::DisabledEndpointsFirewallRule;
@@ -528,6 +533,69 @@ int main(int argc, char** argv)
<< futureTracker.error();
   }
 
+  SecretGenerator* secretGenerator = nullptr;
+
+#ifdef USE_SSL_SOCKET
+  if (flags.jwt_secret_key.isSome()) {
+Try jwtSecretKey = os::read(flags.jwt_secret_key.get());
+if (jwtSecretKey.isError()) {
+  EXIT(EXIT_FAILURE) << "Failed to read the file specified by "
+ << "--jwt_secret_key";
+}
+
+// TODO(greggomann): Factor the following code out into a common helper,
+// since we also do this when loading credentials.
+Try permissions =
+  os::permissions(flags.jwt_secret_key.get());
+if (permissions.isError()) {
+  LOG(WARNING) << "Failed to stat jwt secret key file '"
+   << flags.jwt_secret_key.get()
+   << "': " << permissions.error();
+} else if (permissions->others.rwx) {
+  LOG(WARNING) << "Permissions on executor secret key file '"
+   << flags.jwt_secret_key.get()
+   << "' are too open; it is recommended that your"
+   << " key file is NOT accessible by others";
+}
+
+secretGenerator = new JWTSecretGenerator(jwtSecretKey.get());
+  }
+#endif // USE_SSL_SOCKET
+
+  // The agent will hold ownership of the CSI server, but we also pass a 
pointer
+  // to it into the containerizer for use by the 'volume/csi' isolator.
+  Owned csiServer;
+
+  if (flags.csi_plugin_config_dir.isSome()) {
+// Initialize the CSI server, which manages any configured CSI plugins.
+string scheme = "http";
+
+#ifdef USE_SSL_SOCKET
+if (process::network::openssl::flags().enabled) {
+  scheme = "https";
+}
+#endif
+
+const process::http::URL agentUrl(
+scheme,
+process::address().ip,
+process::address().port,
+id + "/api/v1");
+
+Try> csiServer_ = CSIServer::create(
+flags,
+agentUrl,
+secretGenerator,
+secretResolver.get());
+
+if (csiServer_.isError()) {
+  EXIT(EXIT_FAILURE)
+<< "Failed to initialize the CSI server: " << csiServer_.error();
+}
+
+csiServer = std::move(csiServer_.get());
+  }
+
   Try containerizer = Containerizer::create(
   flags,
   false,
@@ -535,7 +603,8 @@ int main(int argc, char** argv)
   gc,
   secretResolver.get(),
   volumeGidManager,
-  futureTracker.get());
+  futureTracker.get(),
+  csiServer.get());
 
   if (containerizer.isError()) {
 EXIT(EXIT_FAILURE)
@@ -608,35 +677,6 @@ int main(int argc, char** argv)
<< qosController.error();
   }
 
-  SecretGenerator* secretGenerator = nullptr;
-
-#ifdef USE_SSL_SOCKET
-  if (flags.jwt_secret_key.isSome()) {
-Try

[mesos] branch master updated (c78dc33 -> fe0cd02)

2020-08-10 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from c78dc33  Added interface for the CSI server.
 new 38ba191  Added support for secrets to the CSI volume managers.
 new fe0cd02  Added implementation of the CSI server.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/CMakeLists.txt|   1 +
 src/Makefile.am   |   2 +
 src/csi/state.proto   |   6 +
 src/csi/v0_volume_manager.cpp | 103 +++-
 src/csi/v0_volume_manager.hpp |   5 +-
 src/csi/v0_volume_manager_process.hpp |  13 +-
 src/csi/v1_volume_manager.cpp |  96 ++-
 src/csi/v1_volume_manager.hpp |   5 +-
 src/csi/v1_volume_manager_process.hpp |  13 +-
 src/csi/volume_manager.cpp|  21 +-
 src/csi/volume_manager.hpp|   5 +-
 src/slave/csi_server.cpp  | 455 ++
 src/slave/csi_server.hpp  |   8 +-
 13 files changed, 711 insertions(+), 22 deletions(-)
 create mode 100644 src/slave/csi_server.cpp



[mesos] 02/02: Added implementation of the CSI server.

2020-08-10 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit fe0cd02a0697a4c4fcf5087fcafd6729beec0b41
Author: Greg Mann 
AuthorDate: Mon Aug 10 20:11:50 2020 -0700

Added implementation of the CSI server.

Review: https://reviews.apache.org/r/72716/
---
 src/CMakeLists.txt   |   1 +
 src/Makefile.am  |   2 +
 src/slave/csi_server.cpp | 455 +++
 src/slave/csi_server.hpp |   8 +-
 4 files changed, 465 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4e15e3d..c60d98a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -150,6 +150,7 @@ set(AGENT_SRC
   slave/constants.cpp
   slave/container_daemon.cpp
   slave/container_logger.cpp
+  slave/csi_server.cpp
   slave/flags.cpp
   slave/gc.cpp
   slave/http.cpp
diff --git a/src/Makefile.am b/src/Makefile.am
index 447db32..49dab4b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1208,6 +1208,8 @@ libmesos_no_3rdparty_la_SOURCES +=
\
   slave/container_daemon.hpp   \
   slave/container_daemon_process.hpp   \
   slave/container_logger.cpp   \
+  slave/csi_server.cpp \
+  slave/csi_server.hpp \
   slave/container_loggers/sandbox.cpp  \
   slave/container_loggers/sandbox.hpp  \
   slave/containerizer/composing.cpp\
diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp
new file mode 100644
index 000..a9a3995
--- /dev/null
+++ b/src/slave/csi_server.cpp
@@ -0,0 +1,455 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "common/validation.hpp"
+
+#include "csi/metrics.hpp"
+#include "csi/paths.hpp"
+#include "csi/service_manager.hpp"
+#include "csi/volume_manager.hpp"
+
+#include "slave/csi_server.hpp"
+#include "slave/flags.hpp"
+#include "slave/paths.hpp"
+
+using mesos::csi::ServiceManager;
+using mesos::csi::VolumeManager;
+
+using mesos::csi::state::VolumeState;
+
+using process::Failure;
+using process::Future;
+using process::Owned;
+
+using process::grpc::client::Runtime;
+
+using process::http::authentication::Principal;
+
+using std::list;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+static VolumeState createVolumeState(
+const Volume::Source::CSIVolume::StaticProvisioning& volume);
+
+
+static hashset extractServices(
+const CSIPluginInfo& plugin);
+
+
+class CSIServerProcess : public process::Process
+{
+public:
+  CSIServerProcess(
+  const process::http::URL& _agentUrl,
+  const string& _rootDir,
+  SecretGenerator* _secretGenerator,
+  SecretResolver* _secretResolver,
+  hashmap _pluginConfigs)
+: process::ProcessBase(process::ID::generate("csi-server")),
+  agentUrl(_agentUrl),
+  rootDir(_rootDir),
+  secretGenerator(_secretGenerator),
+  secretResolver(_secretResolver),
+  pluginConfigs(_pluginConfigs) {}
+
+  Future start();
+
+  Future publishVolume(const Volume::Source::CSIVolume& volume);
+
+  Future unpublishVolume(
+  const string& pluginName,
+  const string& volumeId);
+
+private:
+  struct CSIPlugin
+  {
+CSIPlugin(const string& metricsPrefix) : metrics(metricsPrefix) {}
+
+CSIPluginInfo info;
+Owned serviceManager;
+Owned volumeManager;
+Runtime runtime;
+csi::Metrics metrics;
+  };
+
+  // Contains the plugins loaded by the server. The key of this map is the
+  // plugin nam

[mesos] 02/02: Added implementation of the CSI server.

2020-08-10 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit fe0cd02a0697a4c4fcf5087fcafd6729beec0b41
Author: Greg Mann 
AuthorDate: Mon Aug 10 20:11:50 2020 -0700

Added implementation of the CSI server.

Review: https://reviews.apache.org/r/72716/
---
 src/CMakeLists.txt   |   1 +
 src/Makefile.am  |   2 +
 src/slave/csi_server.cpp | 455 +++
 src/slave/csi_server.hpp |   8 +-
 4 files changed, 465 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4e15e3d..c60d98a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -150,6 +150,7 @@ set(AGENT_SRC
   slave/constants.cpp
   slave/container_daemon.cpp
   slave/container_logger.cpp
+  slave/csi_server.cpp
   slave/flags.cpp
   slave/gc.cpp
   slave/http.cpp
diff --git a/src/Makefile.am b/src/Makefile.am
index 447db32..49dab4b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1208,6 +1208,8 @@ libmesos_no_3rdparty_la_SOURCES +=
\
   slave/container_daemon.hpp   \
   slave/container_daemon_process.hpp   \
   slave/container_logger.cpp   \
+  slave/csi_server.cpp \
+  slave/csi_server.hpp \
   slave/container_loggers/sandbox.cpp  \
   slave/container_loggers/sandbox.hpp  \
   slave/containerizer/composing.cpp\
diff --git a/src/slave/csi_server.cpp b/src/slave/csi_server.cpp
new file mode 100644
index 000..a9a3995
--- /dev/null
+++ b/src/slave/csi_server.cpp
@@ -0,0 +1,455 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "common/validation.hpp"
+
+#include "csi/metrics.hpp"
+#include "csi/paths.hpp"
+#include "csi/service_manager.hpp"
+#include "csi/volume_manager.hpp"
+
+#include "slave/csi_server.hpp"
+#include "slave/flags.hpp"
+#include "slave/paths.hpp"
+
+using mesos::csi::ServiceManager;
+using mesos::csi::VolumeManager;
+
+using mesos::csi::state::VolumeState;
+
+using process::Failure;
+using process::Future;
+using process::Owned;
+
+using process::grpc::client::Runtime;
+
+using process::http::authentication::Principal;
+
+using std::list;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+static VolumeState createVolumeState(
+const Volume::Source::CSIVolume::StaticProvisioning& volume);
+
+
+static hashset extractServices(
+const CSIPluginInfo& plugin);
+
+
+class CSIServerProcess : public process::Process
+{
+public:
+  CSIServerProcess(
+  const process::http::URL& _agentUrl,
+  const string& _rootDir,
+  SecretGenerator* _secretGenerator,
+  SecretResolver* _secretResolver,
+  hashmap _pluginConfigs)
+: process::ProcessBase(process::ID::generate("csi-server")),
+  agentUrl(_agentUrl),
+  rootDir(_rootDir),
+  secretGenerator(_secretGenerator),
+  secretResolver(_secretResolver),
+  pluginConfigs(_pluginConfigs) {}
+
+  Future start();
+
+  Future publishVolume(const Volume::Source::CSIVolume& volume);
+
+  Future unpublishVolume(
+  const string& pluginName,
+  const string& volumeId);
+
+private:
+  struct CSIPlugin
+  {
+CSIPlugin(const string& metricsPrefix) : metrics(metricsPrefix) {}
+
+CSIPluginInfo info;
+Owned serviceManager;
+Owned volumeManager;
+Runtime runtime;
+csi::Metrics metrics;
+  };
+
+  // Contains the plugins loaded by the server. The key of this map is the
+  // plugin nam

[mesos] 01/02: Added support for secrets to the CSI volume managers.

2020-08-10 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 38ba19127ddb48244f7c6c699e3c41e5ea12b594
Author: Greg Mann 
AuthorDate: Mon Aug 10 20:26:26 2020 -0700

Added support for secrets to the CSI volume managers.

Review: https://reviews.apache.org/r/72732/
---
 src/csi/state.proto   |   6 ++
 src/csi/v0_volume_manager.cpp | 103 +++---
 src/csi/v0_volume_manager.hpp |   5 +-
 src/csi/v0_volume_manager_process.hpp |  13 -
 src/csi/v1_volume_manager.cpp |  96 +--
 src/csi/v1_volume_manager.hpp |   5 +-
 src/csi/v1_volume_manager_process.hpp |  13 -
 src/csi/volume_manager.cpp|  21 ++-
 src/csi/volume_manager.hpp|   5 +-
 9 files changed, 246 insertions(+), 21 deletions(-)

diff --git a/src/csi/state.proto b/src/csi/state.proto
index 836e30c..630e4f5 100644
--- a/src/csi/state.proto
+++ b/src/csi/state.proto
@@ -78,4 +78,10 @@ message VolumeState {
 
   // Indicates that the volume must be mounted read-only.
   bool readonly = 9;
+
+  // Secrets to be included in `NodeStageVolumeRequest`.
+  map node_stage_secrets = 10;
+
+  // Secrets to be included in `NodePublishVolumeRequest`.
+  map node_publish_secrets = 11;
 }
diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp
index 89a6da5..9e840a7 100644
--- a/src/csi/v0_volume_manager.cpp
+++ b/src/csi/v0_volume_manager.cpp
@@ -21,6 +21,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -81,14 +83,16 @@ VolumeManagerProcess::VolumeManagerProcess(
 const hashset _services,
 const Runtime& _runtime,
 ServiceManager* _serviceManager,
-Metrics* _metrics)
+Metrics* _metrics,
+SecretResolver* _secretResolver)
   : ProcessBase(process::ID::generate("csi-v0-volume-manager")),
 rootDir(_rootDir),
 info(_info),
 services(_services),
 runtime(_runtime),
 serviceManager(_serviceManager),
-metrics(_metrics)
+metrics(_metrics),
+secretResolver(_secretResolver)
 {
   // This should have been validated in `VolumeManager::create`.
   CHECK(!services.empty())
@@ -961,8 +965,33 @@ Future VolumeManagerProcess::_publishVolume(const 
string& volumeId)
 request.set_staging_target_path(stagingPath);
   }
 
-  return call(NODE_SERVICE, ::nodePublishVolume, std::move(request))
-.then(defer(self(), [this, volumeId, targetPath] {
+  Future rpcResult;
+
+  if (!volumeState.node_publish_secrets().empty()) {
+rpcResult = resolveSecrets(volumeState.node_publish_secrets())
+  .then(process::defer(
+  self(),
+  [this, request](const Map& secrets) {
+NodePublishVolumeRequest request_(request);
+*request_.mutable_node_publish_secrets() = secrets;
+
+return call(
+NODE_SERVICE,
+::nodePublishVolume,
+std::move(request_));
+  }));
+  } else {
+rpcResult =
+  call(NODE_SERVICE, ::nodePublishVolume, std::move(request));
+  }
+
+  return rpcResult
+.then(process::defer(self(), [this, volumeId, targetPath]()
+-> Future {
+  if (!os::exists(targetPath)) {
+return Failure("Target path '" + targetPath + "' not created");
+  }
+
   CHECK(volumes.contains(volumeId));
   VolumeState& volumeState = volumes.at(volumeId).state;
 
@@ -1042,7 +1071,25 @@ Future 
VolumeManagerProcess::__publishVolume(const string& volumeId)
 evolve(volumeState.volume_capability());
   *request.mutable_volume_attributes() = volumeState.volume_context();
 
-  return call(NODE_SERVICE, ::nodeStageVolume, std::move(request))
+  Future rpcResult;
+
+  if (!volumeState.node_stage_secrets().empty()) {
+rpcResult = resolveSecrets(volumeState.node_stage_secrets())
+  .then([=](const Map& secrets) {
+NodeStageVolumeRequest request_(request);
+*request_.mutable_node_stage_secrets() = secrets;
+
+return call(
+NODE_SERVICE,
+::nodeStageVolume,
+std::move(request_));
+  });
+  } else {
+rpcResult =
+  call(NODE_SERVICE, ::nodeStageVolume, std::move(request));
+  }
+
+  return rpcResult
 .then(process::defer(self(), [this, volumeId] {
   CHECK(volumes.contains(volumeId));
   VolumeState& volumeState = volumes.at(volumeId).state;
@@ -1236,20 +1283,62 @@ void VolumeManagerProcess::removeVolume(const string& 
volumeId)
 }
 
 
+Future> VolumeManagerProcess::resolveSecrets(
+const Map& secrets)
+{
+  if (!secretResolver) {
+return Failure(
+"CSI volume included secrets but the agent was not initialized with "
+"a secret resolver");
+  }
+
+  // This `futures` is used below with `process::colle

[mesos] 01/02: Added support for secrets to the CSI volume managers.

2020-08-10 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 38ba19127ddb48244f7c6c699e3c41e5ea12b594
Author: Greg Mann 
AuthorDate: Mon Aug 10 20:26:26 2020 -0700

Added support for secrets to the CSI volume managers.

Review: https://reviews.apache.org/r/72732/
---
 src/csi/state.proto   |   6 ++
 src/csi/v0_volume_manager.cpp | 103 +++---
 src/csi/v0_volume_manager.hpp |   5 +-
 src/csi/v0_volume_manager_process.hpp |  13 -
 src/csi/v1_volume_manager.cpp |  96 +--
 src/csi/v1_volume_manager.hpp |   5 +-
 src/csi/v1_volume_manager_process.hpp |  13 -
 src/csi/volume_manager.cpp|  21 ++-
 src/csi/volume_manager.hpp|   5 +-
 9 files changed, 246 insertions(+), 21 deletions(-)

diff --git a/src/csi/state.proto b/src/csi/state.proto
index 836e30c..630e4f5 100644
--- a/src/csi/state.proto
+++ b/src/csi/state.proto
@@ -78,4 +78,10 @@ message VolumeState {
 
   // Indicates that the volume must be mounted read-only.
   bool readonly = 9;
+
+  // Secrets to be included in `NodeStageVolumeRequest`.
+  map node_stage_secrets = 10;
+
+  // Secrets to be included in `NodePublishVolumeRequest`.
+  map node_publish_secrets = 11;
 }
diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp
index 89a6da5..9e840a7 100644
--- a/src/csi/v0_volume_manager.cpp
+++ b/src/csi/v0_volume_manager.cpp
@@ -21,6 +21,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -81,14 +83,16 @@ VolumeManagerProcess::VolumeManagerProcess(
 const hashset _services,
 const Runtime& _runtime,
 ServiceManager* _serviceManager,
-Metrics* _metrics)
+Metrics* _metrics,
+SecretResolver* _secretResolver)
   : ProcessBase(process::ID::generate("csi-v0-volume-manager")),
 rootDir(_rootDir),
 info(_info),
 services(_services),
 runtime(_runtime),
 serviceManager(_serviceManager),
-metrics(_metrics)
+metrics(_metrics),
+secretResolver(_secretResolver)
 {
   // This should have been validated in `VolumeManager::create`.
   CHECK(!services.empty())
@@ -961,8 +965,33 @@ Future VolumeManagerProcess::_publishVolume(const 
string& volumeId)
 request.set_staging_target_path(stagingPath);
   }
 
-  return call(NODE_SERVICE, ::nodePublishVolume, std::move(request))
-.then(defer(self(), [this, volumeId, targetPath] {
+  Future rpcResult;
+
+  if (!volumeState.node_publish_secrets().empty()) {
+rpcResult = resolveSecrets(volumeState.node_publish_secrets())
+  .then(process::defer(
+  self(),
+  [this, request](const Map& secrets) {
+NodePublishVolumeRequest request_(request);
+*request_.mutable_node_publish_secrets() = secrets;
+
+return call(
+NODE_SERVICE,
+::nodePublishVolume,
+std::move(request_));
+  }));
+  } else {
+rpcResult =
+  call(NODE_SERVICE, ::nodePublishVolume, std::move(request));
+  }
+
+  return rpcResult
+.then(process::defer(self(), [this, volumeId, targetPath]()
+-> Future {
+  if (!os::exists(targetPath)) {
+return Failure("Target path '" + targetPath + "' not created");
+  }
+
   CHECK(volumes.contains(volumeId));
   VolumeState& volumeState = volumes.at(volumeId).state;
 
@@ -1042,7 +1071,25 @@ Future 
VolumeManagerProcess::__publishVolume(const string& volumeId)
 evolve(volumeState.volume_capability());
   *request.mutable_volume_attributes() = volumeState.volume_context();
 
-  return call(NODE_SERVICE, ::nodeStageVolume, std::move(request))
+  Future rpcResult;
+
+  if (!volumeState.node_stage_secrets().empty()) {
+rpcResult = resolveSecrets(volumeState.node_stage_secrets())
+  .then([=](const Map& secrets) {
+NodeStageVolumeRequest request_(request);
+*request_.mutable_node_stage_secrets() = secrets;
+
+return call(
+NODE_SERVICE,
+::nodeStageVolume,
+std::move(request_));
+  });
+  } else {
+rpcResult =
+  call(NODE_SERVICE, ::nodeStageVolume, std::move(request));
+  }
+
+  return rpcResult
 .then(process::defer(self(), [this, volumeId] {
   CHECK(volumes.contains(volumeId));
   VolumeState& volumeState = volumes.at(volumeId).state;
@@ -1236,20 +1283,62 @@ void VolumeManagerProcess::removeVolume(const string& 
volumeId)
 }
 
 
+Future> VolumeManagerProcess::resolveSecrets(
+const Map& secrets)
+{
+  if (!secretResolver) {
+return Failure(
+"CSI volume included secrets but the agent was not initialized with "
+"a secret resolver");
+  }
+
+  // This `futures` is used below with `process::colle

[mesos] branch master updated (c78dc33 -> fe0cd02)

2020-08-10 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from c78dc33  Added interface for the CSI server.
 new 38ba191  Added support for secrets to the CSI volume managers.
 new fe0cd02  Added implementation of the CSI server.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/CMakeLists.txt|   1 +
 src/Makefile.am   |   2 +
 src/csi/state.proto   |   6 +
 src/csi/v0_volume_manager.cpp | 103 +++-
 src/csi/v0_volume_manager.hpp |   5 +-
 src/csi/v0_volume_manager_process.hpp |  13 +-
 src/csi/v1_volume_manager.cpp |  96 ++-
 src/csi/v1_volume_manager.hpp |   5 +-
 src/csi/v1_volume_manager_process.hpp |  13 +-
 src/csi/volume_manager.cpp|  21 +-
 src/csi/volume_manager.hpp|   5 +-
 src/slave/csi_server.cpp  | 455 ++
 src/slave/csi_server.hpp  |   8 +-
 13 files changed, 711 insertions(+), 22 deletions(-)
 create mode 100644 src/slave/csi_server.cpp



[mesos] branch master updated (d2c84d1 -> c78dc33)

2020-08-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from d2c84d1  Improved CSI service manager to support unmanaged CSI plugins.
 new 8d54518  Enabled pre-provisioned volumes in the volume manager.
 new c63797c  Set the readonly field in the CSI volume manager.
 new c78dc33  Added interface for the CSI server.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/csi/state.proto   |  8 +++
 src/csi/v0_volume_manager.cpp | 99 +++
 src/csi/v0_volume_manager.hpp |  5 +-
 src/csi/v0_volume_manager_process.hpp |  8 ++-
 src/csi/v1_volume_manager.cpp | 99 +++
 src/csi/v1_volume_manager.hpp |  5 +-
 src/csi/v1_volume_manager_process.hpp |  8 ++-
 src/csi/volume_manager.hpp| 18 +--
 src/slave/csi_server.hpp  | 90 +++
 9 files changed, 287 insertions(+), 53 deletions(-)
 create mode 100644 src/slave/csi_server.hpp



[mesos] 03/03: Added interface for the CSI server.

2020-08-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit c78dc333fc893a43d40dc33299a61987198a6ea9
Author: Greg Mann 
AuthorDate: Mon Aug 3 10:11:57 2020 -0700

Added interface for the CSI server.

This component will hold objects associated with CSI plugins
running on the agent.

Review: https://reviews.apache.org/r/72707/
---
 src/slave/csi_server.hpp | 90 
 1 file changed, 90 insertions(+)

diff --git a/src/slave/csi_server.hpp b/src/slave/csi_server.hpp
new file mode 100644
index 000..17882e1
--- /dev/null
+++ b/src/slave/csi_server.hpp
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __SLAVE_CSI_SERVER_HPP__
+#define __SLAVE_CSI_SERVER_HPP__
+
+#include 
+
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "csi/service_manager.hpp"
+#include "csi/volume_manager.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CSIServerProcess;
+
+// A CSI server is a collection of volume managers and associated service
+// managers. This object can be instantiated and held by the Mesos agent to
+// manage a collection of CSI plugins and proxy calls to them.
+class CSIServer
+{
+public:
+  ~CSIServer();
+
+  static Try> create(
+  const Flags& flags,
+  const process::http::URL& agentUrl,
+  SecretGenerator* secretGenerator);
+
+  // Starts the CSI server. Any `publishVolume()` or `unpublishVolume()` calls
+  // which were made previously will be executed after this method is called.
+  // Returns a future which is satisfied once initialization is complete.
+  process::Future start();
+
+  // Publish a CSI volume to this agent. If the `start()` method has not yet
+  // been called, then the publishing of this volume will not be completed 
until
+  // the CSI server is started.
+  // Returns the target path at which the volume has been published.
+  process::Future publishVolume(
+  const Volume::Source::CSIVolume& volume);
+
+  // Unpublishes a CSI volume from this agent. If the `start()` method has not
+  // yet been called, then the unpublishing of this volume will not be 
completed
+  // until the CSI server is started.
+  process::Future unpublishVolume(
+  const std::string& pluginName,
+  const std::string& volumeId);
+
+private:
+  CSIServer(
+  const process::http::URL& agentUrl,
+  const std::string& csiRootDir,
+  SecretGenerator* secretGenerator,
+  const hashmap& csiPluginConfigs);
+
+  process::Owned process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SLAVE_CSI_SERVER_HPP__



[mesos] 02/03: Set the readonly field in the CSI volume manager.

2020-08-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit c63797cd80a1d7d3f91d6a7eb1574e1e8ef063fb
Author: Greg Mann 
AuthorDate: Mon Aug 3 10:11:54 2020 -0700

Set the readonly field in the CSI volume manager.

This patch introduces a new `readonly` field in the CSI
`VolumeState` message and passes it through when publishing
volumes. This will allow us to set this field appropriately
when publishing pre-provisioned volumes.

Review: https://reviews.apache.org/r/72715/
---
 src/csi/state.proto   | 3 +++
 src/csi/v0_volume_manager.cpp | 4 ++--
 src/csi/v1_volume_manager.cpp | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/csi/state.proto b/src/csi/state.proto
index af0ef1c..836e30c 100644
--- a/src/csi/state.proto
+++ b/src/csi/state.proto
@@ -75,4 +75,7 @@ message VolumeState {
   // pre-provisioned by some other means and then attached to the node using a
   // CSI plugin.
   bool pre_provisioned = 8;
+
+  // Indicates that the volume must be mounted read-only.
+  bool readonly = 9;
 }
diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp
index 5368440..89a6da5 100644
--- a/src/csi/v0_volume_manager.cpp
+++ b/src/csi/v0_volume_manager.cpp
@@ -822,7 +822,7 @@ Future VolumeManagerProcess::_attachVolume(const 
string& volumeId)
   request.set_node_id(CHECK_NOTNONE(nodeId));
   *request.mutable_volume_capability() =
 evolve(volumeState.volume_capability());
-  request.set_readonly(false);
+  request.set_readonly(volumeState.readonly());
   *request.mutable_volume_attributes() = volumeState.volume_context();
 
   return call(
@@ -950,7 +950,7 @@ Future VolumeManagerProcess::_publishVolume(const 
string& volumeId)
   request.set_target_path(targetPath);
   *request.mutable_volume_capability() =
 evolve(volumeState.volume_capability());
-  request.set_readonly(false);
+  request.set_readonly(volumeState.readonly());
   *request.mutable_volume_attributes() = volumeState.volume_context();
 
   if (nodeCapabilities->stageUnstageVolume) {
diff --git a/src/csi/v1_volume_manager.cpp b/src/csi/v1_volume_manager.cpp
index 7eae638..5178b2f 100644
--- a/src/csi/v1_volume_manager.cpp
+++ b/src/csi/v1_volume_manager.cpp
@@ -844,7 +844,7 @@ Future VolumeManagerProcess::_attachVolume(const 
string& volumeId)
   request.set_node_id(CHECK_NOTNONE(nodeId));
   *request.mutable_volume_capability() =
 evolve(volumeState.volume_capability());
-  request.set_readonly(false);
+  request.set_readonly(volumeState.readonly());
   *request.mutable_volume_context() = volumeState.volume_context();
 
   return call(
@@ -976,7 +976,7 @@ Future VolumeManagerProcess::_publishVolume(const 
string& volumeId)
   request.set_target_path(targetPath);
   *request.mutable_volume_capability() =
 evolve(volumeState.volume_capability());
-  request.set_readonly(false);
+  request.set_readonly(volumeState.readonly());
   *request.mutable_volume_context() = volumeState.volume_context();
 
   if (nodeCapabilities->stageUnstageVolume) {



[mesos] 01/03: Enabled pre-provisioned volumes in the volume manager.

2020-08-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 8d545180294ae705f7b8a2fe1578653107ede540
Author: Greg Mann 
AuthorDate: Mon Aug 3 10:11:50 2020 -0700

Enabled pre-provisioned volumes in the volume manager.

This patch makes it possible to publish CSI volumes on an
agent which were pre-provisioned out of band.

Review: https://reviews.apache.org/r/72681/
---
 src/csi/state.proto   |  5 ++
 src/csi/v0_volume_manager.cpp | 95 +++
 src/csi/v0_volume_manager.hpp |  5 +-
 src/csi/v0_volume_manager_process.hpp |  8 ++-
 src/csi/v1_volume_manager.cpp | 95 +++
 src/csi/v1_volume_manager.hpp |  5 +-
 src/csi/v1_volume_manager_process.hpp |  8 ++-
 src/csi/volume_manager.hpp| 18 +--
 8 files changed, 190 insertions(+), 49 deletions(-)

diff --git a/src/csi/state.proto b/src/csi/state.proto
index 28ad5ef..af0ef1c 100644
--- a/src/csi/state.proto
+++ b/src/csi/state.proto
@@ -70,4 +70,9 @@ message VolumeState {
   // hence needs cleanup. If set, the resource provider MUST transition the
   // volume to `PUBLISHED` state during recovery.
   bool node_publish_required = 7;
+
+  // Indicates that the volume was not created by a CSI plugin, but rather was
+  // pre-provisioned by some other means and then attached to the node using a
+  // CSI plugin.
+  bool pre_provisioned = 8;
 }
diff --git a/src/csi/v0_volume_manager.cpp b/src/csi/v0_volume_manager.cpp
index b383598..5368440 100644
--- a/src/csi/v0_volume_manager.cpp
+++ b/src/csi/v0_volume_manager.cpp
@@ -452,8 +452,29 @@ Future VolumeManagerProcess::detachVolume(const 
string& volumeId)
 }
 
 
-Future VolumeManagerProcess::publishVolume(const string& volumeId)
+Future VolumeManagerProcess::publishVolume(
+const string& volumeId,
+const Option& volumeState)
 {
+  if (volumeState.isSome()) {
+if (!volumeState->pre_provisioned()) {
+  return Failure(
+  "Cannot specify volume state when publishing a volume unless that"
+  " volume is pre-provisioned");
+}
+
+if (volumeState->state() != VolumeState::VOL_READY &&
+volumeState->state() != VolumeState::NODE_READY) {
+  return Failure(
+  "Cannot specify volume state when publishing a volume unless that"
+  " volume is in either the VOL_READY or NODE_READY state");
+}
+
+// This must be an untracked volume. Track it now before we continue.
+volumes.put(volumeId, VolumeState(volumeState.get()));
+checkpointVolumeState(volumeId);
+  }
+
   if (!volumes.contains(volumeId)) {
 return Failure("Cannot publish unknown volume '" + volumeId + "'");
   }
@@ -728,16 +749,7 @@ Future VolumeManagerProcess::_deleteVolume(const 
std::string& volumeId)
   // the future returned by the sequence ready as well.
   return __deleteVolume(volumeId)
 .then(process::defer(self(), [this, volumeId](bool deleted) {
-  volumes.erase(volumeId);
-
-  const string volumePath =
-paths::getVolumePath(rootDir, info.type(), info.name(), volumeId);
-
-  Try rmdir = os::rmdir(volumePath);
-  CHECK_SOME(rmdir) << "Failed to remove checkpointed volume state at '"
-<< volumePath << "': " << rmdir.error();
-
-  garbageCollectMountPath(volumeId);
+  removeVolume(volumeId);
 
   return deleted;
 }));
@@ -1051,6 +1063,13 @@ Future 
VolumeManagerProcess::_unpublishVolume(const string& volumeId)
 
   if (volumeState.state() == VolumeState::NODE_READY) {
 CHECK(volumeState.boot_id().empty());
+
+if (volumeState.pre_provisioned()) {
+  // Since this volume was pre-provisioned, it has reached the end of its
+  // lifecycle. Remove it now.
+  removeVolume(volumeId);
+}
+
 return Nothing();
   }
 
@@ -1063,9 +1082,16 @@ Future 
VolumeManagerProcess::_unpublishVolume(const string& volumeId)
   }
 
   if (!nodeCapabilities->stageUnstageVolume) {
-// Since this is a no-op, no need to checkpoint here.
-volumeState.set_state(VolumeState::NODE_READY);
-volumeState.clear_boot_id();
+if (volumeState.pre_provisioned()) {
+  // Since this volume was pre-provisioned, it has reached the end of its
+  // lifecycle. Remove it now.
+  removeVolume(volumeId);
+} else {
+  // Since this is a no-op, no need to checkpoint here.
+  volumeState.set_state(VolumeState::NODE_READY);
+  volumeState.clear_boot_id();
+}
+
 return Nothing();
   }
 
@@ -1091,13 +1117,20 @@ Future 
VolumeManagerProcess::_unpublishVolume(const string& volumeId)
   request.set_staging_target_path(stagingPath);
 
   return call(NODE_SERVICE, ::nodeUnstageVol

[mesos] branch master updated: Fixed an example in the documentation.

2020-07-16 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 8e74d18  Fixed an example in the documentation.
8e74d18 is described below

commit 8e74d18c1a34fe3d5a9f553552c6e9b66411a575
Author: Greg Mann 
AuthorDate: Thu Jul 16 09:10:30 2020 -0700

Fixed an example in the documentation.
---
 docs/operator-http-api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/operator-http-api.md b/docs/operator-http-api.md
index 585e94c..8b0fbdc 100644
--- a/docs/operator-http-api.md
+++ b/docs/operator-http-api.md
@@ -2675,7 +2675,7 @@ Accept: application/json
 "agent_id": {
   "value": "3192b9d1-db71-4699-ae25-e28dfbf42de1"
 },
-"max_grace_period": "10mins",
+"max_grace_period": {"seconds": 600},
 "mark_gone": false
   }
 }



[mesos] annotated tag 1.7.3 created (now 20756d0)

2020-05-19 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to annotated tag 1.7.3
in repository https://gitbox.apache.org/repos/asf/mesos.git.


  at 20756d0  (tag)
 tagging 684c41947fe47ae9fc3020598e26368fb8863eea (tag)
  length 159 bytes
  by Greg Mann
  on Tue May 19 12:29:49 2020 -0700

- Log -
Tagging Mesos 1.7.3
---

No new revisions were added by this update.



svn commit: r39700 - in /release/mesos/1.7.3: ./ mesos-1.7.3.tar.gz mesos-1.7.3.tar.gz.asc mesos-1.7.3.tar.gz.sha512

2020-05-19 Thread grag
Author: grag
Date: Tue May 19 19:29:49 2020
New Revision: 39700

Log:
Adding mesos-1.7.3.

Added:
release/mesos/1.7.3/
release/mesos/1.7.3/mesos-1.7.3.tar.gz   (with props)
release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc
release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512

Added: release/mesos/1.7.3/mesos-1.7.3.tar.gz
==
Binary file - no diff available.

Propchange: release/mesos/1.7.3/mesos-1.7.3.tar.gz
--
svn:mime-type = application/octet-stream

Added: release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc
==
--- release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc (added)
+++ release/mesos/1.7.3/mesos-1.7.3.tar.gz.asc Tue May 19 19:29:49 2020
@@ -0,0 +1,17 @@
+-BEGIN PGP SIGNATURE-
+Version: GnuPG v2.0.22 (GNU/Linux)
+
+iQIcBAABAgAGBQJesFMkAAoJEEPsLr7bUMMbMQQQAIDWLNTTHHrSiwgvkOYRa+Hy
+U8J5cHUJiFVy+s/DHApFjwZoU299uqtxk+zn6shesMT7sEC3razKYDfvsa7+F6dK
+mDQUcRw7DKVXxPt40QhM/eZjX2UnaamC7vbpXt8R2I6JUDN4mEkQi4v+tXeKBueh
+d0owEhpLtMVtvruBcQKUXZQU4v9PTe+vSUitWIGdc3JmtqK+Ocw1okCwRLBGbYs3
+lpOUdKAt0cGrHk3uBwurVFY/draVDdAE7gIOdmwCKXzwZ1f58bSEa/YFbmFrsIEx
+vI2nQGG5om4Gt3RaecwbONZUJoObZZtWdPY6ebQLaMp0PlI35lWTM5s80zRr38nB
+r3NcSIfEscUEMnqzQ/fBGlMip17M+iE5J2JiMmre4jcIbiDk2n+14QjR2D2ehPdN
+JdVJZ07Z8PtWV/kRI+9UK6rfvE+FrdnmiCcZaFWo3lEy/L0FArFSnBgPV50/4i/Z
+UnPl3klDaSmXlXTk9d7arMXxEBGXmEuYWzne2dqEOaB3VCURTBhFg3t0n7XrHW7e
+PXHqOx4dxgTtrfssRcLBlWhhQ6aDZ6sIVUou9YAiOv1zKjuiekoBT5Mhj4q/LTAR
+zVcuN/zVT4wv+F8URU48WAsymGrsmzWEORvowcvBsKEADpmi050vd4c4+e1dQQ8i
+E/dXz1wYgN8B6Be2Qz8S
+=+QoT
+-END PGP SIGNATURE-

Added: release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512
==
--- release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512 (added)
+++ release/mesos/1.7.3/mesos-1.7.3.tar.gz.sha512 Tue May 19 19:29:49 2020
@@ -0,0 +1 @@
+6ecea4edcd49e364549f5e8d5728644964cf87fc6a6d0431693efeb94b5c970ad663a0e8279694f4e51408c0ea91aebd4ae08ba5b880b460e4708309d9503bd9
  mesos-1.7.3.tar.gz




[mesos] 01/02: Fixed the java bindings in the cmake build.

2020-05-11 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 630dbf7486bfe315472fbcd8d0087f75cd9f3786
Author: Andriy Kornatskyy 
AuthorDate: Fri May 8 14:10:39 2020 -0700

Fixed the java bindings in the cmake build.

This closes #360
---
 src/CMakeLists.txt  | 3 +--
 src/java/CMakeLists.txt | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 96cd867..810acbf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -627,8 +627,7 @@ target_link_libraries(
   sasl2
   zookeeper
   mesos-protobufs
-  $<$,$>:nvml>
-  $<$:mesos-java>)
+  $<$,$>:nvml>)
 
 if (NOT WIN32)
   target_link_libraries(mesos PUBLIC leveldb)
diff --git a/src/java/CMakeLists.txt b/src/java/CMakeLists.txt
index 29422e9..81eb9b5 100644
--- a/src/java/CMakeLists.txt
+++ b/src/java/CMakeLists.txt
@@ -116,7 +116,8 @@ add_custom_command(
 add_library(mesos-java ${JAVA_SRC} ${JAVA_H})
 
 target_link_libraries(
-  mesos-java
+  mesos-java PUBLIC
+  mesos
   mesos-protobufs
   process
   zookeeper



[mesos] branch master updated (48922e0 -> 07cd355)

2020-05-11 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 48922e0  Updated CHANGELOG for 1.10.0.
 new 630dbf7  Fixed the java bindings in the cmake build.
 new 07cd355  Added ability to specify a root dir for boost and curl with 
cmake.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 3rdparty/CMakeLists.txt  | 61 +---
 cmake/CompilationConfigure.cmake | 14 +
 src/CMakeLists.txt   |  3 +-
 src/java/CMakeLists.txt  |  3 +-
 4 files changed, 56 insertions(+), 25 deletions(-)



[mesos] 02/02: Added ability to specify a root dir for boost and curl with cmake.

2020-05-11 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 07cd355f90892a897e0b210a8cd0eda6103ae6c9
Author: Andriy Kornatskyy 
AuthorDate: Fri May 8 14:11:10 2020 -0700

Added ability to specify a root dir for boost and curl with cmake.

This closes #361
---
 3rdparty/CMakeLists.txt  | 61 +---
 cmake/CompilationConfigure.cmake | 14 +
 2 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 119813e..7b84f12 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -194,29 +194,34 @@ endfunction()
 # Boost: C++ Libraries.
 # http://www.boost.org
 ###
-EXTERNAL(boost ${BOOST_VERSION} ${CMAKE_CURRENT_BINARY_DIR})
-add_library(boost INTERFACE)
-add_dependencies(boost ${BOOST_TARGET})
-if (CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES Clang)
-  # Headers including Boost 1.65.0 fail to compile with GCC 7.2 and
-  # CLang 3.6 without `-Wno-unused-local-typedefs`.
-  # TODO(andschwa): Remove this when Boost has a resolution.
-  target_compile_options(boost INTERFACE -Wno-unused-local-typedefs)
-endif ()
-target_include_directories(boost INTERFACE ${BOOST_ROOT})
+if ("${BOOST_ROOT_DIR}" STREQUAL "")
+  EXTERNAL(boost ${BOOST_VERSION} ${CMAKE_CURRENT_BINARY_DIR})
+  add_library(boost INTERFACE)
+  add_dependencies(boost ${BOOST_TARGET})
+  if (CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES Clang)
+# Headers including Boost 1.65.0 fail to compile with GCC 7.2 and
+# CLang 3.6 without `-Wno-unused-local-typedefs`.
+# TODO(andschwa): Remove this when Boost has a resolution.
+target_compile_options(boost INTERFACE -Wno-unused-local-typedefs)
+  endif ()
+  target_include_directories(boost INTERFACE ${BOOST_ROOT})
 
-# Patch Boost to avoid repeated "Unknown compiler warnings" on Windows.
-PATCH_CMD(BOOST_PATCH_CMD boost-${BOOST_VERSION}.patch)
+  # Patch Boost to avoid repeated "Unknown compiler warnings" on Windows.
+  PATCH_CMD(BOOST_PATCH_CMD boost-${BOOST_VERSION}.patch)
 
-ExternalProject_Add(
-  ${BOOST_TARGET}
-  PREFIX${BOOST_CMAKE_ROOT}
-  PATCH_COMMAND ${BOOST_PATCH_CMD}
-  CONFIGURE_COMMAND ${CMAKE_NOOP}
-  BUILD_COMMAND ${CMAKE_NOOP}
-  INSTALL_COMMAND   ${CMAKE_NOOP}
-  URL   ${BOOST_URL}
-  URL_HASH  ${BOOST_HASH})
+  ExternalProject_Add(
+${BOOST_TARGET}
+PREFIX${BOOST_CMAKE_ROOT}
+PATCH_COMMAND ${BOOST_PATCH_CMD}
+CONFIGURE_COMMAND ${CMAKE_NOOP}
+BUILD_COMMAND ${CMAKE_NOOP}
+INSTALL_COMMAND   ${CMAKE_NOOP}
+URL   ${BOOST_URL}
+URL_HASH  ${BOOST_HASH})
+else ()
+  add_library(boost INTERFACE)
+  target_include_directories(boost INTERFACE ${BOOST_ROOT_DIR}/include)
+endif ()
 
 
 # moodycamel::ConcurrentQueue: An industrial-strength lock-free queue.
@@ -861,7 +866,19 @@ if (WIN32)
 COMMAND ${CMAKE_COMMAND} -E copy $ 
${CMAKE_BINARY_DIR}/src/curl.exe
 DEPENDEES build)
 else ()
-  find_package(CURL REQUIRED)
+  if ("${CURL_ROOT_DIR}" STREQUAL "")
+find_package(CURL REQUIRED)
+  else ()
+set(POSSIBLE_CURL_INCLUDE_DIRS ${CURL_ROOT_DIR}/include)
+set(POSSIBLE_CURL_LIB_DIRS ${CURL_ROOT_DIR}/lib)
+
+set(CURL_LIBRARY_NAMES curl)
+
+FIND_PACKAGE_HELPER(CURL curl/curl.h)
+SET(CURL_INCLUDE_DIRS ${CURL_INCLUDE_DIR})
+SET(CURL_LIBRARIES ${CURL_LIBS})
+  endif ()
+
   add_library(libcurl SHARED IMPORTED)
 
   set_target_properties(
diff --git a/cmake/CompilationConfigure.cmake b/cmake/CompilationConfigure.cmake
index f9511fc..af1a8b5 100644
--- a/cmake/CompilationConfigure.cmake
+++ b/cmake/CompilationConfigure.cmake
@@ -103,6 +103,20 @@ if (ENABLE_LIBEVENT)
 "Specify the path to libevent, e.g. \"C:\\libevent-Win64\".")
 endif()
 
+set(
+  BOOST_ROOT_DIR
+  ""
+  CACHE STRING
+  "Specify the path to boost.")
+
+if (NOT WIN32)
+  set(
+CURL_ROOT_DIR
+""
+CACHE STRING
+"Specify the path to libcurl.")
+endif()
+
 option(
   UNBUNDLED_LEVELDB
   "Build with an installed leveldb version instead of the bundled."



[mesos] branch 1.9.x updated: Added MESOS-10118 to the 1.9.1 CHANGELOG.

2020-05-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.9.x by this push:
 new 30b7aae  Added MESOS-10118 to the 1.9.1 CHANGELOG.
30b7aae is described below

commit 30b7aae987c3534af87453e884e1a87841f3a72c
Author: Greg Mann 
AuthorDate: Wed May 6 18:07:36 2020 -0700

Added MESOS-10118 to the 1.9.1 CHANGELOG.
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index d07f6ce..59ccef6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -14,6 +14,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP)
   * [MESOS-10041] - Libprocess SSL verification can leak memory.
   * [MESOS-10094] - Master's agent draining VLOG prints incorrect task counts.
   * [MESOS-10096] - Reactivating a draining agent leaves the agent in draining 
state.
+  * [MESOS-10118] - Agent incorrectly handles draining when empty.
 
 ** Improvement
   * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in 
Master::__reregisterSlave.



[mesos] branch master updated: Added MESOS-10118 to the 1.9.1 CHANGELOG.

2020-05-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 8682b5d  Added MESOS-10118 to the 1.9.1 CHANGELOG.
8682b5d is described below

commit 8682b5ddf8b773beffe8bf0428c9350d6ae59412
Author: Greg Mann 
AuthorDate: Wed May 6 18:07:36 2020 -0700

Added MESOS-10118 to the 1.9.1 CHANGELOG.
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index a115101..f43ab8d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -22,6 +22,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP)
   * [MESOS-10041] - Libprocess SSL verification can leak memory.
   * [MESOS-10094] - Master's agent draining VLOG prints incorrect task counts.
   * [MESOS-10096] - Reactivating a draining agent leaves the agent in draining 
state.
+  * [MESOS-10118] - Agent incorrectly handles draining when empty.
 
 ** Improvement
   * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in 
Master::__reregisterSlave.



[mesos] branch 1.9.x updated: Fixed a bug in the agent's draining handler.

2020-05-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.9.x by this push:
 new 35912a2  Fixed a bug in the agent's draining handler.
35912a2 is described below

commit 35912a22081e88ba243d2b690667dff6a90c51d0
Author: Greg Mann 
AuthorDate: Wed May 6 16:35:19 2020 -0700

Fixed a bug in the agent's draining handler.

Previously, when the agent had no tasks or operations and
received a `DrainSlaveMessage`, it would checkpoint the
`DrainConfig` to disk, implicitly placing it into a "draining"
state indefinitely. This patch updates the agent's handler to
avoid checkpointing anything to disk in this case.

The `SlaveTest.DrainInfoInAPIOutputs` test is also removed
and its functionality is moved into the test
`SlaveTest.DrainAgentKillsRunningTask`. The running task in
the latter test allows us to verify agent API outputs both
before and after the task's terminal update is acknowleged.

Review: https://reviews.apache.org/r/72368/
---
 src/slave/slave.cpp   |  12 +++
 src/tests/slave_tests.cpp | 215 +-
 2 files changed, 127 insertions(+), 100 deletions(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 23d2ddd..7110ff4 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -998,6 +998,18 @@ void Slave::drain(
 const UPID& from,
 DrainSlaveMessage&& drainSlaveMessage)
 {
+  if (operations.empty() && frameworks.empty()) {
+LOG(INFO)
+  << "Received DrainConfig " << drainSlaveMessage.config()
+  << (drainConfig.isSome()
+  ? "; previously stored DrainConfig " + stringify(*drainConfig)
+  : "")
+  << "; agent has no stored frameworks, tasks, or operations,"
+ " so draining is already complete";
+
+return;
+  }
+
   hashmap> pendingTaskIds;
   foreachvalue (Framework* framework, frameworks) {
 foreachvalue (const auto& taskMap, framework->pendingTasks) {
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index c147bfc..335a1c4 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -11928,97 +11928,8 @@ TEST_F(
 }
 
 
-// When the agent receives a `DrainSlaveMessage` from the master, the agent's
-// drain info should be visible in the agent's API output.
-TEST_F(SlaveTest, DrainInfoInAPIOutputs)
-{
-  Clock::pause();
-
-  const int GRACE_PERIOD_NANOS = 100;
-
-  Try> master = StartMaster();
-  ASSERT_SOME(master);
-
-  Future slaveRegisteredMessage =
-FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
-
-  StandaloneMasterDetector detector(master.get()->pid);
-
-  slave::Flags slaveFlags = CreateSlaveFlags();
-
-  Try> slave = StartSlave(, slaveFlags);
-  ASSERT_SOME(slave);
-
-  Clock::advance(slaveFlags.registration_backoff_factor);
-
-  AWAIT_READY(slaveRegisteredMessage);
-
-  // Simulate the master sending a `DrainSlaveMessage` to the agent.
-  DurationInfo maxGracePeriod;
-  maxGracePeriod.set_nanoseconds(GRACE_PERIOD_NANOS);
-
-  DrainConfig drainConfig;
-  drainConfig.set_mark_gone(true);
-  drainConfig.mutable_max_grace_period()->CopyFrom(maxGracePeriod);
-
-  DrainSlaveMessage drainSlaveMessage;
-  drainSlaveMessage.mutable_config()->CopyFrom(drainConfig);
-
-  process::post(master.get()->pid, slave.get()->pid, drainSlaveMessage);
-
-  Clock::settle();
-
-  {
-v1::agent::Call call;
-call.set_type(v1::agent::Call::GET_AGENT);
-
-const ContentType contentType = ContentType::PROTOBUF;
-
-process::http::Headers headers = 
createBasicAuthHeaders(DEFAULT_CREDENTIAL);
-headers["Accept"] = stringify(contentType);
-
-Future httpResponse =
-  process::http::post(
-  slave.get()->pid,
-  "api/v1",
-  headers,
-  serialize(contentType, call),
-  stringify(contentType));
-
-AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, httpResponse);
-
-Future responseMessage =
-  deserialize(contentType, httpResponse->body);
-
-AWAIT_READY(responseMessage);
-ASSERT_TRUE(responseMessage->IsInitialized());
-ASSERT_EQ(v1::agent::Response::GET_AGENT, responseMessage->type());
-ASSERT_TRUE(responseMessage->get_agent().has_drain_config());
-EXPECT_EQ(
-drainConfig,
-devolve(responseMessage->get_agent().drain_config()));
-  }
-
-  {
-Future response = process::http::get(
-slave.get()->pid,
-"state",
-None(),
-createBasicAuthHeaders(DEFAULT_CREDENTIAL));
-
-AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
-AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", 
response);
-
-Try state

[mesos] branch master updated (ff9f5cc -> 06cc8ac)

2020-05-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from ff9f5cc  Fixed a bug in the agent's draining handler.
 add 5e77c94  Made the scheduler resources a top-level item in the 
documentation.
 add 1949f52  Added intro docs on running workloads.
 add 06cc8ac  Updated existing documentation for resource limits.

No new revisions were added by this update.

Summary of changes:
 docs/home.md|  16 +-
 docs/nested-container-and-task-group.md |  91 
 docs/running-workloads.md   | 255 
 docs/scheduler-http-api.md  |  93 ++--
 docs/upgrades.md|   5 +
 5 files changed, 385 insertions(+), 75 deletions(-)
 create mode 100644 docs/running-workloads.md



[mesos] branch master updated: Fixed a bug in the agent's draining handler.

2020-05-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new ff9f5cc  Fixed a bug in the agent's draining handler.
ff9f5cc is described below

commit ff9f5cc796f6a99302d94de121726cd7b5988f11
Author: Greg Mann 
AuthorDate: Wed May 6 16:35:19 2020 -0700

Fixed a bug in the agent's draining handler.

Previously, when the agent had no tasks or operations and
received a `DrainSlaveMessage`, it would checkpoint the
`DrainConfig` to disk, implicitly placing it into a "draining"
state indefinitely. This patch updates the agent's handler to
avoid checkpointing anything to disk in this case.

The `SlaveTest.DrainInfoInAPIOutputs` test is also removed
and its functionality is moved into the test
`SlaveTest.DrainAgentKillsRunningTask`. The running task in
the latter test allows us to verify agent API outputs both
before and after the task's terminal update is acknowleged.

Review: https://reviews.apache.org/r/72368/
---
 src/slave/slave.cpp   |  12 +++
 src/tests/slave_tests.cpp | 215 +-
 2 files changed, 127 insertions(+), 100 deletions(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 1a32c81..c828d99 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1074,6 +1074,18 @@ void Slave::drain(
 const UPID& from,
 DrainSlaveMessage&& drainSlaveMessage)
 {
+  if (operations.empty() && frameworks.empty()) {
+LOG(INFO)
+  << "Received DrainConfig " << drainSlaveMessage.config()
+  << (drainConfig.isSome()
+  ? "; previously stored DrainConfig " + stringify(*drainConfig)
+  : "")
+  << "; agent has no stored frameworks, tasks, or operations,"
+ " so draining is already complete";
+
+return;
+  }
+
   hashmap> pendingTaskIds;
   foreachvalue (Framework* framework, frameworks) {
 foreachvalue (const auto& taskMap, framework->pendingTasks) {
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 5ad04b2..b46e561 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -12089,97 +12089,8 @@ TEST_F(
 }
 
 
-// When the agent receives a `DrainSlaveMessage` from the master, the agent's
-// drain info should be visible in the agent's API output.
-TEST_F(SlaveTest, DrainInfoInAPIOutputs)
-{
-  Clock::pause();
-
-  const int GRACE_PERIOD_NANOS = 100;
-
-  Try> master = StartMaster();
-  ASSERT_SOME(master);
-
-  Future slaveRegisteredMessage =
-FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
-
-  StandaloneMasterDetector detector(master.get()->pid);
-
-  slave::Flags slaveFlags = CreateSlaveFlags();
-
-  Try> slave = StartSlave(, slaveFlags);
-  ASSERT_SOME(slave);
-
-  Clock::advance(slaveFlags.registration_backoff_factor);
-
-  AWAIT_READY(slaveRegisteredMessage);
-
-  // Simulate the master sending a `DrainSlaveMessage` to the agent.
-  DurationInfo maxGracePeriod;
-  maxGracePeriod.set_nanoseconds(GRACE_PERIOD_NANOS);
-
-  DrainConfig drainConfig;
-  drainConfig.set_mark_gone(true);
-  drainConfig.mutable_max_grace_period()->CopyFrom(maxGracePeriod);
-
-  DrainSlaveMessage drainSlaveMessage;
-  drainSlaveMessage.mutable_config()->CopyFrom(drainConfig);
-
-  process::post(master.get()->pid, slave.get()->pid, drainSlaveMessage);
-
-  Clock::settle();
-
-  {
-v1::agent::Call call;
-call.set_type(v1::agent::Call::GET_AGENT);
-
-const ContentType contentType = ContentType::PROTOBUF;
-
-process::http::Headers headers = 
createBasicAuthHeaders(DEFAULT_CREDENTIAL);
-headers["Accept"] = stringify(contentType);
-
-Future httpResponse =
-  process::http::post(
-  slave.get()->pid,
-  "api/v1",
-  headers,
-  serialize(contentType, call),
-  stringify(contentType));
-
-AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, httpResponse);
-
-Future responseMessage =
-  deserialize(contentType, httpResponse->body);
-
-AWAIT_READY(responseMessage);
-ASSERT_TRUE(responseMessage->IsInitialized());
-ASSERT_EQ(v1::agent::Response::GET_AGENT, responseMessage->type());
-ASSERT_TRUE(responseMessage->get_agent().has_drain_config());
-EXPECT_EQ(
-drainConfig,
-devolve(responseMessage->get_agent().drain_config()));
-  }
-
-  {
-Future response = process::http::get(
-slave.get()->pid,
-"state",
-None(),
-createBasicAuthHeaders(DEFAULT_CREDENTIAL));
-
-AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
-AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", 
response);
-
-Try state

svn commit: r39283 - in /dev/mesos/1.7.3-rc1: ./ mesos-1.7.3.tar.gz mesos-1.7.3.tar.gz.asc mesos-1.7.3.tar.gz.sha512

2020-05-04 Thread grag
Author: grag
Date: Mon May  4 17:39:21 2020
New Revision: 39283

Log:
Adding mesos-1.7.3-rc1.

Added:
dev/mesos/1.7.3-rc1/
dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz   (with props)
dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc
dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512

Added: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz
==
Binary file - no diff available.

Propchange: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz
--
svn:mime-type = application/octet-stream

Added: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc
==
--- dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc (added)
+++ dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.asc Mon May  4 17:39:21 2020
@@ -0,0 +1,17 @@
+-BEGIN PGP SIGNATURE-
+Version: GnuPG v2.0.22 (GNU/Linux)
+
+iQIcBAABAgAGBQJesFMkAAoJEEPsLr7bUMMbMQQQAIDWLNTTHHrSiwgvkOYRa+Hy
+U8J5cHUJiFVy+s/DHApFjwZoU299uqtxk+zn6shesMT7sEC3razKYDfvsa7+F6dK
+mDQUcRw7DKVXxPt40QhM/eZjX2UnaamC7vbpXt8R2I6JUDN4mEkQi4v+tXeKBueh
+d0owEhpLtMVtvruBcQKUXZQU4v9PTe+vSUitWIGdc3JmtqK+Ocw1okCwRLBGbYs3
+lpOUdKAt0cGrHk3uBwurVFY/draVDdAE7gIOdmwCKXzwZ1f58bSEa/YFbmFrsIEx
+vI2nQGG5om4Gt3RaecwbONZUJoObZZtWdPY6ebQLaMp0PlI35lWTM5s80zRr38nB
+r3NcSIfEscUEMnqzQ/fBGlMip17M+iE5J2JiMmre4jcIbiDk2n+14QjR2D2ehPdN
+JdVJZ07Z8PtWV/kRI+9UK6rfvE+FrdnmiCcZaFWo3lEy/L0FArFSnBgPV50/4i/Z
+UnPl3klDaSmXlXTk9d7arMXxEBGXmEuYWzne2dqEOaB3VCURTBhFg3t0n7XrHW7e
+PXHqOx4dxgTtrfssRcLBlWhhQ6aDZ6sIVUou9YAiOv1zKjuiekoBT5Mhj4q/LTAR
+zVcuN/zVT4wv+F8URU48WAsymGrsmzWEORvowcvBsKEADpmi050vd4c4+e1dQQ8i
+E/dXz1wYgN8B6Be2Qz8S
+=+QoT
+-END PGP SIGNATURE-

Added: dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512
==
--- dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512 (added)
+++ dev/mesos/1.7.3-rc1/mesos-1.7.3.tar.gz.sha512 Mon May  4 17:39:21 2020
@@ -0,0 +1 @@
+6ecea4edcd49e364549f5e8d5728644964cf87fc6a6d0431693efeb94b5c970ad663a0e8279694f4e51408c0ea91aebd4ae08ba5b880b460e4708309d9503bd9
  mesos-1.7.3.tar.gz




[mesos] annotated tag 1.7.3-rc1 created (now 684c419)

2020-05-04 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to annotated tag 1.7.3-rc1
in repository https://gitbox.apache.org/repos/asf/mesos.git.


  at 684c419  (tag)
 tagging 5f617044c969ebcfca281d043a2474c1a6b39f23 (commit)
 replaces 1.7.2
  by Greg Mann
  on Mon May 4 10:06:56 2020 -0700

- Log -
Tagging Mesos 1.7.3-rc1.
---

No new revisions were added by this update.



[mesos] branch 1.7.x updated: Prepared the 1.7.3 CHANGELOG for release.

2020-04-29 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.7.x by this push:
 new 5f61704  Prepared the 1.7.3 CHANGELOG for release.
5f61704 is described below

commit 5f617044c969ebcfca281d043a2474c1a6b39f23
Author: Greg Mann 
AuthorDate: Tue Apr 28 23:12:14 2020 -0700

Prepared the 1.7.3 CHANGELOG for release.
---
 CHANGELOG | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index d2e3a19..64921a6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,4 @@
-Release Notes - Mesos - Version 1.7.3 (WIP)
+Release Notes - Mesos - Version 1.7.3
 ---
 * This is a bug fix release.
 



[mesos] branch master updated: Added missing issues to the 1.7.3 CHANGELOG.

2020-04-28 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new db83570  Added missing issues to the 1.7.3 CHANGELOG.
db83570 is described below

commit db83570b13ad82f467ae6dfc5642cc8da2f7a8fc
Author: Greg Mann 
AuthorDate: Tue Apr 28 22:52:02 2020 -0700

Added missing issues to the 1.7.3 CHANGELOG.
---
 CHANGELOG | 9 +
 1 file changed, 9 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 8df75fb..a115101 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -731,12 +731,15 @@ Release Notes - Mesos - Version 1.7.3 (WIP)
 
 ** Bug
   * [MESOS-8467] - Destroyed executors might be used after 
`Slave::publishResource()`.
+  * [MESOS-8537] - Default executor doesn't wait for status updates to be 
ack'd before shutting down.
   * [MESOS-9124] - Agent reconfiguration can cause master to unsuppress on 
scheduler's behalf.
   * [MESOS-9507] - Agent could not recover due to empty docker volume 
checkpointed files.
   * [MESOS-9529] - `/proc` should be remounted even if a nested container set 
`share_pid_namespace` to true.
   * [MESOS-9549] - nvidia/cuda 10 does not work on GPU isolator.
   * [MESOS-9564] - Logrotate container logger lets tasks execute arbitrary 
commands in the Mesos agent's namespace.
   * [MESOS-9568] - SLRP does not clean up mount directories for destroyed 
MOUNT disks.
+  * [MESOS-9581] - Mesos package naming appears to be undeterministic.
+  * [MESOS-9590] - Mesos CI sometimes, incorrectly, overwrites already-pushed 
mesos master nightly images with new images built from non-master branches.
   * [MESOS-9607] - Removing a resource provider with consumers breaks resource 
publishing.
   * [MESOS-9610] - Fetcher vulnerability - escaping from sandbox.
   * [MESOS-9616] - `Filters.refuse_seconds` declines resources not in offers.
@@ -752,6 +755,7 @@ Release Notes - Mesos - Version 1.7.3 (WIP)
   * [MESOS-9787] - Log slow SSL (TLS) peer reverse DNS lookup.
   * [MESOS-9803] - Memory leak caused by an infinite chain of futures in 
`UriDiskProfileAdaptor`.
   * [MESOS-9836] - Docker containerizer overwrites `/mesos/slave` cgroups.
+  * [MESOS-9847] - Docker executor doesn't wait for status updates to be ack'd 
before shutting down.
   * [MESOS-9852] - Slow memory growth in master due to deferred deletion of 
offer filters and timers.
   * [MESOS-9856] - REVIVE call with specified role(s) clears filters for all 
roles of a framework.
   * [MESOS-9868] - NetworkInfo from the agent /state endpoint is not correct.
@@ -762,13 +766,18 @@ Release Notes - Mesos - Version 1.7.3 (WIP)
   * [MESOS-9925] - Default executor takes a couple of seconds to start and 
subscribe Mesos agent.
   * [MESOS-9964] - Support destroying UCR containers in provisioning state.
   * [MESOS-9966] - Agent crashes when trying to destroy orphaned nested 
container if root container is orphaned as well.
+  * [MESOS-9968] - WWWAuthenticate header parsing fails when commas are in 
(quoted) realm.
   * [MESOS-10007] - Command executor can miss exit status for short-lived 
commands due to double-reaping.
   * [MESOS-10015] - updateAllocation() can stall the allocator with a huge 
number of reservations on an agent.
+  * [MESOS-10018] - Duplicate tasks if agent partitioned during maintenance 
down.
+  * [MESOS-10084] - Detecting whether executor is generated for command task 
should work when the launcher_dir changes.
+  * [MESOS-10092] - Cannot pull image from docker registry which does not 
reply with 'scope'/'service' in WWW-Authenticate header.
 
 ** Improvements
   * [MESOS-8880] - Add minimum capabilities in the master.
   * [MESOS-9159] - Support Foreign URLs in docker registry puller.
   * [MESOS-9540] - Support `DESTROY_DISK` on preprovisioned CSI volumes.
+  * [MESOS-9545] - Marking an unreachable agent as gone should transition the 
tasks to terminal state.
   * [MESOS-9675] - Docker Manifest V2 Schema2 Support.
   * [MESOS-9704] - Support docker manifest v2s2 config GC.
   * [MESOS-9759] - Log required quota headroom and available quota headroom in 
the allocator.



[mesos] branch 1.7.x updated: Added missing issues to the 1.7.3 CHANGELOG.

2020-04-28 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.7.x by this push:
 new 1ae15d9  Added missing issues to the 1.7.3 CHANGELOG.
1ae15d9 is described below

commit 1ae15d9f30fbce4cb2e4789af5abcd2cf309493b
Author: Greg Mann 
AuthorDate: Tue Apr 28 22:52:02 2020 -0700

Added missing issues to the 1.7.3 CHANGELOG.
---
 CHANGELOG | 9 +
 1 file changed, 9 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index b5334b0..d2e3a19 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,12 +4,15 @@ Release Notes - Mesos - Version 1.7.3 (WIP)
 
 ** Bug
   * [MESOS-8467] - Destroyed executors might be used after 
`Slave::publishResource()`.
+  * [MESOS-8537] - Default executor doesn't wait for status updates to be 
ack'd before shutting down.
   * [MESOS-9124] - Agent reconfiguration can cause master to unsuppress on 
scheduler's behalf.
   * [MESOS-9507] - Agent could not recover due to empty docker volume 
checkpointed files.
   * [MESOS-9529] - `/proc` should be remounted even if a nested container set 
`share_pid_namespace` to true.
   * [MESOS-9549] - nvidia/cuda 10 does not work on GPU isolator.
   * [MESOS-9564] - Logrotate container logger lets tasks execute arbitrary 
commands in the Mesos agent's namespace.
   * [MESOS-9568] - SLRP does not clean up mount directories for destroyed 
MOUNT disks.
+  * [MESOS-9581] - Mesos package naming appears to be undeterministic.
+  * [MESOS-9590] - Mesos CI sometimes, incorrectly, overwrites already-pushed 
mesos master nightly images with new images built from non-master branches.
   * [MESOS-9607] - Removing a resource provider with consumers breaks resource 
publishing.
   * [MESOS-9610] - Fetcher vulnerability - escaping from sandbox.
   * [MESOS-9616] - `Filters.refuse_seconds` declines resources not in offers.
@@ -25,6 +28,7 @@ Release Notes - Mesos - Version 1.7.3 (WIP)
   * [MESOS-9787] - Log slow SSL (TLS) peer reverse DNS lookup.
   * [MESOS-9803] - Memory leak caused by an infinite chain of futures in 
`UriDiskProfileAdaptor`.
   * [MESOS-9836] - Docker containerizer overwrites `/mesos/slave` cgroups.
+  * [MESOS-9847] - Docker executor doesn't wait for status updates to be ack'd 
before shutting down.
   * [MESOS-9852] - Slow memory growth in master due to deferred deletion of 
offer filters and timers.
   * [MESOS-9856] - REVIVE call with specified role(s) clears filters for all 
roles of a framework.
   * [MESOS-9868] - NetworkInfo from the agent /state endpoint is not correct.
@@ -35,13 +39,18 @@ Release Notes - Mesos - Version 1.7.3 (WIP)
   * [MESOS-9925] - Default executor takes a couple of seconds to start and 
subscribe Mesos agent.
   * [MESOS-9964] - Support destroying UCR containers in provisioning state.
   * [MESOS-9966] - Agent crashes when trying to destroy orphaned nested 
container if root container is orphaned as well.
+  * [MESOS-9968] - WWWAuthenticate header parsing fails when commas are in 
(quoted) realm.
   * [MESOS-10007] - Command executor can miss exit status for short-lived 
commands due to double-reaping.
   * [MESOS-10015] - updateAllocation() can stall the allocator with a huge 
number of reservations on an agent.
+  * [MESOS-10018] - Duplicate tasks if agent partitioned during maintenance 
down.
+  * [MESOS-10084] - Detecting whether executor is generated for command task 
should work when the launcher_dir changes.
+  * [MESOS-10092] - Cannot pull image from docker registry which does not 
reply with 'scope'/'service' in WWW-Authenticate header.
 
 ** Improvements
   * [MESOS-8880] - Add minimum capabilities in the master.
   * [MESOS-9159] - Support Foreign URLs in docker registry puller.
   * [MESOS-9540] - Support `DESTROY_DISK` on preprovisioned CSI volumes.
+  * [MESOS-9545] - Marking an unreachable agent as gone should transition the 
tasks to terminal state.
   * [MESOS-9675] - Docker Manifest V2 Schema2 Support.
   * [MESOS-9704] - Support docker manifest v2s2 config GC.
   * [MESOS-9759] - Log required quota headroom and available quota headroom in 
the allocator.



[mesos] branch master updated: Updated executor API docs to include the domain socket.

2020-04-28 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new ae7b90d  Updated executor API docs to include the domain socket.
ae7b90d is described below

commit ae7b90d74a61d81d8b716a67cac921d6542a305a
Author: Greg Mann 
AuthorDate: Tue Apr 28 15:44:21 2020 -0700

Updated executor API docs to include the domain socket.

Review: https://reviews.apache.org/r/72413/
---
 docs/executor-http-api.md | 97 +--
 1 file changed, 68 insertions(+), 29 deletions(-)

diff --git a/docs/executor-http-api.md b/docs/executor-http-api.md
index 4af4cd4..fba58d9 100644
--- a/docs/executor-http-api.md
+++ b/docs/executor-http-api.md
@@ -7,48 +7,87 @@ layout: documentation
 
 A Mesos executor can be built in two different ways:
 
-1. By using the `ExecutorDriver` C++ interface. The `ExecutorDriver` handles 
the
-details of communicating with the Mesos agent. Executor developers implement
-custom executor logic by registering callbacks with the `ExecutorDriver` for
-significant events, such as when a new task launch request is received. Because
-the `ExecutorDriver` interface is written in C++, this typically requires that
-executor developers either use C++ or use a C++ binding to their language of
-choice (e.g., JNI when using JVM-based languages).
-
-2. By using the new HTTP API. This allows Mesos executors to be developed
-without using C++ or a native client library; instead, a custom executor
-interacts with the Mesos agent via HTTP requests, as described below. Although
-it is theoretically possible to use the HTTP executor API "directly" (e.g., by
-using a generic HTTP library), most executor developers should use a library 
for
-their language of choice that manages the details of the HTTP API; see the
-document on [HTTP API client libraries](api-client-libraries.md) for a list.
-
-The v1 Executor HTTP API was introduced in Mesos 0.28.0. As of Mesos 1.0, it is
-considered stable and is the recommended way to develop new Mesos executors.
+1. By using the HTTP API. This allows Mesos executors to be developed without
+using C++ or a native client library; instead, a custom executor interacts with
+the Mesos agent via HTTP requests, as described below. Although it is
+theoretically possible to use the HTTP executor API "directly" (e.g., by using 
a
+generic HTTP library), most executor developers should use a library for their
+language of choice that manages the details of the HTTP API; see the document 
on
+[HTTP API client libraries](api-client-libraries.md) for a list. This is the
+recommended way to develop new Mesos executors.
+
+2. By using the deprecated `ExecutorDriver` C++ interface. While this interface
+is still supported, note that new features are usually not added to it. The
+`ExecutorDriver` handles the details of communicating with the Mesos agent.
+Executor developers implement custom executor logic by registering callbacks
+with the `ExecutorDriver` for significant events, such as when a new task 
launch
+request is received. Because the `ExecutorDriver` interface is written in C++,
+this typically requires that executor developers either use C++ or use a C++
+binding to their language of choice (e.g., JNI when using JVM-based languages).
 
 
 ## Overview
 
-The executor interacts with Mesos via the 
[/api/v1/executor](endpoints/slave/api/v1/executor.md) agent endpoint. We refer 
to this endpoint with its suffix "/executor" in the rest of this document. This 
endpoint accepts HTTP POST requests with data encoded as JSON (Content-Type: 
application/json) or binary Protobuf (Content-Type: application/x-protobuf). 
The first request that the executor sends to "/executor" endpoint is called 
SUBSCRIBE and results in a streaming response ("200 OK" stat [...]
-
-**Executors are expected to keep the subscription connection open as long as 
possible (barring network errors, agent process restarts, software bugs, etc.) 
and incrementally process the response.** HTTP client libraries that can only 
parse the response after the connection is closed cannot be used. For the 
encoding used, please refer to **Events** section below.
-
-All subsequent (non-`SUBSCRIBE`) requests to the "/executor" endpoint (see 
details below in **Calls** section) must be sent using a different connection 
than the one used for subscription. The agent responds to these HTTP POST 
requests with "202 Accepted" status codes (or, for unsuccessful requests, with 
4xx or 5xx status codes; details in later sections). The "202 Accepted" 
response means that a request has been accepted for processing, not that the 
processing of the request has been co [...]
+The executor interacts with Mesos via the [/api/v1/executor]
+(endpoin

[mesos] branch master updated: Fixed libevent SSL socket shutdown race condition.

2020-04-13 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new e8793b2  Fixed libevent SSL socket shutdown race condition.
e8793b2 is described below

commit e8793b2ca92524c76f96c11e4ca52f41f9a8d414
Author: Greg Mann 
AuthorDate: Mon Apr 13 13:41:08 2020 -0700

Fixed libevent SSL socket shutdown race condition.

This fixes an issue where the functions `shutdown()` and
`event_callback()` race to access the bufferevent held by
our libevent SSL socket implementation, leading to a
CHECK failure.

This race resulted in MESOS-10111, where multiple rapid
changes in ZK membership led to one master re-linking to
another multiple times in RECONNECT mode. This causes
`shutdown()` to be called on the existing socket while
it's attempting a connection, at which point a failure to
connect can produce the CHECK failure.

Review: https://reviews.apache.org/r/72354/
---
 3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp 
b/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp
index dcb6d8e..864802d 100644
--- a/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp
+++ b/3rdparty/libprocess/src/posix/libevent/libevent_ssl_socket.cpp
@@ -190,7 +190,9 @@ Try 
LibeventSSLSocketImpl::shutdown(int how)
 CHECK(__in_event_loop__);
 CHECK(self);
 
-CHECK_NOTNULL(self->bev);
+if (self->bev == nullptr) {
+  return;
+}
 
 synchronized (self->bev) {
   Owned request;



[mesos] 01/03: Added agent-side validation for resource limits.

2020-04-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit ad612599df9e866b2a622baa4cdb659ece5c4574
Author: Greg Mann 
AuthorDate: Mon Apr 6 15:16:33 2020 -0700

Added agent-side validation for resource limits.

This prevents tasks from being launched on agents which would
not be capable of enforcing the specified limits.

Review: https://reviews.apache.org/r/72297/
---
 src/slave/slave.cpp   | 64 +--
 src/slave/slave.hpp   |  3 ++
 src/tests/master_tests.cpp| 10 +-
 src/tests/master_validation_tests.cpp | 20 +--
 4 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 6a48023..1a32c81 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2168,6 +2168,34 @@ void Slave::runTask(
 }
 
 
+Option Slave::validateResourceLimitsAndIsolators(
+const vector& tasks)
+{
+  foreach (const TaskInfo& task, tasks) {
+if (!(task.has_container() &&
+  task.container().type() == ContainerInfo::DOCKER)) {
+  if (task.limits().count("cpus") &&
+  !(strings::contains(flags.isolation, "cgroups/cpu") ||
+strings::contains(flags.isolation, "cgroups/all"))) {
+return Error(
+"CPU limits can only be set on tasks launched in Mesos containers"
+" when the agent has loaded the 'cgroups/cpu' isolator");
+  }
+
+  if (task.limits().count("mem") &&
+  !(strings::contains(flags.isolation, "cgroups/mem") ||
+strings::contains(flags.isolation, "cgroups/all"))) {
+return Error(
+"Memory limits can only be set on tasks launched in Mesos"
+" containers when the agent has loaded the 'cgroups/mem' 
isolator");
+  }
+}
+  }
+
+  return None();
+}
+
+
 void Slave::run(
 const FrameworkInfo& frameworkInfo,
 ExecutorInfo executorInfo,
@@ -2320,6 +2348,40 @@ void Slave::run(
 }
   }
 
+  CHECK_NOTNULL(framework);
+
+  Option error = validateResourceLimitsAndIsolators(tasks);
+  if (error.isSome()) {
+// We report TASK_DROPPED to the framework because the task was
+// never launched. For non-partition-aware frameworks, we report
+// TASK_LOST for backward compatibility.
+mesos::TaskState taskState = TASK_DROPPED;
+if (!protobuf::frameworkHasCapability(
+frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
+  taskState = TASK_LOST;
+}
+
+foreach (const TaskInfo& _task, tasks) {
+  const StatusUpdate update = protobuf::createStatusUpdate(
+  frameworkId,
+  info.id(),
+  _task.task_id(),
+  taskState,
+  TaskStatus::SOURCE_SLAVE,
+  id::UUID::random(),
+  error->message,
+  TaskStatus::REASON_GC_ERROR);
+
+  statusUpdate(update, UPID());
+}
+
+if (framework->idle()) {
+  removeFramework(framework);
+}
+
+return;
+  }
+
   const ExecutorID& executorId = executorInfo.executor_id();
 
   if (HookManager::hooksAvailable()) {
@@ -2342,8 +2404,6 @@ void Slave::run(
 }
   }
 
-  CHECK_NOTNULL(framework);
-
   // Track the pending task / task group to ensure the framework is
   // not removed and the framework and top level executor directories
   // are not scheduled for deletion before '_run()' is called.
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index d7e65e0..2cf45c6 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -158,6 +158,9 @@ public:
   const process::UPID& from,
   RunTaskMessage&& runTaskMessage);
 
+  Option validateResourceLimitsAndIsolators(
+  const std::vector& tasks);
+
   // Made 'virtual' for Slave mocking.
   virtual void runTask(
   const process::UPID& from,
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 8cc8d20..785e5d5 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -4330,7 +4330,15 @@ TEST_F(MasterTest, TasksEndpoint)
   TestContainerizer containerizer();
 
   Owned detector = master.get()->createDetector();
-  Try> slave = StartSlave(detector.get(), 
);
+
+  // We must enable the CPU and memory isolators on the agent so that it can
+  // accept resource limits.
+  slave::Flags flags = CreateSlaveFlags();
+  flags.isolation = "cgroups/cpu,cgroups/mem";
+
+  Try> slave =
+StartSlave(detector.get(), , flags);
+
   ASSERT_SOME(slave);
 
   MockScheduler sched;
diff --git a/src/tests/master_validation_tests.cpp 
b/src/tests/master_validation_tests.cpp
index 9efca42..816635a 100644
--- a/src/tests/master_validation_tests.cpp
+++ b/src/tests/master_validation_tests.cpp
@@ 

[mesos] 03/03: Updated CFS tests to avoid checking CPU usage.

2020-04-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 924a6776ca2e474cc65fcb1d59938a0ff6ad46c8
Author: Greg Mann 
AuthorDate: Mon Apr 6 15:16:54 2020 -0700

Updated CFS tests to avoid checking CPU usage.

This verification is error-prone and leads to flaky test
failures. It is sufficient to verify that the cgroup values
are set correctly. We will avoid going so far as to confirm
that the kernel's scheduler is honoring those values.

Review: https://reviews.apache.org/r/72309/
---
 src/tests/containerizer/cgroups_isolator_tests.cpp | 59 ++
 1 file changed, 4 insertions(+), 55 deletions(-)

diff --git a/src/tests/containerizer/cgroups_isolator_tests.cpp 
b/src/tests/containerizer/cgroups_isolator_tests.cpp
index f4425f0..57158ae 100644
--- a/src/tests/containerizer/cgroups_isolator_tests.cpp
+++ b/src/tests/containerizer/cgroups_isolator_tests.cpp
@@ -386,8 +386,7 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_RevocableCpu)
 
 // This test verifies that a task launched with 0.5 cpu and 32MB memory as its
 // resource requests (but no resource limits specified) will have its CPU and
-// memory's soft & hard limits and OOM score adjustment set correctly, and it
-// cannot consume more cpu time than its CFS quota.
+// memory's soft & hard limits and OOM score adjustment set correctly.
 TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskNoLimits)
 {
   Try> master = StartMaster();
@@ -436,17 +435,6 @@ TEST_F(CgroupsIsolatorTest, 
ROOT_CGROUPS_CFS_CommandTaskNoLimits)
   AWAIT_READY(offers);
   ASSERT_FALSE(offers->empty());
 
-  // Generate random numbers to max out a single core. We'll run this
-  // for 0.5 seconds of wall time so it should consume approximately
-  // 300 ms of total cpu time when limited to 0.6 cpu. We use
-  // /dev/urandom to prevent blocking on Linux when there's
-  // insufficient entropy.
-  string command =
-"cat /dev/urandom > /dev/null & "
-"export MESOS_TEST_PID=$! && "
-"sleep 0.5 && "
-"kill $MESOS_TEST_PID";
-
   // We will launch a task with 0.5 cpu and 32MB memory, and the command
   // executor will be given 0.1 cpu (`DEFAULT_EXECUTOR_CPUS`) and 32MB
   // memory (DEFAULT_EXECUTOR_MEM) by default, so we need 0.6 cpu and 64MB
@@ -462,7 +450,7 @@ TEST_F(CgroupsIsolatorTest, 
ROOT_CGROUPS_CFS_CommandTaskNoLimits)
   TaskInfo task = createTask(
   offers.get()[0].slave_id(),
   Resources::parse("cpus:0.5;mem:32").get(),
-  command);
+  SLEEP_COMMAND(1000));
 
   Future statusStarting;
   Future statusRunning;
@@ -527,20 +515,6 @@ TEST_F(CgroupsIsolatorTest, 
ROOT_CGROUPS_CFS_CommandTaskNoLimits)
   Try oomScoreAdj = numify(strings::trim(read.get()));
   ASSERT_SOME_EQ(0, oomScoreAdj);
 
-  Future usage = containerizer->usage(containerId);
-  AWAIT_READY(usage);
-
-  // Expect that no more than 400 ms of cpu time has been consumed. We
-  // also check that at least 50 ms of cpu time has been consumed so
-  // this test will fail if the host system is very heavily loaded.
-  // This behavior is correct because under such conditions we aren't
-  // actually testing the CFS cpu limiter.
-  double cpuTime = usage->cpus_system_time_secs() +
-   usage->cpus_user_time_secs();
-
-  EXPECT_GE(0.4, cpuTime);
-  EXPECT_LE(0.05, cpuTime);
-
   driver.stop();
   driver.join();
 }
@@ -548,7 +522,7 @@ TEST_F(CgroupsIsolatorTest, 
ROOT_CGROUPS_CFS_CommandTaskNoLimits)
 
 // This test verifies that a task launched with resource limits specified
 // will have its CPU and memory's soft & hard limits and OOM score adjustment
-// set correctly, and it cannot consume more cpu time than its CFS quota.
+// set correctly.
 TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_CommandTaskLimits)
 {
   Try> master = StartMaster();
@@ -603,17 +577,6 @@ TEST_F(CgroupsIsolatorTest, 
ROOT_CGROUPS_CFS_CommandTaskLimits)
   AWAIT_READY(offers);
   ASSERT_FALSE(offers->empty());
 
-  // Generate random numbers to max out a single core. We'll run
-  // this for 0.5 seconds of wall time so it should consume
-  // approximately 300 ms of total cpu time when limited to 0.6
-  // cpu. We use /dev/urandom to prevent blocking on Linux when
-  // there's insufficient entropy.
-  string command =
-"cat /dev/urandom > /dev/null & "
-"export MESOS_TEST_PID=$! && "
-"sleep 0.5 && "
-"kill $MESOS_TEST_PID";
-
   // Launch a task with 0.2 cpu request, 0.5 cpu limit, half of
   // host total memory - `DEFAULT_EXECUTOR_MEM` as memory request
   // and half of host total memory as memory limit.
@@ -632,7 +595,7 @@ TEST_F(CgroupsIsolatorTest, 
ROOT_CGROUPS_CFS_CommandTaskLimits)
   TaskInfo task = createTask(
   offers.get()[0].slave

[mesos] 02/03: Sent appropriate task status reason when task over memory request.

2020-04-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit be90edd31a1833c5ed706b39f3a5547ae8153dd2
Author: Greg Mann 
AuthorDate: Mon Apr 6 15:16:45 2020 -0700

Sent appropriate task status reason when task over memory request.

Review: https://reviews.apache.org/r/72305/
---
 src/common/protobuf_utils.cpp  |  3 ++-
 .../mesos/isolators/cgroups/subsystems/memory.cpp  | 24 +-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp
index 723d85a..8d1d5c4 100644
--- a/src/common/protobuf_utils.cpp
+++ b/src/common/protobuf_utils.cpp
@@ -254,7 +254,8 @@ StatusUpdate createStatusUpdate(
 CHECK(
 reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION ||
 reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION_DISK ||
-reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY)
+reason.get() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY ||
+reason.get() == TaskStatus::REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED)
   << reason.get();
 
 status->mutable_limitation()->mutable_resources()->CopyFrom(
diff --git 
a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp 
b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
index 15f87ba..60c7a89 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
@@ -699,11 +699,33 @@ void MemorySubsystemProcess::oomWaited(
 ? (double) usage->bytes() / Bytes::MEGABYTES : 0),
   "*").get();
 
+  TaskStatus::Reason reason = TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY;
+
+  // If the container has a hard limit set higher than the soft limit, then
+  // check if the memory usage is above the soft limit but less than the hard
+  // limit. If so, we send a task status reason to the scheduler which 
indicates
+  // that this container was preferentially OOM-killed because it exceeded its
+  // memory request without hitting its memory limit.
+  Try softLimit =
+cgroups::memory::soft_limit_in_bytes(hierarchy, cgroup);
+
+  if (softLimit.isError()) {
+LOG(ERROR) << "Failed to read 'memory.soft_limit_in_bytes': "
+   << softLimit.error();
+  } else if (softLimit.get() < limit.get()) {
+if (!usage.isError() &&
+!limit.isError() &&
+usage.get() > softLimit.get() &&
+usage.get() < limit.get()) {
+  reason = TaskStatus::REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED;
+}
+  }
+
   infos[containerId]->limitation.set(
   protobuf::slave::createContainerLimitation(
   mem,
   message.str(),
-  TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY));
+  reason));
 }
 
 



[mesos] branch master updated (92f8768 -> 924a677)

2020-04-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 92f8768  Fixed build of tests broken in 
9bd3ea4665402943af070e64327e8d7dc341e301.
 new ad61259  Added agent-side validation for resource limits.
 new be90edd  Sent appropriate task status reason when task over memory 
request.
 new 924a677  Updated CFS tests to avoid checking CPU usage.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/common/protobuf_utils.cpp  |  3 +-
 .../mesos/isolators/cgroups/subsystems/memory.cpp  | 24 +++-
 src/slave/slave.cpp| 64 +-
 src/slave/slave.hpp|  3 +
 src/tests/containerizer/cgroups_isolator_tests.cpp | 59 ++--
 src/tests/master_tests.cpp | 10 +++-
 src/tests/master_validation_tests.cpp  | 20 ++-
 7 files changed, 120 insertions(+), 63 deletions(-)



[mesos] branch master updated: Added resource limits to the web UI.

2020-03-31 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new f8a3dd3  Added resource limits to the web UI.
f8a3dd3 is described below

commit f8a3dd334934094ec44e07fa350f958d218bc78f
Author: Greg Mann 
AuthorDate: Tue Mar 31 21:55:42 2020 -0700

Added resource limits to the web UI.

Review: https://reviews.apache.org/r/72269/
---
 src/webui/app/agents/agent-executor.html | 82 +---
 1 file changed, 53 insertions(+), 29 deletions(-)

diff --git a/src/webui/app/agents/agent-executor.html 
b/src/webui/app/agents/agent-executor.html
index 7ec56c3..d8a5250 100644
--- a/src/webui/app/agents/agent-executor.html
+++ b/src/webui/app/agents/agent-executor.html
@@ -106,13 +106,19 @@
   class="table table-striped table-bordered table-condensed">
   
 
-  ID
-  Name
-  Role
-  CPUs
-  GPUs
-  Mem
-  Disk
+  ID
+  Name
+  Role
+  CPUs
+  Mem
+  GPUs (allocated)
+  Disk (allocated)
+
+
+  Allocated
+  Limit
+  Allocated
+  Limit
 
   
   
@@ -121,8 +127,10 @@
   {{queued_task.name}}
   {{queued_task.role}}
   {{queued_task.resources.cpus | number}}
-  {{queued_task.resources.gpus | number}}
+  {{queued_task.limits.cpus | number}}
   {{queued_task.resources.mem * (1024 * 1024) | dataSize}}
+  {{queued_task.limits.mem * (1024 * 1024) | dataSize}}
+  {{queued_task.resources.gpus | number}}
   {{queued_task.resources.disk * (1024 * 1024) | dataSize}}
 
   
@@ -132,16 +140,22 @@
   class="table table-striped table-bordered table-condensed">
   
 
-  ID
-  Name
-  Role
-  State
-  Health
-  CPUs (allocated)
-  GPUs (allocated)
-  Mem (allocated)
-  Disk (allocated)
-  
+  ID
+  Name
+  Role
+  State
+  Health
+  CPUs
+  Mem
+  GPUs (allocated)
+  Disk (allocated)
+  
+
+
+  Allocated
+  Limit
+  Allocated
+  Limit
 
   
   
@@ -152,8 +166,10 @@
   {{task.state}}
   {{task.healthy | 
taskHealth}}
   {{task.resources.cpus | number}}
-  {{task.resources.gpus | number}}
+  {{task.limits.cpus | number}}
   {{task.resources.mem * (1024 * 1024) | dataSize}}
+  {{task.limits.mem * (1024 * 1024) | dataSize}}
+  {{task.resources.gpus | number}}
   {{task.resources.disk * (1024 * 1024) | dataSize}}
   
 
   
 
-  ID
-  Name
-  Role
-  State
-  CPUs (allocated)
-  GPUs (allocated)
-  Mem (allocated)
-  Disk (allocated)
-  
+  ID
+  Name
+  Role
+  State
+  CPUs
+  Mem
+  GPUs (allocated)
+  Disk (allocated)
+  
+
+
+  Allocated
+  Limit
+  Allocated
+  Limit
 
   
   
@@ -187,8 +209,10 @@
   {{completed_task.role}}
   {{completed_task.state}}
   {{completed_task.resources.cpus | number}}
-  {{completed_task.resources.gpus | number}}
+  {{completed_task.limits.cpus | number}}
   {{completed_task.resources.mem * (1024 * 1024) | dataSize}}
+  {{completed_task.limits.mem * (1024 * 1024) | dataSize}}
+  {{completed_task.resources.gpus | number}}
   {{completed_task.resources.disk * (1024 * 1024) | dataSize}}
   
 

[mesos] branch master updated: Added resource limits to v0 endpoint results.

2020-03-24 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 77f5165  Added resource limits to v0 endpoint results.
77f5165 is described below

commit 77f5165fd2f942a89063803a7be5465d35774b10
Author: Greg Mann 
AuthorDate: Tue Mar 24 21:13:44 2020 -0700

Added resource limits to v0 endpoint results.

Review: https://reviews.apache.org/r/72262/
---
 src/common/http.cpp | 31 +++
 src/common/http.hpp |  1 +
 src/tests/common/http_tests.cpp |  7 +++
 src/tests/master_tests.cpp  | 14 --
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/common/http.cpp b/src/common/http.cpp
index 3dd77dc..093d837 100644
--- a/src/common/http.cpp
+++ b/src/common/http.cpp
@@ -633,6 +633,10 @@ JSON::Object model(const Task& task)
   object.values["state"] = TaskState_Name(task.state());
   object.values["resources"] = model(task.resources());
 
+  if (!task.limits().empty()) {
+object.values["limits"] = model(task.limits());
+  }
+
   if (task.has_user()) {
 object.values["user"] = task.user();
   }
@@ -785,6 +789,18 @@ JSON::Object model(const FileInfo& fileInfo)
   return file;
 }
 
+
+JSON::Object model(const google::protobuf::Map& map)
+{
+  JSON::Object result, scalar;
+
+  foreach (auto item, map) {
+result.values[item.first] = item.second.value();
+  }
+
+  return result;
+}
+
 }  // namespace internal {
 
 
@@ -1082,6 +1098,17 @@ void json(
 }
 
 
+// Used to include resource limits in JSON output.
+void json(
+JSON::ObjectWriter* writer,
+const google::protobuf::Map& map)
+{
+  foreach (auto item, map) {
+writer->field(item.first, item.second.value());
+  }
+}
+
+
 void json(JSON::ObjectWriter* writer, const Task& task)
 {
   writer->field("id", task.task_id().value());
@@ -1092,6 +1119,10 @@ void json(JSON::ObjectWriter* writer, const Task& task)
   writer->field("state", TaskState_Name(task.state()));
   writer->field("resources", task.resources());
 
+  if (!task.limits().empty()) {
+writer->field("limits", task.limits());
+  }
+
   // Tasks are not allowed to mix resources allocated to
   // different roles, see MESOS-6636.
   writer->field("role", task.resources().begin()->allocation_info().role());
diff --git a/src/common/http.hpp b/src/common/http.hpp
index 02633e1..9d5b8ed 100644
--- a/src/common/http.hpp
+++ b/src/common/http.hpp
@@ -211,6 +211,7 @@ JSON::Object model(const ExecutorInfo& executorInfo);
 JSON::Array model(const Labels& labels);
 JSON::Object model(const Task& task);
 JSON::Object model(const FileInfo& fileInfo);
+JSON::Object model(const google::protobuf::Map& 
map);
 
 void json(JSON::ObjectWriter* writer, const Task& task);
 
diff --git a/src/tests/common/http_tests.cpp b/src/tests/common/http_tests.cpp
index 5f36527..12dcf67 100644
--- a/src/tests/common/http_tests.cpp
+++ b/src/tests/common/http_tests.cpp
@@ -91,6 +91,8 @@ TEST(HTTPTest, ModelTask)
   taskInfo.mutable_command()->set_value("echo hello");
   taskInfo.mutable_command()->set_user("user1");
   taskInfo.mutable_discovery()->CopyFrom(discovery);
+  (*taskInfo.mutable_limits())["cpus"].set_value(1.0);
+  (*taskInfo.mutable_limits())["mem"].set_value(32);
 
   Task task = createTask(taskInfo, state, frameworkId);
   task.add_statuses()->CopyFrom(statuses[0]);
@@ -110,6 +112,11 @@ TEST(HTTPTest, ModelTask)
   "\"gpus\":0,"
   "\"mem\":0"
   "  },"
+  "  \"limits\":"
+  "  {"
+  "\"cpus\": 1.0,"
+  "\"mem\": 32"
+  "  },"
   "  \"slave_id\":\"s\","
   "  \"state\":\"TASK_RUNNING\","
   "  \"statuses\":"
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 617abfa..8cc8d20 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -4357,6 +4357,8 @@ TEST_F(MasterTest, TasksEndpoint)
   task1.mutable_slave_id()->MergeFrom(offer->slave_id());
   task1.mutable_resources()->MergeFrom(
   Resources::parse("cpus:0.1;mem:12").get());
+  (*task1.mutable_limits())["cpus"].set_value(0.5);
+  (*task1.mutable_limits())["mem"].set_value(64);
   task1.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
 
   TaskInfo task2;
@@ -4418,7 +4420,11 @@ TEST_F(MasterTest, TasksEndpoint)
 "\"framework_id\":\"" + frameworkId->valu

[mesos] branch master updated: Moved containerizer utils in CMakeLists.

2020-03-24 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 12e5e87  Moved containerizer utils in CMakeLists.
12e5e87 is described below

commit 12e5e870c38681bfc0455960f89a41127dac3daf
Author: Qian Zhang 
AuthorDate: Tue Mar 24 10:44:39 2020 -0700

Moved containerizer utils in CMakeLists.

This is to ensure the function `calculateOOMScoreAdj()` can be resolved
on Windows.

Review: https://reviews.apache.org/r/72263/
---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5133550..96cd867 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -176,6 +176,7 @@ set(AGENT_SRC
   slave/containerizer/mesos/launcher_tracker.cpp
   slave/containerizer/mesos/mount.cpp
   slave/containerizer/mesos/paths.cpp
+  slave/containerizer/mesos/utils.cpp
   slave/containerizer/mesos/io/switchboard.cpp
   slave/containerizer/mesos/isolators/environment_secret.cpp
   slave/containerizer/mesos/isolators/filesystem/posix.cpp
@@ -188,7 +189,6 @@ set(AGENT_SRC
 
 if (NOT WIN32)
   list(APPEND AGENT_SRC
-slave/containerizer/mesos/utils.cpp
 slave/containerizer/mesos/isolators/docker/volume/driver.cpp
 slave/containerizer/mesos/isolators/docker/volume/paths.cpp
 slave/containerizer/mesos/isolators/network/cni/paths.cpp



[mesos] 05/05: Updated the comment for the 'share_cgroups' field.

2020-03-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit eb455e74ab8f31b93f7f5e87dc1829e05701e411
Author: Greg Mann 
AuthorDate: Fri Mar 20 10:35:39 2020 -0700

Updated the comment for the 'share_cgroups' field.

Review: https://reviews.apache.org/r/72250/
---
 include/mesos/mesos.proto| 22 +++---
 include/mesos/v1/mesos.proto | 22 +++---
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index 6dba47e..9412ed7 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -3353,17 +3353,17 @@ message LinuxInfo {
 
   // If set as 'true', the container will share the cgroups from its parent
   // container, otherwise it will have its own cgroups created. Please note:
-  // 1. This field should be only used for the task containers in a task group
-  //(i.e., the 1st level nested containers). It will be ignored for the
-  //executor containers (i.e., the top-level containers) since the executor
-  //container will always have its own cgroups created, and it will also be
-  //ignored for the nested containers under the 1st nested container (e.g.,
-  //the debug container running as 2nd level nested container) since those
-  //containers should always share cgroups from its parent container.
-  // 2. The value of this field should be same for all the tasks launched by a
-  //single executor.
-  // 3. It is not allowed to set resource limits for the task which has this
-  //field set as true.
+  // 1. For tasks in a task group launched via the LAUNCH_GROUP operation,
+  //this field may be set to 'true' or 'false'. Resource limits may only be
+  //set for tasks in a task group when this field is set to 'false'.
+  // 2. For tasks launched via the LAUNCH operation, this field may only be set
+  //to 'true', and in this case resource limits may be set on these tasks.
+  // 3. For containers launched via the agent's LAUNCH_NESTED_CONTAINER_SESSION
+  //call, this field must be set to 'true'.
+  // 4. For executor containers, this field may only be set to 'false'.
+  // 5. All tasks under a single executor must share the same value of this
+  //field, if it is set. Note that this means that all tasks within a 
single
+  //task group must set this field to the same value.
   optional bool share_cgroups = 8 [default = true];
 }
 
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index e96f51f..194c42c 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -3342,17 +3342,17 @@ message LinuxInfo {
 
   // If set as 'true', the container will share the cgroups from its parent
   // container, otherwise it will have its own cgroups created. Please note:
-  // 1. This field should be only used for the task containers in a task group
-  //(i.e., the 1st level nested containers). It will be ignored for the
-  //executor containers (i.e., the top-level containers) since the executor
-  //container will always have its own cgroups created, and it will also be
-  //ignored for the nested containers under the 1st nested container (e.g.,
-  //the debug container running as 2nd level nested container) since those
-  //containers should always share cgroups from its parent container.
-  // 2. The value of this field should be same for all the tasks launched by a
-  //single executor.
-  // 3. It is not allowed to set resource limits for the task which has this
-  //field set as true.
+  // 1. For tasks in a task group launched via the LAUNCH_GROUP operation,
+  //this field may be set to 'true' or 'false'. Resource limits may only be
+  //set for tasks in a task group when this field is set to 'false'.
+  // 2. For tasks launched via the LAUNCH operation, this field may only be set
+  //to 'true', and in this case resource limits may be set on these tasks.
+  // 3. For containers launched via the agent's LAUNCH_NESTED_CONTAINER_SESSION
+  //call, this field must be set to 'true'.
+  // 4. For executor containers, this field may only be set to 'false'.
+  // 5. All tasks under a single executor must share the same value of this
+  //field, if it is set. Note that this means that all tasks within a 
single
+  //task group must set this field to the same value.
   optional bool share_cgroups = 8 [default = true];
 }
 



[mesos] branch master updated (9ab68cb -> eb455e7)

2020-03-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 9ab68cb  Updated default executor to call the `LaunchContainer` agent 
API.
 new 504af58  Added master validation for task resource limits and shared 
cgroups.
 new 72e78f0  Added tests for master validation of limits and shared 
cgroups.
 new 1088dd3  Added agent validation for shared cgroups.
 new 74c3550  Added tests for agent validation of shared cgroups.
 new eb455e7  Updated the comment for the 'share_cgroups' field.

The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 include/mesos/mesos.proto |  22 +-
 include/mesos/v1/mesos.proto  |  22 +-
 src/master/validation.cpp | 230 +-
 src/slave/validation.cpp  | 116 +++--
 src/tests/master_validation_tests.cpp | 771 ++
 src/tests/slave_validation_tests.cpp  | 143 ++-
 6 files changed, 1246 insertions(+), 58 deletions(-)



[mesos] 03/05: Added agent validation for shared cgroups.

2020-03-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 1088dd3b77eb903718b3df8064d5d1d6c379f25b
Author: Greg Mann 
AuthorDate: Fri Mar 20 10:35:38 2020 -0700

Added agent validation for shared cgroups.

Review: https://reviews.apache.org/r/72221/
---
 src/slave/validation.cpp | 116 +++
 1 file changed, 87 insertions(+), 29 deletions(-)

diff --git a/src/slave/validation.cpp b/src/slave/validation.cpp
index efb2e0c..25e9fbd 100644
--- a/src/slave/validation.cpp
+++ b/src/slave/validation.cpp
@@ -171,8 +171,11 @@ Option validate(
 return Error("Expecting 'launch_nested_container' to be present");
   }
 
+  const mesos::agent::Call::LaunchNestedContainer& launch =
+call.launch_nested_container();
+
   Option error = validation::container::validateContainerId(
-  call.launch_nested_container().container_id());
+  launch.container_id());
 
   if (error.isSome()) {
 return Error("'launch_nested_container.container_id' is invalid"
@@ -181,27 +184,36 @@ Option validate(
 
   // The parent `ContainerID` is required, so that we know
   // which container to place it underneath.
-  if (!call.launch_nested_container().container_id().has_parent()) {
+  if (!launch.container_id().has_parent()) {
 return Error("Expecting 'launch_nested_container.container_id.parent'"
  " to be present");
   }
 
-  if (call.launch_nested_container().has_command()) {
-error = common::validation::validateCommandInfo(
-call.launch_nested_container().command());
+  if (launch.has_command()) {
+error = common::validation::validateCommandInfo(launch.command());
 if (error.isSome()) {
   return Error("'launch_nested_container.command' is invalid"
": " + error->message);
 }
   }
 
-  if (call.launch_nested_container().has_container()) {
-error = common::validation::validateContainerInfo(
-call.launch_nested_container().container());
+  if (launch.has_container()) {
+error = common::validation::validateContainerInfo(launch.container());
 if (error.isSome()) {
   return Error("'launch_nested_container.container' is invalid"
": " + error->message);
 }
+
+if (launch.container().has_linux_info() &&
+launch.container().linux_info().has_share_cgroups() &&
+!launch.container().linux_info().share_cgroups() &&
+launch.container_id().has_parent() &&
+launch.container_id().parent().has_parent()) {
+return Error(
+"'launch_nested_container' is invalid: containers nested at "
+"the second level or greater cannot set 'share_cgroups' to "
+"'false'");
+}
   }
 
   return None();
@@ -279,8 +291,11 @@ Option validate(
 "Expecting 'launch_nested_container_session' to be present");
   }
 
+  const mesos::agent::Call::LaunchNestedContainerSession& launch =
+call.launch_nested_container_session();
+
   Option error = validation::container::validateContainerId(
-  call.launch_nested_container_session().container_id());
+  launch.container_id());
 
   if (error.isSome()) {
 return Error("'launch_nested_container_session.container_id' is 
invalid"
@@ -289,28 +304,34 @@ Option validate(
 
   // The parent `ContainerID` is required, so that we know
   // which container to place it underneath.
-  if (!call.launch_nested_container_session().container_id().has_parent()) 
{
+  if (!launch.container_id().has_parent()) {
 return Error(
 "Expecting 'launch_nested_container_session.container_id.parent'"
 " to be present");
   }
 
-  if (call.launch_nested_container_session().has_command()) {
-error = common::validation::validateCommandInfo(
-call.launch_nested_container_session().command());
+  if (launch.has_command()) {
+error = common::validation::validateCommandInfo(launch.command());
 if (error.isSome()) {
   return Error("'launch_nested_container_session.command' is invalid"
": " + error->message);
 }
   }
 
-  if (call.launch_nested_container_session().has_container()) {
-error = common::validation::validateContainerInfo(
-call.launch_nested_container_session().container());
+  if (launch.has_container()) {
+error = c

[mesos] 02/05: Added tests for master validation of limits and shared cgroups.

2020-03-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 72e78f02f08116c0d4c4a825144b078cfbee5052
Author: Greg Mann 
AuthorDate: Fri Mar 20 10:35:37 2020 -0700

Added tests for master validation of limits and shared cgroups.

Review: https://reviews.apache.org/r/72217/
---
 src/tests/master_validation_tests.cpp | 771 ++
 1 file changed, 771 insertions(+)

diff --git a/src/tests/master_validation_tests.cpp 
b/src/tests/master_validation_tests.cpp
index 8d5e74e..9efca42 100644
--- a/src/tests/master_validation_tests.cpp
+++ b/src/tests/master_validation_tests.cpp
@@ -3486,6 +3486,214 @@ TEST_F(TaskValidationTest, 
TaskSettingDockerParameterName)
   driver.join();
 }
 
+
+TEST_F(TaskValidationTest, ResourceLimitLessThanRequest)
+{
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned detector = master.get()->createDetector();
+  Try> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+  , DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(, _, _));
+
+  Future> offers;
+  EXPECT_CALL(sched, resourceOffers(, _))
+.WillOnce(FutureArg<1>())
+.WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->empty());
+
+  Future status;
+  EXPECT_CALL(sched, statusUpdate(, _))
+.WillOnce(FutureArg<1>());
+
+  Map limits;
+  limits["cpus"].set_value(0.01);
+
+  TaskInfo task = createTask(
+  offers->at(0),
+  "exit 0",
+  None(),
+  "test-task",
+  id::UUID::random().toString(),
+  limits);
+
+  driver.launchTasks(offers->at(0).id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_ERROR, status->state());
+  EXPECT_TRUE(strings::contains(
+  status->message(),
+  "The cpu limit must be greater than or equal to the cpu request"));
+
+  driver.stop();
+  driver.join();
+}
+
+
+TEST_F(TaskValidationTest, LimitOtherThanCpuOrMem)
+{
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned detector = master.get()->createDetector();
+  Try> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+  , DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(, _, _));
+
+  Future> offers;
+  EXPECT_CALL(sched, resourceOffers(, _))
+.WillOnce(FutureArg<1>())
+.WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->empty());
+
+  Future status;
+  EXPECT_CALL(sched, statusUpdate(, _))
+.WillOnce(FutureArg<1>());
+
+  Map limits;
+  limits["disk"].set_value(128);
+
+  TaskInfo task = createTask(
+  offers->at(0),
+  "exit 0",
+  None(),
+  "test-task",
+  id::UUID::random().toString(),
+  limits);
+
+  driver.launchTasks(offers->at(0).id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_ERROR, status->state());
+  EXPECT_TRUE(strings::contains(
+  status->message(),
+  "Only cpus and mem may be included in a task's resource limits"));
+
+  driver.stop();
+  driver.join();
+}
+
+
+TEST_F(TaskValidationTest, NestedCgroupInLaunchOperation)
+{
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned detector = master.get()->createDetector();
+  Try> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+  , DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(, _, _));
+
+  Future> offers;
+  EXPECT_CALL(sched, resourceOffers(, _))
+.WillOnce(FutureArg<1>())
+.WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->empty());
+
+  Future status;
+  EXPECT_CALL(sched, statusUpdate(, _))
+.WillOnce(FutureArg<1>());
+
+  TaskInfo task = createTask(offers->at(0), "exit 0");
+
+  task.mutable_container()->set_type(ContainerInfo::MESOS);
+  task.mutable_container()->mutable_linux_info()->set_share_cgroups(false);
+
+  driver.launchTasks(offers->at(0).id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_ERROR, status->state());
+  EXPECT_TRUE(strings::contains(
+  status->message(),
+  "Only tasks in a task group may have 'share_cgroups' set to 'false'"));
+
+  driver.stop();
+  driver.join();
+}
+
+
+TEST_F(TaskValidationTest, SharedCgroupOnExecutor)
+{
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned detector = master.get()->cr

[mesos] 04/05: Added tests for agent validation of shared cgroups.

2020-03-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 74c355060304da4287d286ed775df072b6101816
Author: Greg Mann 
AuthorDate: Fri Mar 20 10:35:38 2020 -0700

Added tests for agent validation of shared cgroups.

This patch adds validation for shared cgroups when specified
via the agent APIs. In doing so, a new validation test is
added for the agent's LaunchContainer API, since this was
previously missing.

Review: https://reviews.apache.org/r/7/
---
 src/tests/slave_validation_tests.cpp | 143 ++-
 1 file changed, 142 insertions(+), 1 deletion(-)

diff --git a/src/tests/slave_validation_tests.cpp 
b/src/tests/slave_validation_tests.cpp
index 25019cc..f6203ee 100644
--- a/src/tests/slave_validation_tests.cpp
+++ b/src/tests/slave_validation_tests.cpp
@@ -495,8 +495,24 @@ TEST(AgentCallValidationTest, LaunchNestedContainerSession)
   "variable 'ENV_VAR_KEY' of type 'VALUE' must have a value set",
   error->message);
 
-  // Test the valid case.
+  // Container with 'share_cgroups' set to 'false', which is not allowed for
+  // nested container sessions.
   variable->set_value("env_var_value");
+
+  launch->mutable_container()->set_type(ContainerInfo::MESOS);
+  launch->mutable_container()->mutable_linux_info()->set_share_cgroups(false);
+
+  error = validation::agent::call::validate(call);
+  EXPECT_SOME(error);
+  EXPECT_TRUE(strings::contains(
+  error->message,
+  "'launch_nested_container_session.container.linux_info' is invalid: "
+  "'share_cgroups' cannot be set to 'false' for nested container "
+  "sessions"));
+
+  // Test the valid case.
+  launch->mutable_container()->mutable_linux_info()->set_share_cgroups(true);
+
   error = validation::agent::call::validate(call);
   EXPECT_NONE(error);
 
@@ -512,6 +528,131 @@ TEST(AgentCallValidationTest, 
LaunchNestedContainerSession)
 }
 
 
+TEST(AgentCallValidationTest, LaunchContainer)
+{
+  // Missing `launch_container`.
+  agent::Call call;
+  call.set_type(agent::Call::LAUNCH_CONTAINER);
+
+  Option error = validation::agent::call::validate(call);
+  EXPECT_SOME(error);
+
+  // `container_id` is not valid.
+  ContainerID badContainerId;
+  badContainerId.set_value("no spaces allowed");
+
+  agent::Call::LaunchContainer* launch = call.mutable_launch_container();
+
+  launch->mutable_container_id()->CopyFrom(badContainerId);
+
+  error = validation::agent::call::validate(call);
+  EXPECT_SOME(error);
+
+  // Invalid `command.environment`. Set an invalid environment variable to 
check
+  // that the common validation code for the command's environment is being
+  // executed.
+  ContainerID containerId;
+  containerId.set_value(id::UUID::random().toString());
+
+  launch->mutable_container_id()->CopyFrom(containerId);
+
+  launch->mutable_command()->CopyFrom(createCommandInfo("exit 0"));
+
+  Environment::Variable* variable = launch
+->mutable_command()
+->mutable_environment()
+->mutable_variables()
+->Add();
+  variable->set_name("ENV_VAR_KEY");
+  variable->set_type(mesos::Environment::Variable::VALUE);
+
+  error = validation::agent::call::validate(call);
+  EXPECT_SOME(error);
+  EXPECT_EQ(
+  "'launch_container.command' is invalid: Environment variable "
+  "'ENV_VAR_KEY' of type 'VALUE' must have a value set",
+  error->message);
+
+  // Invalid resources.
+  variable->set_value("env_var_value");
+
+  Resource cpus;
+  cpus.set_type(Value::SCALAR);
+  cpus.set_name("cpus");
+  cpus.mutable_scalar()->set_value(-0.1);
+
+  launch->add_resources()->CopyFrom(cpus);
+
+  error = validation::agent::call::validate(call);
+  EXPECT_SOME(error);
+  EXPECT_EQ(
+  "Invalid resources: Resource 'cpus:-0.1' is invalid: "
+  "Invalid scalar resource: value <= 0",
+  error->message);
+
+  // Invalid 'ContainerInfo'.
+  launch->clear_resources();
+  cpus.mutable_scalar()->set_value(0.1);
+  launch->add_resources()->CopyFrom(cpus);
+
+  launch->mutable_container()->set_type(ContainerInfo::DOCKER);
+
+  error = validation::agent::call::validate(call);
+  EXPECT_SOME(error);
+  EXPECT_EQ(
+  "'launch_container.container' is invalid: DockerInfo 'docker' is not set 
"
+  "for DOCKER typed ContainerInfo",
+  error->message);
+
+  // Container with 'share_cgroups' set to 'true', which is not allowed for
+  // containers with no parent.
+  launch->mutable_container()->set_type(ContainerInfo::MESOS);
+  launch->mutable_container()->mutable_linux_info()->set_share_cgroups(true);
+
+  error = validation::agent::call::validate(call

[mesos] 01/05: Added master validation for task resource limits and shared cgroups.

2020-03-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 504af58dda64b73581b0398c83952421aea64d39
Author: Greg Mann 
AuthorDate: Fri Mar 20 10:35:37 2020 -0700

Added master validation for task resource limits and shared cgroups.

Review: https://reviews.apache.org/r/72216/
---
 src/master/validation.cpp | 230 --
 1 file changed, 224 insertions(+), 6 deletions(-)

diff --git a/src/master/validation.cpp b/src/master/validation.cpp
index 084f281..5b1bcb5 100644
--- a/src/master/validation.cpp
+++ b/src/master/validation.cpp
@@ -17,6 +17,7 @@
 #include "master/validation.hpp"
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -29,6 +30,7 @@
 #include 
 
 #include 
+#include 
 
 #include 
 #include 
@@ -45,6 +47,8 @@
 
 #include "master/master.hpp"
 
+using process::Owned;
+
 using process::http::authentication::Principal;
 
 using std::pair;
@@ -1540,6 +1544,77 @@ Option validateContainerInfo(const TaskInfo& task)
 }
 
 
+Option validateResourceLimits(
+const TaskInfo& task,
+Slave* slave)
+{
+  auto limits = task.limits();
+
+  if (!limits.empty()) {
+if (!slave->capabilities.taskResourceLimits) {
+  return Error("Agent is not capable of handling task resource limits");
+}
+
+// Ensure that only "cpus" and "mem" are included.
+const size_t cpuCount = limits.count("cpus");
+const size_t memCount = limits.count("mem");
+
+if (limits.size() > cpuCount + memCount) {
+  return Error(
+  "Only cpus and mem may be included in a task's resource limits");
+}
+
+if (cpuCount) {
+  Option taskCpus = Resources(task.resources()).cpus();
+  if (taskCpus.isNone()) {
+return Error(
+"When a CPU limit is specified, a CPU request must also be "
+"specified");
+  }
+
+  if (limits.at("cpus").value() < taskCpus.get()) {
+return Error(
+"The cpu limit must be greater than or equal to the cpu request");
+  }
+}
+
+if (memCount) {
+  Option taskMem = Resources(task.resources()).mem();
+  if (taskMem.isNone()) {
+return Error(
+"When a memory limit is specified, a memory request must also be "
+"specified");
+  }
+
+  if (!std::isinf(limits.at("mem").value()) &&
+  Bytes(limits.at("mem").value(), Bytes::MEGABYTES) < taskMem.get()) {
+return Error(
+"The memory limit must be greater"
+" than or equal to the memory request");
+  }
+}
+  }
+
+  return None();
+}
+
+
+// This validation function should only be executed for tasks which are 
launched
+// via the LAUNCH operation, not the LAUNCH_GROUP operation.
+Option validateShareCgroups(const TaskInfo& task)
+{
+  if (task.has_container() &&
+  task.container().has_linux_info() &&
+  task.container().linux_info().has_share_cgroups() &&
+  !task.container().linux_info().share_cgroups()) {
+return Error(
+"Only tasks in a task group may have 'share_cgroups' set to 'false'");
+  }
+
+  return None();
+}
+
+
 // Validates task specific fields except its executor (if it exists).
 Option validateTask(
 const TaskInfo& task,
@@ -1561,7 +1636,8 @@ Option validateTask(
 lambda::bind(internal::validateHealthCheck, task),
 lambda::bind(internal::validateResources, task),
 lambda::bind(internal::validateCommandInfo, task),
-lambda::bind(internal::validateContainerInfo, task)
+lambda::bind(internal::validateContainerInfo, task),
+lambda::bind(internal::validateResourceLimits, task, slave)
   };
 
   foreach (const lambda::function()>& validator, validators) {
@@ -1659,6 +1735,15 @@ Option validateExecutor(
 << "in future releases.";
 }
 
+if (executor.has_container() &&
+executor.container().has_linux_info() &&
+executor.container().linux_info().has_share_cgroups() &&
+executor.container().linux_info().share_cgroups()) {
+  return Error(
+  "The 'share_cgroups' field cannot be set to 'true'"
+  " on executor containers");
+}
+
 if (!slave->hasExecutor(framework->id(), task.executor().executor_id())) {
   total += executorResources;
 }
@@ -1698,7 +1783,8 @@ Option validate(
 
   vector()>> validators = {
 lambda::bind(internal::validateTask, task, framework, slave),
-lambda::bind(internal::validateExecutor, task, framework, slave, offered)
+lambda::bind(internal::validateExecutor, task, framework, slave, offered),
+l

[mesos] branch master updated (4d9013d -> 59ba377)

2020-03-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 4d9013d  Added test for removal of ObjectApprovers of disconnected 
framework.
 new f445e3a  Added the 'TASK_RESOURCE_LIMITS' agent capability.
 new 59ba377  Cleaned up agent capability validation and associated docs.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/configuration/agent.md   |  7 +--
 docs/upgrades.md  |  5 +
 include/mesos/mesos.proto |  4 
 include/mesos/v1/mesos.proto  |  4 
 src/common/protobuf_utils.cpp |  3 ++-
 src/common/protobuf_utils.hpp |  7 +++
 src/slave/constants.cpp   |  1 +
 src/slave/flags.cpp   | 26 ++
 src/tests/master_tests.cpp|  3 ++-
 src/tests/slave_tests.cpp |  3 ++-
 10 files changed, 42 insertions(+), 21 deletions(-)



[mesos] 02/02: Cleaned up agent capability validation and associated docs.

2020-03-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 59ba377616d10c248e4f0607a71ecf6658084e59
Author: Greg Mann 
AuthorDate: Tue Mar 3 06:03:58 2020 -0800

Cleaned up agent capability validation and associated docs.

Review: https://reviews.apache.org/r/72087/
---
 docs/configuration/agent.md |  4 +++-
 src/slave/flags.cpp | 20 +---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md
index 1498df4..01ffa38 100644
--- a/docs/configuration/agent.md
+++ b/docs/configuration/agent.md
@@ -93,7 +93,8 @@ Example:
   
 JSON representation of agent features to whitelist. We always require
 'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT',
-'AGENT_OPERATION_FEEDBACK', 'AGENT_DRAINING', and 'TASK_RESOURCE_LIMITS'.
+'AGENT_OPERATION_FEEDBACK', 'RESOURCE_PROVIDER', 'AGENT_DRAINING', and
+'TASK_RESOURCE_LIMITS'.
 
 Example:
 
@@ -103,6 +104,7 @@ Example:
 {"type": "HIERARCHICAL_ROLE"},
 {"type": "RESERVATION_REFINEMENT"},
 {"type": "AGENT_OPERATION_FEEDBACK"},
+{"type": "RESOURCE_PROVIDER"},
 {"type": "AGENT_DRAINING"},
 {"type": "TASK_RESOURCE_LIMITS"}
 ]
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index 5966436..2f88b90 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -812,7 +812,7 @@ mesos::internal::slave::Flags::Flags()
   "agent_features",
   "JSON representation of agent features to whitelist. We always require\n"
   "'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT',\n"
-  "'AGENT_OPERATION_FEEDBACK', 'AGENT_DRAINING', and\n"
+  "'AGENT_OPERATION_FEEDBACK', 'RESOURCE_PROVIDER', 'AGENT_DRAINING', 
and\n"
   "'TASK_RESOURCE_LIMITS'.\n"
   "\n"
   "Example:\n"
@@ -822,6 +822,7 @@ mesos::internal::slave::Flags::Flags()
   "{\"type\": \"HIERARCHICAL_ROLE\"},\n"
   "{\"type\": \"RESERVATION_REFINEMENT\"},\n"
   "{\"type\": \"AGENT_OPERATION_FEEDBACK\"},\n"
+  "{\"type\": \"RESOURCE_PROVIDER\"},\n"
   "{\"type\": \"AGENT_DRAINING\"},\n"
   "{\"type\": \"TASK_RESOURCE_LIMITS\"}\n"
   "]\n"
@@ -836,25 +837,14 @@ mesos::internal::slave::Flags::Flags()
   !capabilities.hierarchicalRole ||
   !capabilities.reservationRefinement ||
   !capabilities.agentOperationFeedback ||
+  !capabilities.resourceProvider ||
   !capabilities.agentDraining ||
   !capabilities.taskResourceLimits) {
 return Error(
 "At least the following agent features need to be enabled:"
 " MULTI_ROLE, HIERARCHICAL_ROLE, RESERVATION_REFINEMENT,"
-" AGENT_OPERATION_FEEDBACK, AGENT_DRAINING, and"
-" TASK_RESOURCE_LIMITS");
-  }
-
-  if (capabilities.resizeVolume && !capabilities.resourceProvider) {
-return Error(
-"RESIZE_VOLUME feature requires RESOURCE_PROVIDER feature");
-  }
-
-  if (capabilities.agentOperationFeedback &&
-  !capabilities.resourceProvider) {
-return Error(
-"AGENT_OPERATION_FEEDBACK feature"
-" requires RESOURCE_PROVIDER feature");
+" AGENT_OPERATION_FEEDBACK, RESOURCE_PROVIDER, AGENT_DRAINING,"
+" and TASK_RESOURCE_LIMITS");
   }
 }
 



[mesos] 01/02: Added the 'TASK_RESOURCE_LIMITS' agent capability.

2020-03-03 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit f445e3aea44b4060292fa5e029dbb2c19e219c25
Author: Greg Mann 
AuthorDate: Tue Mar 3 06:03:57 2020 -0800

Added the 'TASK_RESOURCE_LIMITS' agent capability.

This capability will be used by the master to detect whether
or not an agent can handle task resource limits.

Review: https://reviews.apache.org/r/71991/
---
 docs/configuration/agent.md   |  5 +++--
 docs/upgrades.md  |  5 +
 include/mesos/mesos.proto |  4 
 include/mesos/v1/mesos.proto  |  4 
 src/common/protobuf_utils.cpp |  3 ++-
 src/common/protobuf_utils.hpp |  7 +++
 src/slave/constants.cpp   |  1 +
 src/slave/flags.cpp   | 12 
 src/tests/master_tests.cpp|  3 ++-
 src/tests/slave_tests.cpp |  3 ++-
 10 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md
index 0e703d8..1498df4 100644
--- a/docs/configuration/agent.md
+++ b/docs/configuration/agent.md
@@ -93,7 +93,7 @@ Example:
   
 JSON representation of agent features to whitelist. We always require
 'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT',
-'AGENT_OPERATION_FEEDBACK', and 'AGENT_DRAINING'.
+'AGENT_OPERATION_FEEDBACK', 'AGENT_DRAINING', and 'TASK_RESOURCE_LIMITS'.
 
 Example:
 
@@ -103,7 +103,8 @@ Example:
 {"type": "HIERARCHICAL_ROLE"},
 {"type": "RESERVATION_REFINEMENT"},
 {"type": "AGENT_OPERATION_FEEDBACK"},
-{"type": "AGENT_DRAINING"}
+{"type": "AGENT_DRAINING"},
+{"type": "TASK_RESOURCE_LIMITS"}
 ]
 }
 
diff --git a/docs/upgrades.md b/docs/upgrades.md
index afd9dbb..1e73e3d 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -54,6 +54,7 @@ We categorize the changes as follows:
 
   
 
+  C agent_features
 
   
 
@@ -558,6 +559,10 @@ We categorize the changes as follows:
   The canonical name for the environment variable 
`LIBPROCESS_SSL_REQUIRE_CERT` was changed to 
`LIBPROCESS_SSL_REQUIRE_CLIENT_CERT`.
   The old names will continue to work as before, but operators are encouraged 
to update their configuration to reduce confusion.
 
+
+
+* The Mesos agent now requires the new `TASK_RESOURCE_LIMITS` feature. This 
capability is set by default, but if the `--agent_features` flag is specified 
explicitly, `TASK_RESOURCE_LIMITS` must be included.
+
 ## Upgrading from 1.8.x to 1.9.x ##
 
 
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index d0aed5a..40c45de 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -1050,6 +1050,10 @@ message SlaveInfo {
   // This expresses the ability for the agent to automatically drain tasks
   // in preparation for operator maintenance. This capability is required.
   AGENT_DRAINING = 7;
+
+  // This expresses the ability for the agent to launch tasks which specify
+  // resource limits for CPU and/or memory.
+  TASK_RESOURCE_LIMITS = 8;
 }
 
 // Enum fields should be optional, see: MESOS-4997.
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index 06c4816..6387636 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -1038,6 +1038,10 @@ message AgentInfo {
   // This expresses the ability for the agent to automatically drain tasks
   // in preparation for operator maintenance. This capability is required.
   AGENT_DRAINING = 7;
+
+  // This expresses the ability for the agent to launch tasks which specify
+  // resource limits for CPU and/or memory.
+  TASK_RESOURCE_LIMITS = 8;
 }
 
 // Enum fields should be optional, see: MESOS-4997.
diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp
index 7fe4a44..b3057be 100644
--- a/src/common/protobuf_utils.cpp
+++ b/src/common/protobuf_utils.cpp
@@ -1140,7 +1140,8 @@ bool operator==(const Capabilities& left, const 
Capabilities& right)
  left.resourceProvider == right.resourceProvider &&
  left.resizeVolume == right.resizeVolume &&
  left.agentOperationFeedback == right.agentOperationFeedback &&
- left.agentDraining == right.agentDraining;
+ left.agentDraining == right.agentDraining &&
+ left.taskResourceLimits == right.taskResourceLimits;
 }
 
 
diff --git a/src/common/protobuf_utils.hpp b/src/common/protobuf_utils.hpp
index 3852f59..0558249 100644
--- a/src/common/protobuf_utils.hpp
+++ b/src/common/protobuf_utils.hpp
@@ -361,6 +361,9 @@ struct Capabilities
 case SlaveInfo::Capability::AGENT_DRAINING:
   agentDraining = true;
   break;
+case SlaveInfo::Capability::TASK_RESOURCE_LIMITS:
+  

[mesos] branch master updated (1dd099f -> 4bb7ef9)

2020-02-26 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 1dd099f  Removed dead code from v1 API serialization changes.
 new e258059  SSL Socket: Moved accept callback logic into protected 
function.
 new 599b9e8  Reverted SSL Socket guard against downgrade.
 new 4bb7ef9  SSL Socket: Added downgrade support.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 3rdparty/libprocess/src/openssl.cpp|   6 -
 3rdparty/libprocess/src/ssl/openssl_socket.cpp | 247 -
 3rdparty/libprocess/src/ssl/openssl_socket.hpp |   6 +
 3rdparty/libprocess/src/tests/ssl_tests.cpp|   3 -
 4 files changed, 168 insertions(+), 94 deletions(-)



[mesos] 03/03: SSL Socket: Added downgrade support.

2020-02-26 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 4bb7ef93a9555b4c40efa4a1000b560e58ac9858
Author: Joseph Wu 
AuthorDate: Wed Feb 26 17:14:59 2020 +0100

SSL Socket: Added downgrade support.

This adds downgrade support, in the same fashion that the
Libevent SSL socket does (and copies a good chunk of the code
from there too).  To account for Windows not having `io::poll`,
a slight hack is taken to check for readable bytes.

Review: https://reviews.apache.org/r/72017/
---
 3rdparty/libprocess/src/ssl/openssl_socket.cpp | 66 ++
 1 file changed, 66 insertions(+)

diff --git a/3rdparty/libprocess/src/ssl/openssl_socket.cpp 
b/3rdparty/libprocess/src/ssl/openssl_socket.cpp
index 43909f0..a2ec0a3 100644
--- a/3rdparty/libprocess/src/ssl/openssl_socket.cpp
+++ b/3rdparty/libprocess/src/ssl/openssl_socket.cpp
@@ -573,6 +573,72 @@ Future> 
OpenSSLSocketImpl::accept()
 return Break();
   }
 
+  // If we support downgrading the connection, first wait for this
+  // socket to become readable. We will then MSG_PEEK it to test
+  // whether we want to dispatch as SSL or non-SSL.
+  if (openssl::flags().support_downgrade) {
+#ifdef __WINDOWS__
+// Since there is no `io::poll` on Windows, we instead make
+// a 0-byte read, which will only return once there is something
+// to read.
+return io::read(socket->get(), nullptr, 0)
+#else
+return io::poll(socket->get(), process::io::READ)
+#endif // __WINDOWS__
+  .then([weak_self, socket]() -> Future> {
+std::shared_ptr self(weak_self.lock());
+
+if (self == nullptr) {
+  return Break();
+}
+
+char data[6];
+
+// Try to peek the first 6 bytes of the message.
+ssize_t size = ::recv(socket->get(), data, 6, MSG_PEEK);
+
+// Based on the function 'ssl23_get_client_hello' in openssl, 
we
+// test whether to dispatch to the SSL or non-SSL based accept
+// based on the following rules:
+//   1. If there are fewer than 3 bytes: non-SSL.
+//   2. If the 1st bit of the 1st byte is set AND the 3rd byte
+//  is equal to SSL2_MT_CLIENT_HELLO: SSL.
+//   3. If the 1st byte is equal to SSL3_RT_HANDSHAKE AND the
+//  2nd byte is equal to SSL3_VERSION_MAJOR and the 6th 
byte
+//  is equal to SSL3_MT_CLIENT_HELLO: SSL.
+//   4. Otherwise: non-SSL.
+
+// For an ascii based protocol to falsely get dispatched to SSL
+// it needs to:
+//   1. Start with an invalid ascii character (0x80).
+//   2. OR have the first 2 characters be a SYN followed by 
ETX,
+//  and then the 6th character be SOH.
+// These conditions clearly do not constitute valid HTTP
+// requests, and are unlikely to collide with other existing
+// protocols.
+
+bool ssl = false; // Default to rule 4.
+
+if (size < 2) { // Rule 1.
+  ssl = false;
+} else if ((data[0] & 0x80) &&
+   data[2] == SSL2_MT_CLIENT_HELLO) { // Rule 2.
+  ssl = true;
+} else if (data[0] == SSL3_RT_HANDSHAKE &&
+   data[1] == SSL3_VERSION_MAJOR &&
+   data[5] == SSL3_MT_CLIENT_HELLO) { // Rule 3.
+  ssl = true;
+}
+
+if (ssl) {
+  return self->handle_accept_callback(socket);
+} else {
+  self->accept_queue.put(socket);
+  return Continue();
+}
+  });
+  }
+
   return self->handle_accept_callback(socket);
 });
 



[mesos] 02/03: Reverted SSL Socket guard against downgrade.

2020-02-26 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 599b9e83c6c0659e12b0bccaf7c610b70158737c
Author: Joseph Wu 
AuthorDate: Wed Feb 26 17:14:55 2020 +0100

Reverted SSL Socket guard against downgrade.

This reverts commit 34bac34419ebec8441e69d3a5684381468352399.

Review: https://reviews.apache.org/r/72016/
---
 3rdparty/libprocess/src/openssl.cpp | 6 --
 3rdparty/libprocess/src/tests/ssl_tests.cpp | 3 ---
 2 files changed, 9 deletions(-)

diff --git a/3rdparty/libprocess/src/openssl.cpp 
b/3rdparty/libprocess/src/openssl.cpp
index b2dd2fe..ec7d6e8 100644
--- a/3rdparty/libprocess/src/openssl.cpp
+++ b/3rdparty/libprocess/src/openssl.cpp
@@ -550,14 +550,8 @@ void reinitialize()
   // Notify users of the 'SSL_SUPPORT_DOWNGRADE' flag that this
   // setting allows insecure connections.
   if (ssl_flags->support_downgrade) {
-#ifdef USE_LIBEVENT
 LOG(WARNING) <<
   "Failed SSL connections will be downgraded to a non-SSL socket";
-#else
-EXIT(EXIT_FAILURE)
-  << "Non-libevent SSL sockets do not support downgrade yet,"
-  << " see MESOS-10073";
-#endif // USE_LIBEVENT
   }
 
   // TODO(bevers): Remove the deprecated names for these flags after an
diff --git a/3rdparty/libprocess/src/tests/ssl_tests.cpp 
b/3rdparty/libprocess/src/tests/ssl_tests.cpp
index a6563fb..3f1d103 100644
--- a/3rdparty/libprocess/src/tests/ssl_tests.cpp
+++ b/3rdparty/libprocess/src/tests/ssl_tests.cpp
@@ -483,8 +483,6 @@ TEST_F(SSLTest, ECDHESupport)
 }
 
 
-// TODO(josephw): Support downgrades on the native OpenSSL socket 
(MESOS-10073).
-#ifdef USE_LIBEVENT
 // Ensure we can communicate between a POLL based socket and an SSL
 // socket if 'SSL_SUPPORT_DOWNGRADE' is enabled.
 TEST_F(SSLTest, ValidDowngrade)
@@ -583,7 +581,6 @@ TEST_F(SSLTest, ValidDowngradeEachProtocol)
 AWAIT_ASSERT_READY(await_subprocess(client.get(), 0));
   }
 }
-#endif // USE_LIBEVENT
 
 
 // For each protocol: ensure we CANNOT communicate between a POLL



[mesos] 01/03: SSL Socket: Moved accept callback logic into protected function.

2020-02-26 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit e258059d49779a02b6267a9d0829bd244b882ed5
Author: Joseph Wu 
AuthorDate: Wed Feb 26 17:14:50 2020 +0100

SSL Socket: Moved accept callback logic into protected function.

To support SSL downgrades, this logic will need to be called
from two potential callsites.

Also fixes a slight typo in a comment within the moved code.

Review: https://reviews.apache.org/r/72014/
---
 3rdparty/libprocess/src/ssl/openssl_socket.cpp | 185 +
 3rdparty/libprocess/src/ssl/openssl_socket.hpp |   6 +
 2 files changed, 104 insertions(+), 87 deletions(-)

diff --git a/3rdparty/libprocess/src/ssl/openssl_socket.cpp 
b/3rdparty/libprocess/src/ssl/openssl_socket.cpp
index 74f9fe2..43909f0 100644
--- a/3rdparty/libprocess/src/ssl/openssl_socket.cpp
+++ b/3rdparty/libprocess/src/ssl/openssl_socket.cpp
@@ -573,93 +573,7 @@ Future> 
OpenSSLSocketImpl::accept()
 return Break();
   }
 
-  // Wrap this new socket up into our SSL wrapper class by releasing
-  // the FD and creating a new OpenSSLSocketImpl object with the FD.
-  const std::shared_ptr ssl_socket =
-std::make_shared(socket->release());
-
-  // Set up SSL object.
-  SSL* accept_ssl = SSL_new(openssl::context());
-  if (accept_ssl == nullptr) {
-self->accept_queue.put(Failure("Accept failed, SSL_new"));
-return Continue();
-  }
-
-  Try peer_address = network::peer(ssl_socket->get());
-  if (!peer_address.isSome()) {
-SSL_free(accept_ssl);
-self->accept_queue.put(
-Failure("Could not determine peer IP for connection"));
-return Continue();
-  }
-
-  // NOTE: Right now, `openssl::configure_socket` does not do anything
-  // in server mode, but we still pass the correct peer address to
-  // enable modules to implement application-level logic in the future.
-  Try configured = openssl::configure_socket(
-  accept_ssl, Mode::SERVER, peer_address.get(), None());
-
-  if (configured.isError()) {
-SSL_free(accept_ssl);
-self->accept_queue.put(
-Failure("Could not configure socket: " + configured.error()));
-return Continue();
-  }
-
-  // Set the SSL context in server mode.
-  SSL_set_accept_state(accept_ssl);
-
-  // Pass ownership of `accept_ssl` to the newly accepted socket,
-  // and wtart the SSL handshake. When the SSL handshake completes,
-  // the listening socket will place the result (failure or success)
-  // onto the listening socket's `accept_queue`.
-  //
-  // TODO(josephw): Add a timeout to catch/close incoming sockets which
-  // never finish the SSL handshake.
-  ssl_socket->set_ssl_and_do_handshake(accept_ssl)
-.onAny([weak_self, ssl_socket](Future result) {
-  std::shared_ptr self(weak_self.lock());
-
-  if (self == nullptr) {
-return;
-  }
-
-  if (result.isFailed()) {
-self->accept_queue.put(Failure(result.failure()));
-return;
-  }
-
-  // For verification purposes, we need to grab the address 
(again).
-  Try address = network::address(ssl_socket->get());
-  if (address.isError()) {
-self->accept_queue.put(
-Failure("Failed to get address: " + address.error()));
-return;
-  }
-
-  Try inet_address =
-network::convert(address.get());
-
-  Try verify = openssl::verify(
-  ssl_socket->ssl,
-  Mode::SERVER,
-  None(),
-  inet_address.isSome()
-? Some(inet_address->ip)
-: Option::none());
-
-  if (verify.isError()) {
-VLOG(1) << "Failed accept, verification error: "
-<< verify.error();
-
-self->accept_queue.put(Failure(verify.error()));
-return;
-  }
-
-  self->accept_queue.put(ssl_socket);
-});
-
-  return Continue();
+  return self->handle_accept_callback(socket);
 });
 
 accept_loop_started.done();
@@ -735,6 +649,103 @@ Try OpenSSLSocketImpl::shutdown(int 
how)
 }
 
 
+Future> OpenSSLSocketImpl::handle_accept_callback(
+const std::shared_ptr& socket)
+{
+  // Wrap this new socket up into our SSL wrapper class by releasing
+  // the FD and creat

[mesos] branch master updated: Removed remaining domain socket code from the Windows build.

2020-02-11 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 65e18be  Removed remaining domain socket code from the Windows build.
65e18be is described below

commit 65e18bef2c5ff356ef74bac9aa79b128c5b186d9
Author: Greg Mann 
AuthorDate: Tue Feb 11 10:35:10 2020 -0800

Removed remaining domain socket code from the Windows build.

These changes are needed to get the tests to run.

Review: https://reviews.apache.org/r/72114/
---
 src/slave/flags.hpp  | 2 ++
 src/slave/slave.cpp  | 2 ++
 src/tests/command_executor_tests.cpp | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 838aaee..c3ff887 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -182,8 +182,10 @@ public:
 #ifdef USE_SSL_SOCKET
   bool authenticate_http_executors;
 #endif // USE_SSL_SOCKET
+#ifndef __WINDOWS__
   bool http_executor_domain_sockets;
   Option domain_socket_location;
+#endif // __WINDOWS__
   Option http_credentials;
   Option hooks;
   Option secret_resolver;
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 75bf595..cce275a 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -11176,12 +11176,14 @@ map executorEnvironment(
   environment["MESOS_HTTP_COMMAND_EXECUTOR"] =
 flags.http_command_executor ? "1" : "0";
 
+#ifndef __WINDOWS__
   if (flags.http_executor_domain_sockets) {
 // If `http_executor_domain_sockets` is true, the location should have
 // been set either by the user or automatically during agent startup.
 CHECK(flags.domain_socket_location.isSome());
 environment["MESOS_DOMAIN_SOCKET"] = *flags.domain_socket_location;
   }
+#endif // __WINDOWS__
 
   // Set executor's shutdown grace period. If set, the customized value
   // from `ExecutorInfo` overrides the default from agent flags.
diff --git a/src/tests/command_executor_tests.cpp 
b/src/tests/command_executor_tests.cpp
index 73f8006..4118a52 100644
--- a/src/tests/command_executor_tests.cpp
+++ b/src/tests/command_executor_tests.cpp
@@ -496,6 +496,7 @@ TEST_P(CommandExecutorTest, 
AllocationRoleEnvironmentVariable)
 }
 
 
+#ifndef __WINDOWS__
 // This test checks that the command executor can communicate
 // with the agent using unix domain sockets, when the necessary
 // flags are set on the agent.
@@ -572,6 +573,7 @@ TEST_P(CommandExecutorTest, ExecutorDomainSockets)
   driver.stop();
   driver.join();
 }
+#endif // __WINDOWS__
 
 
 class HTTPCommandExecutorTest



[mesos] branch master updated: Added a new task status reason.

2020-01-16 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 42e9b88  Added a new task status reason.
42e9b88 is described below

commit 42e9b889900be9934dbb5ffc06504b21a3d8c206
Author: Greg Mann 
AuthorDate: Thu Jan 16 18:40:50 2020 -0800

Added a new task status reason.

The new reason will be sent to frameworks when one of their tasks is
OOM-killed on an agent while the task is exceeding its memory request.

Review: https://reviews.apache.org/r/71935/
---
 include/mesos/mesos.proto| 1 +
 include/mesos/v1/mesos.proto | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index b0f5905..d0aed5a 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -2636,6 +2636,7 @@ message TaskStatus {
 REASON_CONTAINER_LIMITATION = 19;
 REASON_CONTAINER_LIMITATION_DISK = 20;
 REASON_CONTAINER_LIMITATION_MEMORY = 8;
+REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED = 35;
 REASON_CONTAINER_PREEMPTED = 17;
 REASON_CONTAINER_UPDATE_FAILED = 22;
 REASON_MAX_COMPLETION_TIME_REACHED = 33;
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index 53a7b9b..06c4816 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -2625,6 +2625,7 @@ message TaskStatus {
 REASON_CONTAINER_LIMITATION = 19;
 REASON_CONTAINER_LIMITATION_DISK = 20;
 REASON_CONTAINER_LIMITATION_MEMORY = 8;
+REASON_CONTAINER_MEMORY_REQUEST_EXCEEDED = 35;
 REASON_CONTAINER_PREEMPTED = 17;
 REASON_CONTAINER_UPDATE_FAILED = 22;
 REASON_MAX_COMPLETION_TIME_REACHED = 33;



[mesos] 02/02: Added MESOS-10041 to the 1.9.1 CHANGELOG.

2019-11-22 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 95d34e7f30c6ca8a074bd2af0359ed96310adee3
Author: Greg Mann 
AuthorDate: Fri Nov 22 15:19:30 2019 -0800

Added MESOS-10041 to the 1.9.1 CHANGELOG.
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index 398f479..08e8944 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -10,6 +10,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP)
   * [MESOS-10007] - Command executor can miss exit status for short-lived 
commands due to double-reaping.
   * [MESOS-10008] - Very large quota values can crash master.
   * [MESOS-10015] - updateAllocation() can stall the allocator with a huge 
number of reservations on an agent.
+  * [MESOS-10041] - Libprocess SSL verification can leak memory.
 
 ** Improvement
   * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in 
Master::__reregisterSlave.



[mesos] branch 1.9.x updated (c313168 -> 95d34e7)

2019-11-22 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from c313168  Garbage-collected lost tasks which are reported as running 
again.
 new a687e71  Fixed memory leak in openssl verification function.
 new 95d34e7  Added MESOS-10041 to the 1.9.1 CHANGELOG.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 3rdparty/libprocess/src/openssl.cpp | 18 +-
 CHANGELOG   |  1 +
 2 files changed, 6 insertions(+), 13 deletions(-)



[mesos] 01/02: Fixed memory leak in openssl verification function.

2019-11-22 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit a687e71790151b5b078322e825ade349cc6922dc
Author: Benno Evers 
AuthorDate: Fri Nov 22 12:00:43 2019 -0800

Fixed memory leak in openssl verification function.

When the hostname validation scheme was set to 'openssl',
the `openssl::verify()` function would return without
freeing a previously allocated `X509*` object.

To fix the leak, a long-standing TODO to switch to
RAII-based memory management for the certificate was
resolved.

Review: https://reviews.apache.org/r/71805/
---
 3rdparty/libprocess/src/openssl.cpp | 18 +-
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/3rdparty/libprocess/src/openssl.cpp 
b/3rdparty/libprocess/src/openssl.cpp
index a81d13f..7eb5822 100644
--- a/3rdparty/libprocess/src/openssl.cpp
+++ b/3rdparty/libprocess/src/openssl.cpp
@@ -841,8 +841,9 @@ Try verify(
   }
 
   // The X509 object must be freed if this call succeeds.
-  // TODO(jmlvanre): handle this better. How about RAII?
-  X509* cert = SSL_get_peer_certificate(ssl);
+  std::unique_ptr cert(
+  SSL_get_peer_certificate(ssl),
+  X509_free);
 
   // NOTE: Even without this check, the OpenSSL handshake will not complete
   // when connecting to servers that do not present a certificate, unless an
@@ -852,7 +853,6 @@ Try verify(
   }
 
   if (SSL_get_verify_result(ssl) != X509_V_OK) {
-X509_free(cert);
 return Error("Could not verify peer certificate");
   }
 
@@ -896,7 +896,6 @@ Try verify(
   }
 
   if (!ssl_flags->verify_ipadd && peer_hostname.isNone()) {
-X509_free(cert);
 return ssl_flags->require_client_cert
   ? Error("Cannot verify peer certificate: peer hostname unknown")
   : Try(Nothing());
@@ -908,7 +907,7 @@ Try verify(
   // physical host.
   STACK_OF(GENERAL_NAME)* san_names =
 reinterpret_cast(X509_get_ext_d2i(
-reinterpret_cast(cert),
+cert.get(),
 NID_subject_alt_name,
 nullptr,
 nullptr));
@@ -931,7 +930,6 @@ Try verify(
 const size_t length = ASN1_STRING_length(current_name->d.dNSName);
 if (length != dns_name.length()) {
   sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free);
-  X509_free(cert);
   return Error(
   "X509 certificate malformed: "
   "embedded NUL character in DNS name");
@@ -941,7 +939,6 @@ Try verify(
   // Compare expected hostname with the DNS name.
   if (peer_hostname.get() == dns_name) {
 sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free);
-X509_free(cert);
 
 VLOG(2) << "dNSName match found for " << peer_hostname.get();
 
@@ -966,7 +963,6 @@ Try verify(
 
   if (ip.get() == ip_add) {
 sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free);
-X509_free(cert);
 
 VLOG(2) << "iPAddress match found for " << ip.get();
 
@@ -985,7 +981,7 @@ Try verify(
   if (peer_hostname.isSome()) {
 // If we still haven't verified the hostname, try doing it via
 // the certificate subject name.
-X509_NAME* name = X509_get_subject_name(cert);
+X509_NAME* name = X509_get_subject_name(cert.get());
 
 if (name != nullptr) {
   char text[MAXHOSTNAMELEN] {};
@@ -998,7 +994,6 @@ Try verify(
 VLOG(2) << "Matching common name: " << text;
 
 if (peer_hostname.get() != text) {
-  X509_free(cert);
   return Error(
 "Presented Certificate Name: " + stringify(text) +
 " does not match peer hostname name: " + peer_hostname.get());
@@ -1006,15 +1001,12 @@ Try verify(
 
 VLOG(2) << "Common name match found for " << peer_hostname.get();
 
-X509_free(cert);
 return Nothing();
   }
 }
   }
 
   // If we still haven't exited, we haven't verified it, and we give up.
-  X509_free(cert);
-
   std::vector details;
 
   if (peer_hostname.isSome()) {



[mesos] branch master updated: Added MESOS-10041 to the 1.9.1 CHANGELOG.

2019-11-22 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new d10a33a  Added MESOS-10041 to the 1.9.1 CHANGELOG.
d10a33a is described below

commit d10a33acc426dda9e34db995f16450faf898bb3b
Author: Greg Mann 
AuthorDate: Fri Nov 22 15:19:30 2019 -0800

Added MESOS-10041 to the 1.9.1 CHANGELOG.
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index cf6311a..21d21d3 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -17,6 +17,7 @@ Release Notes - Mesos - Version 1.9.1 (WIP)
   * [MESOS-10007] - Command executor can miss exit status for short-lived 
commands due to double-reaping.
   * [MESOS-10008] - Very large quota values can crash master.
   * [MESOS-10015] - updateAllocation() can stall the allocator with a huge 
number of reservations on an agent.
+  * [MESOS-10041] - Libprocess SSL verification can leak memory.
 
 ** Improvement
   * [MESOS-9889] - Master CPU high due to unexpected foreachkey behaviour in 
Master::__reregisterSlave.



[mesos] branch master updated: Fixed memory leak in openssl verification function.

2019-11-22 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new e52d0d1  Fixed memory leak in openssl verification function.
e52d0d1 is described below

commit e52d0d1f25a91f9940bea4329eb5359373ee0ed0
Author: Benno Evers 
AuthorDate: Fri Nov 22 12:00:43 2019 -0800

Fixed memory leak in openssl verification function.

When the hostname validation scheme was set to 'openssl',
the `openssl::verify()` function would return without
freeing a previously allocated `X509*` object.

To fix the leak, a long-standing TODO to switch to
RAII-based memory management for the certificate was
resolved.

Review: https://reviews.apache.org/r/71805/
---
 3rdparty/libprocess/src/openssl.cpp | 18 +-
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/3rdparty/libprocess/src/openssl.cpp 
b/3rdparty/libprocess/src/openssl.cpp
index bd05866..8aab5ac 100644
--- a/3rdparty/libprocess/src/openssl.cpp
+++ b/3rdparty/libprocess/src/openssl.cpp
@@ -841,8 +841,9 @@ Try verify(
   }
 
   // The X509 object must be freed if this call succeeds.
-  // TODO(jmlvanre): handle this better. How about RAII?
-  X509* cert = SSL_get_peer_certificate(ssl);
+  std::unique_ptr cert(
+  SSL_get_peer_certificate(ssl),
+  X509_free);
 
   // NOTE: Even without this check, the OpenSSL handshake will not complete
   // when connecting to servers that do not present a certificate, unless an
@@ -852,7 +853,6 @@ Try verify(
   }
 
   if (SSL_get_verify_result(ssl) != X509_V_OK) {
-X509_free(cert);
 return Error("Could not verify peer certificate");
   }
 
@@ -896,7 +896,6 @@ Try verify(
   }
 
   if (!ssl_flags->verify_ipadd && peer_hostname.isNone()) {
-X509_free(cert);
 return ssl_flags->require_client_cert
   ? Error("Cannot verify peer certificate: peer hostname unknown")
   : Try(Nothing());
@@ -908,7 +907,7 @@ Try verify(
   // physical host.
   STACK_OF(GENERAL_NAME)* san_names =
 reinterpret_cast(X509_get_ext_d2i(
-reinterpret_cast(cert),
+cert.get(),
 NID_subject_alt_name,
 nullptr,
 nullptr));
@@ -931,7 +930,6 @@ Try verify(
 const size_t length = ASN1_STRING_length(current_name->d.dNSName);
 if (length != dns_name.length()) {
   sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free);
-  X509_free(cert);
   return Error(
   "X509 certificate malformed: "
   "embedded NUL character in DNS name");
@@ -941,7 +939,6 @@ Try verify(
   // Compare expected hostname with the DNS name.
   if (peer_hostname.get() == dns_name) {
 sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free);
-X509_free(cert);
 
 VLOG(2) << "dNSName match found for " << peer_hostname.get();
 
@@ -966,7 +963,6 @@ Try verify(
 
   if (ip.get() == ip_add) {
 sk_GENERAL_NAME_pop_free(san_names, GENERAL_NAME_free);
-X509_free(cert);
 
 VLOG(2) << "iPAddress match found for " << ip.get();
 
@@ -985,7 +981,7 @@ Try verify(
   if (peer_hostname.isSome()) {
 // If we still haven't verified the hostname, try doing it via
 // the certificate subject name.
-X509_NAME* name = X509_get_subject_name(cert);
+X509_NAME* name = X509_get_subject_name(cert.get());
 
 if (name != nullptr) {
   char text[MAXHOSTNAMELEN] {};
@@ -998,7 +994,6 @@ Try verify(
 VLOG(2) << "Matching common name: " << text;
 
 if (peer_hostname.get() != text) {
-  X509_free(cert);
   return Error(
 "Presented Certificate Name: " + stringify(text) +
 " does not match peer hostname name: " + peer_hostname.get());
@@ -1006,15 +1001,12 @@ Try verify(
 
 VLOG(2) << "Common name match found for " << peer_hostname.get();
 
-X509_free(cert);
 return Nothing();
   }
 }
   }
 
   // If we still haven't exited, we haven't verified it, and we give up.
-  X509_free(cert);
-
   std::vector details;
 
   if (peer_hostname.isSome()) {



[mesos] branch 1.9.x updated: Added MESOS-9965 to the 1.9.1 CHANGELOG.

2019-09-12 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.9.x by this push:
 new c115fb0  Added MESOS-9965 to the 1.9.1 CHANGELOG.
c115fb0 is described below

commit c115fb0e4842f5c1211c7464d38a7a67994f08ae
Author: Greg Mann 
AuthorDate: Thu Sep 12 19:57:35 2019 -0700

Added MESOS-9965 to the 1.9.1 CHANGELOG.
---
 CHANGELOG | 8 
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 876c303..98bbaa0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,11 @@
+Release Notes - Mesos - Version 1.9.1 (WIP)
+---
+* This is a bug fix release.
+
+** Bug
+  * [MESOS-9965] - Agent should not send `TASK_GONE_BY_OPERATOR` if the 
framework is not partition aware.
+
+
 Release Notes - Mesos - Version 1.9.0
 -
 This release contains the following highlights:



[mesos] branch master updated: Added MESOS-9965 to the 1.9.1 CHANGELOG.

2019-09-12 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 2e26323  Added MESOS-9965 to the 1.9.1 CHANGELOG.
2e26323 is described below

commit 2e26323eb98305e531050b62421ea97c63c1b79b
Author: Greg Mann 
AuthorDate: Thu Sep 12 19:57:35 2019 -0700

Added MESOS-9965 to the 1.9.1 CHANGELOG.
---
 CHANGELOG | 8 
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 876c303..98bbaa0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,11 @@
+Release Notes - Mesos - Version 1.9.1 (WIP)
+---
+* This is a bug fix release.
+
+** Bug
+  * [MESOS-9965] - Agent should not send `TASK_GONE_BY_OPERATOR` if the 
framework is not partition aware.
+
+
 Release Notes - Mesos - Version 1.9.0
 -
 This release contains the following highlights:



[mesos] branch 1.9.x updated: Fixed a bug for non-partition-aware schedulers.

2019-09-12 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.9.x by this push:
 new d8520b0  Fixed a bug for non-partition-aware schedulers.
d8520b0 is described below

commit d8520b0b4bf52fd27be45817934e2af1b871c399
Author: Greg Mann 
AuthorDate: Thu Sep 12 16:33:20 2019 -0700

Fixed a bug for non-partition-aware schedulers.

Previously, the agent would send task status updates with the state
TASK_GONE_BY_OPERATOR to all schedulers when an agent was drained
with the `mark_gone` parameter set to `true`.

This patch updates this code to ensure that TASK_GONE_BY_OPERATOR
is only sent to partition-aware schedulers.

Review: https://reviews.apache.org/r/71480/
---
 src/slave/slave.cpp   | 69 ---
 src/tests/slave_tests.cpp | 20 +++---
 2 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 4e93656..96890d3 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -5773,40 +5773,6 @@ void Slave::statusUpdate(StatusUpdate update, const 
Option& pid)
   update.mutable_status()->set_source(
   pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR);
 
-  // If the agent is draining we provide additional
-  // information for KILLING or KILLED states.
-  if (drainConfig.isSome()) {
-switch (update.status().state()) {
-  case TASK_STAGING:
-  case TASK_STARTING:
-  case TASK_RUNNING:
-  case TASK_FAILED:
-  case TASK_FINISHED:
-  case TASK_ERROR:
-  case TASK_LOST:
-  case TASK_DROPPED:
-  case TASK_UNREACHABLE:
-  case TASK_GONE:
-  case TASK_GONE_BY_OPERATOR:
-  case TASK_UNKNOWN: {
-break;
-  }
-  case TASK_KILLING:
-  case TASK_KILLED: {
-// We unconditionally overwrite any previous reason to provide a
-// consistent signal that this task went away during draining.
-update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);
-
-// If the draining marks the agent as gone report tasks as
-// gone by operator.
-if (drainConfig->mark_gone()) {
-  update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
-}
-break;
-  }
-}
-  }
-
   // Set TaskStatus.executor_id if not already set; overwrite existing
   // value if already set.
   if (update.has_executor_id()) {
@@ -5843,6 +5809,41 @@ void Slave::statusUpdate(StatusUpdate update, const 
Option& pid)
 return;
   }
 
+  // If the agent is draining we provide additional
+  // information for KILLING or KILLED states.
+  if (drainConfig.isSome()) {
+switch (update.status().state()) {
+  case TASK_STAGING:
+  case TASK_STARTING:
+  case TASK_RUNNING:
+  case TASK_FAILED:
+  case TASK_FINISHED:
+  case TASK_ERROR:
+  case TASK_LOST:
+  case TASK_DROPPED:
+  case TASK_UNREACHABLE:
+  case TASK_GONE:
+  case TASK_GONE_BY_OPERATOR:
+  case TASK_UNKNOWN: {
+break;
+  }
+  case TASK_KILLING:
+  case TASK_KILLED: {
+// We unconditionally overwrite any previous reason to provide a
+// consistent signal that this task went away during draining.
+update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);
+
+// If the draining marks the agent as gone report tasks as
+// gone by operator.
+if (drainConfig->mark_gone() &&
+framework->capabilities.partitionAware) {
+  update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
+}
+break;
+  }
+}
+  }
+
   if (HookManager::hooksAvailable()) {
 // Even though the hook(s) return a TaskStatus, we only use two fields:
 // container_status and labels. Remaining fields are discarded.
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 02b65a9..c147bfc 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -12040,10 +12040,16 @@ TEST_F(SlaveTest, DrainAgentKillsRunningTask)
 
   AWAIT_READY(updateSlaveMessage);
 
+  // Set the partition-aware capability to ensure that the terminal update 
state
+  // is TASK_GONE_BY_OPERATOR, since we will set `mark_gone = true`.
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+  v1::FrameworkInfo::Capability::PARTITION_AWARE);
+
   auto scheduler = std::make_shared();
 
   EXPECT_CALL(*scheduler, connected(_))
-.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO));
+.WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
 
   Future subscribed;
   EXPECT_CALL(*scheduler, subscribed(_, _))
@@ -12160,10 +12166,16 @@ TEST_F(SlaveTest, DrainAgentKillsQueu

[mesos] branch master updated: Fixed a bug for non-partition-aware schedulers.

2019-09-12 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 8e1a512  Fixed a bug for non-partition-aware schedulers.
8e1a512 is described below

commit 8e1a51207304589a6521cff3540e0705fe1533ff
Author: Greg Mann 
AuthorDate: Thu Sep 12 16:33:20 2019 -0700

Fixed a bug for non-partition-aware schedulers.

Previously, the agent would send task status updates with the state
TASK_GONE_BY_OPERATOR to all schedulers when an agent was drained
with the `mark_gone` parameter set to `true`.

This patch updates this code to ensure that TASK_GONE_BY_OPERATOR
is only sent to partition-aware schedulers.

Review: https://reviews.apache.org/r/71480/
---
 src/slave/slave.cpp   | 69 ---
 src/tests/slave_tests.cpp | 20 +++---
 2 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 4e93656..96890d3 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -5773,40 +5773,6 @@ void Slave::statusUpdate(StatusUpdate update, const 
Option& pid)
   update.mutable_status()->set_source(
   pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR);
 
-  // If the agent is draining we provide additional
-  // information for KILLING or KILLED states.
-  if (drainConfig.isSome()) {
-switch (update.status().state()) {
-  case TASK_STAGING:
-  case TASK_STARTING:
-  case TASK_RUNNING:
-  case TASK_FAILED:
-  case TASK_FINISHED:
-  case TASK_ERROR:
-  case TASK_LOST:
-  case TASK_DROPPED:
-  case TASK_UNREACHABLE:
-  case TASK_GONE:
-  case TASK_GONE_BY_OPERATOR:
-  case TASK_UNKNOWN: {
-break;
-  }
-  case TASK_KILLING:
-  case TASK_KILLED: {
-// We unconditionally overwrite any previous reason to provide a
-// consistent signal that this task went away during draining.
-update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);
-
-// If the draining marks the agent as gone report tasks as
-// gone by operator.
-if (drainConfig->mark_gone()) {
-  update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
-}
-break;
-  }
-}
-  }
-
   // Set TaskStatus.executor_id if not already set; overwrite existing
   // value if already set.
   if (update.has_executor_id()) {
@@ -5843,6 +5809,41 @@ void Slave::statusUpdate(StatusUpdate update, const 
Option& pid)
 return;
   }
 
+  // If the agent is draining we provide additional
+  // information for KILLING or KILLED states.
+  if (drainConfig.isSome()) {
+switch (update.status().state()) {
+  case TASK_STAGING:
+  case TASK_STARTING:
+  case TASK_RUNNING:
+  case TASK_FAILED:
+  case TASK_FINISHED:
+  case TASK_ERROR:
+  case TASK_LOST:
+  case TASK_DROPPED:
+  case TASK_UNREACHABLE:
+  case TASK_GONE:
+  case TASK_GONE_BY_OPERATOR:
+  case TASK_UNKNOWN: {
+break;
+  }
+  case TASK_KILLING:
+  case TASK_KILLED: {
+// We unconditionally overwrite any previous reason to provide a
+// consistent signal that this task went away during draining.
+update.mutable_status()->set_reason(TaskStatus::REASON_SLAVE_DRAINING);
+
+// If the draining marks the agent as gone report tasks as
+// gone by operator.
+if (drainConfig->mark_gone() &&
+framework->capabilities.partitionAware) {
+  update.mutable_status()->set_state(TASK_GONE_BY_OPERATOR);
+}
+break;
+  }
+}
+  }
+
   if (HookManager::hooksAvailable()) {
 // Even though the hook(s) return a TaskStatus, we only use two fields:
 // container_status and labels. Remaining fields are discarded.
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 02b65a9..c147bfc 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -12040,10 +12040,16 @@ TEST_F(SlaveTest, DrainAgentKillsRunningTask)
 
   AWAIT_READY(updateSlaveMessage);
 
+  // Set the partition-aware capability to ensure that the terminal update 
state
+  // is TASK_GONE_BY_OPERATOR, since we will set `mark_gone = true`.
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+  v1::FrameworkInfo::Capability::PARTITION_AWARE);
+
   auto scheduler = std::make_shared();
 
   EXPECT_CALL(*scheduler, connected(_))
-.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO));
+.WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
 
   Future subscribed;
   EXPECT_CALL(*scheduler, subscribed(_, _))
@@ -12160,10 +12166,16 @@ TEST_F(SlaveTest, DrainAgentKillsQueu

[mesos] branch master updated: Added documentation about standalone containers.

2019-09-09 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new b24ab0a  Added documentation about standalone containers.
b24ab0a is described below

commit b24ab0a97e0043cb4a06624f7593dec9e15f5661
Author: Joseph Wu 
AuthorDate: Mon Sep 9 10:37:57 2019 -0700

Added documentation about standalone containers.

This outlines some of the differences to expect from this
new type of container and shows some example API calls.

Review: https://reviews.apache.org/r/65112/
---
 docs/csi.md   |   2 +-
 docs/home.md  |   1 +
 docs/standalone-containers.md | 202 ++
 3 files changed, 204 insertions(+), 1 deletion(-)

diff --git a/docs/csi.md b/docs/csi.md
index 4a83581..aa03175 100644
--- a/docs/csi.md
+++ b/docs/csi.md
@@ -80,7 +80,7 @@ More details about SLRP can be found in the following 
[section](#storage-local-r
 
 CSI plugins are long-running [gRPC](https://grpc.io/) services, like daemons.
 Those CSI plugins are packaged as containers, and are launched by SLRPs using
-the [standalone containers](standalone-container.md) API from the agent.
+the [standalone containers](standalone-containers.md) API from the agent.
 Standalone containers can be launched without any tasks or executors. They use
 the same isolation mechanism provided by the agent for task and executor
 containers.
diff --git a/docs/home.md b/docs/home.md
index ad19919..4e62b4a 100644
--- a/docs/home.md
+++ b/docs/home.md
@@ -59,6 +59,7 @@ layout: documentation
 * [Container Sandboxes](sandbox.md)
 * [Container Volumes](container-volume.md)
 * [Nested Container and Task Group (Pod)](nested-container-and-task-group.md)
+* [Standalone Containers](standalone-containers.md)
 
 ## Networking
 * [Networking Overview](networking.md)
diff --git a/docs/standalone-containers.md b/docs/standalone-containers.md
new file mode 100644
index 000..1e1e306
--- /dev/null
+++ b/docs/standalone-containers.md
@@ -0,0 +1,202 @@
+---
+title: Apache Mesos - Standalone Containers
+layout: documentation
+---
+
+# Standalone Containers
+
+Traditionally, launching a container in a Mesos cluster involves
+communication between multiple components:
+
+```
+ Container(s)
+  +---+ ++ +---+ +--+
+  | Framework | <-> | Master | <-> | Agent | <-> | Executor |
+  +---+ ++ +---+ |  `->Task |
+ ^   +--+
+ | +---+ +--+
+ +-->  | Agent | <-> | Executor |
+ | +---+ |  `->Task |
+...  +--+
+```
+
+Mesos 1.5 introduced "Standalone Containers", which provide an alternate
+path for launching containers with a reduced scope and feature set:
+
+```
+   +---++--+
+  Operator API <-> | Agent | -> | Standalone Container |
+   +---++--+
+```
+
+**NOTE:** Agents currently require a connection to a Mesos master in
+order to accept any Operator API calls.  This limitation is not necessary
+and may be fixed in future.
+
+**NOTE:** Standalone containers only apply to the Mesos containerizer.
+For standalone docker containers, use docker directly.
+
+As hinted by the diagrams, standalone containers are launched on single
+Agents, rather than cluster-wide.  This document describes the major
+differences between normal containers and standalone containers; and
+provides some examples of how to use the new Operator APIs.
+
+
+## Launching a Standalone Container
+
+Because standalone containers are launched directly on Mesos Agents,
+these containers do not participate in the Mesos Master's offer cycle.
+This means standalone containers can be launched regardless of resource
+allocation and can potentially overcommit the Mesos Agent, but cannot
+use reserved resources.
+
+An Operator API might look like this:
+
+```
+LAUNCH_CONTAINER HTTP Request (JSON):
+
+POST /api/v1  HTTP/1.1
+
+Host: agenthost:5051
+Content-Type: application/json
+
+{
+  "type": "LAUNCH_CONTAINER",
+  "launch_container": {
+"container_id": {
+  "value": "my-standalone-container-id"
+},
+"command": {
+  "value": "sleep 100"
+},
+"resources": [
+  {
+"name": "cpus",
+"scalar": { "value": 2.0 },
+"type": "SCALAR"
+  },
+  {
+"name": "mem",
+ 

[mesos] 03/03: Fixed formatting in the upgrade docs.

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 5e79a584e6ec3e9e2f96e8bf418411df9dafac2e
Author: Greg Mann 
AuthorDate: Fri Aug 30 11:48:18 2019 -0300

Fixed formatting in the upgrade docs.
---
 docs/upgrades.md | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/upgrades.md b/docs/upgrades.md
index d36a9a4..d745752 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -522,21 +522,22 @@ We categorize the changes as follows:
 
 
 
-  * A new `DRAINING` state has been added to Mesos agents. Once an agent is 
draining, all tasks running on that agent are gracefully
-killed and no offers for that agent are sent to schedulers, preventing the 
launching of new tasks.
-Operators can put an agent into `DRAINING` state by using the 
`DRAIN_AGENT` operator API call.
-See [`docs/maintenance`](maintenance.md) for details.
+* A new `DRAINING` state has been added to Mesos agents. Once an agent is 
draining, all tasks running on that agent are gracefully
+  killed and no offers for that agent are sent to schedulers, preventing the 
launching of new tasks.
+  Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` 
operator API call.
+  See [`docs/maintenance`](maintenance.md) for details.
 
 
+
 * The Mesos agent now requires the new `AGENT_DRAINING` feature. This 
capability is set by default, but if the `--agent_features` flag is specified 
explicitly, `AGENT_DRAINING` must be included.
 
 
 
-  * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.
+* A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.
 
 
 
-  * A new 
[`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag 
has been added. This causes the agent to ignore any runtime configuration 
present in Docker images.
+* A new 
[`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag 
has been added. This causes the agent to ignore any runtime configuration 
present in Docker images.
 
 
 



[mesos] branch 1.9.x updated (091f193 -> 5e79a58)

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 091f193  Updated docs for the AGENT_DRAINING capability.
 new 9aa7dab  Removed experimental warning from UPDATE_QUOTA call.
 new 499a571  Updated upgrades.md to note quota limits changes.
 new 5e79a58  Fixed formatting in the upgrade docs.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/upgrades.md | 34 --
 include/mesos/master/master.proto|  4 
 include/mesos/v1/master/master.proto |  4 
 3 files changed, 28 insertions(+), 14 deletions(-)



[mesos] 01/03: Removed experimental warning from UPDATE_QUOTA call.

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 9aa7dab8061292d22ac870a2cf8856dca50e31d2
Author: Benjamin Mahler 
AuthorDate: Thu Aug 29 12:59:01 2019 -0400

Removed experimental warning from UPDATE_QUOTA call.

This is now a fully functional feature and is released in 1.9.

Review: https://reviews.apache.org/r/71411
---
 include/mesos/master/master.proto| 4 
 include/mesos/v1/master/master.proto | 4 
 2 files changed, 8 deletions(-)

diff --git a/include/mesos/master/master.proto 
b/include/mesos/master/master.proto
index 8386bd6..315809c 100644
--- a/include/mesos/master/master.proto
+++ b/include/mesos/master/master.proto
@@ -274,10 +274,6 @@ message Call {
 required SlaveID slave_id = 1;
   }
 
-  // EXPERIMENTAL DO NOT USE.
-  //
-  // This feature is not implementation complete.
-  //
   // Updates quota given the provided quota configurations, these 
configurations
   // are applied in an all-or-nothing manner.
   message UpdateQuota {
diff --git a/include/mesos/v1/master/master.proto 
b/include/mesos/v1/master/master.proto
index 893162d..5c99112 100644
--- a/include/mesos/v1/master/master.proto
+++ b/include/mesos/v1/master/master.proto
@@ -275,10 +275,6 @@ message Call {
 required AgentID agent_id = 1;
   }
 
-  // EXPERIMENTAL DO NOT USE.
-  //
-  // This feature is not implementation complete.
-  //
   // Updates quota given the provided quota configurations, these 
configurations
   // are applied in an all-or-nothing manner.
   message UpdateQuota {



[mesos] 02/03: Updated upgrades.md to note quota limits changes.

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 499a57188518e6d541f95e0d10ac264e6f83a735
Author: Benjamin Mahler 
AuthorDate: Thu Aug 29 12:59:54 2019 -0400

Updated upgrades.md to note quota limits changes.

In particular:

  * UPDATE_QUOTA replaces the old SET_QUOTA and REMOVE_QUOTA calls.

  * Quota guarantees are still functional, but deprecated in
preparation for optimistic offers.

Review: https://reviews.apache.org/r/71412
---
 docs/upgrades.md | 21 +
 1 file changed, 21 insertions(+)

diff --git a/docs/upgrades.md b/docs/upgrades.md
index 0345e22..d36a9a4 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -48,6 +48,7 @@ We categorize the changes as follows:
 
   
 
+  A Quota Limits
   A Linux NNP isolator
   A hostname_validation_scheme
   C TLS certificate 
verification behaviour
@@ -78,6 +79,10 @@ We categorize the changes as follows:
 
   
 
+  D SET_QUOTA and REMOVE QUOTA deprecated
+in favor of UPDATE_QUOTA
+  D Quota guarantees deprecated in 
favor
+of using quota limits
 
   
 
@@ -516,6 +521,7 @@ We categorize the changes as follows:
 ## Upgrading from 1.8.x to 1.9.x ##
 
 
+
   * A new `DRAINING` state has been added to Mesos agents. Once an agent is 
draining, all tasks running on that agent are gracefully
 killed and no offers for that agent are sent to schedulers, preventing the 
launching of new tasks.
 Operators can put an agent into `DRAINING` state by using the 
`DRAIN_AGENT` operator API call.
@@ -525,17 +531,21 @@ We categorize the changes as follows:
 * The Mesos agent now requires the new `AGENT_DRAINING` feature. This 
capability is set by default, but if the `--agent_features` flag is specified 
explicitly, `AGENT_DRAINING` must be included.
 
 
+
   * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.
 
 
+
   * A new 
[`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag 
has been added. This causes the agent to ignore any runtime configuration 
present in Docker images.
 
 
+
 * A new libprocess TLS flag `--hostname_validation_scheme` along with the 
corresponding environment variable `LIBPROCESS_SSL_HOSTNAME_VALIDATION_SCHEME`
   has been added. Using this flag, users can configure the way libprocess 
performs hostname validation for TLS connections.
   See [`docs/ssl`](ssl.md) for details.
 
 
+
 * The semantics of the libprocess environment variables 
`LIBPROCESS_SSL_VERIFY_CERT` and `LIBPROCESS_SSL_REQUIRE_CERT` have been 
slightly updated such that
   the former now only applies to client-mode and the latter only to 
server-mode connections. As part of this re-adjustment, the following two 
changes have
   been introduced that might require changes for operators running Mesos in 
unusual TLS configurations.
@@ -548,8 +558,19 @@ We categorize the changes as follows:
 the `LIBPROCESS_SSL_REQUIRE_CERT` option is set to true.
 
 
+
 * The Mesos containerizer now supports configurable IPC namespace and 
/dev/shm. Container can be configured to have a private IPC namespace and 
/dev/shm or share them from its parent via the field `LinuxInfo.ipc_mode`, and 
the size of its private /dev/shm is also configurable via the field 
`LinuxInfo.shm_size`. Operators can control whether it is allowed to share 
host's IPC namespace and /dev/shm with top level containers via the agent flag 
`--disallow_sharing_agent_ipc_namespace`, and s [...]
 
+
+
+* The `SET_QUOTA` and `REMOVE QUOTA` master calls are deprecated in favor of a 
new `UPDATE_QUOTA` master call.
+
+
+
+* Prior to Mesos 1.9, the quota related APIs only exposed quota "guarantees" 
which ensured a minimum amount of resources would be available to a role. 
Setting guarantees also set implicit quota limits. In Mesos 1.9+, quota limits 
are now exposed directly.
+  * Quota guarantees are now deprecated in favor of using only quota limits. 
Enforcement of quota guarantees required that Mesos holds back enough resources 
to meet all of the unsatisfied quota guarantees. Since Mesos is moving towards 
an optimistic offer model (to improve multi-role / multi- scheduler 
scalability, see MESOS-1607), it will become no longer possible to enforce 
quota guarantees by holding back resources. In such a model, quota limits are 
simple to enforce, but quota guaran [...]
+  * For these reasons, quota guarantees, while still functional in Mesos 1.9, 
are now deprecated. A combination of limits and priority based preemption will 
be simpler in an optimistic offer model.
+
 ## Upgrading from 1.7.x to 1.8.x ##
 
 



[mesos] branch master updated: Fixed formatting in the upgrade docs.

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new e5bf1b6  Fixed formatting in the upgrade docs.
e5bf1b6 is described below

commit e5bf1b61ec140f70ad90d522dc37a4ea82554221
Author: Greg Mann 
AuthorDate: Fri Aug 30 11:48:18 2019 -0300

Fixed formatting in the upgrade docs.
---
 docs/upgrades.md | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/upgrades.md b/docs/upgrades.md
index d36a9a4..d745752 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -522,21 +522,22 @@ We categorize the changes as follows:
 
 
 
-  * A new `DRAINING` state has been added to Mesos agents. Once an agent is 
draining, all tasks running on that agent are gracefully
-killed and no offers for that agent are sent to schedulers, preventing the 
launching of new tasks.
-Operators can put an agent into `DRAINING` state by using the 
`DRAIN_AGENT` operator API call.
-See [`docs/maintenance`](maintenance.md) for details.
+* A new `DRAINING` state has been added to Mesos agents. Once an agent is 
draining, all tasks running on that agent are gracefully
+  killed and no offers for that agent are sent to schedulers, preventing the 
launching of new tasks.
+  Operators can put an agent into `DRAINING` state by using the `DRAIN_AGENT` 
operator API call.
+  See [`docs/maintenance`](maintenance.md) for details.
 
 
+
 * The Mesos agent now requires the new `AGENT_DRAINING` feature. This 
capability is set by default, but if the `--agent_features` flag is specified 
explicitly, `AGENT_DRAINING` must be included.
 
 
 
-  * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.
+* A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.
 
 
 
-  * A new 
[`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag 
has been added. This causes the agent to ignore any runtime configuration 
present in Docker images.
+* A new 
[`--docker_ignore_runtime`](configuration/agent.md#docker_ignore_runtime) flag 
has been added. This causes the agent to ignore any runtime configuration 
present in Docker images.
 
 
 



[mesos] branch 1.9.x updated: Updated docs for the AGENT_DRAINING capability.

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/1.9.x by this push:
 new 091f193  Updated docs for the AGENT_DRAINING capability.
091f193 is described below

commit 091f193685142e5187a4f84d55ed1e28ac34749c
Author: Greg Mann 
AuthorDate: Fri Aug 30 10:23:10 2019 -0300

Updated docs for the AGENT_DRAINING capability.

Review: https://reviews.apache.org/r/71405/
---
 docs/configuration/agent.md | 7 ---
 docs/upgrades.md| 4 
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md
index 760d22b..91e38c2 100644
--- a/docs/configuration/agent.md
+++ b/docs/configuration/agent.md
@@ -92,8 +92,8 @@ Example:
   
   
 JSON representation of agent features to whitelist. We always require
-'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', and
-'AGENT_OPERATION_FEEDBACK'.
+'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT',
+'AGENT_OPERATION_FEEDBACK', and 'AGENT_DRAINING'.
 
 Example:
 
@@ -102,7 +102,8 @@ Example:
 {"type": "MULTI_ROLE"},
 {"type": "HIERARCHICAL_ROLE"},
 {"type": "RESERVATION_REFINEMENT"},
-{"type": "AGENT_OPERATION_FEEDBACK"}
+{"type": "AGENT_OPERATION_FEEDBACK"},
+{"type": "AGENT_DRAINING"}
 ]
 }
 
diff --git a/docs/upgrades.md b/docs/upgrades.md
index ded4a8d..0345e22 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -61,6 +61,7 @@ We categorize the changes as follows:
   A docker_ignore_runtime
   A disallow_sharing_agent_ipc_namespace
   A default_container_shm_size
+  C agent_features
 
   
 
@@ -520,6 +521,9 @@ We categorize the changes as follows:
 Operators can put an agent into `DRAINING` state by using the 
`DRAIN_AGENT` operator API call.
 See [`docs/maintenance`](maintenance.md) for details.
 
+
+* The Mesos agent now requires the new `AGENT_DRAINING` feature. This 
capability is set by default, but if the `--agent_features` flag is specified 
explicitly, `AGENT_DRAINING` must be included.
+
 
   * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.
 



[mesos] branch master updated: Updated docs for the AGENT_DRAINING capability.

2019-08-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 93963c4  Updated docs for the AGENT_DRAINING capability.
93963c4 is described below

commit 93963c46b803f59554e12afe9c66605600c7f5b8
Author: Greg Mann 
AuthorDate: Fri Aug 30 10:23:10 2019 -0300

Updated docs for the AGENT_DRAINING capability.

Review: https://reviews.apache.org/r/71405/
---
 docs/configuration/agent.md | 7 ---
 docs/upgrades.md| 4 
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md
index 760d22b..91e38c2 100644
--- a/docs/configuration/agent.md
+++ b/docs/configuration/agent.md
@@ -92,8 +92,8 @@ Example:
   
   
 JSON representation of agent features to whitelist. We always require
-'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT', and
-'AGENT_OPERATION_FEEDBACK'.
+'MULTI_ROLE', 'HIERARCHICAL_ROLE', 'RESERVATION_REFINEMENT',
+'AGENT_OPERATION_FEEDBACK', and 'AGENT_DRAINING'.
 
 Example:
 
@@ -102,7 +102,8 @@ Example:
 {"type": "MULTI_ROLE"},
 {"type": "HIERARCHICAL_ROLE"},
 {"type": "RESERVATION_REFINEMENT"},
-{"type": "AGENT_OPERATION_FEEDBACK"}
+{"type": "AGENT_OPERATION_FEEDBACK"},
+{"type": "AGENT_DRAINING"}
 ]
 }
 
diff --git a/docs/upgrades.md b/docs/upgrades.md
index 31f4a19..d36a9a4 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -62,6 +62,7 @@ We categorize the changes as follows:
   A docker_ignore_runtime
   A disallow_sharing_agent_ipc_namespace
   A default_container_shm_size
+  C agent_features
 
   
 
@@ -526,6 +527,9 @@ We categorize the changes as follows:
 Operators can put an agent into `DRAINING` state by using the 
`DRAIN_AGENT` operator API call.
 See [`docs/maintenance`](maintenance.md) for details.
 
+
+* The Mesos agent now requires the new `AGENT_DRAINING` feature. This 
capability is set by default, but if the `--agent_features` flag is specified 
explicitly, `AGENT_DRAINING` must be included.
+
 
 
   * A new [`linux/nnp`](isolators/linux-nnp.md) isolator has been added. The 
isolator supports setting of the `no_new_privs` bit in the container, 
preventing tasks from acquiring additional privileges.



[mesos] 01/02: Fixed a memory leak in the master's 'removeTask()' helper.

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 6a9cee7999be0a3a4f89d21ec58947fe90c01eeb
Author: Greg Mann 
AuthorDate: Tue Apr 23 22:25:21 2019 -0700

Fixed a memory leak in the master's 'removeTask()' helper.

Previously, all removed tasks were added to the
`slaves.unreachableTasks` map. This patch adds a conditional
so that removed tasks are only added to that structure when
they are being marked unreachable.

Review: https://reviews.apache.org/r/70518/
---
 src/master/master.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 66e8e92..3b58964 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11035,7 +11035,10 @@ void Master::removeTask(Task* task, bool unreachable)
   << " on agent " << *slave;
   }
 
-  slaves.unreachableTasks[slave->id].put(task->framework_id(), 
task->task_id());
+  if (unreachable) {
+slaves.unreachableTasks[slave->id].put(
+task->framework_id(), task->task_id());
+  }
 
   // Remove from framework.
   Framework* framework = getFramework(task->framework_id());



[mesos] branch 1.6.x updated (23020e1 -> c6da50d)

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 23020e1  Fixed a compilation error on clang build.
 new 6a9cee7  Fixed a memory leak in the master's 'removeTask()' helper.
 new c6da50d  Transitioned tasks when an unreachable agent is marked as 
gone.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/master/http.cpp |  10 +--
 src/master/master.cpp   | 105 +++---
 src/master/master.hpp   |   2 +-
 src/tests/api_tests.cpp | 196 
 4 files changed, 293 insertions(+), 20 deletions(-)



[mesos] 02/02: Transitioned tasks when an unreachable agent is marked as gone.

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit c6da50d10511a1046b8d4bc563dc3ccee875
Author: Greg Mann 
AuthorDate: Tue Apr 23 22:25:29 2019 -0700

Transitioned tasks when an unreachable agent is marked as gone.

This patch updates the master code responsible for marking
agents as gone to properly transition tasks on agents which
were previously marked as unreachable.

Review: https://reviews.apache.org/r/70519/
---
 src/master/http.cpp |  10 +--
 src/master/master.cpp   | 100 +---
 src/master/master.hpp   |   2 +-
 src/tests/api_tests.cpp | 196 
 4 files changed, 289 insertions(+), 19 deletions(-)

diff --git a/src/master/http.cpp b/src/master/http.cpp
index 0492b97..103b7f5 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -5225,15 +5225,7 @@ Future Master::Http::_markAgentGone(const 
SlaveID& slaveId) const
  << registrarResult.failure();
 }
 
-Slave* slave = master->slaves.registered.get(slaveId);
-
-// This can happen if the agent that is being marked as
-// gone is not currently registered (unreachable/recovered).
-if (slave == nullptr) {
-  return;
-}
-
-master->markGone(slave, goneTime);
+master->markGone(slaveId, goneTime);
   }));
 
   return gone.then([]() -> Future {
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3b58964..804de69 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8881,20 +8881,102 @@ void Master::_markUnreachable(
 }
 
 
-void Master::markGone(Slave* slave, const TimeInfo& goneTime)
+void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime)
 {
-  CHECK_NOTNULL(slave);
-  CHECK(slaves.markingGone.contains(slave->info.id()));
-  slaves.markingGone.erase(slave->info.id());
+  CHECK(slaves.markingGone.contains(slaveId));
+
+  slaves.markingGone.erase(slaveId);
+
+  slaves.gone[slaveId] = goneTime;
+
+  const string message = "Agent has been marked gone";
+
+  Slave* slave = slaves.registered.get(slaveId);
 
-  slaves.gone[slave->id] = goneTime;
+  // If the `Slave` struct does not exist, then the agent
+  // must be either recovered or unreachable.
+  if (slave == nullptr) {
+CHECK(slaves.recovered.contains(slaveId) ||
+  slaves.unreachable.contains(slaveId));
+
+// When a recovered agent is marked gone, we have no task metadata to use 
in
+// order to send task status updates. We could retain this agent ID and 
send
+// updates upon reregistration but do not currently do this. See 
MESOS-9739.
+if (slaves.recovered.contains(slaveId)) {
+  return;
+}
+
+slaves.unreachable.erase(slaveId);
+
+// TODO(vinod): Consider moving these tasks into `completedTasks` by
+// transitioning them to a terminal state and sending status updates.
+// But it's not clear what this state should be. If a framework
+// reconciles these tasks after this point it would get `TASK_UNKNOWN`
+// which seems appropriate but we don't keep tasks in this state in-memory.
+if (slaves.unreachableTasks.contains(slaveId)) {
+  foreachkey (const FrameworkID& frameworkId,
+  slaves.unreachableTasks.at(slaveId)) {
+Framework* framework = getFramework(frameworkId);
+if (framework == nullptr) {
+  continue;
+}
+
+TaskState newTaskState = TASK_GONE_BY_OPERATOR;
+TaskStatus::Reason newTaskReason =
+  TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR;
+
+if (!framework->capabilities.partitionAware) {
+  newTaskState = TASK_LOST;
+  newTaskReason = TaskStatus::REASON_SLAVE_REMOVED;
+}
+
+foreach (const TaskID& taskId,
+ slaves.unreachableTasks.at(slaveId).get(frameworkId)) {
+  if (framework->unreachableTasks.contains(taskId)) {
+const Owned& task = framework->unreachableTasks.at(taskId);
+
+const StatusUpdate& update = protobuf::createStatusUpdate(
+task->framework_id(),
+task->slave_id(),
+task->task_id(),
+newTaskState,
+TaskStatus::SOURCE_MASTER,
+None(),
+message,
+newTaskReason,
+(task->has_executor_id()
+   ? Option(task->executor_id())
+   : None()));
+
+updateTask(task.get(), update);
+
+if (!framework->connected()) {
+  LOG(WARNING) << "Dropping update " << update
+   << " for disconnected "
+   << " framework " << frameworkId;
+} else {

[mesos] 02/02: Transitioned tasks when an unreachable agent is marked as gone.

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 61f1155675bd3bc5312e0501ea6182d2ee7434af
Author: Greg Mann 
AuthorDate: Tue Apr 23 22:25:29 2019 -0700

Transitioned tasks when an unreachable agent is marked as gone.

This patch updates the master code responsible for marking
agents as gone to properly transition tasks on agents which
were previously marked as unreachable.

Review: https://reviews.apache.org/r/70519/
---
 src/master/http.cpp |  10 +--
 src/master/master.cpp   | 100 +---
 src/master/master.hpp   |   2 +-
 src/tests/api_tests.cpp | 196 
 4 files changed, 289 insertions(+), 19 deletions(-)

diff --git a/src/master/http.cpp b/src/master/http.cpp
index e2773ed..30dddc1 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -5331,15 +5331,7 @@ Future Master::Http::_markAgentGone(const 
SlaveID& slaveId) const
  << registrarResult.failure();
 }
 
-Slave* slave = master->slaves.registered.get(slaveId);
-
-// This can happen if the agent that is being marked as
-// gone is not currently registered (unreachable/recovered).
-if (slave == nullptr) {
-  return;
-}
-
-master->markGone(slave, goneTime);
+master->markGone(slaveId, goneTime);
   }));
 
   return gone.then([]() -> Future {
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 08a5133..1a95b69 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8968,20 +8968,102 @@ void Master::_markUnreachable(
 }
 
 
-void Master::markGone(Slave* slave, const TimeInfo& goneTime)
+void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime)
 {
-  CHECK_NOTNULL(slave);
-  CHECK(slaves.markingGone.contains(slave->info.id()));
-  slaves.markingGone.erase(slave->info.id());
+  CHECK(slaves.markingGone.contains(slaveId));
+
+  slaves.markingGone.erase(slaveId);
+
+  slaves.gone[slaveId] = goneTime;
+
+  const string message = "Agent has been marked gone";
+
+  Slave* slave = slaves.registered.get(slaveId);
 
-  slaves.gone[slave->id] = goneTime;
+  // If the `Slave` struct does not exist, then the agent
+  // must be either recovered or unreachable.
+  if (slave == nullptr) {
+CHECK(slaves.recovered.contains(slaveId) ||
+  slaves.unreachable.contains(slaveId));
+
+// When a recovered agent is marked gone, we have no task metadata to use 
in
+// order to send task status updates. We could retain this agent ID and 
send
+// updates upon reregistration but do not currently do this. See 
MESOS-9739.
+if (slaves.recovered.contains(slaveId)) {
+  return;
+}
+
+slaves.unreachable.erase(slaveId);
+
+// TODO(vinod): Consider moving these tasks into `completedTasks` by
+// transitioning them to a terminal state and sending status updates.
+// But it's not clear what this state should be. If a framework
+// reconciles these tasks after this point it would get `TASK_UNKNOWN`
+// which seems appropriate but we don't keep tasks in this state in-memory.
+if (slaves.unreachableTasks.contains(slaveId)) {
+  foreachkey (const FrameworkID& frameworkId,
+  slaves.unreachableTasks.at(slaveId)) {
+Framework* framework = getFramework(frameworkId);
+if (framework == nullptr) {
+  continue;
+}
+
+TaskState newTaskState = TASK_GONE_BY_OPERATOR;
+TaskStatus::Reason newTaskReason =
+  TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR;
+
+if (!framework->capabilities.partitionAware) {
+  newTaskState = TASK_LOST;
+  newTaskReason = TaskStatus::REASON_SLAVE_REMOVED;
+}
+
+foreach (const TaskID& taskId,
+ slaves.unreachableTasks.at(slaveId).get(frameworkId)) {
+  if (framework->unreachableTasks.contains(taskId)) {
+const Owned& task = framework->unreachableTasks.at(taskId);
+
+const StatusUpdate& update = protobuf::createStatusUpdate(
+task->framework_id(),
+task->slave_id(),
+task->task_id(),
+newTaskState,
+TaskStatus::SOURCE_MASTER,
+None(),
+message,
+newTaskReason,
+(task->has_executor_id()
+   ? Option(task->executor_id())
+   : None()));
+
+updateTask(task.get(), update);
+
+if (!framework->connected()) {
+  LOG(WARNING) << "Dropping update " << update
+   << " for disconnected "
+   << " framework " << frameworkId;
+} else {

[mesos] 01/02: Fixed a memory leak in the master's 'removeTask()' helper.

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 0c5e78bc26653d26a03b08b82923ea517de46fc0
Author: Greg Mann 
AuthorDate: Tue Apr 23 22:25:21 2019 -0700

Fixed a memory leak in the master's 'removeTask()' helper.

Previously, all removed tasks were added to the
`slaves.unreachableTasks` map. This patch adds a conditional
so that removed tasks are only added to that structure when
they are being marked unreachable.

Review: https://reviews.apache.org/r/70518/
---
 src/master/master.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3f0c8c0..08a5133 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11194,7 +11194,10 @@ void Master::removeTask(Task* task, bool unreachable)
   << " on agent " << *slave;
   }
 
-  slaves.unreachableTasks[slave->id].put(task->framework_id(), 
task->task_id());
+  if (unreachable) {
+slaves.unreachableTasks[slave->id].put(
+task->framework_id(), task->task_id());
+  }
 
   // Remove from framework.
   Framework* framework = getFramework(task->framework_id());



[mesos] 01/02: Fixed a memory leak in the master's 'removeTask()' helper.

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 6f90cc334701fad10e721312cd4cbd0690e1c6ec
Author: Greg Mann 
AuthorDate: Tue Apr 23 22:25:21 2019 -0700

Fixed a memory leak in the master's 'removeTask()' helper.

Previously, all removed tasks were added to the
`slaves.unreachableTasks` map. This patch adds a conditional
so that removed tasks are only added to that structure when
they are being marked unreachable.

Review: https://reviews.apache.org/r/70518/
---
 src/master/master.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index 5488b7b..9730e65 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11780,7 +11780,10 @@ void Master::removeTask(Task* task, bool unreachable)
   << " on agent " << *slave;
   }
 
-  slaves.unreachableTasks[slave->id].put(task->framework_id(), 
task->task_id());
+  if (unreachable) {
+slaves.unreachableTasks[slave->id].put(
+task->framework_id(), task->task_id());
+  }
 
   // Remove from framework.
   Framework* framework = getFramework(task->framework_id());



[mesos] 02/02: Transitioned tasks when an unreachable agent is marked as gone.

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 13e4cd1c42ae88094f14d6b05cfb9832d4494193
Author: Greg Mann 
AuthorDate: Tue Apr 23 22:25:29 2019 -0700

Transitioned tasks when an unreachable agent is marked as gone.

This patch updates the master code responsible for marking
agents as gone to properly transition tasks on agents which
were previously marked as unreachable.

Review: https://reviews.apache.org/r/70519/
---
 src/master/http.cpp |  10 +--
 src/master/master.cpp   | 100 +---
 src/master/master.hpp   |   2 +-
 src/tests/api_tests.cpp | 196 
 4 files changed, 289 insertions(+), 19 deletions(-)

diff --git a/src/master/http.cpp b/src/master/http.cpp
index e7a92d0..765bbf1 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -4171,15 +4171,7 @@ Future Master::Http::_markAgentGone(const 
SlaveID& slaveId) const
  << registrarResult.failure();
 }
 
-Slave* slave = master->slaves.registered.get(slaveId);
-
-// This can happen if the agent that is being marked as
-// gone is not currently registered (unreachable/recovered).
-if (slave == nullptr) {
-  return;
-}
-
-master->markGone(slave, goneTime);
+master->markGone(slaveId, goneTime);
   }));
 
   return gone.then([]() -> Future {
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 9730e65..c9b0a38 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -9246,18 +9246,100 @@ void Master::_markUnreachable(
 }
 
 
-void Master::markGone(Slave* slave, const TimeInfo& goneTime)
+void Master::markGone(const SlaveID& slaveId, const TimeInfo& goneTime)
 {
-  CHECK_NOTNULL(slave);
-  CHECK(slaves.markingGone.contains(slave->info.id()));
-  slaves.markingGone.erase(slave->info.id());
+  CHECK(slaves.markingGone.contains(slaveId));
+
+  slaves.markingGone.erase(slaveId);
+
+  slaves.gone[slaveId] = goneTime;
+
+  const string message = "Agent has been marked gone";
+
+  Slave* slave = slaves.registered.get(slaveId);
 
-  slaves.gone[slave->id] = goneTime;
+  // If the `Slave` struct does not exist, then the agent
+  // must be either recovered or unreachable.
+  if (slave == nullptr) {
+CHECK(slaves.recovered.contains(slaveId) ||
+  slaves.unreachable.contains(slaveId));
+
+// When a recovered agent is marked gone, we have no task metadata to use 
in
+// order to send task status updates. We could retain this agent ID and 
send
+// updates upon reregistration but do not currently do this. See 
MESOS-9739.
+if (slaves.recovered.contains(slaveId)) {
+  return;
+}
+
+slaves.unreachable.erase(slaveId);
+
+// TODO(vinod): Consider moving these tasks into `completedTasks` by
+// transitioning them to a terminal state and sending status updates.
+// But it's not clear what this state should be. If a framework
+// reconciles these tasks after this point it would get `TASK_UNKNOWN`
+// which seems appropriate but we don't keep tasks in this state in-memory.
+if (slaves.unreachableTasks.contains(slaveId)) {
+  foreachkey (const FrameworkID& frameworkId,
+  slaves.unreachableTasks.at(slaveId)) {
+Framework* framework = getFramework(frameworkId);
+if (framework == nullptr) {
+  continue;
+}
+
+TaskState newTaskState = TASK_GONE_BY_OPERATOR;
+TaskStatus::Reason newTaskReason =
+  TaskStatus::REASON_SLAVE_REMOVED_BY_OPERATOR;
+
+if (!framework->capabilities.partitionAware) {
+  newTaskState = TASK_LOST;
+  newTaskReason = TaskStatus::REASON_SLAVE_REMOVED;
+}
+
+foreach (const TaskID& taskId,
+ slaves.unreachableTasks.at(slaveId).get(frameworkId)) {
+  if (framework->unreachableTasks.contains(taskId)) {
+const Owned& task = framework->unreachableTasks.at(taskId);
+
+const StatusUpdate& update = protobuf::createStatusUpdate(
+task->framework_id(),
+task->slave_id(),
+task->task_id(),
+newTaskState,
+TaskStatus::SOURCE_MASTER,
+None(),
+message,
+newTaskReason,
+(task->has_executor_id()
+   ? Option(task->executor_id())
+   : None()));
+
+updateTask(task.get(), update);
+
+if (!framework->connected()) {
+  LOG(WARNING) << "Dropping update " << update
+   << " for disconnected "
+   << " framework " << frameworkId;
+} else {

[mesos] branch 1.8.x updated (35bfd8a -> 13e4cd1)

2019-08-20 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch 1.8.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 35bfd8a  Added MESOS-9925 to 1.8.2 CHANGELOG.
 new 6f90cc3  Fixed a memory leak in the master's 'removeTask()' helper.
 new 13e4cd1  Transitioned tasks when an unreachable agent is marked as 
gone.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/master/http.cpp |  10 +--
 src/master/master.cpp   | 105 +++---
 src/master/master.hpp   |   2 +-
 src/tests/api_tests.cpp | 196 
 4 files changed, 293 insertions(+), 20 deletions(-)



[mesos] branch master updated: Updated maintenance docs to include agent draining.

2019-08-06 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new e6edb0a  Updated maintenance docs to include agent draining.
e6edb0a is described below

commit e6edb0a1cf9fd2f8ab7fcb5c291b7a14389118a6
Author: Greg Mann 
AuthorDate: Tue Aug 6 09:25:09 2019 +0200

Updated maintenance docs to include agent draining.

Review: https://reviews.apache.org/r/71219/
---
 docs/maintenance.md   | 137 +-
 docs/operator-http-api.md |  85 
 2 files changed, 219 insertions(+), 3 deletions(-)

diff --git a/docs/maintenance.md b/docs/maintenance.md
index bec69e0..10eeea1 100644
--- a/docs/maintenance.md
+++ b/docs/maintenance.md
@@ -1,9 +1,9 @@
 ---
-title: Apache Mesos - Maintenance Primitives
+title: Apache Mesos - Performing Maintenance
 layout: documentation
 ---
 
-# Maintenance Primitives
+# Performing Node Maintenance in a Mesos Cluster
 
 Operators regularly need to perform maintenance tasks on machines that comprise
 a Mesos cluster.  Most Mesos upgrades can be done without affecting running
@@ -14,6 +14,137 @@ For example:
 * Kernel upgrades
 * Agent upgrades (e.g., adjusting agent attributes or resources)
 
+Before performing maintenance on an agent node in a Mesos cluster, it is
+typically desirable to gracefully migrate tasks away from the node beforehand 
in
+order to minimize service disruption when the machine is taken down. Mesos
+provides several ways to accomplish this migration:
+
+* Automatic agent draining, which does not explicitly require cooperation from
+  schedulers
+* Manual node draining, which allows operators to exercise precise control over
+  the task draining process
+* Maintenance primitives, which permit complex coordination but do require that
+  schedulers react to the maintenance-related messages that they receive
+
+# Automatic Node Draining
+
+Node draining was added to provide a simple method for operators to drain tasks
+from nodes on which they plan to perform maintenance, without requiring that
+schedulers implement support for any maintenance-specific messages.
+
+Initiating draining will cause all tasks on the target agent node to receive a
+kill event immediately, assuming the agent is currently reachable. If the agent
+is unreachable, initiation of the kill event will be delayed until the agent is
+reachable by the master again. When the tasks receive a kill event, a SIGTERM
+signal will be sent to the task to begin the killing process. Depending on the
+particular task's behavior, this signal may be sufficient to terminate it. Some
+tasks may use this signal to begin the process of graceful termination, which
+may take some time. After some delay, a SIGKILL signal will be sent to the 
task,
+which forcefully terminates the task if it is still running. The delay between
+the SIGTERM and SIGKILL signals is determined by the length of the task's kill
+grace period. If no grace period is set for the task, a default value of 
several
+seconds will be used.
+
+## Initiating Draining on a Node
+
+To begin draining an agent, issue the operator API [`DRAIN_AGENT`
+call](operator-http-api.md#drain_agent) to the master:
+
+$ curl -X POST -d '{"type": "DRAIN_AGENT", "drain_agent": {"agent_id": 
{"value": ""}}}' masterhost:5050/api/v1
+
+This will immediately begin the process of killing all tasks on the agent. Once
+draining has begun, it cannot be cancelled. To monitor the progress of the
+draining process, you can inspect the state of the agent via the master 
operator
+API [`GET_STATE`](operator-http-api.md#get_state) or
+[`GET_AGENTS`](operator-http-api.md#get_agents) calls:
+
+$ curl -X POST -d '{"type": "GET_AGENTS"}' masterhost:5050/api/v1
+
+Locate the relevant agent and inspect its `drain_info.state` field. While
+draining, the state will be `DRAINING`. When all tasks on the agent have
+terminated, all their terminal status updates have been acknowledged by the
+schedulers, and all offer operations on the agent have finished, draining is
+complete and the agent's drain state will transition to `DRAINED`. At this
+point, the node may be taken down for maintenance.
+
+## Options for Automatic Node Draining
+
+You may set an upper bound on the kill grace period of draining tasks by
+specifying the `max_grace_period` option when draining:
+
+$ curl -X POST -d '{"type": "DRAIN_AGENT", "drain_agent": {"agent_id": 
{"value": ""}, "max_grace_period": "10mins"}}' 
masterhost:5050/api/v1
+
+In cases where you know that the node being drained will not return after
+draining is complete, and you would like it to be automatically permanently
+removed fr

[mesos] branch master updated: Added documentation for GET_OPERATIONS calls.

2019-07-30 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 00bb0b6  Added documentation for GET_OPERATIONS calls.
00bb0b6 is described below

commit 00bb0b6d6abe7700a5adab0bdaf7e91767a2db19
Author: Greg Mann 
AuthorDate: Tue Jul 30 10:58:36 2019 +0200

Added documentation for GET_OPERATIONS calls.

Review: https://reviews.apache.org/r/71199
---
 docs/operator-http-api.md | 174 ++
 1 file changed, 174 insertions(+)

diff --git a/docs/operator-http-api.md b/docs/operator-http-api.md
index 1167838..ddf29e9 100644
--- a/docs/operator-http-api.md
+++ b/docs/operator-http-api.md
@@ -1661,6 +1661,93 @@ Content-Type: application/json
 
 ```
 
+### GET_OPERATIONS
+
+Returns a list of all offer operations throughout the cluster, not including
+`LAUNCH` or `LAUNCH_GROUP` operations which can be retrieved with `GET_TASKS`.
+
+```
+GET_OPERATIONS HTTP Request (JSON):
+
+POST /api/v1  HTTP/1.1
+
+Host: masterhost:5050
+Content-Type: application/json
+Accept: application/json
+
+{
+  "type": "GET_OPERATIONS"
+}
+
+
+GET_OPERATIONS HTTP Response (JSON):
+
+HTTP/1.1 200 OK
+
+Content-Type: application/json
+
+{
+  "type": "GET_OPERATIONS",
+  "get_operations": {
+"operations": [
+  {
+"framework_id": {"value": "74bddcbc-4a02-4d64-b291-aed52032055f-"},
+"agent_id": {"value": "18083noa-j287-dan4-9qx6-l02b84nksb7z-0021"},
+"info": {
+  "type": "CREATE_DISK",
+  "id": {"value": "n2j8nksj-9827-82bk-nd7u-83hbchu7whdk-9978"},
+  "create_disk": {
+"source": {
+  "provider_id": {"value": 
"837hfmi2-u2u7-19pp-1884-812i8f02828j-0030"},
+  "name": "disk",
+  "type": "SCALAR",
+  "scalar": {"value": 1024.0},
+  "role": "storage-role-1",
+  "allocation_info": {"role": "storage-role-1"},
+  "reservation": {
+"type": "DYNAMIC",
+"role": "storage-role-1",
+"principal": "storage-service"
+  },
+  "reservations": [{
+"type": "DYNAMIC",
+"role": "storage-role-1",
+"principal": "storage-service"
+  }],
+  "disk": {
+"source": {
+  "type": "RAW",
+  "vendor": "nas-service",
+  "id": "vol-19827509",
+  "profile": "fast-volume"
+}
+  }
+},
+"target_type": "MOUNT"
+  }
+},
+"latest_status": {
+  "operation_id": {"value": 
"n2j8nksj-9827-82bk-nd7u-83hbchu7whdk-9978"},
+  "state": "OPERATION_PENDING",
+  "uuid": {"value": "28987843-j288-1k0s-l29n-837ybzmo18tj-nv73"},
+  "agent_id": {"value": "18083noa-j287-dan4-9qx6-l02b84nksb7z-0021"},
+  "resource_provider_id": {"value": 
"837hfmi2-u2u7-19pp-1884-812i8f02828j-0030"}
+},
+"statuses": [{
+  "operation_id": {"value": 
"n2j8nksj-9827-82bk-nd7u-83hbchu7whdk-9978"},
+  "state": "OPERATION_PENDING",
+  "uuid": {"value": "28987843-j288-1k0s-l29n-837ybzmo18tj-nv73"},
+  "agent_id": {"value": "18083noa-j287-dan4-9qx6-l02b84nksb7z-0021"},
+  "resource_provider_id": {"value": 
"837hfmi2-u2u7-19pp-1884-812i8f02828j-0030"}
+}],
+"uuid": {"value": "nsj27802-jd82-jd19-jd38-837jdfnoqfij-u284"}
+  }
+]
+  }
+}
+
+```
+
 ### GET_WEIGHTS
 
 This call retrieves the information about role weights.
@@ -3590,6 +3677,93 @@ Content-Type: application/json
 
 ```
 
+### GET_OPERATIONS
+
+Returns a list of all offer operations known to the agent, not including
+`LAUNCH` or `LAUNCH_GROUP` operations which can be retrieved with `GET_TASKS`.
+
+```
+GET_OPERATIONS HTTP Request (JSON):
+
+POST /api/v1  HTTP/1.1

[mesos] branch master updated (4b15fbd -> 7e160a3)

2019-07-25 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from 4b15fbd  Exposed agent drain information in the webui.
 new a1e4a9a  Moved the Docker executor declaration into a header.
 new 4cbda17  Enabled the Docker executor to accept kill policy overrides.
 new 7e160a3  Added test to verify that Docker executor can override kill 
policy.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/CMakeLists.txt |   4 +-
 src/Makefile.am|   3 +-
 src/docker/CMakeLists.txt  |  20 --
 src/docker/executor.cpp| 388 +++--
 src/docker/executor.hpp|  58 +++
 src/exec/exec.cpp  |  22 +-
 src/internal/evolve.cpp|   6 +
 src/internal/evolve.hpp|   1 +
 src/launcher/CMakeLists.txt|   5 +
 src/launcher/docker_executor.cpp   | 266 ++
 .../containerizer/docker_containerizer_tests.cpp   | 172 +
 11 files changed, 651 insertions(+), 294 deletions(-)
 delete mode 100644 src/docker/CMakeLists.txt
 create mode 100644 src/launcher/docker_executor.cpp



[mesos] 03/03: Added test to verify that Docker executor can override kill policy.

2019-07-25 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 7e160a36918ad73f79c05cb53a48b7424958e497
Author: Greg Mann 
AuthorDate: Thu Jul 25 12:17:45 2019 -0700

Added test to verify that Docker executor can override kill policy.

This adds a test which verifies that when a scheduler attemps to
override a task's default kill policy, the Docker executor will
honor that override.

Review: https://reviews.apache.org/r/71035/
---
 src/internal/evolve.cpp|   6 +
 src/internal/evolve.hpp|   1 +
 .../containerizer/docker_containerizer_tests.cpp   | 172 +
 3 files changed, 179 insertions(+)

diff --git a/src/internal/evolve.cpp b/src/internal/evolve.cpp
index 81de15e..c5e4151 100644
--- a/src/internal/evolve.cpp
+++ b/src/internal/evolve.cpp
@@ -86,6 +86,12 @@ v1::AgentInfo evolve(const SlaveInfo& slaveInfo)
 }
 
 
+v1::ContainerInfo evolve(const ContainerInfo& containerInfo)
+{
+  return evolve(containerInfo);
+}
+
+
 v1::DomainInfo evolve(const DomainInfo& domainInfo)
 {
   return evolve(domainInfo);
diff --git a/src/internal/evolve.hpp b/src/internal/evolve.hpp
index ffbb342..e4e3ab4 100644
--- a/src/internal/evolve.hpp
+++ b/src/internal/evolve.hpp
@@ -62,6 +62,7 @@ namespace internal {
 // Helpers for evolving types between versions. Please add as necessary!
 v1::AgentID evolve(const SlaveID& slaveId);
 v1::AgentInfo evolve(const SlaveInfo& slaveInfo);
+v1::ContainerInfo evolve(const ContainerInfo& containerInfo);
 v1::DomainInfo evolve(const DomainInfo& domainInfo);
 v1::DrainInfo evolve(const DrainInfo& drainInfo);
 v1::ExecutorID evolve(const ExecutorID& executorId);
diff --git a/src/tests/containerizer/docker_containerizer_tests.cpp 
b/src/tests/containerizer/docker_containerizer_tests.cpp
index a621758..3d932a5 100644
--- a/src/tests/containerizer/docker_containerizer_tests.cpp
+++ b/src/tests/containerizer/docker_containerizer_tests.cpp
@@ -20,6 +20,8 @@
 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -5240,6 +5242,176 @@ TEST_F(HungDockerTest, 
ROOT_DOCKER_InspectHungDuringPull)
   driver.join();
 }
 
+
+// This test is disabled on windows due to the bash-specific
+// command used in the task below.
+TEST_F_TEMP_DISABLED_ON_WINDOWS(
+DockerContainerizerTest, ROOT_DOCKER_OverrideKillPolicy)
+{
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockDocker* mockDocker =
+new MockDocker(tests::flags.docker, tests::flags.docker_socket);
+
+  Shared docker(mockDocker);
+
+  slave::Flags flags = CreateSlaveFlags();
+
+  Fetcher fetcher(flags);
+
+  Try logger =
+ContainerLogger::create(flags.container_logger);
+
+  ASSERT_SOME(logger);
+
+  Future slaveRegisteredMessage =
+FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  MockDockerContainerizer dockerContainerizer(
+  flags,
+  ,
+  Owned(logger.get()),
+  docker);
+
+  Owned detector = master.get()->createDetector();
+
+  Try> slave =
+StartSlave(detector.get(), , flags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(slaveRegisteredMessage);
+
+  auto scheduler = std::make_shared();
+
+  EXPECT_CALL(*scheduler, connected(_))
+.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO));
+
+  Future subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+.WillOnce(FutureArg<1>());
+
+  Future offers;
+  EXPECT_CALL(*scheduler, offers(_, _))
+.WillOnce(FutureArg<1>())
+.WillRepeatedly(Return());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+.WillRepeatedly(Return()); // Ignore heartbeats.
+
+  v1::scheduler::TestMesos mesos(
+  master.get()->pid,
+  ContentType::PROTOBUF,
+  scheduler);
+
+  AWAIT_READY(subscribed);
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->offers().empty());
+
+  const v1::Offer& offer = offers->offers(0);
+  const v1::AgentID& agentId = offer.agent_id();
+
+  Try parsed =
+v1::Resources::parse("cpus:0.1;mem:32;disk:32");
+
+  ASSERT_SOME(parsed);
+
+  v1::Resources resources = parsed.get();
+
+  // Create a task which ignores SIGTERM so that we can detect
+  // when the task receives SIGKILL.
+  v1::TaskInfo taskInfo = v1::createTask(
+  agentId,
+  resources,
+  "trap \"echo 'SIGTERM received'\" SIGTERM; sleep 99");
+
+  // TODO(tnachen): Use local image to test if possible.
+  taskInfo.mutable_container()->CopyFrom(
+  evolve(createDockerInfo(DOCKER_TEST_IMAGE)));
+
+  {
+// Set a long grace period on the task's kill policy so that we
+// can detect if the override is effective.
+mesos::v1::DurationInfo gracePeriod;
+gracePeriod.set_nanoseconds(Minutes(10).ns());
+
+mesos::v1::KillPolicy killPolicy;

[mesos] 01/03: Moved the Docker executor declaration into a header.

2019-07-25 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit a1e4a9aa1d6f2dee9fd56432122c2fa6a35edb77
Author: Greg Mann 
AuthorDate: Thu Jul 25 12:17:41 2019 -0700

Moved the Docker executor declaration into a header.

This moves the declaration of the Docker executor into the
Docker executor header file and moves the code for the Docker
executor binary into a new launcher implementation file.

This change will enable the Mesos executor driver
implementation to make use of the `DockerExecutor` symbol.

Review: https://reviews.apache.org/r/71033/
---
 src/CMakeLists.txt   |   4 +-
 src/Makefile.am  |   3 +-
 src/docker/CMakeLists.txt|  20 ---
 src/docker/executor.cpp  | 348 +--
 src/docker/executor.hpp  |  53 ++
 src/launcher/CMakeLists.txt  |   5 +
 src/launcher/docker_executor.cpp | 266 ++
 7 files changed, 409 insertions(+), 290 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c455ed6..218a75e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -263,7 +263,8 @@ set(CSI_SRC
   csi/volume_manager.cpp)
 
 set(DOCKER_SRC
-  docker/docker.cpp)
+  docker/docker.cpp
+  docker/executor.cpp)
 
 if (NOT WIN32)
   list(APPEND DOCKER_SRC
@@ -644,7 +645,6 @@ endif ()
 ##
 add_subdirectory(checks)
 add_subdirectory(cli)
-add_subdirectory(docker)
 add_subdirectory(examples)
 add_subdirectory(launcher)
 add_subdirectory(local)
diff --git a/src/Makefile.am b/src/Makefile.am
index 46c66f1..697ab10 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1092,6 +1092,7 @@ libmesos_no_3rdparty_la_SOURCES +=
\
   docker/docker.cpp\
   docker/docker.hpp\
   docker/executor.hpp  \
+  docker/executor.cpp  \
   docker/spec.cpp  \
   examples/flags.hpp   \
   examples/test_anonymous_module.hpp   \
@@ -1818,7 +1819,7 @@ mesos_usage_CPPFLAGS = $(MESOS_CPPFLAGS)
 mesos_usage_LDADD = libmesos.la $(LDADD)
 
 pkglibexec_PROGRAMS += mesos-docker-executor
-mesos_docker_executor_SOURCES = docker/executor.cpp
+mesos_docker_executor_SOURCES = launcher/docker_executor.cpp
 mesos_docker_executor_CPPFLAGS = $(MESOS_CPPFLAGS)
 mesos_docker_executor_LDADD = libmesos.la $(LDADD)
 
diff --git a/src/docker/CMakeLists.txt b/src/docker/CMakeLists.txt
deleted file mode 100644
index 1196664..000
--- a/src/docker/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# THE DOCKER EXECUTOR EXECUTABLE.
-#
-add_executable(mesos-docker-executor executor.cpp)
-target_link_libraries(mesos-docker-executor PRIVATE mesos)
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index f638e4b..de8216f 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -856,291 +856,105 @@ private:
 };
 
 
-class DockerExecutor : public Executor
+DockerExecutor::DockerExecutor(
+const Owned& docker,
+const string& container,
+const string& sandboxDirectory,
+const string& mappedDirectory,
+const Duration& shutdownGracePeriod,
+const string& launcherDir,
+const map& taskEnvironment,
+const Option& defaultContainerDNS,
+bool cgroupsEnableCfs)
 {
-public:
-  DockerExecutor(
-  const Owned& docker,
-  const string& container,
-  const string& sandboxDirectory,
-  const string& mappedDirectory,
-  const Duration& shutdownGracePeriod,
-  const string& launcherDir,
-  const map& taskEnvironment,
-  const Option& defaultContainerDNS,
-  bool cgroupsEnableCfs)
-  {
-process = Owned(new D

[mesos] 02/03: Enabled the Docker executor to accept kill policy overrides.

2019-07-25 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 4cbda17f3667e2b0713a1f1663e50819a076680b
Author: Greg Mann 
AuthorDate: Thu Jul 25 12:17:43 2019 -0700

Enabled the Docker executor to accept kill policy overrides.

This adds a new `killTask()` overload to the Docker executor
and updates the Mesos executor driver to call into that
overload when the loaded executor is the Docker executor.

This allows the executor driver to pass the kill policy
override, when present, into the Docker executor.

Review: https://reviews.apache.org/r/71034/
---
 src/docker/executor.cpp | 48 
 src/docker/executor.hpp |  5 +
 src/exec/exec.cpp   | 22 ++
 3 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index de8216f..132f42b 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -396,15 +396,31 @@ public:
 defer(self(), ::launchHealthCheck, containerName, task));
   }
 
-  void killTask(ExecutorDriver* driver, const TaskID& taskId)
+  void killTask(
+  ExecutorDriver* driver,
+  const TaskID& taskId,
+  const Option& killPolicyOverride = None())
   {
-LOG(INFO) << "Received killTask for task " << taskId.value();
+string overrideMessage = "";
+if (killPolicyOverride.isSome() && killPolicyOverride->has_grace_period()) 
{
+  Duration gracePeriodDuration =
+Nanoseconds(killPolicyOverride->grace_period().nanoseconds());
+
+  overrideMessage =
+" with grace period override of " + stringify(gracePeriodDuration);
+}
+
+LOG(INFO) << "Received killTask" << overrideMessage
+  << " for task " << taskId.value();
 
 // Using shutdown grace period as a default is backwards compatible
 // with the `stop_timeout` flag, deprecated in 1.0.
 Duration gracePeriod = shutdownGracePeriod;
 
-if (killPolicy.isSome() && killPolicy->has_grace_period()) {
+if (killPolicyOverride.isSome() && killPolicyOverride->has_grace_period()) 
{
+  gracePeriod =
+Nanoseconds(killPolicyOverride->grace_period().nanoseconds());
+} else if (killPolicy.isSome() && killPolicy->has_grace_period()) {
   gracePeriod = Nanoseconds(killPolicy->grace_period().nanoseconds());
 }
 
@@ -929,7 +945,12 @@ void DockerExecutor::launchTask(ExecutorDriver* driver, 
const TaskInfo& task)
 
 void DockerExecutor::killTask(ExecutorDriver* driver, const TaskID& taskId)
 {
-  dispatch(process.get(), ::killTask, driver, taskId);
+  // Need to disambiguate overloaded function.
+  void (DockerExecutorProcess::*killTaskMethod)(
+  ExecutorDriver*, const TaskID&, const Option&)
+= ::killTask;
+
+  process::dispatch(process.get(), killTaskMethod, driver, taskId, None());
 }
 
 
@@ -955,6 +976,25 @@ void DockerExecutor::error(ExecutorDriver* driver, const 
string& data)
   dispatch(process.get(), ::error, driver, data);
 }
 
+
+void DockerExecutor::killTask(
+ExecutorDriver* driver,
+const TaskID& taskId,
+const Option& killPolicyOverride)
+{
+  // Need to disambiguate overloaded function.
+  void (DockerExecutorProcess::*killTaskMethod)(
+  ExecutorDriver*, const TaskID&, const Option&)
+= ::killTask;
+
+  process::dispatch(
+  process.get(),
+  killTaskMethod,
+  driver,
+  taskId,
+  killPolicyOverride);
+}
+
 } // namespace docker {
 } // namespace internal {
 } // namespace mesos {
diff --git a/src/docker/executor.hpp b/src/docker/executor.hpp
index dfb8ad0..768c2e1 100644
--- a/src/docker/executor.hpp
+++ b/src/docker/executor.hpp
@@ -151,6 +151,11 @@ public:
 
   void error(ExecutorDriver* driver, const std::string& data) override;
 
+  void killTask(
+  ExecutorDriver* driver,
+  const TaskID& taskId,
+  const Option& killPolicyOverride);
+
 private:
   process::Owned process;
 };
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index c0fa3b6..67e082e 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -47,6 +47,8 @@
 
 #include "common/protobuf_utils.hpp"
 
+#include "docker/executor.hpp"
+
 #include "logging/flags.hpp"
 #include "logging/logging.hpp"
 
@@ -183,8 +185,7 @@ public:
 ::task);
 
 install(
-::killTask,
-::task_id);
+::killTask);
 
 install(
 ::statusUpdateAcknowledgement,
@@ -339,8 +340,10 @@ protected:
 VLOG(1) << "Executor::launchTask took " << stopwatch.elapsed();
   }
 
-  void killTask(const TaskID& taskId)
+  void killTask(KillTaskMessage&& killTa

[mesos] branch master updated: Exposed agent drain information in the webui.

2019-07-25 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new 4b15fbd  Exposed agent drain information in the webui.
4b15fbd is described below

commit 4b15fbdde14eed3a6dbd5c95d271bc26eb7216e2
Author: Benjamin Bannier 
AuthorDate: Thu Jul 25 09:35:40 2019 -0700

Exposed agent drain information in the webui.

Review: https://reviews.apache.org/r/71081/
---
 src/webui/app/agents/agent.html  | 10 ++
 src/webui/app/agents/agents.html |  2 ++
 src/webui/app/controllers.js | 11 +++
 3 files changed, 23 insertions(+)

diff --git a/src/webui/app/agents/agent.html b/src/webui/app/agents/agent.html
index 6d50bfd..25d233b 100644
--- a/src/webui/app/agents/agent.html
+++ b/src/webui/app/agents/agent.html
@@ -56,6 +56,16 @@
 
   
 
+  
+Draining
+
+  Mark gone:
+  {{agent.drain_config.mark_gone}}
+  Max. grace period:
+  {{agent.drain_config.max_grace_period.nanoseconds / 10}} 
seconds
+
+  
+
   Tasks
   
 
diff --git a/src/webui/app/agents/agents.html b/src/webui/app/agents/agents.html
index 98712c6..0c6d330 100644
--- a/src/webui/app/agents/agents.html
+++ b/src/webui/app/agents/agents.html
@@ -13,6 +13,7 @@
 
   ID
   Host
+  State
   CPUs (Allocated / Total)
   GPUs (Allocated / Total)
   Mem (Allocated / Total)
@@ -34,6 +35,7 @@
   
 
 {{agent.hostname}}
+{{agent.state}}
 
   {{agent.used_resources.cpus | number}} / {{agent.resources.cpus | 
number}}
 
diff --git a/src/webui/app/controllers.js b/src/webui/app/controllers.js
index 66cd32e..725230f 100644
--- a/src/webui/app/controllers.js
+++ b/src/webui/app/controllers.js
@@ -198,6 +198,17 @@
 $scope.unreachable_agents = $scope.state.unreachable_slaves;
 
 _.each($scope.state.slaves, function(agent) {
+  // Calculate the agent "state" from activation and drain state.
+  if (!agent.deactivated) {
+agent.state = "Active";
+  } else if (agent.drain_info) {
+// Transform the drain state so only the first letter is capitalized.
+var s = agent.drain_info.state;
+agent.state = s.charAt(0).toUpperCase() + s.slice(1).toLowerCase();
+  } else {
+agent.state = "Deactivated";
+  }
+
   $scope.agents[agent.id] = agent;
   $scope.total_cpus += agent.resources.cpus;
   $scope.total_gpus += agent.resources.gpus;



[mesos] branch master updated: Fixed devolution of 'max_grace_period' field in DRAIN_AGENT call.

2019-07-22 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
 new ff8c9a9  Fixed devolution of 'max_grace_period' field in DRAIN_AGENT 
call.
ff8c9a9 is described below

commit ff8c9a96be6ae1ee47faf9d5b80a518dfb4a3db0
Author: Greg Mann 
AuthorDate: Mon Jul 22 18:48:56 2019 -0700

Fixed devolution of 'max_grace_period' field in DRAIN_AGENT call.

Review: https://reviews.apache.org/r/71140
---
 src/internal/devolve.cpp | 14 +-
 src/tests/api_tests.cpp  |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/internal/devolve.cpp b/src/internal/devolve.cpp
index 2809c25..4527c52 100644
--- a/src/internal/devolve.cpp
+++ b/src/internal/devolve.cpp
@@ -280,7 +280,19 @@ mesos::agent::Response devolve(const v1::agent::Response& 
response)
 
 mesos::master::Call devolve(const v1::master::Call& call)
 {
-  return devolve(call);
+  mesos::master::Call _call = devolve(call);
+
+  // The `google.protobuf.Duration` field in the `DrainAgent` call does not get
+  // devolved automatically with the templated helper, so we devolve it
+  // explicitly here.
+  if (call.type() == v1::master::Call::DRAIN_AGENT &&
+  call.has_drain_agent() &&
+  call.drain_agent().has_max_grace_period()) {
+*_call.mutable_drain_agent()->mutable_max_grace_period() =
+  devolve(call.drain_agent().max_grace_period());
+  }
+
+  return _call;
 }
 
 } // namespace internal {
diff --git a/src/tests/api_tests.cpp b/src/tests/api_tests.cpp
index 3479ed3..641eb15 100644
--- a/src/tests/api_tests.cpp
+++ b/src/tests/api_tests.cpp
@@ -5588,6 +5588,7 @@ TEST_P(MasterAPITest, DrainAgent)
   {
 v1::master::Call::DrainAgent drainAgent;
 drainAgent.mutable_agent_id()->CopyFrom(agentId);
+drainAgent.mutable_max_grace_period()->set_seconds(0);
 
 v1::master::Call call;
 call.set_type(v1::master::Call::DRAIN_AGENT);
@@ -5605,6 +5606,7 @@ TEST_P(MasterAPITest, DrainAgent)
   mesos::v1::DrainInfo drainInfo;
   drainInfo.set_state(mesos::v1::DRAINED);
   drainInfo.mutable_config()->set_mark_gone(false);
+  drainInfo.mutable_config()->mutable_max_grace_period()->set_nanoseconds(0);
 
   // Ensure that the agent's drain info is reflected in the master's
   // GET_AGENTS response.



[mesos] 05/14: Updated an equality operator.

2019-07-15 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 27f0cd3519bafaf058e8347d482475e776d494e1
Author: Greg Mann 
AuthorDate: Mon Jul 15 10:25:47 2019 -0700

Updated an equality operator.

This patch updates the equality operator for the `Task`
message to include two missing conditions. An equality
operator for `HealthCheck` is also added to make this
possible.

Review: https://reviews.apache.org/r/70900/
---
 include/mesos/type_utils.hpp |  1 +
 src/common/type_utils.cpp| 10 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/mesos/type_utils.hpp b/include/mesos/type_utils.hpp
index ed9190b..b9e6164 100644
--- a/include/mesos/type_utils.hpp
+++ b/include/mesos/type_utils.hpp
@@ -62,6 +62,7 @@ bool operator==(
 bool operator==(const DiscoveryInfo& left, const DiscoveryInfo& right);
 bool operator==(const Environment& left, const Environment& right);
 bool operator==(const ExecutorInfo& left, const ExecutorInfo& right);
+bool operator==(const HealthCheck& left, const HealthCheck& right);
 bool operator==(const Label& left, const Label& right);
 bool operator==(const Labels& left, const Labels& right);
 bool operator==(const MasterInfo& left, const MasterInfo& right);
diff --git a/src/common/type_utils.cpp b/src/common/type_utils.cpp
index a7eb0e9..16d6657 100644
--- a/src/common/type_utils.cpp
+++ b/src/common/type_utils.cpp
@@ -400,6 +400,12 @@ bool operator!=(const ExecutorInfo& left, const 
ExecutorInfo& right)
 }
 
 
+bool operator==(const HealthCheck& left, const HealthCheck& right)
+{
+  return google::protobuf::util::MessageDifferencer::Equals(left, right);
+}
+
+
 bool operator==(const MasterInfo& left, const MasterInfo& right)
 {
   return left.id() == right.id() &&
@@ -575,7 +581,9 @@ bool operator==(const Task& left, const Task& right)
 left.status_update_uuid() == right.status_update_uuid() &&
 left.labels() == right.labels() &&
 left.discovery() == right.discovery() &&
-left.user() == right.user();
+left.user() == right.user() &&
+left.container() == right.container() &&
+left.health_check() == right.health_check();
 }
 
 



[mesos] branch master updated (a32fd27 -> c076c8c)

2019-07-15 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


from a32fd27  Updated 3 unit tests by changing IO switchboard to local mode.
 new 3c959eb  Added minimal agent handler for 'DrainSlaveMessage'.
 new 7d08b66  Added the DrainConfig to agent API outputs.
 new 04d25af  Added test for DrainConfig in agent API outputs.
 new ef19f29  Refactored the agent's task-killing code.
 new 27f0cd3  Updated an equality operator.
 new 3bb8287  Added kill policy to the 'Task' message.
 new e1c7985  Killed all tasks on the agent when draining.
 new 505928a  Added tests for task killing when draining the agent.
 new 1a32b31  Fixed pid checkpointing for `TestContainerizer`.
 new 54fb43e  Added recovery of agent drain information.
 new 1889268  Adjusted task status updates during draining.
 new a7044bd  Changed agent to fail task launches received during draining.
 new 654faf9  Cleared agent drain state when draining is finished.
 new c076c8c  Added test for agent to leave draining state on its own.

The 14 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 include/mesos/agent/agent.proto|   2 +
 include/mesos/mesos.proto  |   4 +
 include/mesos/type_utils.hpp   |   8 +
 include/mesos/v1/agent/agent.proto |   2 +
 include/mesos/v1/mesos.proto   |   4 +
 src/common/protobuf_utils.cpp  |   4 +
 src/common/type_utils.cpp  |  17 +-
 src/slave/http.cpp |  11 +
 src/slave/paths.cpp|   9 +
 src/slave/paths.hpp|   6 +
 src/slave/slave.cpp| 366 ++
 src/slave/slave.hpp|  31 +-
 src/slave/state.cpp|  16 +
 src/slave/state.hpp|   3 +
 src/tests/containerizer.cpp|  12 +
 src/tests/slave_tests.cpp  | 756 +
 16 files changed, 1173 insertions(+), 78 deletions(-)



[mesos] 08/14: Added tests for task killing when draining the agent.

2019-07-15 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 505928a3f51555bd3e45f2fc9787fdf890b28bfb
Author: Greg Mann 
AuthorDate: Mon Jul 15 10:25:56 2019 -0700

Added tests for task killing when draining the agent.

Review: https://reviews.apache.org/r/70904/
---
 src/tests/slave_tests.cpp | 335 ++
 1 file changed, 335 insertions(+)

diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 8098a1a..147967d 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -94,6 +94,8 @@
 #include "tests/resources_utils.hpp"
 #include "tests/utils.hpp"
 
+#include "tests/containerizer/mock_containerizer.hpp"
+
 using namespace mesos::internal::slave;
 
 #ifdef USE_SSL_SOCKET
@@ -11881,6 +11883,339 @@ TEST_F(SlaveTest, DrainInfoInAPIOutputs)
   }
 }
 
+
+// When an agent receives a `DrainSlaveMessage`, it should kill running tasks.
+TEST_F(SlaveTest, DrainAgentKillsRunningTask)
+{
+  Clock::pause();
+
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Future updateSlaveMessage =
+FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _);
+
+  StandaloneMasterDetector detector(master.get()->pid);
+
+  slave::Flags slaveFlags = CreateSlaveFlags();
+
+  Try> slave = StartSlave(, slaveFlags);
+  ASSERT_SOME(slave);
+
+  Clock::advance(slaveFlags.registration_backoff_factor);
+
+  AWAIT_READY(updateSlaveMessage);
+
+  auto scheduler = std::make_shared();
+
+  EXPECT_CALL(*scheduler, connected(_))
+.WillOnce(v1::scheduler::SendSubscribe(v1::DEFAULT_FRAMEWORK_INFO));
+
+  Future subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+.WillOnce(FutureArg<1>());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+.WillRepeatedly(Return()); // Ignore heartbeats.
+
+  Future offers;
+  EXPECT_CALL(*scheduler, offers(_, _))
+.WillOnce(FutureArg<1>())
+.WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  v1::scheduler::TestMesos mesos(
+  master.get()->pid,
+  ContentType::PROTOBUF,
+  scheduler);
+
+  AWAIT_READY(subscribed);
+
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->offers().empty());
+
+  const v1::Offer& offer = offers->offers(0);
+  const v1::AgentID& agentId = offer.agent_id();
+
+  Future startingUpdate;
+  Future runningUpdate;
+  EXPECT_CALL(*scheduler, update(_, _))
+.WillOnce(DoAll(
+FutureArg<1>(),
+v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+.WillOnce(DoAll(
+FutureArg<1>(),
+v1::scheduler::SendAcknowledge(frameworkId, agentId)));
+
+  v1::Resources resources =
+v1::Resources::parse("cpus:0.1;mem:32;disk:32").get();
+
+  v1::TaskInfo taskInfo =
+v1::createTask(agentId, resources, SLEEP_COMMAND(1000));
+
+  v1::Offer::Operation launch = v1::LAUNCH({taskInfo});
+
+  mesos.send(
+  v1::createCallAccept(
+  frameworkId,
+  offer,
+  {launch}));
+
+  AWAIT_READY(startingUpdate);
+  EXPECT_EQ(v1::TASK_STARTING, startingUpdate->status().state());
+
+  AWAIT_READY(runningUpdate);
+  EXPECT_EQ(v1::TASK_RUNNING, runningUpdate->status().state());
+
+  Future killedUpdate;
+  EXPECT_CALL(*scheduler, update(_, _))
+.WillOnce(FutureArg<1>());
+
+  // Simulate the master sending a `DrainSlaveMessage` to the agent.
+
+  // Immediately kill the task forcefully.
+  DurationInfo maxGracePeriod;
+  maxGracePeriod.set_nanoseconds(0);
+
+  DrainConfig drainConfig;
+  drainConfig.set_mark_gone(true);
+  drainConfig.mutable_max_grace_period()->CopyFrom(maxGracePeriod);
+
+  DrainSlaveMessage drainSlaveMessage;
+  drainSlaveMessage.mutable_config()->CopyFrom(drainConfig);
+
+  process::post(master.get()->pid, slave.get()->pid, drainSlaveMessage);
+
+  AWAIT_READY(killedUpdate);
+
+  EXPECT_EQ(v1::TASK_KILLED, killedUpdate->status().state());
+}
+
+
+// When the agent receives a `DrainSlaveMessage`, it should kill queued tasks.
+TEST_F(SlaveTest, DrainAgentKillsQueuedTask)
+{
+  Clock::pause();
+
+  Try> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Future updateSlaveMessage =
+FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _);
+
+  MockContainerizer mockContainerizer;
+  StandaloneMasterDetector detector(master.get()->pid);
+  slave::Flags slaveFlags = CreateSlaveFlags();
+
+  EXPECT_CALL(mockContainerizer, recover(_))
+.WillOnce(Return(Nothing()));
+
+  EXPECT_CALL(mockContainerizer, containers())
+.WillOnce(Return(hashset()));
+
+  Try> slave = StartSlave(
+  ,
+  ,
+  slaveFlags);
+  ASSERT_SOME(slave);
+
+  Clock::advance(slaveFlags.registration_backoff_factor);
+
+  AWAIT_READY(updateSlaveMessage);
+
+  auto scheduler = std::make_shared();
+
+  EXPECT_CALL(*scheduler, connected(_))
+.

[mesos] 02/14: Added the DrainConfig to agent API outputs.

2019-07-15 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 7d08b667e446840dc31538d9d40705e3d8fb12a0
Author: Greg Mann 
AuthorDate: Mon Jul 15 10:25:35 2019 -0700

Added the DrainConfig to agent API outputs.

Review: https://reviews.apache.org/r/70835/
---
 include/mesos/agent/agent.proto|  2 ++
 include/mesos/v1/agent/agent.proto |  2 ++
 src/slave/http.cpp | 11 +++
 3 files changed, 15 insertions(+)

diff --git a/include/mesos/agent/agent.proto b/include/mesos/agent/agent.proto
index 83eb7bb..3cb622d 100644
--- a/include/mesos/agent/agent.proto
+++ b/include/mesos/agent/agent.proto
@@ -569,6 +569,8 @@ message Response {
   // Contains the agent's information.
   message GetAgent {
 optional SlaveInfo slave_info = 1;
+
+optional DrainConfig drain_config = 2;
   }
 
   // Lists information about all resource providers known to the agent
diff --git a/include/mesos/v1/agent/agent.proto 
b/include/mesos/v1/agent/agent.proto
index f6574cb..4324ad6 100644
--- a/include/mesos/v1/agent/agent.proto
+++ b/include/mesos/v1/agent/agent.proto
@@ -569,6 +569,8 @@ message Response {
   // Contains the agent's information.
   message GetAgent {
 optional AgentInfo agent_info = 1;
+
+optional DrainConfig drain_config = 2;
   }
 
   // Lists information about all resource providers known to the agent
diff --git a/src/slave/http.cpp b/src/slave/http.cpp
index 69e6d74..321dca7 100644
--- a/src/slave/http.cpp
+++ b/src/slave/http.cpp
@@ -1331,6 +1331,12 @@ Future Http::state(
   writer->field("domain", slave->info.domain());
 }
 
+if (slave->drainConfig.isSome()) {
+  writer->field(
+  "drain_config",
+  JSON::Protobuf(slave->drainConfig.get()));
+}
+
 const Resources& totalResources = slave->totalResources;
 
 writer->field("resources", totalResources);
@@ -1842,6 +1848,11 @@ Future Http::getAgent(
 
   response.mutable_get_agent()->mutable_slave_info()->CopyFrom(slave->info);
 
+  if (slave->drainConfig.isSome()) {
+response.mutable_get_agent()->mutable_drain_config()->CopyFrom(
+slave->drainConfig.get());
+  }
+
   return OK(serialize(acceptType, evolve(response)),
 stringify(acceptType));
 }



[mesos] 07/14: Killed all tasks on the agent when draining.

2019-07-15 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit e1c7985e96d84693f3e41d3a50da5f5ea11b6cd8
Author: Greg Mann 
AuthorDate: Mon Jul 15 10:25:51 2019 -0700

Killed all tasks on the agent when draining.

This patch updates the agent's `DrainSlaveMessage` handler
to kill all tasks on the agent when the message is received.

Review: https://reviews.apache.org/r/70903/
---
 include/mesos/type_utils.hpp |  6 +
 src/slave/slave.cpp  | 62 
 2 files changed, 68 insertions(+)

diff --git a/include/mesos/type_utils.hpp b/include/mesos/type_utils.hpp
index 2fd8a62..98a2995 100644
--- a/include/mesos/type_utils.hpp
+++ b/include/mesos/type_utils.hpp
@@ -338,6 +338,12 @@ inline bool operator<(const ContainerID& left, const 
ContainerID& right)
 }
 
 
+inline bool operator<(const DurationInfo& left, const DurationInfo& right)
+{
+  return left.nanoseconds() < right.nanoseconds();
+}
+
+
 inline bool operator<(const ExecutorID& left, const ExecutorID& right)
 {
   return left.value() < right.value();
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 741c1f6..19b4769 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -999,6 +999,68 @@ void Slave::drain(
 << "Failed to checkpoint DrainConfig";
 
   drainConfig = drainSlaveMessage.config();
+
+  const Option maxGracePeriod =
+drainConfig->has_max_grace_period()
+  ? drainConfig->max_grace_period()
+  : Option::none();
+
+  auto calculateKillPolicy =
+[&](const Option& killPolicy) -> Option {
+  if (maxGracePeriod.isNone()) {
+return None();
+  }
+
+  KillPolicy killPolicyOverride;
+  
killPolicyOverride.mutable_grace_period()->CopyFrom(maxGracePeriod.get());
+
+  // Task kill policy is not set or unknown.
+  if (killPolicy.isNone() || !killPolicy->has_grace_period()) {
+return killPolicyOverride;
+  }
+
+  // Task kill policy is greater than the override.
+  if (maxGracePeriod.get() < killPolicy->grace_period()) {
+return killPolicyOverride;
+  }
+
+  return None();
+};
+
+  // Frameworks may be removed within `kill()` or `killPendingTask()` below,
+  // so we must copy them and their members before looping.
+  foreachvalue (Framework* framework, utils::copy(frameworks)) {
+typedef hashmap TaskMap;
+foreachvalue (const TaskMap& tasks, utils::copy(framework->pendingTasks)) {
+  foreachvalue (const TaskInfo& task, tasks) {
+killPendingTask(framework->id(), framework, task.task_id());
+  }
+}
+
+foreachvalue (Executor* executor, utils::copy(framework->executors)) {
+  foreachvalue (Task* task, executor->launchedTasks) {
+kill(framework->id(),
+ framework,
+ executor,
+ task->task_id(),
+ calculateKillPolicy(
+task->has_kill_policy()
+  ? task->kill_policy()
+  : Option::none()));
+  }
+
+  foreachvalue (const TaskInfo& task, utils::copy(executor->queuedTasks)) {
+kill(framework->id(),
+ framework,
+ executor,
+ task.task_id(),
+ calculateKillPolicy(
+task.has_kill_policy()
+  ? task.kill_policy()
+  : Option::none()));
+  }
+}
+  }
 }
 
 



[mesos] 13/14: Cleared agent drain state when draining is finished.

2019-07-15 Thread grag
This is an automated email from the ASF dual-hosted git repository.

grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 654faf9244b0016f8a17623aca7812923b3a313a
Author: Benjamin Bannier 
AuthorDate: Mon Jul 15 10:26:23 2019 -0700

Cleared agent drain state when draining is finished.

Once a draining agent has neither frameworks with pending tasks nor any
executors with either queued or launched tasks it has finished draining.
This patch adds handling of that case which clears both the in-memory
and persisted drain configuration.

Review: https://reviews.apache.org/r/70959/
---
 src/slave/slave.cpp | 31 +++
 src/slave/slave.hpp |  4 
 2 files changed, 35 insertions(+)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index eecd71e..2477975 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -7067,6 +7067,8 @@ void Slave::removeFramework(Framework* framework)
   // Pass ownership of the framework pointer.
   completedFrameworks.set(framework->id(), Owned(framework));
 
+  updateDrainStatus();
+
   if (state == TERMINATING && frameworks.empty()) {
 terminate(self());
   }
@@ -8944,6 +8946,8 @@ void Slave::removeOperation(Operation* operation)
 
   checkpointResourceState(
   totalResources.filter(mesos::needCheckpointing), false);
+
+  updateDrainStatus();
 }
 
 
@@ -9768,6 +9772,33 @@ void Slave::initializeResourceProviderManager(
 }
 
 
+void Slave::updateDrainStatus()
+{
+  if (drainConfig.isNone()) {
+return;
+  }
+
+  bool drained = operations.empty() && frameworks.empty();
+
+  if (!drained) {
+return;
+  }
+
+  LOG(INFO) << "Agent finished draining";
+
+  const string drainConfigPath = paths::getDrainConfigPath(metaDir, info.id());
+
+  Try rm = os::rm(drainConfigPath);
+
+  if (rm.isError()) {
+EXIT(EXIT_FAILURE) << "Could not remove persisted drain configuration "
+   << "'" << drainConfigPath << "': " << rm.error();
+  }
+
+  drainConfig = None();
+}
+
+
 Framework::Framework(
 Slave* _slave,
 const Flags& slaveFlags,
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 58bdd2a..58a5608 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -910,6 +910,10 @@ private:
   // If the agent is currently draining, contains the configuration used to
   // drain the agent. If NONE, the agent is not currently draining.
   Option drainConfig;
+
+  // Check whether draining is finished and possibly remove
+  // both in-memory and persisted drain configuration.
+  void updateDrainStatus();
 };
 
 



  1   2   3   4   5   6   >