This is an automated email from the ASF dual-hosted git repository. chhsiao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit c2861ddac3eb875db70fae806db67abbc555b279 Author: Chun-Hung Hsiao <[email protected]> AuthorDate: Tue Feb 19 22:08:12 2019 -0800 Added a unit test to run Tensorflow on a GPU. Test `TensorflowGpuImage` leverages the Tensorflow GPU image, which is based on the `nvidia/cuda` image, to launch a task that consumes a GPU. Review: https://reviews.apache.org/r/70017 --- .../containerizer/nvidia_gpu_isolator_tests.cpp | 110 +++++++++++++++++++++ src/tests/mesos.hpp | 11 +++ 2 files changed, 121 insertions(+) diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp index 040453e..b75c81c 100644 --- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp +++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp @@ -332,6 +332,116 @@ TEST_F(NvidiaGpuTest, ROOT_INTERNET_CURL_CGROUPS_NVIDIA_GPU_NvidiaDockerImage) } +// This test verifies that we can enable the Nvidia GPU isolator and launch +// tasks to run the Tensorflow GPU image, which is based on Nvidia's image that +// has a special environment variable that indicates that we need to mount a +// volume containing the Nvidia libraries and binaries. +TEST_F(NvidiaGpuTest, ROOT_INTERNET_CURL_CGROUPS_NVIDIA_GPU_TensorflowGpuImage) +{ + Try<Owned<cluster::Master>> master = StartMaster(); + ASSERT_SOME(master); + + slave::Flags flags = CreateSlaveFlags(); + flags.isolation = "docker/runtime,filesystem/linux," + "cgroups/devices,gpu/nvidia"; + flags.image_providers = "docker"; + flags.nvidia_gpu_devices = vector<unsigned int>({0u}); + flags.resources = "cpus:1;mem:128;gpus:1"; + + Owned<MasterDetector> detector = master.get()->createDetector(); + + Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags); + ASSERT_SOME(slave); + + // NOTE: We use the default executor (and thus v1 API) in this test to avoid + // executor registration timing out due to fetching the Tensorflow GPU image + // over a slow connection. + auto scheduler = std::make_shared<v1::MockHTTPScheduler>(); + + v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO; + frameworkInfo.add_capabilities()->set_type( + v1::FrameworkInfo::Capability::GPU_RESOURCES); + + EXPECT_CALL(*scheduler, connected(_)) + .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo)); + + Future<v1::scheduler::Event::Subscribed> subscribed; + EXPECT_CALL(*scheduler, subscribed(_, _)) + .WillOnce(FutureArg<1>(&subscribed)); + + Future<v1::scheduler::Event::Offers> offers; + EXPECT_CALL(*scheduler, offers(_, _)) + .WillOnce(FutureArg<1>(&offers)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + EXPECT_CALL(*scheduler, heartbeat(_)) + .WillRepeatedly(Return()); // Ignore heartbeats. + + EXPECT_CALL(*scheduler, failure(_, _)) + .Times(AtMost(2)); + + v1::scheduler::TestMesos mesos( + master.get()->pid, + ContentType::PROTOBUF, + scheduler); + + AWAIT_READY(subscribed); + + const v1::FrameworkID& frameworkId = subscribed->framework_id(); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->offers().empty()); + + const v1::AgentID& agentId = offers->offers(0).agent_id(); + + mesos::v1::Image image; + image.set_type(mesos::v1::Image::DOCKER); + image.mutable_docker()->set_name("tensorflow/tensorflow:latest-gpu"); + + // Launch a task requesting 1 GPU and run a simple Tensorflow program. + v1::ExecutorInfo executor = v1::createExecutorInfo( + id::UUID::random().toString(), + None(), + "cpus:0.1;mem:32;disk:32", + v1::ExecutorInfo::DEFAULT, + frameworkId); + + v1::TaskInfo task = v1::createTask( + agentId, + v1::Resources::parse("cpus:0.1;mem:32;gpus:1").get(), + "python -c '" + "import tensorflow as tf;" + "tf.enable_eager_execution();" + "print(tf.reduce_sum(tf.random_normal([1000, 1000])));" + "'"); + + mesos::v1::ContainerInfo* container = task.mutable_container(); + container->set_type(mesos::v1::ContainerInfo::MESOS); + container->mutable_mesos()->mutable_image()->CopyFrom(image); + + EXPECT_CALL(*scheduler, update(_, TaskStatusUpdateStateEq(v1::TASK_STARTING))) + .WillOnce(v1::scheduler::SendAcknowledge(frameworkId, agentId)); + + EXPECT_CALL(*scheduler, update(_, TaskStatusUpdateStateEq(v1::TASK_RUNNING))) + .WillOnce(v1::scheduler::SendAcknowledge(frameworkId, agentId)); + + Future<v1::scheduler::Event::Update> terminalStatusUpdate; + EXPECT_CALL(*scheduler, update(_, TaskStatusUpdateIsTerminalState())) + .WillOnce(DoAll( + FutureArg<1>(&terminalStatusUpdate), + v1::scheduler::SendAcknowledge(frameworkId, agentId))); + + mesos.send(v1::createCallAccept( + frameworkId, + offers->offers(0), + {v1::LAUNCH_GROUP(executor, v1::createTaskGroupInfo({task}))})); + + // We wait up to 180 seconds to download the docker image. + AWAIT_READY_FOR(terminalStatusUpdate, Seconds(180)); + EXPECT_EQ(v1::TASK_FINISHED, terminalStatusUpdate->status().state()); +} + + // This test verifies correct failure semantics when // a task requests a fractional number of GPUs. TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_FractionalResources) diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp index f3f1e64..1a6d826 100644 --- a/src/tests/mesos.hpp +++ b/src/tests/mesos.hpp @@ -71,6 +71,9 @@ #include "authentication/executor/jwt_secret_generator.hpp" #include "common/http.hpp" +#include "common/protobuf_utils.hpp" + +#include "internal/devolve.hpp" #include "messages/messages.hpp" // For google::protobuf::Message. @@ -3668,6 +3671,14 @@ MATCHER_P(TaskStatusUpdateStateEq, taskState, "") } +// This matcher is used to match an `Event.update.status` message whose state is +// terminal. +MATCHER(TaskStatusUpdateIsTerminalState, "") +{ + return protobuf::isTerminalState(devolve(arg.status()).state()); +} + + // This matcher is used to match the task id of // `authorization::Request.Object.TaskInfo`. MATCHER_P(AuthorizationRequestHasTaskID, taskId, "")
