This is an automated email from the ASF dual-hosted git repository. bmahler pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 554524a0e9b2137769a758a1d386b4ea035dc637 Author: Ilya Pronin <ipro...@twitter.com> AuthorDate: Mon Oct 7 14:04:08 2019 -0700 [FIX] Rate overflow in port_mapping isolator. The port mapping isolator currently uses uint32_t to store container rate limits as Bytes/s. The use of this data type is dictated by the current libnl interface, that the isolator uses to interface with netlink. The container preparation phase in the isolator involves setting up TX rate limiting HTB class using tc command that wants rate to be represented as bits/s. The conversion is performed by a simple multiplication of the rate by a literal 8. The result of such multiplication is uint32_t that can hold a maximum value of 4294967295, which is not enough to represent speeds available on modern 25G links as bits/s. This change fixes the described overflow issue. However, representing network rates as 32bit integers still presents a problem on hosts with more advanced links, e.g. 100G. This additional problem will be addressed separately with an upgrade to the recently released libnl 3.5.0, that features an updated interface with uint64_t rates. --- .../mesos/isolators/network/port_mapping.cpp | 11 +- src/tests/containerizer/port_mapping_tests.cpp | 122 +++++++++++++++++++++ 2 files changed, 129 insertions(+), 4 deletions(-) diff --git a/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp index 491546e5f..bb52f333e 100644 --- a/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp +++ b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp @@ -5077,7 +5077,8 @@ Option<htb::cls::Config> PortMappingIsolatorProcess::egressHTBConfig( if (flags.egress_rate_limit_per_container.isSome()) { rate = flags.egress_rate_limit_per_container.get(); } else if (egressRatePerCpu.isSome()) { - rate = egressRatePerCpu.get() * floor(resources.cpus().getOrElse(0)); + rate = egressRatePerCpu.get() * + static_cast<uint64_t>(resources.cpus().getOrElse(0)); } else { return None(); } @@ -5115,7 +5116,8 @@ Option<htb::cls::Config> PortMappingIsolatorProcess::ingressHTBConfig( if (flags.ingress_rate_limit_per_container.isSome()) { rate = flags.ingress_rate_limit_per_container.get(); } else if (ingressRatePerCpu.isSome()) { - rate = ingressRatePerCpu.get() * floor(resources.cpus().getOrElse(0)); + rate = ingressRatePerCpu.get() * + static_cast<uint64_t>(resources.cpus().getOrElse(0)); } else { return None(); } @@ -5292,10 +5294,11 @@ string PortMappingIsolatorProcess::scripts(Info* info) script << "tc class add dev " << eth0 << " parent " << CONTAINER_TX_HTB_HANDLE << " classid " << CONTAINER_TX_HTB_CLASS_ID << " htb rate " - << info->egressConfig->rate * 8 << "bit"; + << static_cast<uint64_t>(info->egressConfig->rate) * 8 << "bit"; if (info->egressConfig->ceil.isSome()) { script << " ceil " - << info->egressConfig->ceil.get() * 8 << "bit"; + << static_cast<uint64_t>(info->egressConfig->ceil.get()) * 8 + << "bit"; if (info->egressConfig->burst.isSome()) { // Use bytes here because tc command borks at bits. script << " burst " << info->egressConfig->burst.get() << "b"; diff --git a/src/tests/containerizer/port_mapping_tests.cpp b/src/tests/containerizer/port_mapping_tests.cpp index ca7588723..9bc94aad6 100644 --- a/src/tests/containerizer/port_mapping_tests.cpp +++ b/src/tests/containerizer/port_mapping_tests.cpp @@ -1850,6 +1850,128 @@ TEST_F(PortMappingIsolatorTest, ROOT_ScaleEgressWithCPU) } +// Test that egress rate limit scaling works correctly with high real-world +// speeds. +TEST_F(PortMappingIsolatorTest, ROOT_ScaleEgressWithCPULarge) +{ + flags.egress_rate_limit_per_container = None(); + flags.egress_rate_per_cpu = "auto"; + + // Change available CPUs to be 64. + vector<string> resources = strings::split(flags.resources.get(), ";"); + std::replace_if( + resources.begin(), + resources.end(), + [](const string& s) {return strings::startsWith(s, "cpus:");}, + "cpus:64"); + flags.resources = strings::join(";", resources); + + const Bytes linkSpeed = 3125000000; // 25 Gbit/s + const Bytes ratePerCpu = linkSpeed / 64; + flags.network_link_speed = linkSpeed; + + const Bytes minRate = 225000000; // 1.8 Gbit/s + flags.minimum_egress_rate_limit = minRate; + + const Bytes maxRate = 2187500000; // 17.5 Gbit/s + flags.maximum_egress_rate_limit = maxRate; + + // CPU high enough to be in linear scaling region and to trigger uint32_t + // overflow of scaled rate represented as bit/s: 16 * 3125000000 / 64 = + // 781250000 B/s or 6250000000 bits/s. + Try<Resources> linearCpu = Resources::parse("cpus:16;mem:1024;disk:1024"); + ASSERT_SOME(linearCpu); + + // CPU low enough for scaled network egress to be increased to the min limit: + // 4 * 3125000000 / 64 = 195312500 B/s + Try<Resources> lowCpu = Resources::parse("cpus:4;mem:1024;disk:1024"); + ASSERT_SOME(lowCpu); + + // CPU high enough for scaled network egress to be reduced to the max limit: + // 60 * 3125000000 / 64 = 2929687500 B/s. + Try<Resources> highCpu = Resources::parse("cpus:60;mem:1024;disk:1024"); + ASSERT_SOME(highCpu); + + Try<Isolator*> isolator = PortMappingIsolatorProcess::create(flags); + ASSERT_SOME(isolator); + + Try<Launcher*> launcher = LinuxLauncher::create(flags); + ASSERT_SOME(launcher); + + ExecutorInfo executorInfo; + executorInfo.mutable_resources()->CopyFrom(linearCpu.get()); + + ContainerID containerId1; + containerId1.set_value(id::UUID::random().toString()); + + ContainerConfig containerConfig1; + containerConfig1.mutable_executor_info()->CopyFrom(executorInfo); + containerConfig1.mutable_resources()->CopyFrom(executorInfo.resources()); + + Future<Option<ContainerLaunchInfo>> launchInfo1 = + isolator.get()->prepare(containerId1, containerConfig1); + AWAIT_READY(launchInfo1); + ASSERT_SOME(launchInfo1.get()); + ASSERT_EQ(1, launchInfo1.get()->pre_exec_commands().size()); + + int pipes[2]; + ASSERT_NE(-1, ::pipe(pipes)); + + Try<pid_t> pid = launchHelper( + launcher.get(), + pipes, + containerId1, + "touch " + container1Ready + " && sleep 1000", + launchInfo1.get()); + ASSERT_SOME(pid); + + // Reap the forked child. + Future<Option<int>> status = process::reap(pid.get()); + + // Continue in the parent. + ::close(pipes[0]); + + // Isolate the forked child. + AWAIT_READY(isolator.get()->isolate(containerId1, pid.get())); + + // Signal forked child to continue. + char dummy; + ASSERT_LT(0, ::write(pipes[1], &dummy, sizeof(dummy))); + ::close(pipes[1]); + + // Wait for command to start to ensure all pre-exec scripts have + // executed. + ASSERT_TRUE(waitForFileCreation(container1Ready)); + + Result<htb::cls::Config> config = recoverHTBConfig(pid.get(), eth0, flags); + ASSERT_SOME(config); + ASSERT_EQ(ratePerCpu * 16, config->rate); + + // Reduce CPU to get to hit the min limit. + Future<Nothing> update = isolator.get()->update(containerId1, lowCpu.get()); + AWAIT_READY(update); + + config = recoverHTBConfig(pid.get(), eth0, flags); + ASSERT_SOME(config); + ASSERT_EQ(minRate, config->rate); + + // Increase CPU to hit the max limit. + update = isolator.get()->update(containerId1, highCpu.get()); + AWAIT_READY(update); + + config = recoverHTBConfig(pid.get(), eth0, flags); + ASSERT_SOME(config); + ASSERT_EQ(maxRate, config->rate); + + // Kill the container + AWAIT_READY(launcher.get()->destroy(containerId1)); + AWAIT_READY(isolator.get()->cleanup(containerId1)); + + delete launcher.get(); + delete isolator.get(); +} + + TEST_F(PortMappingIsolatorTest, ROOT_ScaleEgressWithCPUAutoConfig) { flags.egress_rate_limit_per_container = None();