This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 554524a0e9b2137769a758a1d386b4ea035dc637
Author: Ilya Pronin <ipro...@twitter.com>
AuthorDate: Mon Oct 7 14:04:08 2019 -0700

    [FIX] Rate overflow in port_mapping isolator.
    
    The port mapping isolator currently uses uint32_t to store container
    rate limits as Bytes/s. The use of this data type is dictated by the current
    libnl interface, that the isolator uses to interface with netlink.
    
    The container preparation phase in the isolator involves setting up TX rate
    limiting HTB class using tc command that wants rate to be represented as
    bits/s. The conversion is performed by a simple multiplication of the
    rate by a literal 8. The result of such multiplication is uint32_t that
    can hold a maximum value of 4294967295, which is not enough to represent
    speeds available on modern 25G links as bits/s.
    
    This change fixes the described overflow issue. However, representing
    network rates as 32bit integers still presents a problem on hosts with
    more advanced links, e.g. 100G. This additional problem will be addressed
    separately with an upgrade to the recently released libnl 3.5.0, that
    features an updated interface with uint64_t rates.
---
 .../mesos/isolators/network/port_mapping.cpp       |  11 +-
 src/tests/containerizer/port_mapping_tests.cpp     | 122 +++++++++++++++++++++
 2 files changed, 129 insertions(+), 4 deletions(-)

diff --git a/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp 
b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
index 491546e5f..bb52f333e 100644
--- a/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
+++ b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
@@ -5077,7 +5077,8 @@ Option<htb::cls::Config> 
PortMappingIsolatorProcess::egressHTBConfig(
   if (flags.egress_rate_limit_per_container.isSome()) {
     rate = flags.egress_rate_limit_per_container.get();
   } else if (egressRatePerCpu.isSome()) {
-    rate = egressRatePerCpu.get() * floor(resources.cpus().getOrElse(0));
+    rate = egressRatePerCpu.get() *
+      static_cast<uint64_t>(resources.cpus().getOrElse(0));
   } else {
     return None();
   }
@@ -5115,7 +5116,8 @@ Option<htb::cls::Config> 
PortMappingIsolatorProcess::ingressHTBConfig(
   if (flags.ingress_rate_limit_per_container.isSome()) {
     rate = flags.ingress_rate_limit_per_container.get();
   } else if (ingressRatePerCpu.isSome()) {
-    rate = ingressRatePerCpu.get() * floor(resources.cpus().getOrElse(0));
+    rate = ingressRatePerCpu.get() *
+      static_cast<uint64_t>(resources.cpus().getOrElse(0));
   } else {
     return None();
   }
@@ -5292,10 +5294,11 @@ string PortMappingIsolatorProcess::scripts(Info* info)
     script << "tc class add dev " << eth0 << " parent "
            << CONTAINER_TX_HTB_HANDLE << " classid "
            << CONTAINER_TX_HTB_CLASS_ID << " htb rate "
-           << info->egressConfig->rate * 8 << "bit";
+           << static_cast<uint64_t>(info->egressConfig->rate) * 8 << "bit";
     if (info->egressConfig->ceil.isSome()) {
       script << " ceil "
-             << info->egressConfig->ceil.get() * 8 << "bit";
+             << static_cast<uint64_t>(info->egressConfig->ceil.get()) * 8
+             << "bit";
       if (info->egressConfig->burst.isSome()) {
         // Use bytes here because tc command borks at bits.
         script << " burst " << info->egressConfig->burst.get() << "b";
diff --git a/src/tests/containerizer/port_mapping_tests.cpp 
b/src/tests/containerizer/port_mapping_tests.cpp
index ca7588723..9bc94aad6 100644
--- a/src/tests/containerizer/port_mapping_tests.cpp
+++ b/src/tests/containerizer/port_mapping_tests.cpp
@@ -1850,6 +1850,128 @@ TEST_F(PortMappingIsolatorTest, ROOT_ScaleEgressWithCPU)
 }
 
 
+// Test that egress rate limit scaling works correctly with high real-world
+// speeds.
+TEST_F(PortMappingIsolatorTest, ROOT_ScaleEgressWithCPULarge)
+{
+  flags.egress_rate_limit_per_container = None();
+  flags.egress_rate_per_cpu = "auto";
+
+  // Change available CPUs to be 64.
+  vector<string> resources = strings::split(flags.resources.get(), ";");
+  std::replace_if(
+      resources.begin(),
+      resources.end(),
+      [](const string& s) {return strings::startsWith(s, "cpus:");},
+      "cpus:64");
+  flags.resources = strings::join(";", resources);
+
+  const Bytes linkSpeed = 3125000000; // 25 Gbit/s
+  const Bytes ratePerCpu = linkSpeed / 64;
+  flags.network_link_speed = linkSpeed;
+
+  const Bytes minRate = 225000000; // 1.8 Gbit/s
+  flags.minimum_egress_rate_limit = minRate;
+
+  const Bytes maxRate = 2187500000; // 17.5 Gbit/s
+  flags.maximum_egress_rate_limit = maxRate;
+
+  // CPU high enough to be in linear scaling region and to trigger uint32_t
+  // overflow of scaled rate represented as bit/s: 16 * 3125000000 / 64 =
+  // 781250000 B/s or 6250000000 bits/s.
+  Try<Resources> linearCpu = Resources::parse("cpus:16;mem:1024;disk:1024");
+  ASSERT_SOME(linearCpu);
+
+  // CPU low enough for scaled network egress to be increased to the min limit:
+  // 4 * 3125000000 / 64 = 195312500 B/s
+  Try<Resources> lowCpu = Resources::parse("cpus:4;mem:1024;disk:1024");
+  ASSERT_SOME(lowCpu);
+
+  // CPU high enough for scaled network egress to be reduced to the max limit:
+  // 60 * 3125000000 / 64 = 2929687500 B/s.
+  Try<Resources> highCpu = Resources::parse("cpus:60;mem:1024;disk:1024");
+  ASSERT_SOME(highCpu);
+
+  Try<Isolator*> isolator = PortMappingIsolatorProcess::create(flags);
+  ASSERT_SOME(isolator);
+
+  Try<Launcher*> launcher = LinuxLauncher::create(flags);
+  ASSERT_SOME(launcher);
+
+  ExecutorInfo executorInfo;
+  executorInfo.mutable_resources()->CopyFrom(linearCpu.get());
+
+  ContainerID containerId1;
+  containerId1.set_value(id::UUID::random().toString());
+
+  ContainerConfig containerConfig1;
+  containerConfig1.mutable_executor_info()->CopyFrom(executorInfo);
+  containerConfig1.mutable_resources()->CopyFrom(executorInfo.resources());
+
+  Future<Option<ContainerLaunchInfo>> launchInfo1 =
+    isolator.get()->prepare(containerId1, containerConfig1);
+  AWAIT_READY(launchInfo1);
+  ASSERT_SOME(launchInfo1.get());
+  ASSERT_EQ(1, launchInfo1.get()->pre_exec_commands().size());
+
+  int pipes[2];
+  ASSERT_NE(-1, ::pipe(pipes));
+
+  Try<pid_t> pid = launchHelper(
+      launcher.get(),
+      pipes,
+      containerId1,
+      "touch " + container1Ready + " && sleep 1000",
+      launchInfo1.get());
+  ASSERT_SOME(pid);
+
+  // Reap the forked child.
+  Future<Option<int>> status = process::reap(pid.get());
+
+  // Continue in the parent.
+  ::close(pipes[0]);
+
+  // Isolate the forked child.
+  AWAIT_READY(isolator.get()->isolate(containerId1, pid.get()));
+
+  // Signal forked child to continue.
+  char dummy;
+  ASSERT_LT(0, ::write(pipes[1], &dummy, sizeof(dummy)));
+  ::close(pipes[1]);
+
+  // Wait for command to start to ensure all pre-exec scripts have
+  // executed.
+  ASSERT_TRUE(waitForFileCreation(container1Ready));
+
+  Result<htb::cls::Config> config = recoverHTBConfig(pid.get(), eth0, flags);
+  ASSERT_SOME(config);
+  ASSERT_EQ(ratePerCpu * 16, config->rate);
+
+  // Reduce CPU to get to hit the min limit.
+  Future<Nothing> update = isolator.get()->update(containerId1, lowCpu.get());
+  AWAIT_READY(update);
+
+  config = recoverHTBConfig(pid.get(), eth0, flags);
+  ASSERT_SOME(config);
+  ASSERT_EQ(minRate, config->rate);
+
+  // Increase CPU to hit the max limit.
+  update = isolator.get()->update(containerId1, highCpu.get());
+  AWAIT_READY(update);
+
+  config = recoverHTBConfig(pid.get(), eth0, flags);
+  ASSERT_SOME(config);
+  ASSERT_EQ(maxRate, config->rate);
+
+  // Kill the container
+  AWAIT_READY(launcher.get()->destroy(containerId1));
+  AWAIT_READY(isolator.get()->cleanup(containerId1));
+
+  delete launcher.get();
+  delete isolator.get();
+}
+
+
 TEST_F(PortMappingIsolatorTest, ROOT_ScaleEgressWithCPUAutoConfig)
 {
   flags.egress_rate_limit_per_container = None();

Reply via email to