This is an automated email from the ASF dual-hosted git repository. penghui pushed a commit to branch branch-2.7 in repository https://gitbox.apache.org/repos/asf/pulsar.git
commit 05d414d642ccdec029f4deee8aafa2dbac4e4de8 Author: Michael Marshall <[email protected]> AuthorDate: Wed Feb 10 18:45:20 2021 -0700 Fix testBrokerSelectionForAntiAffinityGroup by increasing OverloadedThreshold (#9393) Fixes: #6368 Flaky-test: `AntiAffinityNamespaceGroupTest.testBrokerSelectionForAntiAffinityGroup` TestClass is flaky. The testMethod test method fails sporadically. See #6368 for example failures as well as for my analysis and detailed justification for this change. In short, by setting the `LoadBalancerBrokerOverloadedThresholdPercentage` to `100`, we remove the main edge case that allows two namespaces in the same anti-affinity group to get placed on the same broker. Note that I am assuming the following method will never return a value greater than 1, which could lead to test failure. https://github.com/apache/pulsar/blob/85f3ff4edbaa10c7894af8ad823cbce37b13829c/pulsar-common/src/main/java/org/apache/pulsar/policies/data/loadbalancer/LocalBrokerData.java#L214-L217 (cherry picked from commit 610b17d6433b0894a541421841b525741f8d5d40) --- .../loadbalance/impl/GenericBrokerHostUsageImpl.java | 19 ++++++++++++------- .../loadbalance/impl/LinuxBrokerHostUsageImpl.java | 7 ++++--- .../loadbalance/AntiAffinityNamespaceGroupTest.java | 11 +++++++++++ 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java index 9d3de7b..14c670c 100644 --- a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java +++ b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java @@ -52,8 +52,13 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage { this.systemBean = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean(); this.usage = new SystemResourceUsage(); this.totalCpuLimit = getTotalCpuLimit(); - executorService.scheduleAtFixedRate(this::checkCpuLoad, 0, CPU_CHECK_MILLIS, TimeUnit.MILLISECONDS); - executorService.scheduleAtFixedRate(this::doCalculateBrokerHostUsage, 0, hostUsageCheckIntervalMin, TimeUnit.MINUTES); + + // Call now to initialize values before the constructor returns + calculateBrokerHostUsage(); + executorService.scheduleAtFixedRate(this::checkCpuLoad, CPU_CHECK_MILLIS, + CPU_CHECK_MILLIS, TimeUnit.MILLISECONDS); + executorService.scheduleAtFixedRate(this::doCalculateBrokerHostUsage, hostUsageCheckIntervalMin, + hostUsageCheckIntervalMin, TimeUnit.MINUTES); } @Override @@ -61,7 +66,7 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage { return usage; } - private void checkCpuLoad() { + private synchronized void checkCpuLoad() { cpuUsageSum += systemBean.getSystemCpuLoad(); cpuUsageCount++; } @@ -84,7 +89,10 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage { return 100 * Runtime.getRuntime().availableProcessors(); } - private double getTotalCpuUsage() { + private synchronized double getTotalCpuUsage() { + if (cpuUsageCount == 0) { + return 0; + } double cpuUsage = cpuUsageSum / cpuUsageCount; cpuUsageSum = 0d; cpuUsageCount = 0; @@ -92,9 +100,6 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage { } private ResourceUsage getCpuUsage() { - if (cpuUsageCount == 0) { - return new ResourceUsage(0, totalCpuLimit); - } return new ResourceUsage(getTotalCpuUsage() * totalCpuLimit, totalCpuLimit); } diff --git a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java index 992b743..e18ff91 100644 --- a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java +++ b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java @@ -78,8 +78,6 @@ public class LinuxBrokerHostUsageImpl implements BrokerHostUsage { this.lastCollection = 0L; this.usage = new SystemResourceUsage(); this.overrideBrokerNicSpeedGbps = overrideBrokerNicSpeedGbps; - executorService.scheduleAtFixedRate(this::calculateBrokerHostUsage, 0, - hostUsageCheckIntervalMin, TimeUnit.MINUTES); boolean isCGroupsEnabled = false; try { @@ -87,9 +85,12 @@ public class LinuxBrokerHostUsageImpl implements BrokerHostUsage { } catch (Exception e) { log.warn("Failed to check cgroup CPU usage file: {}", e.getMessage()); } - this.isCGroupsEnabled = isCGroupsEnabled; + + // Call now to initialize values before the constructor returns calculateBrokerHostUsage(); + executorService.scheduleAtFixedRate(this::calculateBrokerHostUsage, hostUsageCheckIntervalMin, + hostUsageCheckIntervalMin, TimeUnit.MINUTES); } @Override diff --git a/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java b/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java index 6551172..2286477 100644 --- a/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java +++ b/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java @@ -113,6 +113,8 @@ public class AntiAffinityNamespaceGroupTest { config1.setFailureDomainsEnabled(true); config1.setLoadBalancerEnabled(true); config1.setAdvertisedAddress("localhost"); + // Don't want overloaded threshold to affect namespace placement + config1.setLoadBalancerBrokerOverloadedThresholdPercentage(400); createCluster(bkEnsemble.getZkClient(), config1); pulsar1 = new PulsarService(config1); pulsar1.start(); @@ -130,6 +132,8 @@ public class AntiAffinityNamespaceGroupTest { config2.setBrokerServicePort(Optional.of(0)); config2.setFailureDomainsEnabled(true); config2.setAdvertisedAddress("localhost"); + // Don't want overloaded threshold to affect namespace placement + config2.setLoadBalancerBrokerOverloadedThresholdPercentage(400); pulsar2 = new PulsarService(config2); pulsar2.start(); @@ -366,6 +370,13 @@ public class AntiAffinityNamespaceGroupTest { /** * It verifies anti-affinity with failure domain enabled with 2 brokers. * + * Note: in this class's setup method, the LoadBalancerBrokerOverloadedThresholdPercentage + * is set to 400 to ensure that the overloaded logic doesn't affect the broker selection. + * Without that configuration, two namespaces in the same anti-affinity group could + * be placed on the same broker. The CPU usage can be over 100%, and if we run with more + * than 4 cores, it could conceivably be above 400%. If this test becomes flaky again, + * look at the logs to see if there is a mention of an overloaded broker. + * * <pre> * 1. Register brokers to domain: domain-1: broker1 & domain-2: broker2 * 2. Load-Manager receives a watch and updates brokerToDomain cache with new domain data
