This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 0f3a98573a0f2457213ad29b05b5231254585431 Author: yujun <[email protected]> AuthorDate: Tue Aug 15 17:27:31 2023 +0800 [improvement](tablet clone) update the capacity coeficient for calculating backend load score (#22857) update the capacity coeficient for calcutating the backend load score: 1. Add fe config entry `backend_load_capacity_coeficient` to allow setting the capacity coeficient manually; 2. Adjust calculating capacity coeficient as below. We emphasize disk usage for calculating load score. If a be has a high used capacity percent, we should increase its load score. So we increase capacity coefficient with a be's used capacity percent. But this is not enough. For example, if the tablets have a big difference in data size. Then for below two BEs, their load score maybe the same: BE A: disk usage = 60%, replica number = 2000 (it contains the big tablets) BE B: disk usage = 30%, replica number = 4000 (it contains the small tablets) But what we want is: firstly move some big tablets from A to B, after their disk usages are close, then move some small tablets from B to A, finally both of their disk usages and replica number are close. To achieve this, when the max difference between all BE's disk usages >= 30%, we set the capacity cofficient to 1.0 and avoid the affect of replica num. After the disk usage difference decrease, then decrease the capacity cofficient to make replica num effective. --- .../main/java/org/apache/doris/common/Config.java | 14 ++++ .../apache/doris/clone/BackendLoadStatistic.java | 87 ++++++++++++++++++---- .../apache/doris/clone/LoadStatisticForTag.java | 30 ++++++-- .../org/apache/doris/clone/DiskRebalanceTest.java | 1 + 4 files changed, 111 insertions(+), 21 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 7405f8ab1b..3578c38726 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -616,6 +616,20 @@ public class Config extends ConfigBase { "The high water of disk capacity used percent. This is used for calculating load score of a backend."}) public static double capacity_used_percent_high_water = 0.75; + @ConfField(mutable = true, masterOnly = true, description = { + "负载均衡时,磁盘使用率最大差值。", + "The max diff of disk capacity used percent between BE. " + + "It is used for calculating load score of a backend."}) + public static double used_capacity_percent_max_diff = 0.30; + + @ConfField(mutable = true, masterOnly = true, description = { + "设置固定的 BE 负载分数中磁盘使用率系数。BE 负载分数会综合磁盘使用率和副本数而得。有效值范围为[0, 1]," + + "当超出此范围时,则使用其他方法自动计算此系数。", + "Sets a fixed disk usage factor in the BE load fraction. The BE load score is a combination of disk usage " + + "and replica count. The valid value range is [0, 1]. When it is out of this range, other " + + "methods are used to automatically calculate this coefficient."}) + public static double backend_load_capacity_coeficient = -1.0; + @ConfField(mutable = true, masterOnly = true, description = { "ALTER TABLE 请求的最大超时时间。设置的足够长以适应表的数据量。", "Maximal timeout of ALTER TABLE request. Set long enough to fit your table data size."}) diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java index f1391b26d2..fdf45afe81 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java @@ -87,11 +87,14 @@ public class BackendLoadStatistic { private Tag tag; public static class LoadScore { - public double replicaNumCoefficient = 0.5; public double capacityCoefficient = 0.5; public double score = 0.0; public static final LoadScore DUMMY = new LoadScore(); + + public double getReplicaNumCoefficient() { + return 1.0 - capacityCoefficient; + } } private Map<TStorageMedium, Long> totalCapacityMap = Maps.newHashMap(); @@ -244,24 +247,27 @@ public class BackendLoadStatistic { } public void calcScore(Map<TStorageMedium, Double> avgClusterUsedCapacityPercentMap, + Map<TStorageMedium, Double> maxUsedPercentDiffMap, Map<TStorageMedium, Double> avgClusterReplicaNumPerBackendMap) { for (TStorageMedium medium : TStorageMedium.values()) { - LoadScore loadScore = calcSore(totalUsedCapacityMap.getOrDefault(medium, 0L), + LoadScore loadScore = calcScore(totalUsedCapacityMap.getOrDefault(medium, 0L), totalCapacityMap.getOrDefault(medium, 1L), totalReplicaNumMap.getOrDefault(medium, 0L), avgClusterUsedCapacityPercentMap.getOrDefault(medium, 0.0), + maxUsedPercentDiffMap.getOrDefault(medium, 0.0), avgClusterReplicaNumPerBackendMap.getOrDefault(medium, 0.0)); loadScoreMap.put(medium, loadScore); LOG.debug("backend {}, medium: {}, capacity coefficient: {}, replica coefficient: {}, load score: {}", - beId, medium, loadScore.capacityCoefficient, loadScore.replicaNumCoefficient, loadScore.score); + beId, medium, loadScore.capacityCoefficient, loadScore.getReplicaNumCoefficient(), + loadScore.score); } } - public static LoadScore calcSore(long beUsedCapacityB, long beTotalCapacityB, long beTotalReplicaNum, - double avgClusterUsedCapacityPercent, double avgClusterReplicaNumPerBackend) { + public static LoadScore calcScore(long beUsedCapacityB, long beTotalCapacityB, long beTotalReplicaNum, + double avgClusterUsedCapacityPercent, double maxUsedPercentDiff, double avgClusterReplicaNumPerBackend) { double usedCapacityPercent = (beUsedCapacityB / (double) beTotalCapacityB); double capacityProportion = avgClusterUsedCapacityPercent <= 0 ? 0.0 @@ -271,22 +277,71 @@ public class BackendLoadStatistic { LoadScore loadScore = new LoadScore(); - // If this backend's capacity used percent < 50%, set capacityCoefficient to 0.5. - // Else if capacity used percent > 75%, set capacityCoefficient to 1. - // Else, capacityCoefficient changed smoothly from 0.5 to 1 with used capacity increasing - // Function: (2 * usedCapacityPercent - 0.5) - if (!Config.be_rebalancer_fuzzy_test) { - loadScore.capacityCoefficient = usedCapacityPercent < 0.5 ? 0.5 - : (usedCapacityPercent > Config.capacity_used_percent_high_water ? 1.0 - : (2 * usedCapacityPercent - 0.5)); - loadScore.replicaNumCoefficient = 1 - loadScore.capacityCoefficient; + if (Config.backend_load_capacity_coeficient >= 0.0 && Config.backend_load_capacity_coeficient <= 1.0) { + loadScore.capacityCoefficient = Config.backend_load_capacity_coeficient; + } else if (!Config.be_rebalancer_fuzzy_test) { + // Emphasize disk usage for calculating load score. + // + // thisBeCapacityCoefficient: + // If this be has a high used capacity percent, we should increase its load score. + // So we increase capacity coefficient with this be's used capacity percent. + // + // Also we emphasize the difference of disk usage between all the BEs. + // For example, if the tablets have a big difference in data size. + // Then for below two BEs, their load score maybe the same: + // BE A: disk usage = 60%, replica number = 2000 (it contains the big tablets) + // BE B: disk usage = 30%, replica number = 4000 (it contains the small tablets) + // + // But what we want is: firstly move some big tablets from A to B, after their disk usages are close, + // then move some small tablets from B to A, finally both of their disk usages and replica number + // are close. + // + // allBeCapacityCoefficient is used to achieve this, when the max difference between all BE's + // disk usages >= 30%, we set the capacity cofficient to 1.0 and avoid the affect of replica num. + // After the disk usage difference decrease, then decrease the capacity cofficient. + // + double thisBeCapacityCoefficient = getSmoothCofficient(usedCapacityPercent, 0.5, + Config.capacity_used_percent_high_water); + double allBeCapacityCoefficient = getSmoothCofficient(maxUsedPercentDiff, 0.05, + Config.used_capacity_percent_max_diff); + + loadScore.capacityCoefficient = Math.max(thisBeCapacityCoefficient, allBeCapacityCoefficient); } loadScore.score = capacityProportion * loadScore.capacityCoefficient - + replicaNumProportion * loadScore.replicaNumCoefficient; + + replicaNumProportion * loadScore.getReplicaNumCoefficient(); return loadScore; } + // the return cofficient: + // 1. percent <= percentLowWatermark: cofficient = 0.5; + // 2. percentLowWatermark < percent < percentHighWatermark: cofficient linear increase from 0.5 to 1.0; + // 3. percent >= percentHighWatermark: cofficient = 1.0; + public static double getSmoothCofficient(double percent, double percentLowWatermark, + double percentHighWatermark) { + final double lowCofficient = 0.5; + final double highCofficient = 1.0; + + // low watermark and high watermark equal, then return 0.75 + if (Math.abs(percentHighWatermark - percentLowWatermark) < 1e-6) { + return (lowCofficient + highCofficient) / 2; + } + if (percentLowWatermark > percentHighWatermark) { + double tmp = percentLowWatermark; + percentLowWatermark = percentHighWatermark; + percentHighWatermark = tmp; + } + if (percent <= percentLowWatermark) { + return lowCofficient; + } + if (percent >= percentHighWatermark) { + return highCofficient; + } + + return lowCofficient + (highCofficient - lowCofficient) + * (percent - percentLowWatermark) / (percentHighWatermark - percentLowWatermark); + } + public BalanceStatus isFit(long tabletSize, TStorageMedium medium, List<RootPathLoadStatistic> result, boolean isSupplement) { BalanceStatus status = new BalanceStatus(ErrCode.COMMON_ERROR); @@ -444,7 +499,7 @@ public class BackendLoadStatistic { info.add(String.valueOf(totalReplicaNumMap.getOrDefault(medium, 0L))); LoadScore loadScore = loadScoreMap.getOrDefault(medium, new LoadScore()); info.add(String.valueOf(loadScore.capacityCoefficient)); - info.add(String.valueOf(loadScore.replicaNumCoefficient)); + info.add(String.valueOf(loadScore.getReplicaNumCoefficient())); info.add(String.valueOf(loadScore.score)); info.add(clazzMap.getOrDefault(medium, Classification.INIT).name()); return info; diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java index 4709575c69..224399f480 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java @@ -53,6 +53,7 @@ public class LoadStatisticForTag { private final Map<TStorageMedium, Long> totalCapacityMap = Maps.newHashMap(); private final Map<TStorageMedium, Long> totalUsedCapacityMap = Maps.newHashMap(); private final Map<TStorageMedium, Long> totalReplicaNumMap = Maps.newHashMap(); + private final Map<TStorageMedium, Double> maxUsedPercentDiffMap = Maps.newHashMap(); private final Map<TStorageMedium, Double> avgUsedCapacityPercentMap = Maps.newHashMap(); private final Map<TStorageMedium, Double> avgReplicaNumPercentMap = Maps.newHashMap(); private final Map<TStorageMedium, Double> avgLoadScoreMap = Maps.newHashMap(); @@ -116,10 +117,27 @@ public class LoadStatisticForTag { / (double) totalCapacityMap.getOrDefault(medium, 1L)); avgReplicaNumPercentMap.put(medium, totalReplicaNumMap.getOrDefault(medium, 0L) / (double) backendNumMap.getOrDefault(medium, 1)); + + double maxUsedPercent = -1.0; + double minUsedPercent = -1.0; + for (BackendLoadStatistic beStatistic : beLoadStatistics) { + long beTotalCapacityB = beStatistic.getTotalCapacityB(medium); + long beTotalUsedCapacityB = beStatistic.getTotalUsedCapacityB(medium); + if (beTotalCapacityB > 0) { + double beUsedPercent = ((double) beTotalUsedCapacityB) / beTotalCapacityB; + if (maxUsedPercent < 0.0 || beUsedPercent > maxUsedPercent) { + maxUsedPercent = beUsedPercent; + } + if (minUsedPercent < 0.0 || beUsedPercent < minUsedPercent) { + minUsedPercent = beUsedPercent; + } + } + } + maxUsedPercentDiffMap.put(medium, maxUsedPercent - minUsedPercent); } for (BackendLoadStatistic beStatistic : beLoadStatistics) { - beStatistic.calcScore(avgUsedCapacityPercentMap, avgReplicaNumPercentMap); + beStatistic.calcScore(avgUsedCapacityPercentMap, maxUsedPercentDiffMap, avgReplicaNumPercentMap); } // classify all backends @@ -250,13 +268,15 @@ public class LoadStatisticForTag { currentSrcBeScore = srcBeStat.getLoadScore(medium); currentDestBeScore = destBeStat.getLoadScore(medium); - LoadScore newSrcBeScore = BackendLoadStatistic.calcSore(srcBeStat.getTotalUsedCapacityB(medium) - tabletSize, + LoadScore newSrcBeScore = BackendLoadStatistic.calcScore(srcBeStat.getTotalUsedCapacityB(medium) - tabletSize, srcBeStat.getTotalCapacityB(medium), srcBeStat.getReplicaNum(medium) - 1, - avgUsedCapacityPercentMap.get(medium), avgReplicaNumPercentMap.get(medium)); + avgUsedCapacityPercentMap.get(medium), maxUsedPercentDiffMap.get(medium), + avgReplicaNumPercentMap.get(medium)); - LoadScore newDestBeScore = BackendLoadStatistic.calcSore(destBeStat.getTotalUsedCapacityB(medium) + tabletSize, + LoadScore newDestBeScore = BackendLoadStatistic.calcScore(destBeStat.getTotalUsedCapacityB(medium) + tabletSize, destBeStat.getTotalCapacityB(medium), destBeStat.getReplicaNum(medium) + 1, - avgUsedCapacityPercentMap.get(medium), avgReplicaNumPercentMap.get(medium)); + avgUsedCapacityPercentMap.get(medium), maxUsedPercentDiffMap.get(medium), + avgReplicaNumPercentMap.get(medium)); double currentDiff = Math.abs(currentSrcBeScore - avgLoadScoreMap.get(medium)) + Math.abs(currentDestBeScore - avgLoadScoreMap.get(medium)); diff --git a/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java b/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java index 0c596a6f53..9d5ffe1e6d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java @@ -82,6 +82,7 @@ public class DiskRebalanceTest { @Before public void setUp() throws Exception { + Config.used_capacity_percent_max_diff = 1.0; db = new Database(1, "test db"); db.setClusterName(SystemInfoService.DEFAULT_CLUSTER); new Expectations() { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
