This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 0f3a98573a0f2457213ad29b05b5231254585431
Author: yujun <[email protected]>
AuthorDate: Tue Aug 15 17:27:31 2023 +0800

    [improvement](tablet clone)  update the capacity coeficient for calculating 
backend load score (#22857)
    
    update the capacity coeficient for calcutating the backend load score:
    1. Add fe config entry `backend_load_capacity_coeficient` to allow setting 
the capacity coeficient manually;
    2. Adjust calculating capacity coeficient as below.
    
    We emphasize disk usage for calculating load score.
    If a be has a high used capacity percent, we should increase its load score.
    So we increase capacity coefficient with a be's used capacity percent.
    
    But this is not enough. For example, if the tablets have a big difference 
in data size.
    Then for below two BEs, their load score maybe the same:
    BE A:  disk usage = 60%,  replica number = 2000  (it contains the big 
tablets)
    BE B:  disk usage = 30%,  replica number = 4000  (it contains the small 
tablets)
    
    But what we want is: firstly move some big tablets from A to B, after their 
disk usages are close,
    then move some small tablets from B to A, finally both of their disk usages 
and replica number
    are close.
    
    To achieve this, when the max difference between all BE's disk usages >= 
30%,  we set the capacity cofficient to 1.0 and avoid the affect of replica 
num. After the disk usage difference decrease, then decrease the capacity 
cofficient to make replica num effective.
---
 .../main/java/org/apache/doris/common/Config.java  | 14 ++++
 .../apache/doris/clone/BackendLoadStatistic.java   | 87 ++++++++++++++++++----
 .../apache/doris/clone/LoadStatisticForTag.java    | 30 ++++++--
 .../org/apache/doris/clone/DiskRebalanceTest.java  |  1 +
 4 files changed, 111 insertions(+), 21 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 7405f8ab1b..3578c38726 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -616,6 +616,20 @@ public class Config extends ConfigBase {
             "The high water of disk capacity used percent. This is used for 
calculating load score of a backend."})
     public static double capacity_used_percent_high_water = 0.75;
 
+    @ConfField(mutable = true, masterOnly = true, description = {
+            "负载均衡时,磁盘使用率最大差值。",
+            "The max diff of disk capacity used percent between BE. "
+                    + "It is used for calculating load score of a backend."})
+    public static double used_capacity_percent_max_diff = 0.30;
+
+    @ConfField(mutable = true, masterOnly = true, description = {
+            "设置固定的 BE 负载分数中磁盘使用率系数。BE 负载分数会综合磁盘使用率和副本数而得。有效值范围为[0, 1],"
+                    + "当超出此范围时,则使用其他方法自动计算此系数。",
+            "Sets a fixed disk usage factor in the BE load fraction. The BE 
load score is a combination of disk usage "
+                    + "and replica count. The valid value range is [0, 1]. 
When it is out of this range, other "
+                    + "methods are used to automatically calculate this 
coefficient."})
+    public static double backend_load_capacity_coeficient = -1.0;
+
     @ConfField(mutable = true, masterOnly = true, description = {
             "ALTER TABLE 请求的最大超时时间。设置的足够长以适应表的数据量。",
             "Maximal timeout of ALTER TABLE request. Set long enough to fit 
your table data size."})
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java
index f1391b26d2..fdf45afe81 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/BackendLoadStatistic.java
@@ -87,11 +87,14 @@ public class BackendLoadStatistic {
     private Tag tag;
 
     public static class LoadScore {
-        public double replicaNumCoefficient = 0.5;
         public double capacityCoefficient = 0.5;
         public double score = 0.0;
 
         public static final LoadScore DUMMY = new LoadScore();
+
+        public double getReplicaNumCoefficient() {
+            return 1.0 - capacityCoefficient;
+        }
     }
 
     private Map<TStorageMedium, Long> totalCapacityMap = Maps.newHashMap();
@@ -244,24 +247,27 @@ public class BackendLoadStatistic {
     }
 
     public void calcScore(Map<TStorageMedium, Double> 
avgClusterUsedCapacityPercentMap,
+            Map<TStorageMedium, Double> maxUsedPercentDiffMap,
             Map<TStorageMedium, Double> avgClusterReplicaNumPerBackendMap) {
 
         for (TStorageMedium medium : TStorageMedium.values()) {
-            LoadScore loadScore = 
calcSore(totalUsedCapacityMap.getOrDefault(medium, 0L),
+            LoadScore loadScore = 
calcScore(totalUsedCapacityMap.getOrDefault(medium, 0L),
                     totalCapacityMap.getOrDefault(medium, 1L),
                     totalReplicaNumMap.getOrDefault(medium, 0L),
                     avgClusterUsedCapacityPercentMap.getOrDefault(medium, 0.0),
+                    maxUsedPercentDiffMap.getOrDefault(medium, 0.0),
                     avgClusterReplicaNumPerBackendMap.getOrDefault(medium, 
0.0));
 
             loadScoreMap.put(medium, loadScore);
 
             LOG.debug("backend {}, medium: {}, capacity coefficient: {}, 
replica coefficient: {}, load score: {}",
-                    beId, medium, loadScore.capacityCoefficient, 
loadScore.replicaNumCoefficient, loadScore.score);
+                    beId, medium, loadScore.capacityCoefficient, 
loadScore.getReplicaNumCoefficient(),
+                    loadScore.score);
         }
     }
 
-    public static LoadScore calcSore(long beUsedCapacityB, long 
beTotalCapacityB, long beTotalReplicaNum,
-            double avgClusterUsedCapacityPercent, double 
avgClusterReplicaNumPerBackend) {
+    public static LoadScore calcScore(long beUsedCapacityB, long 
beTotalCapacityB, long beTotalReplicaNum,
+            double avgClusterUsedCapacityPercent, double maxUsedPercentDiff, 
double avgClusterReplicaNumPerBackend) {
 
         double usedCapacityPercent = (beUsedCapacityB / (double) 
beTotalCapacityB);
         double capacityProportion = avgClusterUsedCapacityPercent <= 0 ? 0.0
@@ -271,22 +277,71 @@ public class BackendLoadStatistic {
 
         LoadScore loadScore = new LoadScore();
 
-        // If this backend's capacity used percent < 50%, set 
capacityCoefficient to 0.5.
-        // Else if capacity used percent > 75%, set capacityCoefficient to 1.
-        // Else, capacityCoefficient changed smoothly from 0.5 to 1 with used 
capacity increasing
-        // Function: (2 * usedCapacityPercent - 0.5)
-        if (!Config.be_rebalancer_fuzzy_test) {
-            loadScore.capacityCoefficient = usedCapacityPercent < 0.5 ? 0.5
-                    : (usedCapacityPercent > 
Config.capacity_used_percent_high_water ? 1.0
-                            : (2 * usedCapacityPercent - 0.5));
-            loadScore.replicaNumCoefficient = 1 - 
loadScore.capacityCoefficient;
+        if (Config.backend_load_capacity_coeficient >= 0.0 && 
Config.backend_load_capacity_coeficient <= 1.0) {
+            loadScore.capacityCoefficient = 
Config.backend_load_capacity_coeficient;
+        } else if (!Config.be_rebalancer_fuzzy_test) {
+            // Emphasize disk usage for calculating load score.
+            //
+            // thisBeCapacityCoefficient:
+            //   If this be has a high used capacity percent, we should 
increase its load score.
+            //   So we increase capacity coefficient with this be's used 
capacity percent.
+            //
+            // Also we emphasize the difference of disk usage between all the 
BEs.
+            // For example, if the tablets have a big difference in data size.
+            // Then for below two BEs, their load score maybe the same:
+            // BE A:  disk usage = 60%,  replica number = 2000  (it contains 
the big tablets)
+            // BE B:  disk usage = 30%,  replica number = 4000  (it contains 
the small tablets)
+            //
+            // But what we want is: firstly move some big tablets from A to B, 
after their disk usages are close,
+            // then move some small tablets from B to A, finally both of their 
disk usages and replica number
+            // are close.
+            //
+            // allBeCapacityCoefficient is used to achieve this, when the max 
difference between all BE's
+            // disk usages >= 30%,  we set the capacity cofficient to 1.0 and 
avoid the affect of replica num.
+            // After the disk usage difference decrease, then decrease the 
capacity cofficient.
+            //
+            double thisBeCapacityCoefficient = 
getSmoothCofficient(usedCapacityPercent, 0.5,
+                    Config.capacity_used_percent_high_water);
+            double allBeCapacityCoefficient = 
getSmoothCofficient(maxUsedPercentDiff, 0.05,
+                    Config.used_capacity_percent_max_diff);
+
+            loadScore.capacityCoefficient = 
Math.max(thisBeCapacityCoefficient, allBeCapacityCoefficient);
         }
         loadScore.score = capacityProportion * loadScore.capacityCoefficient
-                + replicaNumProportion * loadScore.replicaNumCoefficient;
+                + replicaNumProportion * loadScore.getReplicaNumCoefficient();
 
         return loadScore;
     }
 
+    // the return cofficient:
+    // 1. percent <= percentLowWatermark: cofficient = 0.5;
+    // 2. percentLowWatermark < percent < percentHighWatermark: cofficient 
linear increase from 0.5 to 1.0;
+    // 3. percent >= percentHighWatermark: cofficient = 1.0;
+    public static double getSmoothCofficient(double percent, double 
percentLowWatermark,
+            double percentHighWatermark) {
+        final double lowCofficient = 0.5;
+        final double highCofficient = 1.0;
+
+        // low watermark and high watermark equal, then return 0.75
+        if (Math.abs(percentHighWatermark - percentLowWatermark) < 1e-6) {
+            return (lowCofficient + highCofficient) / 2;
+        }
+        if (percentLowWatermark > percentHighWatermark) {
+            double tmp = percentLowWatermark;
+            percentLowWatermark = percentHighWatermark;
+            percentHighWatermark = tmp;
+        }
+        if (percent <= percentLowWatermark) {
+            return lowCofficient;
+        }
+        if (percent >= percentHighWatermark) {
+            return highCofficient;
+        }
+
+        return lowCofficient + (highCofficient - lowCofficient)
+            * (percent - percentLowWatermark) / (percentHighWatermark - 
percentLowWatermark);
+    }
+
     public BalanceStatus isFit(long tabletSize, TStorageMedium medium,
             List<RootPathLoadStatistic> result, boolean isSupplement) {
         BalanceStatus status = new BalanceStatus(ErrCode.COMMON_ERROR);
@@ -444,7 +499,7 @@ public class BackendLoadStatistic {
         info.add(String.valueOf(totalReplicaNumMap.getOrDefault(medium, 0L)));
         LoadScore loadScore = loadScoreMap.getOrDefault(medium, new 
LoadScore());
         info.add(String.valueOf(loadScore.capacityCoefficient));
-        info.add(String.valueOf(loadScore.replicaNumCoefficient));
+        info.add(String.valueOf(loadScore.getReplicaNumCoefficient()));
         info.add(String.valueOf(loadScore.score));
         info.add(clazzMap.getOrDefault(medium, Classification.INIT).name());
         return info;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
index 4709575c69..224399f480 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/LoadStatisticForTag.java
@@ -53,6 +53,7 @@ public class LoadStatisticForTag {
     private final Map<TStorageMedium, Long> totalCapacityMap = 
Maps.newHashMap();
     private final Map<TStorageMedium, Long> totalUsedCapacityMap = 
Maps.newHashMap();
     private final Map<TStorageMedium, Long> totalReplicaNumMap = 
Maps.newHashMap();
+    private final Map<TStorageMedium, Double> maxUsedPercentDiffMap = 
Maps.newHashMap();
     private final Map<TStorageMedium, Double> avgUsedCapacityPercentMap = 
Maps.newHashMap();
     private final Map<TStorageMedium, Double> avgReplicaNumPercentMap = 
Maps.newHashMap();
     private final Map<TStorageMedium, Double> avgLoadScoreMap = 
Maps.newHashMap();
@@ -116,10 +117,27 @@ public class LoadStatisticForTag {
                     / (double) totalCapacityMap.getOrDefault(medium, 1L));
             avgReplicaNumPercentMap.put(medium, 
totalReplicaNumMap.getOrDefault(medium, 0L)
                     / (double) backendNumMap.getOrDefault(medium, 1));
+
+            double maxUsedPercent = -1.0;
+            double minUsedPercent = -1.0;
+            for (BackendLoadStatistic beStatistic : beLoadStatistics) {
+                long beTotalCapacityB = beStatistic.getTotalCapacityB(medium);
+                long beTotalUsedCapacityB = 
beStatistic.getTotalUsedCapacityB(medium);
+                if (beTotalCapacityB > 0) {
+                    double beUsedPercent = ((double) beTotalUsedCapacityB) / 
beTotalCapacityB;
+                    if (maxUsedPercent < 0.0 || beUsedPercent > 
maxUsedPercent) {
+                        maxUsedPercent = beUsedPercent;
+                    }
+                    if (minUsedPercent < 0.0 || beUsedPercent < 
minUsedPercent) {
+                        minUsedPercent = beUsedPercent;
+                    }
+                }
+            }
+            maxUsedPercentDiffMap.put(medium, maxUsedPercent - minUsedPercent);
         }
 
         for (BackendLoadStatistic beStatistic : beLoadStatistics) {
-            beStatistic.calcScore(avgUsedCapacityPercentMap, 
avgReplicaNumPercentMap);
+            beStatistic.calcScore(avgUsedCapacityPercentMap, 
maxUsedPercentDiffMap, avgReplicaNumPercentMap);
         }
 
         // classify all backends
@@ -250,13 +268,15 @@ public class LoadStatisticForTag {
         currentSrcBeScore = srcBeStat.getLoadScore(medium);
         currentDestBeScore = destBeStat.getLoadScore(medium);
 
-        LoadScore newSrcBeScore = 
BackendLoadStatistic.calcSore(srcBeStat.getTotalUsedCapacityB(medium) - 
tabletSize,
+        LoadScore newSrcBeScore = 
BackendLoadStatistic.calcScore(srcBeStat.getTotalUsedCapacityB(medium) - 
tabletSize,
                 srcBeStat.getTotalCapacityB(medium), 
srcBeStat.getReplicaNum(medium) - 1,
-                avgUsedCapacityPercentMap.get(medium), 
avgReplicaNumPercentMap.get(medium));
+                avgUsedCapacityPercentMap.get(medium), 
maxUsedPercentDiffMap.get(medium),
+                avgReplicaNumPercentMap.get(medium));
 
-        LoadScore newDestBeScore = 
BackendLoadStatistic.calcSore(destBeStat.getTotalUsedCapacityB(medium) + 
tabletSize,
+        LoadScore newDestBeScore = 
BackendLoadStatistic.calcScore(destBeStat.getTotalUsedCapacityB(medium) + 
tabletSize,
                 destBeStat.getTotalCapacityB(medium), 
destBeStat.getReplicaNum(medium) + 1,
-                avgUsedCapacityPercentMap.get(medium), 
avgReplicaNumPercentMap.get(medium));
+                avgUsedCapacityPercentMap.get(medium), 
maxUsedPercentDiffMap.get(medium),
+                avgReplicaNumPercentMap.get(medium));
 
         double currentDiff = Math.abs(currentSrcBeScore - 
avgLoadScoreMap.get(medium))
                 + Math.abs(currentDestBeScore - avgLoadScoreMap.get(medium));
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java
index 0c596a6f53..9d5ffe1e6d 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/clone/DiskRebalanceTest.java
@@ -82,6 +82,7 @@ public class DiskRebalanceTest {
 
     @Before
     public void setUp() throws Exception {
+        Config.used_capacity_percent_max_diff = 1.0;
         db = new Database(1, "test db");
         db.setClusterName(SystemInfoService.DEFAULT_CLUSTER);
         new Expectations() {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to