[doris] 02/06: [feature](autobucekt) Add support autobucket min buckets config Add support autobucket (#24920)

kxiao Wed, 27 Sep 2023 05:27:39 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 78ce6b67220eda955a8e9206b5365c43e75e0806
Author: Jack Drogon <[email protected]>
AuthorDate: Wed Sep 27 08:13:53 2023 +0800

    [feature](autobucekt) Add support autobucket min buckets config Add support 
autobucket (#24920)
    
    Signed-off-by: Jack Drogon <[email protected]>
---
 docs/en/docs/advanced/autobucket.md                |  1 +
 docs/zh-CN/docs/advanced/autobucket.md             |  1 +
 .../main/java/org/apache/doris/common/Config.java  | 34 ++++++++++++++++++++++
 .../org/apache/doris/analysis/CreateTableStmt.java |  2 +-
 .../doris/clone/DynamicPartitionScheduler.java     |  2 +-
 .../apache/doris/common/util/AutoBucketUtils.java  |  5 ++++
 .../suites/autobucket/test_autobucket.groovy       | 23 +++++++++++++++
 7 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/docs/en/docs/advanced/autobucket.md 
b/docs/en/docs/advanced/autobucket.md
index bc05e3e72f6..e9fdfcdf63b 100644
--- a/docs/en/docs/advanced/autobucket.md
+++ b/docs/en/docs/advanced/autobucket.md
@@ -72,6 +72,7 @@ First, use the value of estimate_partition_size divided by 5 
(calculated as a 5-
 3. Calculation logic to get the final number of buckets.
 First calculate an intermediate value x = min(M, N, 128).
 If x < N and x < the number of BE nodes, the final bucket is y, the number of 
BE nodes; otherwise, the final bucket is x.
+4. x = max(x, autobucket_min_buckets), 
这里autobucket_min_buckets是在Config中配置的，默认是1
 
 The pseudo-code representation of the above process is as follows
 
diff --git a/docs/zh-CN/docs/advanced/autobucket.md 
b/docs/zh-CN/docs/advanced/autobucket.md
index 43a1124db77..ccbceaf633d 100644
--- a/docs/zh-CN/docs/advanced/autobucket.md
+++ b/docs/zh-CN/docs/advanced/autobucket.md
@@ -73,6 +73,7 @@ properties("estimate_partition_size" = "100G")
 3. 得到最终的分桶个数计算逻辑：
 先计算一个中间值 x = min(M, N, 128)，
 如果 x < N并且x < BE节点个数，则最终分桶为 y 即 BE 节点个数；否则最终分桶数为 x
+4. x = max(x, autobucket_min_buckets), 
这里autobucket_min_buckets是在Config中配置的，默认是1
 
 上述过程伪代码表现形式为：
 
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 5f2173af7ca..386ed7cde47 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2125,4 +2125,38 @@ public class Config extends ConfigBase {
                     + "The larger the value, the more uniform the distribution 
of the hash algorithm, "
                     + "but it will increase the memory overhead."})
     public static int virtual_node_number = 2048;
+
+    @ConfField(description = {"控制对大表的自动ANALYZE的最小时间间隔，"
+            + "在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次",
+            "This controls the minimum time interval for automatic ANALYZE on 
large tables. Within this interval,"
+                    + "tables larger than huge_table_lower_bound_size_in_bytes 
are analyzed only once."})
+    public static long huge_table_auto_analyze_interval_in_millis = 
TimeUnit.HOURS.toMillis(12);
+
+    @ConfField(description = {"定义大表的大小下界，在开启enable_auto_sample的情况下，"
+            + "大小超过该值的表将会自动通过采样收集统计信息", "This defines the lower size bound for 
large tables. "
+            + "When enable_auto_sample is enabled, tables larger than this 
value will automatically collect "
+            + "statistics through sampling"})
+    public static long huge_table_lower_bound_size_in_bytes = 5L * 1024 * 1024 
* 1024;
+
+    @ConfField(description = {"定义开启开启大表自动sample后，对大表的采样比例",
+            "This defines the number of sample percent for large tables when 
automatic sampling for"
+                    + "large tables is enabled"})
+    public static int huge_table_default_sample_rows = 4194304;
+
+    @ConfField(description = 
{"是否开启大表自动sample，开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集"
+            + "统计信息", "Whether to enable automatic sampling for large tables, 
which, when enabled, automatically"
+            + "collects statistics through sampling for tables larger than 
'huge_table_lower_bound_size_in_bytes'"})
+    public static boolean enable_auto_sample = false;
+
+    @ConfField(description = {
+            "控制统计信息的自动触发作业执行记录的持久化行数",
+            "Determine the persist number of automatic triggered analyze job 
execution status"
+    })
+    public static long auto_analyze_job_record_count = 20000;
+
+    @ConfField(description = {
+            "Auto Buckets中最小的buckets数目",
+            "min buckets of auto bucket"
+    })
+    public static int autobucket_min_buckets = 1;
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java
index b1df8498b8b..fa305793f56 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java
@@ -122,7 +122,7 @@ public class CreateTableStmt extends DdlStmt {
         } else {
             long partitionSize = ParseUtil
                     
.analyzeDataVolumn(newProperties.get(PropertyAnalyzer.PROPERTIES_ESTIMATE_PARTITION_SIZE));
-            
distributionDesc.setBuckets(AutoBucketUtils.getBucketsNum(partitionSize));
+            
distributionDesc.setBuckets(AutoBucketUtils.getBucketsNum(partitionSize, 
Config.autobucket_min_buckets));
         }
 
         return newProperties;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java
 
b/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java
index 53f44070d93..dc03ecf8233 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java
@@ -221,7 +221,7 @@ public class DynamicPartitionScheduler extends MasterDaemon 
{
 
         // plus 5 for uncompressed data
         long uncompressedPartitionSize = 
getNextPartitionSize(partitionSizeArray) * 5;
-        return AutoBucketUtils.getBucketsNum(uncompressedPartitionSize);
+        return AutoBucketUtils.getBucketsNum(uncompressedPartitionSize, 
Config.autobucket_min_buckets);
     }
 
     private ArrayList<AddPartitionClause> getAddPartitionClause(Database db, 
OlapTable olapTable,
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java
index ca935ab20e7..294250fd213 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java
@@ -95,4 +95,9 @@ public class AutoBucketUtils {
         logger.debug("AutoBucketsUtil: final bucketsNum {}", bucketsNum);
         return bucketsNum;
     }
+
+    public static int getBucketsNum(long partitionSize, int minBuckets) {
+        int bucketsNum = getBucketsNum(partitionSize);
+        return Math.max(minBuckets, bucketsNum);
+    }
 }
diff --git a/regression-test/suites/autobucket/test_autobucket.groovy 
b/regression-test/suites/autobucket/test_autobucket.groovy
index ab0ae99658b..d3ba70d0df3 100644
--- a/regression-test/suites/autobucket/test_autobucket.groovy
+++ b/regression-test/suites/autobucket/test_autobucket.groovy
@@ -39,4 +39,27 @@ suite("test_autobucket") {
     assertEquals(Integer.valueOf(result.get(0).get(8)), 10)
 
     sql "drop table if exists autobucket_test"
+
+
+    sql "drop table if exists autobucket_test_min_buckets"
+    result = sql """
+        CREATE TABLE `autobucket_test_min_buckets` (
+          `user_id` largeint(40) NOT NULL
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`user_id`)
+        COMMENT 'OLAP'
+        DISTRIBUTED BY HASH(`user_id`) BUCKETS AUTO
+        PROPERTIES (
+          "replication_allocation" = "tag.location.default: 1",
+          "estimate_partition_size" = "1M"
+        )
+        """
+
+    default_min_buckets = 1 // in Config.java
+    result = sql "show partitions from autobucket_test_min_buckets"
+    logger.info("${result}")
+    // XXX: buckets at pos(8), next maybe impl by sql meta
+    assertEquals(Integer.valueOf(result.get(0).get(8)), default_min_buckets)
+
+    sql "drop table if exists autobucket_test_min_buckets"
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] 02/06: [feature](autobucekt) Add support autobucket min buckets config Add support autobucket (#24920)

Reply via email to