This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 78ce6b67220eda955a8e9206b5365c43e75e0806 Author: Jack Drogon <[email protected]> AuthorDate: Wed Sep 27 08:13:53 2023 +0800 [feature](autobucekt) Add support autobucket min buckets config Add support autobucket (#24920) Signed-off-by: Jack Drogon <[email protected]> --- docs/en/docs/advanced/autobucket.md | 1 + docs/zh-CN/docs/advanced/autobucket.md | 1 + .../main/java/org/apache/doris/common/Config.java | 34 ++++++++++++++++++++++ .../org/apache/doris/analysis/CreateTableStmt.java | 2 +- .../doris/clone/DynamicPartitionScheduler.java | 2 +- .../apache/doris/common/util/AutoBucketUtils.java | 5 ++++ .../suites/autobucket/test_autobucket.groovy | 23 +++++++++++++++ 7 files changed, 66 insertions(+), 2 deletions(-) diff --git a/docs/en/docs/advanced/autobucket.md b/docs/en/docs/advanced/autobucket.md index bc05e3e72f6..e9fdfcdf63b 100644 --- a/docs/en/docs/advanced/autobucket.md +++ b/docs/en/docs/advanced/autobucket.md @@ -72,6 +72,7 @@ First, use the value of estimate_partition_size divided by 5 (calculated as a 5- 3. Calculation logic to get the final number of buckets. First calculate an intermediate value x = min(M, N, 128). If x < N and x < the number of BE nodes, the final bucket is y, the number of BE nodes; otherwise, the final bucket is x. +4. x = max(x, autobucket_min_buckets), 这里autobucket_min_buckets是在Config中配置的,默认是1 The pseudo-code representation of the above process is as follows diff --git a/docs/zh-CN/docs/advanced/autobucket.md b/docs/zh-CN/docs/advanced/autobucket.md index 43a1124db77..ccbceaf633d 100644 --- a/docs/zh-CN/docs/advanced/autobucket.md +++ b/docs/zh-CN/docs/advanced/autobucket.md @@ -73,6 +73,7 @@ properties("estimate_partition_size" = "100G") 3. 得到最终的分桶个数计算逻辑: 先计算一个中间值 x = min(M, N, 128), 如果 x < N并且x < BE节点个数,则最终分桶为 y 即 BE 节点个数;否则最终分桶数为 x +4. x = max(x, autobucket_min_buckets), 这里autobucket_min_buckets是在Config中配置的,默认是1 上述过程伪代码表现形式为: diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 5f2173af7ca..386ed7cde47 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2125,4 +2125,38 @@ public class Config extends ConfigBase { + "The larger the value, the more uniform the distribution of the hash algorithm, " + "but it will increase the memory overhead."}) public static int virtual_node_number = 2048; + + @ConfField(description = {"控制对大表的自动ANALYZE的最小时间间隔," + + "在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次", + "This controls the minimum time interval for automatic ANALYZE on large tables. Within this interval," + + "tables larger than huge_table_lower_bound_size_in_bytes are analyzed only once."}) + public static long huge_table_auto_analyze_interval_in_millis = TimeUnit.HOURS.toMillis(12); + + @ConfField(description = {"定义大表的大小下界,在开启enable_auto_sample的情况下," + + "大小超过该值的表将会自动通过采样收集统计信息", "This defines the lower size bound for large tables. " + + "When enable_auto_sample is enabled, tables larger than this value will automatically collect " + + "statistics through sampling"}) + public static long huge_table_lower_bound_size_in_bytes = 5L * 1024 * 1024 * 1024; + + @ConfField(description = {"定义开启开启大表自动sample后,对大表的采样比例", + "This defines the number of sample percent for large tables when automatic sampling for" + + "large tables is enabled"}) + public static int huge_table_default_sample_rows = 4194304; + + @ConfField(description = {"是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集" + + "统计信息", "Whether to enable automatic sampling for large tables, which, when enabled, automatically" + + "collects statistics through sampling for tables larger than 'huge_table_lower_bound_size_in_bytes'"}) + public static boolean enable_auto_sample = false; + + @ConfField(description = { + "控制统计信息的自动触发作业执行记录的持久化行数", + "Determine the persist number of automatic triggered analyze job execution status" + }) + public static long auto_analyze_job_record_count = 20000; + + @ConfField(description = { + "Auto Buckets中最小的buckets数目", + "min buckets of auto bucket" + }) + public static int autobucket_min_buckets = 1; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java index b1df8498b8b..fa305793f56 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java @@ -122,7 +122,7 @@ public class CreateTableStmt extends DdlStmt { } else { long partitionSize = ParseUtil .analyzeDataVolumn(newProperties.get(PropertyAnalyzer.PROPERTIES_ESTIMATE_PARTITION_SIZE)); - distributionDesc.setBuckets(AutoBucketUtils.getBucketsNum(partitionSize)); + distributionDesc.setBuckets(AutoBucketUtils.getBucketsNum(partitionSize, Config.autobucket_min_buckets)); } return newProperties; diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java index 53f44070d93..dc03ecf8233 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java @@ -221,7 +221,7 @@ public class DynamicPartitionScheduler extends MasterDaemon { // plus 5 for uncompressed data long uncompressedPartitionSize = getNextPartitionSize(partitionSizeArray) * 5; - return AutoBucketUtils.getBucketsNum(uncompressedPartitionSize); + return AutoBucketUtils.getBucketsNum(uncompressedPartitionSize, Config.autobucket_min_buckets); } private ArrayList<AddPartitionClause> getAddPartitionClause(Database db, OlapTable olapTable, diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java index ca935ab20e7..294250fd213 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/AutoBucketUtils.java @@ -95,4 +95,9 @@ public class AutoBucketUtils { logger.debug("AutoBucketsUtil: final bucketsNum {}", bucketsNum); return bucketsNum; } + + public static int getBucketsNum(long partitionSize, int minBuckets) { + int bucketsNum = getBucketsNum(partitionSize); + return Math.max(minBuckets, bucketsNum); + } } diff --git a/regression-test/suites/autobucket/test_autobucket.groovy b/regression-test/suites/autobucket/test_autobucket.groovy index ab0ae99658b..d3ba70d0df3 100644 --- a/regression-test/suites/autobucket/test_autobucket.groovy +++ b/regression-test/suites/autobucket/test_autobucket.groovy @@ -39,4 +39,27 @@ suite("test_autobucket") { assertEquals(Integer.valueOf(result.get(0).get(8)), 10) sql "drop table if exists autobucket_test" + + + sql "drop table if exists autobucket_test_min_buckets" + result = sql """ + CREATE TABLE `autobucket_test_min_buckets` ( + `user_id` largeint(40) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`user_id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`user_id`) BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "estimate_partition_size" = "1M" + ) + """ + + default_min_buckets = 1 // in Config.java + result = sql "show partitions from autobucket_test_min_buckets" + logger.info("${result}") + // XXX: buckets at pos(8), next maybe impl by sql meta + assertEquals(Integer.valueOf(result.get(0).get(8)), default_min_buckets) + + sql "drop table if exists autobucket_test_min_buckets" } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
