This is an automated email from the ASF dual-hosted git repository.
lijibing pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 259d28407e4 [improvement](statistics)Enable estimate hive table row
count using file size. (#37218) (#37694)
259d28407e4 is described below
commit 259d28407e464acf7bfc868842396857d5ed88fd
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Jul 12 13:47:27 2024 +0800
[improvement](statistics)Enable estimate hive table row count using file
size. (#37218) (#37694)
backport: https://github.com/apache/doris/pull/37218
---
.../main/java/org/apache/doris/common/Config.java | 2 +-
.../doris/datasource/hive/HMSExternalTable.java | 2 +-
.../java/org/apache/doris/qe/GlobalVariable.java | 2 +-
.../hive/test_hive_partition_column_analyze.groovy | 325 +++++++++++----------
4 files changed, 172 insertions(+), 159 deletions(-)
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 1be7b871d68..995813e06e6 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2282,7 +2282,7 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = false, description = {
"Hive行数估算分区采样数",
"Sample size for hive row count estimation."})
- public static int hive_stats_partition_sample_size = 3000;
+ public static int hive_stats_partition_sample_size = 30;
@ConfField(mutable = true, masterOnly = true, description = {
"启用Hive分桶表",
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
index 03eac33ab53..76f2f6c4b39 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
@@ -875,7 +875,7 @@ public class HMSExternalTable extends ExternalTable
implements MTMVRelatedTableI
}
int totalPartitionSize = partitionValues == null ? 1 :
partitionValues.getIdToPartitionItem().size();
- if (samplePartitionSize < totalPartitionSize) {
+ if (samplePartitionSize != 0 && samplePartitionSize <
totalPartitionSize) {
totalSize = totalSize * totalPartitionSize / samplePartitionSize;
}
return totalSize / estimatedRowSize;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
index 6eac0c2b815..abe5d452a8f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
@@ -146,7 +146,7 @@ public final class GlobalVariable {
+ "Getting file list may be a time-consuming
operation. "
+ "If you don't need to estimate the number of
rows in the table "
+ "or it affects performance, you can disable this
feature."})
- public static boolean enable_get_row_count_from_file_list = false;
+ public static boolean enable_get_row_count_from_file_list = true;
@VariableMgr.VarAttr(name = READ_ONLY, flag = VariableMgr.GLOBAL,
description = {"仅用于兼容MySQL生态,暂无实际意义",
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
index 7704b68e399..9accfcaa445 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
@@ -18,7 +18,13 @@
suite("test_hive_partition_column_analyze",
"p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
- logger.info("diable Hive test.")
+ logger.info("disable Hive test.")
+ return;
+ }
+
+ def enable = sql """show variables like "enable_partition_analyze" """
+ if (enable.size() != 1 || enable[0][1].equalsIgnoreCase("false")) {
+ logger.info("partition analyze disabled. " + enable)
return;
}
@@ -36,161 +42,168 @@ suite("test_hive_partition_column_analyze",
"p0,external,hive,external_docker,ex
"""
logger.info("catalog " + catalog_name + " created")
- try {
- sql """set global enable_get_row_count_from_file_list=true"""
-
- sql """switch ${catalog_name};"""
- logger.info("switched to catalog " + catalog_name)
-
- sql """analyze table
${catalog_name}.partition_type.tinyint_partition (tinyint_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.smallint_partition (smallint_part) with sync"""
- sql """analyze table ${catalog_name}.partition_type.int_partition
(int_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.bigint_partition (bigint_part) with sync"""
- sql """analyze table ${catalog_name}.partition_type.char_partition
(char_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.varchar_partition (varchar_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.string_partition (string_part) with sync"""
- sql """analyze table ${catalog_name}.partition_type.date_partition
(date_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.float_partition (float_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.double_partition (double_part) with sync"""
- sql """analyze table
${catalog_name}.partition_type.decimal_partition (decimal_part) with sync"""
-
- sql """use partition_type;"""
-
- result = sql """show column stats tinyint_partition
(tinyint_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "tinyint_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "141474.0")
- assertEquals(result[0][6], "1.0")
- assertEquals(result[0][7], "1")
- assertEquals(result[0][8], "100")
-
- result = sql """show column stats smallint_partition
(smallint_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "smallint_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "282948.0")
- assertEquals(result[0][6], "2.0")
- assertEquals(result[0][7], "1")
- assertEquals(result[0][8], "100")
-
- result = sql """show column stats int_partition (int_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "int_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "565896.0")
- assertEquals(result[0][6], "4.0")
- assertEquals(result[0][7], "1")
- assertEquals(result[0][8], "100")
-
- result = sql """show column stats bigint_partition (bigint_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "bigint_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "1131792.0")
- assertEquals(result[0][6], "8.0")
- assertEquals(result[0][7], "1")
- assertEquals(result[0][8], "100")
-
- result = sql """show column stats char_partition (char_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "char_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "101.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "2829480.0")
- assertEquals(result[0][6], "20.0")
- assertEquals(result[0][7], "\'1 \'")
- assertEquals(result[0][8], "\'a \'")
-
- result = sql """show column stats varchar_partition
(varchar_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "varchar_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "271630.0")
- assertEquals(result[0][6], "1.9199994345250717")
- assertEquals(result[0][7], "\'1\'")
- assertEquals(result[0][8], "\'99\'")
-
- result = sql """show column stats string_partition (string_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "string_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "271630.0")
- assertEquals(result[0][6], "1.9199994345250717")
- assertEquals(result[0][7], "\'1\'")
- assertEquals(result[0][8], "\'99\'")
-
- result = sql """show column stats date_partition (date_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "date_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "565896.0")
- assertEquals(result[0][6], "4.0")
- assertEquals(result[0][7], "\'2001-10-12\'")
- assertEquals(result[0][8], "\'2100-10-12\'")
-
- result = sql """show column stats float_partition (float_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "float_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "565896.0")
- assertEquals(result[0][6], "4.0")
- assertEquals(result[0][7], "296.3103")
- assertEquals(result[0][8], "32585.627")
-
- result = sql """show column stats double_partition (double_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "double_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "1131792.0")
- assertEquals(result[0][6], "8.0")
- assertEquals(result[0][7], "115.14474")
- assertEquals(result[0][8], "32761.14458")
-
- result = sql """show column stats decimal_partition
(decimal_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "decimal_part")
- assertEquals(result[0][2], "141474.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "1131792.0")
- assertEquals(result[0][6], "8.0")
- assertEquals(result[0][7], "243.2868")
- assertEquals(result[0][8], "32527.1543")
-
- sql """analyze table
${catalog_name}.partition_type.decimal_partition (decimal_part) with sync with
sql"""
- result = sql """show column stats decimal_partition
(decimal_part)"""
- assertEquals(result.size(), 1)
- assertEquals(result[0][0], "decimal_part")
- assertEquals(result[0][2], "100000.0")
- assertEquals(result[0][3], "100.0")
- assertEquals(result[0][4], "0.0")
- assertEquals(result[0][5], "800000.0")
- assertEquals(result[0][6], "8.0")
- assertEquals(result[0][7], "243.2868")
- assertEquals(result[0][8], "32527.1543")
- } finally {
- sql """set global enable_get_row_count_from_file_list=false"""
- }
+ sql """set global enable_get_row_count_from_file_list=true"""
+ sql """switch ${catalog_name};"""
+ logger.info("switched to catalog " + catalog_name)
+
+ sql """analyze table ${catalog_name}.partition_type.tinyint_partition
(tinyint_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.smallint_partition
(smallint_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.int_partition
(int_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.bigint_partition
(bigint_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.char_partition
(char_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.varchar_partition
(varchar_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.string_partition
(string_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.date_partition
(date_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.float_partition
(float_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.double_partition
(double_part) with sync"""
+ sql """analyze table ${catalog_name}.partition_type.decimal_partition
(decimal_part) with sync"""
+
+ sql """use partition_type;"""
+
+ result = sql """show column stats tinyint_partition (tinyint_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "tinyint_part")
+ assertEquals(result[0][1], "tinyint_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "100000.0")
+ assertEquals(result[0][6], "1.0")
+ assertEquals(result[0][7], "1")
+ assertEquals(result[0][8], "100")
+
+ result = sql """show column stats smallint_partition (smallint_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "smallint_part")
+ assertEquals(result[0][1], "smallint_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "200000.0")
+ assertEquals(result[0][6], "2.0")
+ assertEquals(result[0][7], "1")
+ assertEquals(result[0][8], "100")
+
+ result = sql """show column stats int_partition (int_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "int_part")
+ assertEquals(result[0][1], "int_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "400000.0")
+ assertEquals(result[0][6], "4.0")
+ assertEquals(result[0][7], "1")
+ assertEquals(result[0][8], "100")
+
+ result = sql """show column stats bigint_partition (bigint_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "bigint_part")
+ assertEquals(result[0][1], "bigint_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "800000.0")
+ assertEquals(result[0][6], "8.0")
+ assertEquals(result[0][7], "1")
+ assertEquals(result[0][8], "100")
+
+ result = sql """show column stats char_partition (char_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "char_part")
+ assertEquals(result[0][1], "char_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "2000000.0")
+ assertEquals(result[0][6], "20.0")
+ assertEquals(result[0][7], "\'1 \'")
+ assertEquals(result[0][8], "\'99 \'")
+
+ result = sql """show column stats varchar_partition (varchar_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "varchar_part")
+ assertEquals(result[0][1], "varchar_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "192000.0")
+ assertEquals(result[0][6], "1.92")
+ assertEquals(result[0][7], "\'1\'")
+ assertEquals(result[0][8], "\'99\'")
+
+ result = sql """show column stats string_partition (string_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "string_part")
+ assertEquals(result[0][1], "string_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "192000.0")
+ assertEquals(result[0][6], "1.92")
+ assertEquals(result[0][7], "\'1\'")
+ assertEquals(result[0][8], "\'99\'")
+
+ result = sql """show column stats date_partition (date_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "date_part")
+ assertEquals(result[0][1], "date_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "400000.0")
+ assertEquals(result[0][6], "4.0")
+ assertEquals(result[0][7], "\'2001-10-12\'")
+ assertEquals(result[0][8], "\'2100-10-12\'")
+
+ result = sql """show column stats float_partition (float_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "float_part")
+ assertEquals(result[0][1], "float_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "400000.0")
+ assertEquals(result[0][6], "4.0")
+ assertEquals(result[0][7], "296.3103")
+ assertEquals(result[0][8], "32585.627")
+
+ result = sql """show column stats double_partition (double_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "double_part")
+ assertEquals(result[0][1], "double_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "800000.0")
+ assertEquals(result[0][6], "8.0")
+ assertEquals(result[0][7], "115.14474")
+ assertEquals(result[0][8], "32761.14458")
+
+ result = sql """show column stats decimal_partition (decimal_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "decimal_part")
+ assertEquals(result[0][1], "decimal_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "800000.0")
+ assertEquals(result[0][6], "8.0")
+ assertEquals(result[0][7], "243.2868")
+ assertEquals(result[0][8], "32527.1543")
+
+ sql """analyze table ${catalog_name}.partition_type.decimal_partition
(decimal_part) with sync with sql"""
+ result = sql """show column stats decimal_partition (decimal_part)"""
+ assertEquals(result.size(), 1)
+ assertEquals(result[0][0], "decimal_part")
+ assertEquals(result[0][1], "decimal_partition")
+ assertEquals(result[0][2], "100000.0")
+ assertEquals(result[0][3], "100.0")
+ assertEquals(result[0][4], "0.0")
+ assertEquals(result[0][5], "800000.0")
+ assertEquals(result[0][6], "8.0")
+ assertEquals(result[0][7], "243.2868")
+ assertEquals(result[0][8], "32527.1543")
sql """drop catalog ${catalog_name}"""
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]