This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 8a8e6edba98 [enhancement](stats) Unify sample algorithm between olap
table and external table (#25472)
8a8e6edba98 is described below
commit 8a8e6edba98c0515b93a6050c407cc0cea94b2cd
Author: AKIRA <[email protected]>
AuthorDate: Wed Oct 18 14:18:40 2023 +0800
[enhancement](stats) Unify sample algorithm between olap table and external
table (#25472)
To reduce error of ndv estimation of olap table's column
---
.../apache/doris/statistics/BaseAnalysisTask.java | 9 ++++++
.../apache/doris/statistics/HMSAnalysisTask.java | 7 +----
.../apache/doris/statistics/OlapAnalysisTask.java | 34 +++++++++++-----------
3 files changed, 27 insertions(+), 23 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 1377fe05ff8..73ad70eb0df 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -44,6 +44,15 @@ public abstract class BaseAnalysisTask {
public static final Logger LOG =
LogManager.getLogger(BaseAnalysisTask.class);
+ protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
+
+ protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) *
${scaleFactor}) AS row_count, "
+ + "case when NDV(`${colName}`)/count('${colName}') < "
+ + NDV_MULTIPLY_THRESHOLD
+ + " then NDV(`${colName}`) "
+ + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
+ ;
+
/**
* Stats stored in the column_statistics table basically has two types,
`part_id` is null which means it is
* aggregate from partition level stats, `part_id` is not null which means
it is partition level stats.
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index df2396de034..217eb89383c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -50,7 +50,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
// While doing sample analysis, the sampled ndv result will multiply a
factor (total size/sample size)
// if ndv(col)/count(col) is greater than this threshold.
- private static final String NDV_MULTIPLY_THRESHOLD = "0.3";
private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
@@ -62,11 +61,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
- + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
- + "case when NDV(`${colName}`)/count('${colName}') < "
- + NDV_MULTIPLY_THRESHOLD
- + " then NDV(`${colName}`) "
- + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
+ + NDV_SAMPLE_TEMPLATE
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) *
${scaleFactor}) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index d14cbc0f0a1..7be6d1cc6ee 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -54,7 +54,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
// NDV should only be computed for the relevant partition.
private static final String ANALYZE_COLUMN_SQL_TEMPLATE =
INSERT_COL_STATISTICS
+ " (SELECT NDV(`${colName}`) AS ndv "
- + " FROM `${dbName}`.`${tblName}`) t2\n";
+ + " FROM `${dbName}`.`${tblName}`) t2";
private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE =
" SELECT "
@@ -73,22 +73,22 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
+ "${dataSizeFunction} AS data_size, "
+ "NOW() FROM `${dbName}`.`${tblName}` PARTITION
${partitionName}";
- private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT \n"
- + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, \n"
- + "${catalogId} AS catalog_id, \n"
- + "${dbId} AS db_id, \n"
- + "${tblId} AS tbl_id, \n"
- + "${idxId} AS idx_id, \n"
- + "'${colId}' AS col_id, \n"
- + "NULL AS part_id, \n"
- + "COUNT(1) * ${scaleFactor} AS row_count, \n"
- + "NDV(`${colName}`) * ${scaleFactor} AS ndv, \n"
- + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) *
${scaleFactor} AS null_count, \n"
- + "MIN(`${colName}`) AS min, \n"
- + "MAX(`${colName}`) AS max, \n"
- + "${dataSizeFunction} * ${scaleFactor} AS data_size, \n"
- + "NOW()\n"
- + "FROM `${dbName}`.`${tblName}`\n"
+ private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT "
+ + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+ + "${catalogId} AS catalog_id, "
+ + "${dbId} AS db_id, "
+ + "${tblId} AS tbl_id, "
+ + "${idxId} AS idx_id, "
+ + "'${colId}' AS col_id, "
+ + "NULL AS part_id, "
+ + "COUNT(1) * ${scaleFactor} AS row_count, "
+ + NDV_SAMPLE_TEMPLATE
+ + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) *
${scaleFactor} AS null_count, "
+ + "MIN(`${colName}`) AS min, "
+ + "MAX(`${colName}`) AS max, "
+ + "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ + "NOW() "
+ + "FROM `${dbName}`.`${tblName}`"
+ "${tablets}";
// cache stats for each partition, it would be inserted into
column_statistics in a batch.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]