This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new d939903753a [improvement](statistics)Use count as ndv for unique/agg olap table single key column (#27186) d939903753a is described below commit d939903753a3e39f0723f32152c06003a4bf27c4 Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Mon Nov 20 15:49:08 2023 +0800 [improvement](statistics)Use count as ndv for unique/agg olap table single key column (#27186) Single key column of unique/agg olap table has the same value of count and ndv, for this kind of column, don't need to calculate ndv, simply use count as ndv. --- .../apache/doris/statistics/BaseAnalysisTask.java | 4 +-- .../apache/doris/statistics/HMSAnalysisTask.java | 1 + .../apache/doris/statistics/OlapAnalysisTask.java | 40 +++++++++++++++++++--- .../doris/statistics/OlapAnalysisTaskTest.java | 2 +- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index a278200e5c7..f3fa143b528 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask { + "${idxId} AS `idx_id`, " + "'${colId}' AS `col_id`, " + "NULL AS `part_id`, " - + "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, " - + "ROUND(NDV(`${colName}`) * ${scaleFactor}) as `ndv`, " + + "${rowCount} AS `row_count`, " + + "${ndvFunction} as `ndv`, " + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, " + "${min} AS `min`, " + "${max} AS `max`, " diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 812bd615a69..4c12236fb40 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -145,6 +145,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) { bucketFlag = true; sb.append(LINEAR_ANALYZE_TEMPLATE); + params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); } else { sb.append(DUJ1_ANALYZE_TEMPLATE); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index d7037580595..97cb10c520c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -17,6 +17,7 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.KeysType; import org.apache.doris.catalog.MaterializedIndex; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; @@ -129,21 +130,26 @@ public class OlapAnalysisTask extends BaseAnalysisTask { } StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql; - // Single distribution column is not fit for DUJ1 estimator, use linear estimator. - Set<String> distributionColumns = tbl.getDistributionColumnNames(); - if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) { + if (useLinearAnalyzeTemplate()) { params.put("min", StatisticsUtil.quote(min)); params.put("max", StatisticsUtil.quote(max)); + // For single unique key, use count as ndv. + if (isSingleUniqueKey()) { + params.put("ndvFunction", String.valueOf(rowCount)); + } else { + params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); + } sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); } else { params.put("dataSizeFunction", getDataSizeFunction(col, true)); sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); } LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], " - + "limited [{}], distribute column [{}], partition column [{}], key column [{}]", + + "limited [{}], distribute column [{}], partition column [{}], key column [{}], " + + "is single unique key [{}]", col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"), limitFlag, tbl.isDistributionColumn(col.getName()), - tbl.isPartitionColumn(col.getName()), col.isKey()); + tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey()); runQuery(sql, false); } } @@ -278,4 +284,28 @@ public class OlapAnalysisTask extends BaseAnalysisTask { } return sampleRows; } + + /** + * Check if the task should use linear analyze template. + * @return True for single unique key column and single distribution column. + */ + protected boolean useLinearAnalyzeTemplate() { + if (isSingleUniqueKey()) { + return true; + } + Set<String> distributionColumns = tbl.getDistributionColumnNames(); + return distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase()); + } + + /** + * Check if the olap table has a single unique key. + * @return True if the table has a single unique/agg key. False otherwise. + */ + protected boolean isSingleUniqueKey() { + int keysNum = ((OlapTable) tbl).getKeysNum(); + KeysType keysType = ((OlapTable) tbl).getKeysType(); + return col.isKey() + && keysNum == 1 + && (keysType.equals(KeysType.UNIQUE_KEYS) || keysType.equals(KeysType.AGG_KEYS)); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 9437d2d0787..8e30519e8c4 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -218,7 +218,7 @@ public class OlapAnalysisTaskTest { @Mock public void runQuery(String sql, boolean needEncode) { Assertions.assertFalse(needEncode); - Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, ROUND(COUNT(1) * 5.0) AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`. [...] + Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit [...] return; } }; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org