This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d939903753a [improvement](statistics)Use count as ndv for unique/agg 
olap table single key column (#27186)
d939903753a is described below

commit d939903753a3e39f0723f32152c06003a4bf27c4
Author: Jibing-Li <64681310+jibing...@users.noreply.github.com>
AuthorDate: Mon Nov 20 15:49:08 2023 +0800

    [improvement](statistics)Use count as ndv for unique/agg olap table single 
key column (#27186)
    
    Single key column of unique/agg olap table has the same value of count and 
ndv, for this kind of column,
    don't need to calculate ndv, simply use count as ndv.
---
 .../apache/doris/statistics/BaseAnalysisTask.java  |  4 +--
 .../apache/doris/statistics/HMSAnalysisTask.java   |  1 +
 .../apache/doris/statistics/OlapAnalysisTask.java  | 40 +++++++++++++++++++---
 .../doris/statistics/OlapAnalysisTaskTest.java     |  2 +-
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index a278200e5c7..f3fa143b528 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask {
             + "${idxId} AS `idx_id`, "
             + "'${colId}' AS `col_id`, "
             + "NULL AS `part_id`, "
-            + "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, "
-            + "ROUND(NDV(`${colName}`) * ${scaleFactor})  as `ndv`, "
+            + "${rowCount} AS `row_count`, "
+            + "${ndvFunction} as `ndv`, "
             + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 
${scaleFactor}) AS `null_count`, "
             + "${min} AS `min`, "
             + "${max} AS `max`, "
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index 812bd615a69..4c12236fb40 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -145,6 +145,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
             if (distributionColumns.size() == 1 && 
distributionColumns.contains(col.getName().toLowerCase())) {
                 bucketFlag = true;
                 sb.append(LINEAR_ANALYZE_TEMPLATE);
+                params.put("ndvFunction", "ROUND(NDV(`${colName}`) * 
${scaleFactor})");
                 params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
             } else {
                 sb.append(DUJ1_ANALYZE_TEMPLATE);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index d7037580595..97cb10c520c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -17,6 +17,7 @@
 
 package org.apache.doris.statistics;
 
+import org.apache.doris.catalog.KeysType;
 import org.apache.doris.catalog.MaterializedIndex;
 import org.apache.doris.catalog.OlapTable;
 import org.apache.doris.catalog.Partition;
@@ -129,21 +130,26 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
             }
             StringSubstitutor stringSubstitutor = new 
StringSubstitutor(params);
             String sql;
-            // Single distribution column is not fit for DUJ1 estimator, use 
linear estimator.
-            Set<String> distributionColumns = tbl.getDistributionColumnNames();
-            if (distributionColumns.size() == 1 && 
distributionColumns.contains(col.getName().toLowerCase())) {
+            if (useLinearAnalyzeTemplate()) {
                 params.put("min", StatisticsUtil.quote(min));
                 params.put("max", StatisticsUtil.quote(max));
+                // For single unique key, use count as ndv.
+                if (isSingleUniqueKey()) {
+                    params.put("ndvFunction", String.valueOf(rowCount));
+                } else {
+                    params.put("ndvFunction", "ROUND(NDV(`${colName}`) * 
${scaleFactor})");
+                }
                 sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
             } else {
                 params.put("dataSizeFunction", getDataSizeFunction(col, true));
                 sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
             }
             LOG.info("Sample for column [{}]. Total rows [{}], rows to sample 
[{}], scale factor [{}], "
-                    + "limited [{}], distribute column [{}], partition column 
[{}], key column [{}]",
+                    + "limited [{}], distribute column [{}], partition column 
[{}], key column [{}], "
+                    + "is single unique key [{}]",
                     col.getName(), params.get("rowCount"), rowsToSample, 
params.get("scaleFactor"),
                     limitFlag, tbl.isDistributionColumn(col.getName()),
-                    tbl.isPartitionColumn(col.getName()), col.isKey());
+                    tbl.isPartitionColumn(col.getName()), col.isKey(), 
isSingleUniqueKey());
             runQuery(sql, false);
         }
     }
@@ -278,4 +284,28 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
         }
         return sampleRows;
     }
+
+    /**
+     * Check if the task should use linear analyze template.
+     * @return True for single unique key column and single distribution 
column.
+     */
+    protected boolean useLinearAnalyzeTemplate() {
+        if (isSingleUniqueKey()) {
+            return true;
+        }
+        Set<String> distributionColumns = tbl.getDistributionColumnNames();
+        return distributionColumns.size() == 1 && 
distributionColumns.contains(col.getName().toLowerCase());
+    }
+
+    /**
+     * Check if the olap table has a single unique key.
+     * @return True if the table has a single unique/agg key. False otherwise.
+     */
+    protected boolean isSingleUniqueKey() {
+        int keysNum = ((OlapTable) tbl).getKeysNum();
+        KeysType keysType = ((OlapTable) tbl).getKeysType();
+        return col.isKey()
+            && keysNum == 1
+            && (keysType.equals(KeysType.UNIQUE_KEYS) || 
keysType.equals(KeysType.AGG_KEYS));
+    }
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 9437d2d0787..8e30519e8c4 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -218,7 +218,7 @@ public class OlapAnalysisTaskTest {
             @Mock
             public void runQuery(String sql, boolean needEncode) {
                 Assertions.assertFalse(needEncode);
-                Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, ROUND(COUNT(1) * 5.0) AS 
`row_count`, ROUND(NDV(`${colName}`) * 5.0)  as `ndv`, ROUND(SUM(CASE WHEN 
`${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS 
`min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() 
FROM `catalogName`.`${dbName}`. [...]
+                Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, 
ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS 
NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS 
`max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM 
`catalogName`.`${dbName}`.`${tblName}`  limit [...]
                 return;
             }
         };


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to