This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 1130317b91d [Improvement](statistics)Collect stats for hive partition 
column using metadata (#24853)
1130317b91d is described below

commit 1130317b91d5a83282a1cb0b144449cee95cf5e4
Author: Jibing-Li <[email protected]>
AuthorDate: Tue Oct 17 10:31:57 2023 +0800

    [Improvement](statistics)Collect stats for hive partition column using 
metadata (#24853)
    
    Hive partition columns' stats could be calculated from hive metastore data. 
Doesn't need to execute sql to get the stats.
    This PR is using hive partition metadata to collect partition column stats.
---
 docs/en/docs/query-acceleration/statistics.md      |   5 +-
 docs/zh-CN/docs/query-acceleration/statistics.md   |   5 +-
 fe/fe-core/src/main/cup/sql_parser.cup             |   7 +
 .../apache/doris/analysis/AnalyzeProperties.java   |   7 +-
 .../org/apache/doris/analysis/AnalyzeStmt.java     |   4 +
 .../doris/datasource/hive/HiveMetaStoreCache.java  |  14 +-
 .../apache/doris/external/hive/util/HiveUtil.java  |  16 ++
 .../org/apache/doris/statistics/AnalysisInfo.java  |   8 +-
 .../doris/statistics/AnalysisInfoBuilder.java      |  20 +-
 .../apache/doris/statistics/AnalysisManager.java   |   1 +
 .../apache/doris/statistics/BaseAnalysisTask.java  |  18 ++
 .../apache/doris/statistics/HMSAnalysisTask.java   | 154 +++++++++++---
 .../doris/statistics/util/StatisticsUtil.java      |   5 +
 fe/fe-core/src/main/jflex/sql_scanner.flex         |   1 +
 .../hive/test_hive_partition_column_analyze.groovy | 230 +++++++++++++++++++++
 15 files changed, 447 insertions(+), 48 deletions(-)

diff --git a/docs/en/docs/query-acceleration/statistics.md 
b/docs/en/docs/query-acceleration/statistics.md
index 127240626e6..4bb3d941b07 100644
--- a/docs/en/docs/query-acceleration/statistics.md
+++ b/docs/en/docs/query-acceleration/statistics.md
@@ -52,9 +52,9 @@ Syntax:
 
 ```SQL
 ANALYZE < TABLE | DATABASE table_name | db_name >
-    [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT] ]
+    [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT ] ]
     [ (column_name [, ...]) ]
-    [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ]]
+    [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH SQL ] ]
     [ PROPERTIES ("key" = "value", ...) ];
 ```
 
@@ -65,6 +65,7 @@ Where:
 - `column_name`: Specifies the target column. It must be an existing column in 
`table_name`, and multiple column names are separated by commas.
 - `sync`: Collect statistics synchronously. Returns upon completion. If not 
specified, it executes asynchronously and returns a task ID.
 - `sample percent | rows`: Collect statistics using sampling. You can specify 
either the sampling percentage or the number of sampled rows.
+- `sql`: Collect statistics for external partition column with sql. By 
default, it uses meta data for partition columns, which is faster but may 
inaccurate for row count and size. Using sql could collect the accurate stats.
 
 ### Automatic Statistics Collection
 
diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md 
b/docs/zh-CN/docs/query-acceleration/statistics.md
index 9bd69f94482..fd3066995e6 100644
--- a/docs/zh-CN/docs/query-acceleration/statistics.md
+++ b/docs/zh-CN/docs/query-acceleration/statistics.md
@@ -51,9 +51,9 @@ Doris支持用户通过提交ANALYZE语句来触发统计信息的收集和更
 
 ```SQL
 ANALYZE < TABLE | DATABASE table_name | db_name > 
-    [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT] ]
+    [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT ] ]
     [ (column_name [, ...]) ]
-    [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ]]
+    [ [ WITH SYNC ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH SQL ] ]
     [ PROPERTIES ("key" = "value", ...) ];
 ```
 
@@ -64,6 +64,7 @@ ANALYZE < TABLE | DATABASE table_name | db_name >
 - column_name: 指定的目标列。必须是  `table_name`  中存在的列,多个列名称用逗号分隔。
 - sync:同步收集统计信息。收集完后返回。若不指定则异步执行并返回任务 ID。
 - sample percent | rows:抽样收集统计信息。可以指定抽样比例或者抽样行数。
+- 
sql:执行sql来收集外表分区列统计信息。默认从元数据收集分区列信息,这样效率比较高但是行数和数据量大小可能不准。用户可以指定使用sql来收集,这样可以收集到准确的分区列信息。
 
 
 ### 自动收集
diff --git a/fe/fe-core/src/main/cup/sql_parser.cup 
b/fe/fe-core/src/main/cup/sql_parser.cup
index 989a28d5484..4b6c74fbe95 100644
--- a/fe/fe-core/src/main/cup/sql_parser.cup
+++ b/fe/fe-core/src/main/cup/sql_parser.cup
@@ -577,6 +577,7 @@ terminal String
     KW_SNAPSHOT,
     KW_SONAME,
     KW_SPLIT,
+    KW_SQL,
     KW_SQL_BLOCK_RULE,
     KW_START,
     KW_STARTS,    
@@ -6142,6 +6143,12 @@ with_analysis_properties ::=
              put(AnalyzeProperties.PROPERTY_FORCE_FULL, "true");
         }};
     :}
+    | KW_SQL
+    {:
+        RESULT = new HashMap<String, String>() {{
+             put(AnalyzeProperties.PROPERTY_PARTITION_COLUMN_FROM_SQL, "true");
+        }};
+    :}
 
     ;
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java
index d7e639da3a5..4b5f161d2be 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java
@@ -42,8 +42,8 @@ public class AnalyzeProperties {
     public static final String PROPERTY_NUM_BUCKETS = "num.buckets";
     public static final String PROPERTY_ANALYSIS_TYPE = "analysis.type";
     public static final String PROPERTY_PERIOD_SECONDS = "period.seconds";
-
     public static final String PROPERTY_FORCE_FULL = "force.full";
+    public static final String PROPERTY_PARTITION_COLUMN_FROM_SQL = 
"partition.column.from.sql";
 
     public static final AnalyzeProperties DEFAULT_PROP = new 
AnalyzeProperties(new HashMap<String, String>() {
         {
@@ -71,6 +71,7 @@ public class AnalyzeProperties {
             .add(PROPERTY_PERIOD_SECONDS)
             .add(PROPERTY_PERIOD_CRON)
             .add(PROPERTY_FORCE_FULL)
+            .add(PROPERTY_PARTITION_COLUMN_FROM_SQL)
             .build();
 
     public AnalyzeProperties(Map<String, String> properties) {
@@ -276,6 +277,10 @@ public class AnalyzeProperties {
         return properties.containsKey(PROPERTY_SAMPLE_ROWS);
     }
 
+    public boolean usingSqlForPartitionColumn() {
+        return properties.containsKey(PROPERTY_PARTITION_COLUMN_FROM_SQL);
+    }
+
     public String toSQL() {
         StringBuilder sb = new StringBuilder();
         sb.append("PROPERTIES(");
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java
index ae2c6a7ff48..2c4a51757cd 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java
@@ -97,4 +97,8 @@ public class AnalyzeStmt extends StatementBase {
     public boolean forceFull() {
         return analyzeProperties.forceFull();
     }
+
+    public boolean usingSqlForPartitionColumn() {
+        return analyzeProperties.usingSqlForPartitionColumn();
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java
index e5a43f3bb0f..8c9b5552a72 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java
@@ -87,10 +87,7 @@ import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
 import java.io.FileNotFoundException;
-import java.io.UnsupportedEncodingException;
 import java.net.URI;
-import java.net.URLDecoder;
-import java.nio.charset.StandardCharsets;
 import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -298,16 +295,7 @@ public class HiveMetaStoreCache {
         Preconditions.checkState(parts.length == types.size(), partitionName + 
" vs. " + types);
         List<PartitionValue> values = 
Lists.newArrayListWithExpectedSize(types.size());
         for (String part : parts) {
-            String[] kv = part.split("=");
-            Preconditions.checkState(kv.length == 2, partitionName);
-            String partitionValue;
-            try {
-                // hive partition value maybe contains special characters like 
'=' and '/'
-                partitionValue = URLDecoder.decode(kv[1], 
StandardCharsets.UTF_8.name());
-            } catch (UnsupportedEncodingException e) {
-                // It should not be here
-                throw new RuntimeException(e);
-            }
+            String partitionValue = HiveUtil.getHivePartitionValue(part);
             values.add(new PartitionValue(partitionValue, 
HIVE_DEFAULT_PARTITION.equals(partitionValue)));
         }
         try {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java
index 85b8034fe70..deb048b5943 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java
@@ -27,6 +27,7 @@ import org.apache.doris.fs.FileSystemFactory;
 import org.apache.doris.fs.remote.BrokerFileSystem;
 import org.apache.doris.fs.remote.RemoteFileSystem;
 
+import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -46,8 +47,11 @@ import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
+import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 
 /**
@@ -221,4 +225,16 @@ public final class HiveUtil {
         }
     }
 
+    public static String getHivePartitionValue(String part) {
+        String[] kv = part.split("=");
+        Preconditions.checkState(kv.length == 2, String.format("Malformed 
partition name %s", part));
+        try {
+            // hive partition value maybe contains special characters like '=' 
and '/'
+            return URLDecoder.decode(kv[1], StandardCharsets.UTF_8.name());
+        } catch (UnsupportedEncodingException e) {
+            // It should not be here
+            throw new RuntimeException(e);
+        }
+    }
+
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java
index 2bf06d0c2c4..9c298a6ea88 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java
@@ -179,13 +179,17 @@ public class AnalysisInfo implements Writable {
     @SerializedName("forceFull")
     public final boolean forceFull;
 
+    @SerializedName("usingSqlForPartitionColumn")
+    public final boolean usingSqlForPartitionColumn;
+
     public AnalysisInfo(long jobId, long taskId, List<Long> taskIds, long 
catalogId, long dbId, long tblId,
             Map<String, Set<String>> colToPartitions, Set<String> 
partitionNames, String colName, Long indexId,
             JobType jobType, AnalysisMode analysisMode, AnalysisMethod 
analysisMethod, AnalysisType analysisType,
             int samplePercent, long sampleRows, int maxBucketNum, long 
periodTimeInMs, String message,
             long lastExecTimeInMs, long timeCostInMs, AnalysisState state, 
ScheduleType scheduleType,
             boolean isExternalTableLevelTask, boolean partitionOnly, boolean 
samplingPartition,
-            boolean isAllPartition, long partitionCount, CronExpression 
cronExpression, boolean forceFull) {
+            boolean isAllPartition, long partitionCount, CronExpression 
cronExpression, boolean forceFull,
+            boolean usingSqlForPartitionColumn) {
         this.jobId = jobId;
         this.taskId = taskId;
         this.taskIds = taskIds;
@@ -219,6 +223,7 @@ public class AnalysisInfo implements Writable {
             this.cronExprStr = cronExpression.getCronExpression();
         }
         this.forceFull = forceFull;
+        this.usingSqlForPartitionColumn = usingSqlForPartitionColumn;
     }
 
     @Override
@@ -259,6 +264,7 @@ public class AnalysisInfo implements Writable {
             sj.add("cronExpr: " + cronExprStr);
         }
         sj.add("forceFull: " + forceFull);
+        sj.add("usingSqlForPartitionColumn: " + usingSqlForPartitionColumn);
         return sj.toString();
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java
index b6dc7b5dd74..18cf3e30792 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java
@@ -60,6 +60,7 @@ public class AnalysisInfoBuilder {
     private long partitionCount;
     private CronExpression cronExpression;
     private boolean forceFull;
+    private boolean usingSqlForPartitionColumn;
 
     public AnalysisInfoBuilder() {
     }
@@ -95,6 +96,7 @@ public class AnalysisInfoBuilder {
         partitionCount = info.partitionCount;
         cronExpression = info.cronExpression;
         forceFull = info.forceFull;
+        usingSqlForPartitionColumn = info.usingSqlForPartitionColumn;
     }
 
     public AnalysisInfoBuilder setJobId(long jobId) {
@@ -237,12 +239,19 @@ public class AnalysisInfoBuilder {
         return this;
     }
 
-    public void setCronExpression(CronExpression cronExpression) {
+    public AnalysisInfoBuilder setCronExpression(CronExpression 
cronExpression) {
         this.cronExpression = cronExpression;
+        return this;
     }
 
-    public void setForceFull(boolean forceFull) {
+    public AnalysisInfoBuilder setForceFull(boolean forceFull) {
         this.forceFull = forceFull;
+        return this;
+    }
+
+    public AnalysisInfoBuilder setUsingSqlForPartitionColumn(boolean 
usingSqlForPartitionColumn) {
+        this.usingSqlForPartitionColumn = usingSqlForPartitionColumn;
+        return this;
     }
 
     public AnalysisInfo build() {
@@ -250,7 +259,7 @@ public class AnalysisInfoBuilder {
                 colName, indexId, jobType, analysisMode, analysisMethod, 
analysisType, samplePercent,
                 sampleRows, maxBucketNum, periodTimeInMs, message, 
lastExecTimeInMs, timeCostInMs, state, scheduleType,
                 externalTableLevelTask, partitionOnly, samplingPartition, 
isAllPartition, partitionCount,
-                cronExpression, forceFull);
+                cronExpression, forceFull, usingSqlForPartitionColumn);
     }
 
     public AnalysisInfoBuilder copy() {
@@ -281,6 +290,9 @@ public class AnalysisInfoBuilder {
                 .setSamplingPartition(samplingPartition)
                 .setPartitionOnly(partitionOnly)
                 .setAllPartition(isAllPartition)
-                .setPartitionCount(partitionCount);
+                .setPartitionCount(partitionCount)
+                .setCronExpression(cronExpression)
+                .setForceFull(forceFull)
+                .setUsingSqlForPartitionColumn(usingSqlForPartitionColumn);
     }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
index 1099fd3e1fd..01c4e3c25ba 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
@@ -537,6 +537,7 @@ public class AnalysisManager extends Daemon implements 
Writable {
         infoBuilder.setLastExecTimeInMs(0);
         infoBuilder.setCronExpression(cronExpression);
         infoBuilder.setForceFull(stmt.forceFull());
+        
infoBuilder.setUsingSqlForPartitionColumn(stmt.usingSqlForPartitionColumn());
         if (analysisMethod == AnalysisMethod.SAMPLE) {
             infoBuilder.setSamplePercent(samplePercent);
             infoBuilder.setSampleRows(sampleRows);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 04b775bcd63..1377fe05ff8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -93,6 +93,24 @@ public abstract class BaseAnalysisTask {
             + "     ${internalDB}.${columnStatTbl}.part_id IS NOT NULL"
             + "     ) t1, \n";
 
+    protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = "INSERT 
INTO "
+            + "${internalDB}.${columnStatTbl}"
+            + " SELECT "
+            + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+            + "${catalogId} AS catalog_id, "
+            + "${dbId} AS db_id, "
+            + "${tblId} AS tbl_id, "
+            + "${idxId} AS idx_id, "
+            + "'${colId}' AS col_id, "
+            + "NULL AS part_id, "
+            + "${row_count} AS row_count, "
+            + "${ndv} AS ndv, "
+            + "${null_count} AS null_count, "
+            + "'${min}' AS min, "
+            + "'${max}' AS max, "
+            + "${data_size} AS data_size, "
+            + "NOW() ";
+
     protected AnalysisInfo info;
 
     protected CatalogIf<? extends DatabaseIf<? extends TableIf>> catalog;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index 7ef87ac9401..df2396de034 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -21,12 +21,15 @@ import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.catalog.external.HMSExternalTable;
 import org.apache.doris.common.FeConstants;
+import org.apache.doris.datasource.hive.HiveMetaStoreCache;
+import org.apache.doris.external.hive.util.HiveUtil;
 import org.apache.doris.qe.AutoCloseConnectContext;
 import org.apache.doris.qe.QueryState;
 import org.apache.doris.qe.StmtExecutor;
 import org.apache.doris.statistics.util.StatisticsUtil;
 
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 import org.apache.commons.text.StringSubstitutor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -118,7 +121,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
      * Get table row count
      */
     private void getTableStats() throws Exception {
-        Map<String, String> params = buildTableStatsParams(null);
+        Map<String, String> params = buildStatsParams(null);
         List<ResultRow> columnResult =
                 StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
                         .replace(ANALYZE_TABLE_COUNT_TEMPLATE));
@@ -132,6 +135,33 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
      * Get column statistics and insert the result to 
__internal_schema.column_statistics
      */
     private void getTableColumnStats() throws Exception {
+        if (isPartitionOnly) {
+            getPartitionNames();
+            List<String> partitionAnalysisSQLs = new ArrayList<>();
+            for (String partId : this.partitionNames) {
+                partitionAnalysisSQLs.add(generateSqlForPartition(partId));
+            }
+            execSQLs(partitionAnalysisSQLs);
+        } else {
+            if (!info.usingSqlForPartitionColumn && isPartitionColumn()) {
+                try {
+                    getPartitionColumnStats();
+                } catch (Exception e) {
+                    LOG.warn("Failed to collect stats for partition col {} 
using metadata, "
+                            + "fallback to normal collection", col.getName(), 
e);
+                    getOrdinaryColumnStats();
+                }
+            } else {
+                getOrdinaryColumnStats();
+            }
+        }
+    }
+
+    private boolean isPartitionColumn() {
+        return table.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
+    }
+
+    private void getOrdinaryColumnStats() throws Exception {
         // An example sql for a column stats:
         // INSERT INTO __internal_schema.column_statistics
         //   SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id,
@@ -148,26 +178,100 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
         //   MAX(`r_regionkey`) AS max,
         //   0 AS data_size,
         //   NOW() FROM `hive`.`tpch100`.`region`
-        if (isPartitionOnly) {
-            getPartitionNames();
-            List<String> partitionAnalysisSQLs = new ArrayList<>();
-            for (String partId : this.partitionNames) {
-                partitionAnalysisSQLs.add(generateSqlForPartition(partId));
+        StringBuilder sb = new StringBuilder();
+        sb.append(ANALYZE_TABLE_TEMPLATE);
+        Map<String, String> params = buildStatsParams("NULL");
+        params.put("dataSizeFunction", getDataSizeFunction(col));
+        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        String sql = stringSubstitutor.replace(sb.toString());
+        executeInsertSql(sql);
+    }
+
+    private void getPartitionColumnStats() throws Exception {
+        Set<String> partitionNames = table.getPartitionNames();
+        Set<String> ndvPartValues = Sets.newHashSet();
+        long numNulls = 0;
+        long dataSize = 0;
+        String min = null;
+        String max = null;
+        for (String names : partitionNames) {
+            // names is like "date=20230101" for one level partition
+            // and like "date=20230101/hour=12" for two level partition
+            String[] parts = names.split("/");
+            for (String part : parts) {
+                if (part.startsWith(col.getName())) {
+                    String value = HiveUtil.getHivePartitionValue(part);
+                    // HIVE_DEFAULT_PARTITION hive partition value when the 
partition name is not specified.
+                    if (value == null || value.isEmpty() || 
value.equals(HiveMetaStoreCache.HIVE_DEFAULT_PARTITION)) {
+                        numNulls += 1;
+                        continue;
+                    }
+                    ndvPartValues.add(value);
+                    dataSize += col.getType().isStringType() ? value.length() 
: col.getType().getSlotSize();
+                    min = updateMinValue(min, value);
+                    max = updateMaxValue(max, value);
+                }
+            }
+        }
+        // Estimate the row count. This value is inaccurate if the table stats 
is empty.
+        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
+        long count = tableStatsStatus == null ? table.estimatedRowCount() : 
tableStatsStatus.rowCount;
+        dataSize = dataSize * count / partitionNames.size();
+        numNulls = numNulls * count / partitionNames.size();
+        int ndv = ndvPartValues.size();
+
+        Map<String, String> params = buildStatsParams("NULL");
+        params.put("row_count", String.valueOf(count));
+        params.put("ndv", String.valueOf(ndv));
+        params.put("null_count", String.valueOf(numNulls));
+        params.put("min", min);
+        params.put("max", max);
+        params.put("data_size", String.valueOf(dataSize));
+        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        String sql = 
stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
+        executeInsertSql(sql);
+    }
+
+    private String updateMinValue(String currentMin, String value) {
+        if (currentMin == null) {
+            return value;
+        }
+        if (col.getType().isFixedPointType()) {
+            if (Long.parseLong(value) < Long.parseLong(currentMin)) {
+                return value;
+            } else {
+                return currentMin;
+            }
+        }
+        if (col.getType().isFloatingPointType() || col.getType().isDecimalV2() 
|| col.getType().isDecimalV3()) {
+            if (Double.parseDouble(value) < Double.parseDouble(currentMin)) {
+                return value;
+            } else {
+                return currentMin;
+            }
+        }
+        return value.compareTo(currentMin) < 0 ? value : currentMin;
+    }
+
+    private String updateMaxValue(String currentMax, String value) {
+        if (currentMax == null) {
+            return value;
+        }
+        if (col.getType().isFixedPointType()) {
+            if (Long.parseLong(value) > Long.parseLong(currentMax)) {
+                return value;
+            } else {
+                return currentMax;
+            }
+        }
+        if (col.getType().isFloatingPointType() || col.getType().isDecimalV2() 
|| col.getType().isDecimalV3()) {
+            if (Double.parseDouble(value) > Double.parseDouble(currentMax)) {
+                return value;
+            } else {
+                return currentMax;
             }
-            execSQLs(partitionAnalysisSQLs);
-        } else {
-            StringBuilder sb = new StringBuilder();
-            sb.append(ANALYZE_TABLE_TEMPLATE);
-            Map<String, String> params = buildTableStatsParams("NULL");
-            params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
-            params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
-            params.put("colName", col.getName());
-            params.put("colId", info.colName);
-            params.put("dataSizeFunction", getDataSizeFunction(col));
-            StringSubstitutor stringSubstitutor = new 
StringSubstitutor(params);
-            String sql = stringSubstitutor.replace(sb.toString());
-            executeInsertSql(sql);
         }
+        return value.compareTo(currentMax) > 0 ? value : currentMax;
     }
 
     private void getPartitionNames() {
@@ -198,11 +302,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
                 sb.append(" and ");
             }
         }
-        Map<String, String> params = buildTableStatsParams(partId);
-        params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
-        params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
-        params.put("colName", col.getName());
-        params.put("colId", info.colName);
+        Map<String, String> params = buildStatsParams(partId);
         params.put("dataSizeFunction", getDataSizeFunction(col));
         return new StringSubstitutor(params).replace(sb.toString());
     }
@@ -262,7 +362,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
         }
     }
 
-    private Map<String, String> buildTableStatsParams(String partId) {
+    private Map<String, String> buildStatsParams(String partId) {
         Map<String, String> commonParams = new HashMap<>();
         String id = StatisticsUtil.constructId(tbl.getId(), -1);
         if (partId == null) {
@@ -271,12 +371,16 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
             id = StatisticsUtil.constructId(id, partId);
             commonParams.put("partId", "\'" + partId + "\'");
         }
+        commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
+        commonParams.put("columnStatTbl", 
StatisticConstants.STATISTIC_TBL_NAME);
         commonParams.put("id", id);
         commonParams.put("catalogId", String.valueOf(catalog.getId()));
         commonParams.put("dbId", String.valueOf(db.getId()));
         commonParams.put("tblId", String.valueOf(tbl.getId()));
         commonParams.put("indexId", "-1");
         commonParams.put("idxId", "-1");
+        commonParams.put("colName", info.colName);
+        commonParams.put("colId", info.colName);
         commonParams.put("catalogName", catalog.getName());
         commonParams.put("dbName", db.getFullName());
         commonParams.put("tblName", tbl.getName());
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index dba5075aae1..169ac3e3383 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -629,7 +629,12 @@ public class StatisticsUtil {
         }
         // Estimate row count: totalSize/estimatedRowSize
         long estimatedRowSize = 0;
+        List<Column> partitionColumns = table.getPartitionColumns();
         for (Column column : table.getFullSchema()) {
+            // Partition column shouldn't count to the row size, because it is 
not in the data file.
+            if (partitionColumns != null && partitionColumns.contains(column)) 
{
+                continue;
+            }
             estimatedRowSize += column.getDataType().getSlotSize();
         }
         if (estimatedRowSize == 0) {
diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex 
b/fe/fe-core/src/main/jflex/sql_scanner.flex
index 10fa8600ac7..d4423f7d4c8 100644
--- a/fe/fe-core/src/main/jflex/sql_scanner.flex
+++ b/fe/fe-core/src/main/jflex/sql_scanner.flex
@@ -429,6 +429,7 @@ import org.apache.doris.qe.SqlModeHelper;
         keywordMap.put("snapshot", new Integer(SqlParserSymbols.KW_SNAPSHOT));
         keywordMap.put("soname", new Integer(SqlParserSymbols.KW_SONAME));
         keywordMap.put("split", new Integer(SqlParserSymbols.KW_SPLIT));
+        keywordMap.put("sql", new Integer(SqlParserSymbols.KW_SQL));
         keywordMap.put("sql_block_rule", new 
Integer(SqlParserSymbols.KW_SQL_BLOCK_RULE));
         keywordMap.put("sample", new Integer(SqlParserSymbols.KW_SAMPLE));
         keywordMap.put("start", new Integer(SqlParserSymbols.KW_START));
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy
 
b/regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy
new file mode 100644
index 00000000000..42aeea1524a
--- /dev/null
+++ 
b/regression-test/suites/external_table_p2/hive/test_hive_partition_column_analyze.groovy
@@ -0,0 +1,230 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_partition_column_analyze", 
"p2,external,hive,external_remote,external_remote_hive") {
+    String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String extHiveHmsHost = 
context.config.otherConfigs.get("extHiveHmsHost")
+        String extHiveHmsPort = 
context.config.otherConfigs.get("extHiveHmsPort")
+        String catalog_name = "test_hive_partition_column_analyze"
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                'type'='hms',
+                'hadoop.username' = 'hadoop',
+                'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+            );
+        """
+        logger.info("catalog " + catalog_name + " created")
+
+        // Test analyze table without init.
+        sql """analyze table 
${catalog_name}.multi_partition.multi_partition_parquet (event_day) with sync"""
+        sql """analyze table 
${catalog_name}.multi_partition.multi_partition_orc (event_day) with sync"""
+
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+        sql """use multi_partition;"""
+        def result = sql """show column stats multi_partition_parquet 
(event_day)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "event_day")
+        assertTrue(result[0][1] == "3.83714205E8")
+        assertTrue(result[0][2] == "99949.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "3.83714205E9")
+        assertTrue(result[0][5] == "10.0")
+        assertTrue(result[0][6] == "\'1749-09-24\'")
+        assertTrue(result[0][7] == "\'2023-05-26\'")
+
+        result = sql """show column stats multi_partition_orc (event_day)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "event_day")
+        assertTrue(result[0][1] == "1.9007155E8")
+        assertTrue(result[0][2] == "99949.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "1.9007155E9")
+        assertTrue(result[0][5] == "10.0")
+        assertTrue(result[0][6] == "\'1749-09-24\'")
+        assertTrue(result[0][7] == "\'2023-05-26\'")
+
+        sql """analyze table ${catalog_name}.partition_type.tinyint_partition 
(tinyint_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.smallint_partition 
(smallint_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.int_partition 
(int_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.bigint_partition 
(bigint_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.char_partition 
(char_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.varchar_partition 
(varchar_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.string_partition 
(string_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.date_partition 
(date_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.float_partition 
(float_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.double_partition 
(double_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.decimal_partition 
(decimal_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.two_partition 
(part1, part2) with sync"""
+
+        sql """use partition_type;"""
+
+        result = sql """show column stats tinyint_partition (tinyint_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "tinyint_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "141474.0")
+        assertTrue(result[0][5] == "1.0")
+        assertTrue(result[0][6] == "1")
+        assertTrue(result[0][7] == "100")
+
+        result = sql """show column stats smallint_partition (smallint_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "smallint_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "282948.0")
+        assertTrue(result[0][5] == "2.0")
+        assertTrue(result[0][6] == "1")
+        assertTrue(result[0][7] == "100")
+
+        result = sql """show column stats int_partition (int_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "int_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "565896.0")
+        assertTrue(result[0][5] == "4.0")
+        assertTrue(result[0][6] == "1")
+        assertTrue(result[0][7] == "100")
+
+        result = sql """show column stats bigint_partition (bigint_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "bigint_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "1131792.0")
+        assertTrue(result[0][5] == "8.0")
+        assertTrue(result[0][6] == "1")
+        assertTrue(result[0][7] == "100")
+
+        result = sql """show column stats char_partition (char_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "char_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "101.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "2829480.0")
+        assertTrue(result[0][5] == "20.0")
+        assertTrue(result[0][6] == "\'1                   \'")
+        assertTrue(result[0][7] == "\'a                   \'")
+
+        result = sql """show column stats varchar_partition (varchar_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "varchar_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "271630.0")
+        assertTrue(result[0][5] == "1.9199994345250717")
+        assertTrue(result[0][6] == "\'1\'")
+        assertTrue(result[0][7] == "\'99\'")
+
+        result = sql """show column stats string_partition (string_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "string_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "271630.0")
+        assertTrue(result[0][5] == "1.9199994345250717")
+        assertTrue(result[0][6] == "\'1\'")
+        assertTrue(result[0][7] == "\'99\'")
+
+        result = sql """show column stats date_partition (date_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "date_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "565896.0")
+        assertTrue(result[0][5] == "4.0")
+        assertTrue(result[0][6] == "\'2001-10-12\'")
+        assertTrue(result[0][7] == "\'2100-10-12\'")
+
+        result = sql """show column stats float_partition (float_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "float_part")
+        assertTrue(result[0][1] == "117416.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "469664.0")
+        assertTrue(result[0][5] == "4.0")
+        assertTrue(result[0][6] == "296.31")
+        assertTrue(result[0][7] == "32585.627")
+
+        result = sql """show column stats double_partition (double_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "double_part")
+        assertTrue(result[0][1] == "16987.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "135896.0")
+        assertTrue(result[0][5] == "8.0")
+        assertTrue(result[0][6] == "115.145")
+        assertTrue(result[0][7] == "32761.145")
+
+        result = sql """show column stats decimal_partition (decimal_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "decimal_part")
+        assertTrue(result[0][1] == "141474.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "1131792.0")
+        assertTrue(result[0][5] == "8.0")
+        assertTrue(result[0][6] == "243.2868")
+        assertTrue(result[0][7] == "32527.1543")
+
+        sql """analyze table ${catalog_name}.partition_type.decimal_partition 
(decimal_part) with sync with sql"""
+        result = sql """show column stats decimal_partition (decimal_part)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "decimal_part")
+        assertTrue(result[0][1] == "100000.0")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][4] == "800000.0")
+        assertTrue(result[0][5] == "8.0")
+        assertTrue(result[0][6] == "243.2868")
+        assertTrue(result[0][7] == "32527.1543")
+
+        result = sql """show column stats two_partition (part1)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "part1")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][6] == "1")
+        assertTrue(result[0][7] == "100")
+
+        result = sql """show column stats two_partition (part2)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][0] == "part2")
+        assertTrue(result[0][2] == "100.0")
+        assertTrue(result[0][3] == "0.0")
+        assertTrue(result[0][6] == "\'1\'")
+        assertTrue(result[0][7] == "\'99\'")
+
+        sql """drop catalog ${catalog_name}"""
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to