This is an automated email from the ASF dual-hosted git repository.
lijibing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new fdf9281e847 [improve](statistics)Support sample column skew info.
(#52048)
fdf9281e847 is described below
commit fdf9281e847d825987d0ae418a90d0a77f4ca36e
Author: James <[email protected]>
AuthorDate: Wed Jul 2 09:54:54 2025 +0800
[improve](statistics)Support sample column skew info. (#52048)
### What problem does this PR solve?
Support sample column skew info.
We collect top 3 values with their row count percentage of a column when
doing sample analyzing, and store the result as a string to
column_statistics table.
---
.../org/apache/doris/analysis/AddColumnClause.java | 4 +
.../apache/doris/analysis/ShowColumnStatsStmt.java | 2 +
.../org/apache/doris/catalog/InternalSchema.java | 2 +
.../doris/catalog/InternalSchemaInitializer.java | 28 +++-
.../plans/commands/AlterColumnStatsCommand.java | 1 +
.../apache/doris/statistics/BaseAnalysisTask.java | 142 +++++++++-------
.../org/apache/doris/statistics/ColStatsData.java | 10 +-
.../apache/doris/statistics/ColumnStatistic.java | 33 +++-
.../doris/statistics/ColumnStatisticBuilder.java | 18 +-
.../apache/doris/statistics/HMSAnalysisTask.java | 4 +-
.../apache/doris/statistics/OlapAnalysisTask.java | 5 +-
.../PartitionColumnStatisticBuilder.java | 2 +-
.../apache/doris/statistics/StatisticsCache.java | 2 +-
.../doris/statistics/StatisticsRepository.java | 8 +-
.../org/apache/doris/statistics/StatsType.java | 1 +
.../doris/statistics/util/StatisticsUtil.java | 33 ++++
.../catalog/InternalSchemaInitializerTest.java | 27 ++-
.../doris/nereids/util/HyperGraphBuilder.java | 2 +-
.../apache/doris/statistics/AnalysisJobTest.java | 2 +-
.../doris/statistics/BaseAnalysisTaskTest.java | 8 +-
.../org/apache/doris/statistics/CacheTest.java | 4 +-
.../apache/doris/statistics/ColStatsDataTest.java | 139 ++++++++--------
.../doris/statistics/HMSAnalysisTaskTest.java | 24 +--
.../doris/statistics/OlapAnalysisTaskTest.java | 59 ++++---
.../org/apache/doris/statistics/StatsMockUtil.java | 1 +
.../doris/statistics/util/StatisticsUtilTest.java | 44 +++++
.../hive/test_hive_statistic.groovy | 2 +-
regression-test/suites/statistics/test_ddl.groovy | 2 +-
.../suites/statistics/test_hot_value.groovy | 184 +++++++++++++++++++++
29 files changed, 589 insertions(+), 204 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/AddColumnClause.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/AddColumnClause.java
index 837acb5b51e..f7ae9083502 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AddColumnClause.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AddColumnClause.java
@@ -153,4 +153,8 @@ public class AddColumnClause extends AlterTableClause {
public String toString() {
return toSql();
}
+
+ public void setColumn(Column column) {
+ this.column = column;
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowColumnStatsStmt.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowColumnStatsStmt.java
index aa840dc452b..aec15432fbd 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowColumnStatsStmt.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowColumnStatsStmt.java
@@ -72,6 +72,7 @@ public class ShowColumnStatsStmt extends ShowStmt implements
NotFallbackInParser
.add("update_rows")
.add("last_analyze_row_count")
.add("last_analyze_version")
+ .add("hot_values")
.build();
private static final ImmutableList<String> PARTITION_COLUMN_TITLE_NAMES =
@@ -188,6 +189,7 @@ public class ShowColumnStatsStmt extends ShowStmt
implements NotFallbackInParser
row.add(String.valueOf(colStatsMeta == null ? "N/A" :
colStatsMeta.updatedRows));
row.add(String.valueOf(colStatsMeta == null ? "N/A" :
colStatsMeta.rowCount));
row.add(String.valueOf(colStatsMeta == null ? "N/A" :
colStatsMeta.tableVersion));
+ row.add(String.valueOf(colStatsMeta == null ? "N/A" :
p.second.getStringHotValues()));
result.add(row);
});
return new ShowResultSet(getMetaData(), result);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
index 9940bedbe49..81d7e7bfc44 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchema.java
@@ -67,6 +67,8 @@ public class InternalSchema {
new ColumnDef("data_size_in_bytes",
TypeDef.create(PrimitiveType.BIGINT), ColumnNullableType.NULLABLE));
TABLE_STATS_SCHEMA.add(
new ColumnDef("update_time",
TypeDef.create(PrimitiveType.DATETIME), ColumnNullableType.NOT_NULLABLE));
+ TABLE_STATS_SCHEMA.add(
+ new ColumnDef("hot_value",
TypeDef.create(PrimitiveType.STRING), ColumnNullableType.NULLABLE));
// partition statistics table
PARTITION_STATS_SCHEMA = new ArrayList<>();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
index 933272a5dbc..1444a4febaa 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java
@@ -17,6 +17,7 @@
package org.apache.doris.catalog;
+import org.apache.doris.analysis.AddColumnClause;
import org.apache.doris.analysis.AlterClause;
import org.apache.doris.analysis.AlterTableStmt;
import org.apache.doris.analysis.ColumnDef;
@@ -62,12 +63,14 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.Set;
import java.util.stream.Collectors;
public class InternalSchemaInitializer extends Thread {
private static final Logger LOG =
LogManager.getLogger(InternalSchemaInitializer.class);
+ private static boolean StatsTableSchemaValid = false;
public InternalSchemaInitializer() {
super("InternalSchemaInitializer");
@@ -135,6 +138,7 @@ public class InternalSchemaInitializer extends Thread {
// IGNORE
}
}
+ StatsTableSchemaValid = true;
}
public Table findStatsTable() {
@@ -160,9 +164,25 @@ public class InternalSchemaInitializer extends Thread {
}
}
- public List<AlterClause> getModifyColumnClauses(Table table) {
+ public List<AlterClause> getModifyColumnClauses(Table table) throws
AnalysisException {
List<AlterClause> clauses = Lists.newArrayList();
- for (Column col : table.fullSchema) {
+ Set<String> currentColumnNames = table.getBaseSchema().stream()
+ .map(Column::getName)
+ .map(String::toLowerCase)
+ .collect(Collectors.toSet());
+ if
(!currentColumnNames.containsAll(InternalSchema.TABLE_STATS_SCHEMA.stream()
+ .map(ColumnDef::getName)
+ .map(String::toLowerCase)
+ .collect(Collectors.toList()))) {
+ for (ColumnDef expected : InternalSchema.TABLE_STATS_SCHEMA) {
+ if
(!currentColumnNames.contains(expected.getName().toLowerCase())) {
+ AddColumnClause addColumnClause = new
AddColumnClause(expected, null, null, null);
+ addColumnClause.setColumn(expected.toColumn());
+ clauses.add(addColumnClause);
+ }
+ }
+ }
+ for (Column col : table.getFullSchema()) {
if (col.isKey() && col.getType().isVarchar()
&& col.getType().getLength() <
StatisticConstants.MAX_NAME_LEN) {
TypeDef typeDef = new TypeDef(
@@ -422,4 +442,8 @@ public class InternalSchemaInitializer extends Thread {
}
return true;
}
+
+ public static boolean isStatsTableSchemaValid() {
+ return StatsTableSchemaValid;
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/AlterColumnStatsCommand.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/AlterColumnStatsCommand.java
index accf55f13c6..ebfe1f73f37 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/AlterColumnStatsCommand.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/AlterColumnStatsCommand.java
@@ -74,6 +74,7 @@ public class AlterColumnStatsCommand extends AlterCommand {
.add(ColumnStatistic.MIN_VALUE)
.add(ColumnStatistic.MAX_VALUE)
.add(StatsType.DATA_SIZE)
+ .add(StatsType.HOT_VALUES)
.build();
private final TableNameInfo tableNameInfo;
private final String indexName;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 256309c4792..1613230bb9a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -66,62 +66,88 @@ public abstract class BaseAnalysisTask {
protected static final String FULL_ANALYZE_TEMPLATE =
"SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
- + " ${catalogId} AS `catalog_id`, "
- + " ${dbId} AS `db_id`, "
- + " ${tblId} AS `tbl_id`, "
- + " ${idxId} AS `idx_id`, "
- + " '${colId}' AS `col_id`, "
- + " NULL AS `part_id`, "
- + " COUNT(1) AS `row_count`, "
- + " NDV(`${colName}`) AS `ndv`, "
- + " COUNT(1) - COUNT(`${colName}`) AS `null_count`, "
- + " SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024)
AS `min`, "
- + " SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024)
AS `max`, "
- + " ${dataSizeFunction} AS `data_size`, "
- + " NOW() AS `update_time` "
- + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}";
-
- protected static final String LINEAR_ANALYZE_TEMPLATE = "SELECT "
- + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
- + "${catalogId} AS `catalog_id`, "
- + "${dbId} AS `db_id`, "
- + "${tblId} AS `tbl_id`, "
- + "${idxId} AS `idx_id`, "
- + "'${colId}' AS `col_id`, "
- + "NULL AS `part_id`, "
- + "${rowCount} AS `row_count`, "
- + "${ndvFunction} as `ndv`, "
- + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) *
${scaleFactor}) AS `null_count`, "
- + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
- + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
- + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
- + "NOW() FROM ( "
- + "SELECT * FROM `${catalogName}`.`${dbName}`.`${tblName}`
${index} ${sampleHints} ${limit}) "
- + "as t ${preAggHint}";
-
- protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "
- + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
- + "${catalogId} AS `catalog_id`, "
- + "${dbId} AS `db_id`, "
- + "${tblId} AS `tbl_id`, "
- + "${idxId} AS `idx_id`, "
- + "'${colId}' AS `col_id`, "
- + "NULL AS `part_id`, "
- + "${rowCount} AS `row_count`, "
- + "${ndvFunction} as `ndv`, "
- + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0)
* ${scaleFactor} as `null_count`, "
- + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
- + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
- + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
- + "NOW() "
- + "FROM ( "
- + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`,
SUM(`len`) as `column_length` "
- + " FROM "
- + " (SELECT ${subStringColName} AS `colValue`,
LENGTH(`${colName}`) as `len` "
- + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}
${sampleHints} ${limit}) as `t0`"
- + " ${preAggHint}"
- + " GROUP BY `t0`.`colValue` "
- + ") as `t1` ";
+ + "${catalogId} AS `catalog_id`, "
+ + "${dbId} AS `db_id`, "
+ + "${tblId} AS `tbl_id`, "
+ + "${idxId} AS `idx_id`, "
+ + "'${colId}' AS `col_id`, "
+ + "NULL AS `part_id`, "
+ + "COUNT(1) AS `row_count`, "
+ + "NDV(`${colName}`) AS `ndv`, "
+ + "COUNT(1) - COUNT(`${colName}`) AS `null_count`, "
+ + "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS
`min`, "
+ + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS
`max`, "
+ + "${dataSizeFunction} AS `data_size`, "
+ + "NOW() AS `update_time`, "
+ + "null as `hot_value` "
+ + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}";
+
+ protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+ + "SELECT `${colName}` "
+ + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}
${sampleHints} ${limit} ${preAggHint}), "
+ + "cte2 AS ("
+ + "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS
`id`, "
+ + "${catalogId} AS `catalog_id`, "
+ + "${dbId} AS `db_id`, "
+ + "${tblId} AS `tbl_id`, "
+ + "${idxId} AS `idx_id`, "
+ + "'${colId}' AS `col_id`, "
+ + "NULL AS `part_id`, "
+ + "${rowCount} AS `row_count`, "
+ + "${ndvFunction} as `ndv`, "
+ + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)
* ${scaleFactor}) AS `null_count`, "
+ + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+ + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
+ + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ + "NOW() FROM cte1), "
+ + "cte3 AS ("
+ + "SELECT GROUP_CONCAT(CONCAT("
+ + "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"),
\";\", \"\\\\;\"), "
+ + "\" :\", ROUND(t.`count` * 100.0 / ${rowCount2}, 2)), \"
;\") "
+ + "as `hot_value` "
+ + "FROM ("
+ + "SELECT ${subStringColName} as `hash_value`, "
+ + "MAX(`${colName}`) as `column_key`, "
+ + "COUNT(1) AS `count` "
+ + "FROM cte1 WHERE `${colName}` IS NOT NULL "
+ + "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT 3) t)
"
+ + "SELECT * FROM cte2 CROSS JOIN cte3";
+
+ protected static final String DUJ1_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+ + "SELECT MAX(t0.`col_value`) as `col_value`, COUNT(1) as `count`,
SUM(`len`) as `column_length` "
+ + "FROM "
+ + "(SELECT "
+ + "${subStringColName} AS `hash_value`, "
+ + "`${colName}` AS `col_value`, "
+ + "LENGTH(`${colName}`) as `len` "
+ + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}
${sampleHints} ${limit}) as `t0` "
+ + "${preAggHint} GROUP BY `t0`.`hash_value`), "
+ + "cte2 AS ( "
+ + "SELECT CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}')
AS `id`, "
+ + "${catalogId} AS `catalog_id`, "
+ + "${dbId} AS `db_id`, "
+ + "${tblId} AS `tbl_id`, "
+ + "${idxId} AS `idx_id`, "
+ + "'${colId}' AS `col_id`, "
+ + "NULL AS `part_id`, "
+ + "${rowCount} AS `row_count`, "
+ + "${ndvFunction} as `ndv`, "
+ + "IFNULL(SUM(IF(`t1`.`col_value` IS NULL, `t1`.`count`, 0)),
0) * ${scaleFactor} as `null_count`, "
+ + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+ + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
+ + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ + "NOW() "
+ + "FROM cte1 t1), "
+ + "cte3 AS ("
+ + "SELECT GROUP_CONCAT(CONCAT("
+ + "REPLACE(REPLACE(t2.`col_value`, \":\", \"\\\\:\"),
\";\", \"\\\\;\"), "
+ + "\" :\", ROUND(t2.`count` * 100.0 / ${rowCount2}, 2)),
\" ;\") "
+ + "as `hot_value` "
+ + "FROM ("
+ + "SELECT `col_value`, `count` "
+ + "FROM cte1 "
+ + "WHERE `col_value` IS NOT NULL ORDER BY `count` DESC
LIMIT 3) t2) "
+ + "SELECT * FROM cte2 CROSS JOIN cte3";
protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT
"
+ "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
@@ -294,8 +320,8 @@ public abstract class BaseAnalysisTask {
protected String getNdvFunction(String totalRows) {
String n = "SUM(`t1`.`count`)"; // sample rows
- String f1 = "SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0))";
- String d = "COUNT(`t1`.`column_key`)"; // sample ndv
+ String f1 = "SUM(IF(`t1`.`count` = 1 and `t1`.`col_value` is not null,
1, 0))";
+ String d = "COUNT(`t1`.`col_value`)"; // sample ndv
// DUJ1 estimator: n*d / (n - f1 + f1*n/N)
// f1 is the count of element that appears only once in the sample.
//
(https://github.com/postgres/postgres/blob/master/src/backend/commands/analyze.c)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index 329d39eb5ee..232ae506d71 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -66,6 +66,8 @@ public class ColStatsData {
public final long dataSizeInBytes;
@SerializedName("updateTime")
public final String updateTime;
+ @SerializedName("hotValues")
+ public final String hotValues;
@VisibleForTesting
public ColStatsData() {
@@ -77,6 +79,7 @@ public class ColStatsData {
maxLit = null;
dataSizeInBytes = 0;
updateTime = null;
+ hotValues = null;
}
public ColStatsData(StatsId statsId) {
@@ -88,6 +91,7 @@ public class ColStatsData {
maxLit = null;
dataSizeInBytes = 0;
updateTime =
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
+ hotValues = null;
}
public ColStatsData(ResultRow row) {
@@ -99,10 +103,11 @@ public class ColStatsData {
this.maxLit = row.get(11);
this.dataSizeInBytes = (long)
Double.parseDouble(row.getWithDefault(12, "0"));
this.updateTime = row.get(13);
+ this.hotValues = row.get(14);
}
public ColStatsData(String id, long catalogId, long dbId, long tblId, long
idxId, String colId, String partId,
- ColumnStatistic columnStatistic) {
+ String hotValues, ColumnStatistic columnStatistic) {
this.statsId = new StatsId(id, catalogId, dbId, tblId, idxId, colId,
partId);
this.count = Math.round(columnStatistic.count);
this.ndv = Math.round(columnStatistic.ndv);
@@ -111,6 +116,7 @@ public class ColStatsData {
this.maxLit = columnStatistic.maxExpr == null ? null :
columnStatistic.maxExpr.getStringValue();
this.dataSizeInBytes = Math.round(columnStatistic.dataSize);
this.updateTime = columnStatistic.updatedTime;
+ this.hotValues = hotValues;
}
public String toSQL(boolean roundByParentheses) {
@@ -127,6 +133,7 @@ public class ColStatsData {
sj.add(maxLit == null ? "NULL" : "'" +
StatisticsUtil.escapeSQL(maxLit) + "'");
sj.add(String.valueOf(dataSizeInBytes));
sj.add(StatisticsUtil.quote(updateTime));
+ sj.add(hotValues == null ? "NULL" : "'" +
StatisticsUtil.escapeSQL(hotValues) + "'");
return sj.toString();
}
@@ -174,6 +181,7 @@ public class ColStatsData {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
columnStatisticBuilder.setUpdatedTime(updateTime);
+
columnStatisticBuilder.setHotValues(StatisticsUtil.getHotValues(hotValues,
col.getType()));
return columnStatisticBuilder.build();
} catch (Exception e) {
LOG.warn("Failed to convert column statistics.", e);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index e0e99fcdfad..ff4c127b4ef 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -34,6 +34,7 @@ import org.apache.logging.log4j.Logger;
import org.json.JSONObject;
import java.util.List;
+import java.util.Map;
import java.util.Set;
public class ColumnStatistic {
@@ -94,10 +95,13 @@ public class ColumnStatistic {
@SerializedName("updatedTime")
public final String updatedTime;
+ @SerializedName("hotValues")
+ public final Map<LiteralExpr, Float> hotValues;
+
public ColumnStatistic(double count, double ndv, ColumnStatistic original,
double avgSizeByte,
double numNulls, double dataSize, double minValue, double maxValue,
LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown,
- String updatedTime) {
+ String updatedTime, Map<LiteralExpr, Float> hotValues) {
this.count = count;
this.ndv = ndv;
this.original = original;
@@ -110,6 +114,7 @@ public class ColumnStatistic {
this.maxExpr = maxExpr;
this.isUnKnown = isUnKnown;
this.updatedTime = updatedTime;
+ this.hotValues = hotValues;
}
public static ColumnStatistic fromResultRow(List<ResultRow> resultRows) {
@@ -187,6 +192,7 @@ public class ColumnStatistic {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
columnStatisticBuilder.setUpdatedTime(row.get(13));
+
columnStatisticBuilder.setHotValues(StatisticsUtil.getHotValues(row.get(14),
col.getType()));
return columnStatisticBuilder.build();
}
@@ -298,7 +304,7 @@ public class ColumnStatistic {
// Histogram is got by other place
public static ColumnStatistic fromJson(String statJson) throws
AnalysisException {
JSONObject stat = new JSONObject(statJson);
- Double minValue;
+ double minValue;
switch (stat.getString("MinValueType")) {
case "Infinite":
minValue = Double.NEGATIVE_INFINITY;
@@ -312,7 +318,7 @@ public class ColumnStatistic {
default:
throw new RuntimeException(String.format("Min value does not
get anytype"));
}
- Double maxValue;
+ double maxValue;
switch (stat.getString("MaxValueType")) {
case "Infinite":
maxValue = Double.POSITIVE_INFINITY;
@@ -330,7 +336,6 @@ public class ColumnStatistic {
try {
lastUpdatedTime = stat.getString("LastUpdatedTime");
} catch (Exception e) {
- LOG.warn("lastUpdateTimeIsEmpty", e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.debug(e);
}
@@ -349,7 +354,8 @@ public class ColumnStatistic {
LiteralExpr.create(stat.getString("MaxExprValue"),
GsonUtils.GSON.fromJson(stat.getString("MaxExprType"),
Type.class)),
stat.getBoolean("IsUnKnown"),
- lastUpdatedTime
+ lastUpdatedTime,
+ null
);
}
@@ -395,4 +401,21 @@ public class ColumnStatistic {
.build();
}
}
+
+ public String getStringHotValues() {
+ if (hotValues == null || hotValues.isEmpty()) {
+ return null;
+ }
+ StringBuilder sb = new StringBuilder();
+ hotValues.forEach((k, v) -> {
+ sb.append(k.toString());
+ sb.append(":");
+ sb.append(v);
+ sb.append(";");
+ });
+ if (sb.length() > 0) {
+ sb.setLength(sb.length() - 1);
+ }
+ return sb.toString();
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index 4e190ce388e..0b98a3e61f9 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -21,6 +21,8 @@ import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.types.coercion.CharacterType;
+import java.util.Map;
+
public class ColumnStatisticBuilder {
private double count;
private double ndv;
@@ -37,6 +39,7 @@ public class ColumnStatisticBuilder {
private ColumnStatistic original;
private String updatedTime;
+ private Map<LiteralExpr, Float> hotValues;
public ColumnStatisticBuilder() {
}
@@ -54,6 +57,7 @@ public class ColumnStatisticBuilder {
this.isUnknown = columnStatistic.isUnKnown;
this.original = columnStatistic.original;
this.updatedTime = columnStatistic.updatedTime;
+ this.hotValues = columnStatistic.hotValues;
}
// ATTENTION: DON'T USE FOLLOWING TWO DURING STATS DERIVING EXCEPT FOR
INITIALIZATION
@@ -74,6 +78,7 @@ public class ColumnStatisticBuilder {
this.isUnknown = columnStatistic.isUnKnown;
this.original = columnStatistic.original;
this.updatedTime = columnStatistic.updatedTime;
+ this.hotValues = columnStatistic.hotValues;
}
public ColumnStatisticBuilder setNdv(double ndv) {
@@ -126,6 +131,11 @@ public class ColumnStatisticBuilder {
return this;
}
+ public ColumnStatisticBuilder setHotValues(Map<LiteralExpr, Float>
hotValues) {
+ this.hotValues = hotValues;
+ return this;
+ }
+
public double getCount() {
return count;
}
@@ -175,16 +185,20 @@ public class ColumnStatisticBuilder {
return this;
}
+ public Map<LiteralExpr, Float> getHotValues() {
+ return hotValues;
+ }
+
public ColumnStatistic build() {
dataSize = dataSize > 0 ? dataSize : Math.max((count - numNulls + 1) *
avgSizeByte, 0);
if (original == null && !isUnknown) {
original = new ColumnStatistic(count, ndv, null, avgSizeByte,
numNulls,
dataSize, minValue, maxValue, minExpr, maxExpr,
- isUnknown, updatedTime);
+ isUnknown, updatedTime, hotValues);
}
ColumnStatistic colStats = new ColumnStatistic(count, ndv, original,
avgSizeByte, numNulls,
dataSize, minValue, maxValue, minExpr, maxExpr,
- isUnknown, updatedTime);
+ isUnknown, updatedTime, hotValues);
return colStats;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index 06009b38177..2a80a9995cb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -248,13 +248,15 @@ public class HMSAnalysisTask extends ExternalAnalysisTask
{
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("ndvFunction", "ROUND(NDV(`${colName}`) *
${scaleFactor})");
- params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
+ params.put("rowCount", "ROUND(COUNT(1) * ${scaleFactor})");
+ params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE
`${colName}` IS NOT NULL)");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
params.put("subStringColName", getStringTypeColName(col));
params.put("dataSizeFunction", getDataSizeFunction(col, true));
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) *
${scaleFactor})"));
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
+ params.put("rowCount2", "(SELECT SUM(`count`) FROM cte1 WHERE
`col_value` IS NOT NULL)");
}
LOG.info("Sample for column [{}]. Scale factor [{}], "
+ "limited [{}], is distribute column [{}]",
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index 43990f42d0b..bcb61f38e71 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -256,6 +256,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("rowCount", String.valueOf(tableRowCount));
params.put("type", col.getType().toString());
params.put("limit", "");
+ params.put("subStringColName", getStringTypeColName(col));
// For agg table and mor unique table, set PREAGGOPEN preAggHint.
if (((OlapTable) tbl).getKeysType().equals(KeysType.AGG_KEYS)
@@ -269,6 +270,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("scaleFactor", "1");
params.put("sampleHints", "");
params.put("ndvFunction", "ROUND(NDV(`${colName}`) *
${scaleFactor})");
+ params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE
`${colName}` IS NOT NULL)");
scanFullTable = true;
return;
}
@@ -299,6 +301,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
// Set algorithm related params.
if (useLinearAnalyzeTemplate()) {
+ params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE
`${colName}` IS NOT NULL)");
// For single unique key, use count as ndv.
if (isSingleUniqueKey()) {
params.put("ndvFunction", String.valueOf(tableRowCount));
@@ -308,7 +311,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
} else {
params.put("ndvFunction",
getNdvFunction(String.valueOf(tableRowCount)));
params.put("dataSizeFunction", getDataSizeFunction(col, true));
- params.put("subStringColName", getStringTypeColName(col));
+ params.put("rowCount2", "(SELECT SUM(`count`) FROM cte1 WHERE
`col_value` IS NOT NULL)");
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java
index b1dc7cdd001..99fb28162e4 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java
@@ -174,6 +174,6 @@ public class PartitionColumnStatisticBuilder {
public ColumnStatistic toColumnStatistics() {
return new ColumnStatistic(count, ndv.estimateCardinality(), null,
- avgSizeByte, numNulls, dataSize, minValue, maxValue, minExpr,
maxExpr, isUnknown, updatedTime);
+ avgSizeByte, numNulls, dataSize, minValue, maxValue, minExpr,
maxExpr, isUnknown, updatedTime, null);
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
index 10a96ef13b0..59ae6fbf324 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
@@ -241,7 +241,7 @@ public class StatisticsCache {
long retryTimes = 0;
while (!StatisticsUtil.statsTblAvailable()) {
try {
- Thread.sleep(100L);
+ Thread.sleep(1000L);
} catch (InterruptedException e) {
// IGNORE
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
index 5ebe51f8091..0780e88714f 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
@@ -85,7 +85,8 @@ public class StatisticsRepository {
private static final String INSERT_INTO_COLUMN_STATISTICS_FOR_ALTER =
StatisticConstants.INSERT_INTO_COLUMN_STATS_PREFIX
+ "('${id}', ${catalogId}, ${dbId}, ${tblId}, '${idxId}',"
- + "'${colId}', ${partId}, ${count}, ${ndv}, ${nullCount},
${min}, ${max}, ${dataSize}, NOW())";
+ + "'${colId}', ${partId}, ${count}, ${ndv}, ${nullCount},
${min}, ${max}, ${dataSize}, NOW(), "
+ + "${hotValues})";
private static final String DELETE_TABLE_STATISTICS_BY_COLUMN_TEMPLATE =
"DELETE FROM "
+ FeConstants.INTERNAL_DB_NAME + "." +
StatisticConstants.TABLE_STATISTIC_TBL_NAME
@@ -323,6 +324,7 @@ public class StatisticsRepository {
String min = alterColumnStatsCommand.getValue(StatsType.MIN_VALUE);
String max = alterColumnStatsCommand.getValue(StatsType.MAX_VALUE);
String dataSize =
alterColumnStatsCommand.getValue(StatsType.DATA_SIZE);
+ String hotValues =
alterColumnStatsCommand.getValue(StatsType.HOT_VALUES);
long indexId = alterColumnStatsCommand.getIndexId();
if (rowCount == null) {
throw new RuntimeException("Row count is null.");
@@ -356,6 +358,7 @@ public class StatisticsRepository {
}
}
}
+ builder.setHotValues(StatisticsUtil.getHotValues(hotValues,
column.getType()));
ColumnStatistic columnStatistic = builder.build();
Map<String, String> params = new HashMap<>();
@@ -371,6 +374,7 @@ public class StatisticsRepository {
params.put("min", min == null ? "NULL" : "'" +
StatisticsUtil.escapeSQL(min) + "'");
params.put("max", max == null ? "NULL" : "'" +
StatisticsUtil.escapeSQL(max) + "'");
params.put("dataSize", String.valueOf(columnStatistic.dataSize));
+ params.put("hotValues", "'" + StatisticsUtil.escapeSQL(hotValues) +
"'");
if (partitionIds.isEmpty()) {
// update table granularity statistics
@@ -378,7 +382,7 @@ public class StatisticsRepository {
StatisticsUtil.execUpdate(INSERT_INTO_COLUMN_STATISTICS_FOR_ALTER,
params);
ColStatsData data = new
ColStatsData(constructId(objects.table.getId(), indexId, colName),
objects.catalog.getId(), objects.db.getId(),
objects.table.getId(), indexId, colName,
- null, columnStatistic);
+ null, hotValues, columnStatistic);
Env.getCurrentEnv().getStatisticsCache().syncColStats(data);
AnalysisInfo mockedJobInfo = new AnalysisInfoBuilder()
.setTblUpdateTime(objects.table.getUpdateTime())
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsType.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsType.java
index d59f85285a6..a14e808afba 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsType.java
@@ -27,6 +27,7 @@ public enum StatsType {
MIN_VALUE("min_value"),
MAX_VALUE("max_value"),
HISTOGRAM("histogram"),
+ HOT_VALUES("hot_values"),
// only for test
UNKNOWN("unknown");
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 188a73ab5c8..b6565f1dbaf 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -37,6 +37,7 @@ import org.apache.doris.catalog.ArrayType;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.DatabaseIf;
import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.InternalSchemaInitializer;
import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.MapType;
import org.apache.doris.catalog.OlapTable;
@@ -87,6 +88,7 @@ import org.apache.doris.statistics.TableStatsMeta;
import org.apache.doris.system.Frontend;
import com.google.common.base.Preconditions;
+import com.google.common.collect.Maps;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringSubstitutor;
@@ -111,6 +113,7 @@ import java.util.Base64;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -502,6 +505,9 @@ public class StatisticsUtil {
}
public static boolean statsTblAvailable() {
+ if (!InternalSchemaInitializer.isStatsTableSchemaValid()) {
+ return false;
+ }
String dbName = FeConstants.INTERNAL_DB_NAME;
List<OlapTable> statsTbls = new ArrayList<>();
try {
@@ -1263,4 +1269,31 @@ public class StatisticsUtil {
public static boolean canCollect() {
return enableAutoAnalyze() &&
inAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()));
}
+
+ /**
+ * Get the map of column literal value and its row count percentage in the
table.
+ * The stringValues is like:
+ * value1 :percent1 ;value2 :percent2 ;value3 :percent3
+ * @return Map of LiteralExpr -> percentage.
+ */
+ public static LinkedHashMap<LiteralExpr, Float> getHotValues(String
stringValues, Type type) {
+ if (stringValues == null) {
+ return null;
+ }
+ try {
+ LinkedHashMap<LiteralExpr, Float> ret = Maps.newLinkedHashMap();
+ for (String oneRow : stringValues.split(" ;")) {
+ String[] oneRowSplit = oneRow.split(" :");
+ float value = Float.parseFloat(oneRowSplit[1]);
+ String stringLiteral = oneRowSplit[0].replaceAll("\\\\:",
":").replaceAll("\\\\;", ";");
+ ret.put(readableValue(type, stringLiteral), value);
+ }
+ if (!ret.isEmpty()) {
+ return ret;
+ }
+ } catch (Exception e) {
+ LOG.info("Failed to parse hot values [{}]. {}", stringValues,
e.getMessage());
+ }
+ return null;
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/catalog/InternalSchemaInitializerTest.java
b/fe/fe-core/src/test/java/org/apache/doris/catalog/InternalSchemaInitializerTest.java
index c71c1e1c1b6..a05039307b5 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/catalog/InternalSchemaInitializerTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/catalog/InternalSchemaInitializerTest.java
@@ -22,33 +22,25 @@ import org.apache.doris.analysis.ColumnDef;
import org.apache.doris.analysis.ColumnPosition;
import org.apache.doris.analysis.CreateTableStmt;
import org.apache.doris.analysis.ModifyColumnClause;
+import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.UserException;
-import org.apache.doris.datasource.hive.HMSExternalTable;
import org.apache.doris.plugin.audit.AuditLoader;
import org.apache.doris.statistics.StatisticConstants;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
-import mockit.Mock;
-import mockit.MockUp;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
import java.lang.reflect.Method;
import java.util.List;
class InternalSchemaInitializerTest {
@Test
- public void testGetModifyColumn() {
- new MockUp<HMSExternalTable>() {
- @Mock
- public HMSExternalTable.DLAType getDlaType() {
- return HMSExternalTable.DLAType.HUDI;
- }
- };
-
+ public void testGetModifyColumn() throws AnalysisException {
InternalSchemaInitializer initializer = new
InternalSchemaInitializer();
- OlapTable table = new OlapTable();
+ OlapTable table = Mockito.mock(OlapTable.class);
Column key1 = new Column("key1", ScalarType.createVarcharType(100),
true, null, false, null, "");
Column key2 = new Column("key2", ScalarType.createVarcharType(100),
true, null, true, null, "");
Column key3 = new Column("key3", ScalarType.createVarcharType(1024),
true, null, null, "");
@@ -56,6 +48,7 @@ class InternalSchemaInitializerTest {
Column key5 = new Column("key5", ScalarType.INT, true, null, null, "");
Column value1 = new Column("value1", ScalarType.INT, false, null,
null, "");
Column value2 = new Column("value2",
ScalarType.createVarcharType(100), false, null, null, "");
+ Column value3 = new Column("hot_value",
ScalarType.createVarcharType(100), false, null, null, "");
List<Column> schema = Lists.newArrayList();
schema.add(key1);
schema.add(key2);
@@ -64,15 +57,17 @@ class InternalSchemaInitializerTest {
schema.add(key5);
schema.add(value1);
schema.add(value2);
- table.fullSchema = schema;
+ schema.add(value3);
+ Mockito.when(table.getFullSchema()).thenReturn(schema);
+ Mockito.when(table.getBaseSchema()).thenReturn(schema);
List<AlterClause> modifyColumnClauses =
initializer.getModifyColumnClauses(table);
- Assertions.assertEquals(2, modifyColumnClauses.size());
- ModifyColumnClause clause1 = (ModifyColumnClause)
modifyColumnClauses.get(0);
+ Assertions.assertEquals(16, modifyColumnClauses.size());
+ ModifyColumnClause clause1 = (ModifyColumnClause)
modifyColumnClauses.get(14);
Assertions.assertEquals("key1", clause1.getColumn().getName());
Assertions.assertEquals(StatisticConstants.MAX_NAME_LEN,
clause1.getColumn().getType().getLength());
Assertions.assertFalse(clause1.getColumn().isAllowNull());
- ModifyColumnClause clause2 = (ModifyColumnClause)
modifyColumnClauses.get(1);
+ ModifyColumnClause clause2 = (ModifyColumnClause)
modifyColumnClauses.get(15);
Assertions.assertEquals("key2", clause2.getColumn().getName());
Assertions.assertEquals(StatisticConstants.MAX_NAME_LEN,
clause2.getColumn().getType().getLength());
Assertions.assertTrue(clause2.getColumn().isAllowNull());
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java
index 815cff0f2d9..d1d0e4ff70b 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java
@@ -370,7 +370,7 @@ public class HyperGraphBuilder {
slotIdToColumnStats.put(slot,
new ColumnStatistic(count, count, null, 1, 0, 0, 0,
count, null, null, true,
- new Date().toString()));
+ new Date().toString(), null));
}
return new Statistics(count, slotIdToColumnStats);
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java
index 5b7b430a2cd..d3f739a18fb 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java
@@ -238,7 +238,7 @@ public class AnalysisJobTest {
job.flushBuffer();
Assertions.assertEquals(0, job.queryFinished.size());
Assertions.assertEquals(0, job.buf.size());
- Assertions.assertEquals("d8c3a1696769a1862331eec30d2c1ecb",
job.stmtExecutor.getContext().getSqlHash());
+ Assertions.assertEquals("a48a05f7d2f079b74481e2909a7fa79b",
job.stmtExecutor.getContext().getSqlHash());
}
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
index 11a271f9b40..343407dcaa6 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -61,7 +61,7 @@ public class BaseAnalysisTaskTest {
Assertions.assertEquals("NULL", maxFunction);
String ndvFunction =
olapAnalysisTask.getNdvFunction(String.valueOf(100));
- Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`)
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1,
0)) * SUM(`t1`.`count`) / 100)", ndvFunction);
+ Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`col_value`) /
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`col_value` is not null,
1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`col_value` is not null, 1, 0)) *
SUM(`t1`.`count`) / 100)", ndvFunction);
System.out.println(ndvFunction);
}
@@ -82,6 +82,7 @@ public class BaseAnalysisTaskTest {
values.add("max");
values.add("400");
values.add("500");
+ values.add(null);
ResultRow row = new ResultRow(values);
List<ResultRow> result = Lists.newArrayList();
result.add(row);
@@ -98,7 +99,7 @@ public class BaseAnalysisTaskTest {
} catch (Exception e) {
Assertions.assertEquals(e.getMessage(),
"ColStatsData is invalid, skip analyzing. "
- +
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
+ +
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500',NULL)");
return;
}
Assertions.fail();
@@ -121,6 +122,7 @@ public class BaseAnalysisTaskTest {
values.add("max");
values.add("400");
values.add("500");
+ values.add(null);
ResultRow row = new ResultRow(values);
List<ResultRow> result = Lists.newArrayList();
result.add(row);
@@ -137,7 +139,7 @@ public class BaseAnalysisTaskTest {
} catch (Exception e) {
Assertions.assertEquals(e.getMessage(),
"ColStatsData is invalid, skip analyzing. "
- +
"('id',10000,20000,30000,0,'col',null,500,0,300,'min','max',400,'500')");
+ +
"('id',10000,20000,30000,0,'col',null,500,0,300,'min','max',400,'500',NULL)");
return;
}
Assertions.fail();
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
index c031f27a917..e188df95f63 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
@@ -242,7 +242,7 @@ public class CacheTest extends TestWithFeService {
return Optional.of(new ColumnStatistic(1, 2,
null, 3, 4, 5, 6, 7,
null, null, false,
- new Date().toString()));
+ new Date().toString(), null));
}
};
@@ -408,7 +408,7 @@ public class CacheTest extends TestWithFeService {
return Optional.of(new ColumnStatistic(1, 2,
null, 3, 4, 5, 6, 7,
null, null, false,
- new Date().toString()));
+ new Date().toString(), null));
}
};
columnStatistic = statisticsCache.getColumnStatistics(1, 1, 1, -1,
"col", connectContext);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
index b193edb2049..aa6301ed639 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
@@ -22,10 +22,11 @@ import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.collect.Lists;
-import mockit.Expectations;
-import mockit.Mocked;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import org.mockito.ArgumentMatchers;
+import org.mockito.MockedStatic;
+import org.mockito.Mockito;
import java.util.List;
@@ -47,6 +48,7 @@ public class ColStatsDataTest {
values.add("max");
values.add("400");
values.add("500");
+ values.add(null);
ResultRow row = new ResultRow(values);
ColStatsData data = new ColStatsData(row);
Assertions.assertEquals("id", data.statsId.id);
@@ -55,7 +57,7 @@ public class ColStatsDataTest {
Assertions.assertEquals(30000, data.statsId.tblId);
Assertions.assertEquals(0, data.statsId.idxId);
Assertions.assertEquals("col", data.statsId.colId);
- Assertions.assertEquals(null, data.statsId.partId);
+ Assertions.assertNull(data.statsId.partId);
Assertions.assertEquals(100, data.count);
Assertions.assertEquals(200, data.ndv);
Assertions.assertEquals(300, data.nullCount);
@@ -63,6 +65,7 @@ public class ColStatsDataTest {
Assertions.assertEquals("max", data.maxLit);
Assertions.assertEquals(400, data.dataSizeInBytes);
Assertions.assertEquals("500", data.updateTime);
+ Assertions.assertNull(data.hotValues);
}
@Test
@@ -82,6 +85,7 @@ public class ColStatsDataTest {
values.add(null);
values.add(null);
values.add(null);
+ values.add(null);
ResultRow row = new ResultRow(values);
ColStatsData data = new ColStatsData(row);
Assertions.assertEquals("id", data.statsId.id);
@@ -90,79 +94,83 @@ public class ColStatsDataTest {
Assertions.assertEquals(30000, data.statsId.tblId);
Assertions.assertEquals(0, data.statsId.idxId);
Assertions.assertEquals("col", data.statsId.colId);
- Assertions.assertEquals(null, data.statsId.partId);
+ Assertions.assertNull(data.statsId.partId);
Assertions.assertEquals(0, data.count);
Assertions.assertEquals(0, data.ndv);
Assertions.assertEquals(0, data.nullCount);
- Assertions.assertEquals(null, data.minLit);
- Assertions.assertEquals(null, data.maxLit);
+ Assertions.assertNull(data.minLit);
+ Assertions.assertNull(data.maxLit);
Assertions.assertEquals(0, data.dataSizeInBytes);
- Assertions.assertEquals(null, data.updateTime);
+ Assertions.assertNull(data.updateTime);
+ Assertions.assertNull(data.hotValues);
}
@Test
- public void testToColumnStatisticUnknown(@Mocked StatisticsUtil
mockedClass) {
+ public void testToColumnStatisticUnknown() {
// Test column is null
- new Expectations() {
- {
- mockedClass.findColumn(anyLong, anyLong, anyLong, anyLong,
anyString);
- result = null;
- }
- };
- List<String> values = Lists.newArrayList();
- values.add("id");
- values.add("10000");
- values.add("20000");
- values.add("30000");
- values.add("0");
- values.add("col");
- values.add(null);
- values.add("100");
- values.add("200");
- values.add("300");
- values.add("min");
- values.add("max");
- values.add("400");
- values.add("500");
- ResultRow row = new ResultRow(values);
- ColStatsData data = new ColStatsData(row);
- ColumnStatistic columnStatistic = data.toColumnStatistic();
- Assertions.assertEquals(ColumnStatistic.UNKNOWN, columnStatistic);
+ try (MockedStatic<StatisticsUtil> mocked =
Mockito.mockStatic(StatisticsUtil.class)) {
+ // 设置静态方法的行为
+ mocked.when(() ->
StatisticsUtil.findColumn(ArgumentMatchers.anyLong(),
ArgumentMatchers.anyLong(),
+ ArgumentMatchers.anyLong(), ArgumentMatchers.anyLong(),
ArgumentMatchers.anyString()))
+ .thenReturn(null);
+
+ List<String> values = Lists.newArrayList();
+ values.add("id");
+ values.add("10000");
+ values.add("20000");
+ values.add("30000");
+ values.add("0");
+ values.add("col");
+ values.add(null);
+ values.add("100");
+ values.add("200");
+ values.add("300");
+ values.add("min");
+ values.add("max");
+ values.add("400");
+ values.add("500");
+ values.add(null);
+ ResultRow row = new ResultRow(values);
+ ColStatsData data = new ColStatsData(row);
+ ColumnStatistic columnStatistic = data.toColumnStatistic();
+ Assertions.assertEquals(ColumnStatistic.UNKNOWN, columnStatistic);
+ }
}
@Test
- public void testToColumnStatisticNormal(@Mocked StatisticsUtil
mockedClass) {
- new Expectations() {
- {
- mockedClass.findColumn(anyLong, anyLong, anyLong, anyLong,
anyString);
- result = new Column("colName", PrimitiveType.STRING);
- }
- };
- List<String> values = Lists.newArrayList();
- values.add("id");
- values.add("10000");
- values.add("20000");
- values.add("30000");
- values.add("0");
- values.add("col");
- values.add(null);
- values.add("100");
- values.add("200");
- values.add("300");
- values.add("null");
- values.add("null");
- values.add("400");
- values.add("500");
- ResultRow row = new ResultRow(values);
- ColStatsData data = new ColStatsData(row);
- ColumnStatistic columnStatistic = data.toColumnStatistic();
- Assertions.assertEquals(100, columnStatistic.count);
- Assertions.assertEquals(200, columnStatistic.ndv);
- Assertions.assertEquals(300, columnStatistic.numNulls);
- Assertions.assertEquals(Double.NEGATIVE_INFINITY,
columnStatistic.minValue);
- Assertions.assertEquals(Double.POSITIVE_INFINITY,
columnStatistic.maxValue);
- Assertions.assertEquals(400, columnStatistic.dataSize);
- Assertions.assertEquals("500", columnStatistic.updatedTime);
+ public void testToColumnStatisticNormal() {
+ try (MockedStatic<StatisticsUtil> mocked =
Mockito.mockStatic(StatisticsUtil.class)) {
+ // 设置静态方法的行为
+ mocked.when(() ->
StatisticsUtil.findColumn(ArgumentMatchers.anyLong(),
ArgumentMatchers.anyLong(),
+ ArgumentMatchers.anyLong(),
ArgumentMatchers.anyLong(), ArgumentMatchers.anyString()))
+ .thenReturn(new Column("colName", PrimitiveType.STRING));
+ List<String> values = Lists.newArrayList();
+ values.add("id");
+ values.add("10000");
+ values.add("20000");
+ values.add("30000");
+ values.add("0");
+ values.add("col");
+ values.add(null);
+ values.add("100");
+ values.add("200");
+ values.add("300");
+ values.add("null");
+ values.add("null");
+ values.add("400");
+ values.add("500");
+ values.add(null);
+ ResultRow row = new ResultRow(values);
+ ColStatsData data = new ColStatsData(row);
+ ColumnStatistic columnStatistic = data.toColumnStatistic();
+ Assertions.assertEquals(100, columnStatistic.count);
+ Assertions.assertEquals(200, columnStatistic.ndv);
+ Assertions.assertEquals(300, columnStatistic.numNulls);
+ Assertions.assertEquals(Double.NEGATIVE_INFINITY,
columnStatistic.minValue);
+ Assertions.assertEquals(Double.POSITIVE_INFINITY,
columnStatistic.maxValue);
+ Assertions.assertEquals(400, columnStatistic.dataSize);
+ Assertions.assertEquals("500", columnStatistic.updatedTime);
+ }
}
@Test
@@ -193,6 +201,7 @@ public class ColStatsDataTest {
values.add("max");
values.add("400");
values.add("500");
+ values.add(null);
ResultRow row = new ResultRow(values);
ColStatsData data = new ColStatsData(row);
Assertions.assertFalse(data.isValid());
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index 3a9cf558cc5..e870b0b3bfd 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -201,21 +201,15 @@ public class HMSAnalysisTaskTest {
new MockUp<HMSAnalysisTask>() {
@Mock
public void runQuery(String sql) {
- Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-',
'hour') AS `id`,"
- + " 10001 AS `catalog_id`,"
- + " 20001 AS `db_id`,"
- + " 30001 AS `tbl_id`,"
- + " -1 AS `idx_id`,"
- + " 'hour' AS `col_id`,"
- + " NULL AS `part_id`,"
- + " COUNT(1) AS `row_count`,"
- + " NDV(`hour`) AS `ndv`,"
- + " COUNT(1) - COUNT(`hour`) AS `null_count`,"
- + " SUBSTRING(CAST(MIN(`hour`) AS STRING), 1,
1024) AS `min`,"
- + " SUBSTRING(CAST(MAX(`hour`) AS STRING), 1,
1024) AS `max`,"
- + " COUNT(1) * 4 AS `data_size`,"
- + " NOW() AS `update_time`"
- + " FROM `hms`.`default`.`test` ", sql);
+ Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-',
'hour') AS `id`, "
+ + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, "
+ + "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS
`part_id`, "
+ + "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
+ + "COUNT(1) - COUNT(`hour`) AS `null_count`, "
+ + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS
`min`, "
+ + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS
`max`, "
+ + "COUNT(1) * 4 AS `data_size`, NOW() AS
`update_time`, "
+ + "null as `hot_value` FROM `hms`.`default`.`test` ",
sql);
}
};
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 7503ff8d202..08bc2ff9111 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -131,23 +131,29 @@ public class OlapAnalysisTaskTest {
}
@Mock
- protected void getSampleParams(Map<String, String> params, long
tableRowCount) {}
+ void getSampleParams(Map<String, String> params, long
tableRowCount) {}
@Mock
- protected boolean useLinearAnalyzeTemplate() {
+ boolean useLinearAnalyzeTemplate() {
return true;
}
@Mock
public void runQuery(String sql) {
- Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-',
'null') AS `id`, "
+ Assertions.assertEquals("WITH cte1 AS (SELECT `null` FROM
`catalogName`.`${dbName}`.`null` "
+ + "${sampleHints} ${limit} ), cte2 AS (SELECT
CONCAT(30001, '-', -1, '-', 'null') AS `id`, "
+ "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, "
- + "'null' AS `col_id`, NULL AS `part_id`, ${rowCount}
AS `row_count`, "
- + "${ndvFunction} as `ndv`, ROUND(SUM(CASE WHEN `null`
IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, "
- + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,
SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
- + "COUNT(1) * 4 * ${scaleFactor} AS `data_size`, NOW()
FROM "
- + "( SELECT * FROM `catalogName`.`${dbName}`.`null`
${sampleHints} ${limit}) as t ", sql);
- return;
+ + "'null' AS `col_id`, NULL AS `part_id`, ${rowCount}
AS `row_count`, ${ndvFunction} as `ndv`, "
+ + "ROUND(SUM(CASE WHEN `null` IS NULL THEN 1 ELSE 0
END) * ${scaleFactor}) AS `null_count`, "
+ + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
+ + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ + "COUNT(1) * 4 * ${scaleFactor} AS `data_size`, NOW()
FROM cte1), "
+ + "cte3 AS (SELECT
GROUP_CONCAT(CONCAT(REPLACE(REPLACE(t.`column_key`, "
+ + "\":\", \"\\\\:\"), \";\", \"\\\\;\"), \" :\",
ROUND(t.`count` * 100.0 / ${rowCount2}, 2)), "
+ + "\" ;\") as `hot_value` FROM (SELECT
${subStringColName} as `hash_value`, "
+ + "MAX(`null`) as `column_key`, COUNT(1) AS `count`
FROM cte1 "
+ + "WHERE `null` IS NOT NULL GROUP BY `hash_value`
ORDER BY `count` DESC LIMIT 3) t) "
+ + "SELECT * FROM cte2 CROSS JOIN cte3", sql);
}
};
@@ -166,21 +172,24 @@ public class OlapAnalysisTaskTest {
new MockUp<OlapAnalysisTask>() {
@Mock
public void runQuery(String sql) {
- Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, "
- + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, "
- + "'null' AS `col_id`, NULL AS `part_id`, ${rowCount}
AS `row_count`, ${ndvFunction} as `ndv`, "
- + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
- + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,
SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
- + "COUNT(1) * 4 * ${scaleFactor} AS `data_size`, NOW()
"
- + "FROM ( SELECT t0.`colValue` as `column_key`,
COUNT(1) as `count`, SUM(`len`) as `column_length` "
- + "FROM (SELECT ${subStringColName} AS
`colValue`, LENGTH(`null`) as `len` "
- + "FROM `catalogName`.`${dbName}`.`null`
${sampleHints} ${limit}) as `t0` "
- + " GROUP BY `t0`.`colValue` ) as `t1` ", sql);
- return;
- }
-
- @Mock
- protected boolean useLinearAnalyzeTemplate() {
+ Assertions.assertEquals("WITH cte1 AS (SELECT
MAX(t0.`col_value`) as `col_value`, COUNT(1) as `count`,"
+ + " SUM(`len`) as `column_length` FROM (SELECT
${subStringColName} AS `hash_value`, "
+ + "`null` AS `col_value`, LENGTH(`null`) as `len` FROM
`catalogName`.`${dbName}`.`null` "
+ + "${sampleHints} ${limit}) as `t0` GROUP BY
`t0`.`hash_value`), "
+ + "cte2 AS ( SELECT CONCAT('30001', '-', '-1', '-',
'null') AS `id`, 10001 AS `catalog_id`, "
+ + "20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS
`idx_id`, 'null' AS `col_id`, NULL AS `part_id`, "
+ + "${rowCount} AS `row_count`, ${ndvFunction} as
`ndv`, IFNULL(SUM(IF(`t1`.`col_value` "
+ + "IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as
`null_count`, SUBSTRING(CAST('1' "
+ + "AS STRING), 1, 1024) AS `min`, SUBSTRING(CAST('2'
AS STRING), 1, 1024) AS `max`, "
+ + "COUNT(1) * 4 * ${scaleFactor} AS `data_size`, NOW()
FROM cte1 t1), cte3 AS (SELECT "
+ + "GROUP_CONCAT(CONCAT(REPLACE(REPLACE(t2.`col_value`,
\":\", \"\\\\:\"), \";\", \"\\\\;\"), "
+ + "\" :\", ROUND(t2.`count` * 100.0 / ${rowCount2},
2)), \" ;\") as `hot_value` FROM (SELECT "
+ + "`col_value`, `count` FROM cte1 WHERE `col_value` IS
NOT NULL ORDER BY `count` DESC LIMIT 3) "
+ + "t2) SELECT * FROM cte2 CROSS JOIN cte3", sql);
+ }
+
+ @Mock
+ boolean useLinearAnalyzeTemplate() {
return false;
}
};
@@ -377,7 +386,7 @@ public class OlapAnalysisTaskTest {
task.getSampleParams(params, 1000);
Assertions.assertEquals("10.0", params.get("scaleFactor"));
Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
- Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`)
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1,
0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
+ Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`col_value`) /
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`col_value` is not null,
1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`col_value` is not null, 1, 0)) *
SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
Assertions.assertEquals("SUM(t1.count) * 4",
params.get("dataSizeFunction"));
Assertions.assertEquals("`${colName}`",
params.get("subStringColName"));
Assertions.assertEquals("/*+PREAGGOPEN*/", params.get("preAggHint"));
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
index 45c78f665b0..3de70a11a52 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
@@ -42,6 +42,7 @@ public class StatsMockUtil {
// 11
add("11");
add("12");
+ add(null);
add(String.valueOf(System.currentTimeMillis()));
}};
return new ResultRow(vals);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
index 0467f9b89dd..15a0c65e49c 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
@@ -17,6 +17,7 @@
package org.apache.doris.statistics.util;
+import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.MaterializedIndexMeta;
@@ -558,4 +559,47 @@ class StatisticsUtilTest {
Assertions.assertTrue(StatisticsUtil.canCollectColumn(column, table,
true, 1));
}
+
+ @Test
+ void testGetHotValues() {
+ String value1 = "1234 :0.33 ;222 :0.22";
+ Map<LiteralExpr, Float> hotValues =
StatisticsUtil.getHotValues(value1, Type.INT);
+ Assertions.assertEquals(2, hotValues.size());
+
+ int i = 0;
+ for (Map.Entry<LiteralExpr, Float> entry : hotValues.entrySet()) {
+ if (i == 0) {
+ Assertions.assertEquals(1234, entry.getKey().getLongValue());
+ Assertions.assertEquals("0.33", entry.getValue().toString());
+ i++;
+ } else {
+ Assertions.assertEquals(222, entry.getKey().getLongValue());
+ Assertions.assertEquals("0.22", entry.getValue().toString());
+ }
+ }
+
+ String value2 = "1234 :0.33";
+ hotValues = StatisticsUtil.getHotValues(value2, Type.INT);
+ Assertions.assertEquals(1, hotValues.size());
+
+ for (Map.Entry<LiteralExpr, Float> entry : hotValues.entrySet()) {
+ Assertions.assertEquals(1234, entry.getKey().getLongValue());
+ Assertions.assertEquals("0.33", entry.getValue().toString());
+ }
+
+ String value3 = "aabbcc\\:\\; :0.33 ; dd :0.22";
+ hotValues = StatisticsUtil.getHotValues(value3, Type.STRING);
+ Assertions.assertEquals(2, hotValues.size());
+ i = 0;
+ for (Map.Entry<LiteralExpr, Float> entry : hotValues.entrySet()) {
+ if (i == 0) {
+ Assertions.assertEquals("aabbcc:;",
entry.getKey().getStringValue());
+ Assertions.assertEquals("0.33", entry.getValue().toString());
+ i++;
+ } else {
+ Assertions.assertEquals(" dd",
entry.getKey().getStringValue());
+ Assertions.assertEquals("0.22", entry.getValue().toString());
+ }
+ }
+ }
}
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_statistic.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_statistic.groovy
index 971e6bc0253..03a84e45001 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_statistic.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_statistic.groovy
@@ -293,7 +293,7 @@ suite("test_hive_statistic",
"p0,external,hive,external_docker,external_docker_h
def data_size_in_bytes = result[0][12]
def update_time = result[0][13]
- sql """insert into internal.__internal_schema.column_statistics values
('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count,
$ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time')"""
+ sql """insert into internal.__internal_schema.column_statistics values
('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count,
$ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time', null)"""
result = sql """show column stats logs1_parquet (log_time)"""
assertEquals(result.size(), 1)
diff --git a/regression-test/suites/statistics/test_ddl.groovy
b/regression-test/suites/statistics/test_ddl.groovy
index be2c82bd4ba..a5fa9203cbf 100644
--- a/regression-test/suites/statistics/test_ddl.groovy
+++ b/regression-test/suites/statistics/test_ddl.groovy
@@ -17,7 +17,7 @@
suite("test_ddl") {
sql """
- INSERT INTO `__internal_schema`.`column_statistics` VALUES
('10143--1-cd_credit_rating','0','10141','10143','-1','cd_credit_rating',NULL,1920800,4,0,'Good','Unknown',0,'2023-06-15
19:50:43'),('10143--1-cd_demo_sk-10142','0','10141','10143','-1','cd_demo_sk','10142',1920800,1916366,0,'1','1920800',0,'2023-06-15
19:50:42'),('10143--1-cd_dep_employed_count','0','10141','10143','-1','cd_dep_employed_count',NULL,1920800,7,0,'0','6',0,'2023-06-15
19:50:43'),('10143--1-cd_dep_employed_c [...]
+ INSERT INTO `__internal_schema`.`column_statistics` VALUES
('10143--1-cd_credit_rating','0','10141','10143','-1','cd_credit_rating',NULL,1920800,4,0,'Good','Unknown',0,'2023-06-15
19:50:43',null),('10143--1-cd_demo_sk-10142','0','10141','10143','-1','cd_demo_sk','10142',1920800,1916366,0,'1','1920800',0,'2023-06-15
19:50:42',null),('10143--1-cd_dep_employed_count','0','10141','10143','-1','cd_dep_employed_count',NULL,1920800,7,0,'0','6',0,'2023-06-15
19:50:43',null),('10143--1-cd_ [...]
"""
sql """
diff --git a/regression-test/suites/statistics/test_hot_value.groovy
b/regression-test/suites/statistics/test_hot_value.groovy
new file mode 100644
index 00000000000..39f59524511
--- /dev/null
+++ b/regression-test/suites/statistics/test_hot_value.groovy
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hot_value") {
+
+ def wait_row_count_reported = { db, table, row, column, expected ->
+ def result = sql """show frontends;"""
+ logger.info("show frontends result origin: " + result)
+ def host
+ def port
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][8] == "true") {
+ host = result[i][1]
+ port = result[i][4]
+ }
+ }
+ def tokens = context.config.jdbcUrl.split('/')
+ def url=tokens[0] + "//" + host + ":" + port
+ logger.info("Master url is " + url)
+ connect(context.config.jdbcUser, context.config.jdbcPassword, url) {
+ sql """use ${db}"""
+ result = sql """show frontends;"""
+ logger.info("show frontends result master: " + result)
+ for (int i = 0; i < 120; i++) {
+ Thread.sleep(5000)
+ result = sql """SHOW DATA FROM ${table};"""
+ logger.info("result " + result)
+ if (result[row][column] == expected) {
+ return;
+ }
+ }
+ throw new Exception("Row count report timeout.")
+ }
+
+ }
+
+ sql """drop database if exists test_hot_value"""
+ sql """create database test_hot_value"""
+ sql """use test_hot_value"""
+ sql """set global enable_auto_analyze=false"""
+
+ sql """CREATE TABLE test1 (
+ key1 int NULL,
+ value1 varchar(25) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """insert into test1 select number, number % 2 from
numbers("number"="10000")"""
+ sql """CREATE TABLE test2 (
+ key1 int NULL,
+ value1 varchar(25) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """insert into test2 select number, " : ;a" from
numbers("number"="10000")"""
+ wait_row_count_reported("test_hot_value", "test1", 0, 4, "10000")
+ wait_row_count_reported("test_hot_value", "test2", 0, 4, "10000")
+ sql """analyze table test1 with sync"""
+ def result = sql """show column stats test1(key1)"""
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ assertEquals("null", result[0][17])
+ result = sql """show column stats test1(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ assertEquals("null", result[0][17])
+ result = sql """show column cached stats test1(key1)"""
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ assertEquals("null", result[0][17])
+ result = sql """show column cached stats test1(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ assertEquals("null", result[0][17])
+
+ sql """drop stats test1"""
+ sql """analyze table test1 with sample rows 400 with sync"""
+ result = sql """show column stats test1(key1)"""
+ logger.info("result " + result)
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ result = sql """show column stats test1(value1)"""
+ logger.info("result " + result)
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ String[] hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0] == "1:50.0" || hotValues[0] == "0:50.0")
+ assertTrue(hotValues[1] == "1:50.0" || hotValues[1] == "0:50.0")
+ result = sql """show column cached stats test1(value1)"""
+ logger.info("result " + result)
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0] == "1:50.0" || hotValues[0] == "0:50.0")
+ assertTrue(hotValues[1] == "1:50.0" || hotValues[1] == "0:50.0")
+
+ sql """drop stats test1"""
+ sql """analyze table test1 with sample rows 40000 with sync"""
+ result = sql """show column stats test1(key1)"""
+ logger.info("result " + result)
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ result = sql """show column stats test1(value1)"""
+ logger.info("result " + result)
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0] == "1:50.0" || hotValues[0] == "0:50.0")
+ assertTrue(hotValues[1] == "1:50.0" || hotValues[1] == "0:50.0")
+ result = sql """show column cached stats test1(value1)"""
+ logger.info("result " + result)
+ assertEquals(1, result.size())
+ assertEquals("10000.0", result[0][2])
+ hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0] == "1:50.0" || hotValues[0] == "0:50.0")
+ assertTrue(hotValues[1] == "1:50.0" || hotValues[1] == "0:50.0")
+
+ sql """alter table test1 modify column value1 set stats
('row_count'='5.0', 'ndv'='5.0', 'num_nulls'='0.0', 'data_size'='34.0',
'min_value'='AFRICA', 'max_value'='MIDDLE EAST', 'hot_values'='aaa :22.33');"""
+ result = sql """show column stats test1(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("5.0", result[0][2])
+ assertEquals("aaa:22.33", result[0][17])
+ result = sql """show column cached stats test1(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("5.0", result[0][2])
+ assertEquals("aaa:22.33", result[0][17])
+
+ sql """alter table test1 modify column value1 set stats
('row_count'='5.0', 'ndv'='5.0', 'num_nulls'='0.0', 'data_size'='34.0',
'min_value'='AFRICA', 'max_value'='MIDDLE EAST', 'hot_values'='a \\\\;a \\\\:a
:22.33');"""
+ result = sql """show column stats test1(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("5.0", result[0][2])
+ assertEquals("a ;a :a:22.33", result[0][17])
+ result = sql """show column cached stats test1(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("5.0", result[0][2])
+ assertEquals("a ;a :a:22.33", result[0][17])
+
+ sql """analyze table test2 with sample rows 100 with sync"""
+ result = sql """show column stats test2(value1)"""
+ assertEquals(1, result.size())
+ assertEquals(" : ;a:100.0", result[0][17])
+ result = sql """show column cached stats test2(value1)"""
+ assertEquals(1, result.size())
+ assertEquals(" : ;a:100.0", result[0][17])
+
+ sql """drop stats test2"""
+ sql """analyze table test2 with sample rows 4000000 with sync"""
+ result = sql """show column stats test2(value1)"""
+ assertEquals(1, result.size())
+ assertEquals(" : ;a:100.0", result[0][17])
+ result = sql """show column cached stats test2(value1)"""
+ assertEquals(1, result.size())
+ assertEquals(" : ;a:100.0", result[0][17])
+
+ sql """drop database if exists test_hot_value"""
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]