This is an automated email from the ASF dual-hosted git repository.
lijibing pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 177e1af868f Return UNKNOWN column stats if ndv is 0. (#31439) (#31588)
177e1af868f is described below
commit 177e1af868f118aeef1977b0b41252797125a3e5
Author: Jibing-Li <[email protected]>
AuthorDate: Thu Feb 29 16:56:26 2024 +0800
Return UNKNOWN column stats if ndv is 0. (#31439) (#31588)
---
.../org/apache/doris/statistics/ColStatsData.java | 5 ++++
.../doris/statistics/ColumnStatisticBuilder.java | 4 +--
.../statistics/ColumnStatisticsCacheLoader.java | 30 ++++++++++++++--------
.../apache/doris/statistics/StatisticsCache.java | 5 +++-
.../doris/nereids/stats/FilterEstimationTest.java | 2 ++
.../suites/statistics/analyze_stats.groovy | 17 ++++++++++++
6 files changed, 50 insertions(+), 13 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index c90b3dd8e1d..6bbafdbe5b5 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -131,6 +131,11 @@ public class ColStatsData {
}
public ColumnStatistic toColumnStatistic() {
+ // For non-empty table, return UNKNOWN if we can't collect ndv value.
+ // Because inaccurate ndv is very misleading.
+ if (count > 0 && ndv == 0 && count != nullCount) {
+ return ColumnStatistic.UNKNOWN;
+ }
try {
ColumnStatisticBuilder columnStatisticBuilder = new
ColumnStatisticBuilder();
columnStatisticBuilder.setCount(count);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index f8ed6a1b6ab..a512fbadbda 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -25,8 +25,8 @@ public class ColumnStatisticBuilder {
private double avgSizeByte;
private double numNulls;
private double dataSize;
- private double minValue;
- private double maxValue;
+ private double minValue = Double.NEGATIVE_INFINITY;
+ private double maxValue = Double.POSITIVE_INFINITY;
private LiteralExpr minExpr;
private LiteralExpr maxExpr;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
index 0b66fa5e7b1..bc5fc4c10c0 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
@@ -37,22 +37,32 @@ public class ColumnStatisticsCacheLoader extends
StatisticsCacheLoader<Optional<
try {
// Load from statistics table.
columnStatistic = loadFromStatsTable(key);
- if (columnStatistic.isPresent()) {
- return columnStatistic;
- }
- // Load from data source metadata
- try {
- TableIf table = StatisticsUtil.findTable(key.catalogId,
key.dbId, key.tableId);
- columnStatistic = table.getColumnStatistic(key.colName);
- } catch (Exception e) {
- LOG.debug(String.format("Exception to get column statistics by
metadata. [Catalog:{}, DB:{}, Table:{}]",
- key.catalogId, key.dbId, key.tableId), e);
+ if (!columnStatistic.isPresent()) {
+ // Load from data source metadata
+ try {
+ TableIf table = StatisticsUtil.findTable(key.catalogId,
key.dbId, key.tableId);
+ columnStatistic = table.getColumnStatistic(key.colName);
+ } catch (Exception e) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(String.format("Exception to get column
statistics by metadata."
+ + "[Catalog:{}, DB:{}, Table:{}]",
+ key.catalogId, key.dbId, key.tableId), e);
+ }
+ }
}
} catch (Throwable t) {
LOG.warn("Failed to load stats for column [Catalog:{}, DB:{},
Table:{}, Column:{}], Reason: {}",
key.catalogId, key.dbId, key.tableId, key.colName,
t.getMessage());
LOG.debug(t);
}
+ if (columnStatistic.isPresent()) {
+ // For non-empty table, return UNKNOWN if we can't collect ndv
value.
+ // Because inaccurate ndv is very misleading.
+ ColumnStatistic stats = columnStatistic.get();
+ if (stats.count > 0 && stats.ndv == 0 && stats.count !=
stats.numNulls) {
+ columnStatistic = Optional.of(ColumnStatistic.UNKNOWN);
+ }
+ }
return columnStatistic;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
index 62e11f5c9d8..1826f10a38a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
@@ -173,7 +173,10 @@ public class StatisticsCache {
String colId = statsId.colId;
final StatisticsCacheKey k =
new StatisticsCacheKey(tblId, idxId, colId);
- final ColumnStatistic c = ColumnStatistic.fromResultRow(r);
+ ColumnStatistic c = ColumnStatistic.fromResultRow(r);
+ if (c.count > 0 && c.ndv == 0 && c.count != c.numNulls) {
+ c = ColumnStatistic.UNKNOWN;
+ }
putCache(k, c);
} catch (Throwable t) {
LOG.warn("Error when preheating stats cache", t);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index 66e64145901..9b37f1119a4 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -134,6 +134,8 @@ class FilterEstimationTest {
Map<Expression, ColumnStatistic> slotToColumnStat = new HashMap<>();
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(500)
+ .setMaxValue(0)
+ .setMinValue(0)
.setIsUnknown(false);
slotToColumnStat.put(a, builder.build());
Statistics stat = new Statistics(1000, slotToColumnStat);
diff --git a/regression-test/suites/statistics/analyze_stats.groovy
b/regression-test/suites/statistics/analyze_stats.groovy
index 35ba207b882..1fa86e52b02 100644
--- a/regression-test/suites/statistics/analyze_stats.groovy
+++ b/regression-test/suites/statistics/analyze_stats.groovy
@@ -2670,6 +2670,23 @@ PARTITION `p599` VALUES IN (599)
sql """drop stats alter_test"""
alter_result = sql """show table stats alter_test"""
assertEquals("false", alter_result[0][7])
+ sql """alter table alter_test modify column id set stats
('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='2.69975443E8',
'min_value'='1', 'max_value'='2');"""
+ alter_result = sql """show column stats alter_test(id)"""
+ assertEquals(1, alter_result.size())
+ alter_result = sql """show column cached stats alter_test(id)"""
+ assertEquals(0, alter_result.size())
+ alter_result = sql """show column cached stats alter_test(id)"""
+ assertEquals(0, alter_result.size())
+ sql """alter table alter_test modify column id set stats
('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='2.69975443E8',
'min_value'='1', 'max_value'='2');"""
+ alter_result = sql """show column stats alter_test(id)"""
+ assertEquals(1, alter_result.size())
+ alter_result = sql """show column cached stats alter_test(id)"""
+ assertEquals(1, alter_result.size())
+ sql """alter table alter_test modify column id set stats
('row_count'='100', 'ndv'='1', 'num_nulls'='0', 'data_size'='2.69975443E8',
'min_value'='1', 'max_value'='2');"""
+ alter_result = sql """show column stats alter_test(id)"""
+ assertEquals(1, alter_result.size())
+ alter_result = sql """show column cached stats alter_test(id)"""
+ assertEquals(1, alter_result.size())
// Test trigger type, manual default full, manual high health value,
sample empty, kill job, show analyze
sql """DROP DATABASE IF EXISTS trigger"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]