This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 7cc003ed5a3 [opt](nereids) clear min/max column stats if table is
partially analyzed (#35533)
7cc003ed5a3 is described below
commit 7cc003ed5a3982c3fc208fde78ae37ca1ed7a8a7
Author: minghong <[email protected]>
AuthorDate: Wed May 29 11:54:01 2024 +0800
[opt](nereids) clear min/max column stats if table is partially analyzed
(#35533)
cherry picked from master PR #33685
commit 3d14f663a6a30292a547fd56e557cde55593c4b6
if user queries newly loaded data (the new data are not analyzed),
optimizer may generate inefficient plan because the newly loaded data is out of
column stats min-max range.
In this pr, we will ignore min-max if there are newly loaded data.
---
.../doris/nereids/stats/StatsCalculator.java | 65 ++++++++++++++--------
1 file changed, 43 insertions(+), 22 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index f9b767f40c8..d711b99655b 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -114,6 +114,7 @@ import
org.apache.doris.nereids.trees.plans.physical.PhysicalWindow;
import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor;
import org.apache.doris.nereids.types.DataType;
import org.apache.doris.qe.ConnectContext;
+import org.apache.doris.statistics.AnalysisManager;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
import org.apache.doris.statistics.Histogram;
@@ -121,8 +122,10 @@ import org.apache.doris.statistics.StatisticConstants;
import org.apache.doris.statistics.StatisticRange;
import org.apache.doris.statistics.Statistics;
import org.apache.doris.statistics.StatisticsBuilder;
+import org.apache.doris.statistics.TableStatsMeta;
import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.collections.CollectionUtils;
@@ -620,10 +623,20 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
// 2. Consider the influence of runtime filter
// 3. Get NDV and column data size from StatisticManger,
StatisticManager doesn't support it now.
private Statistics computeCatalogRelation(CatalogRelation catalogRelation)
{
- Set<SlotReference> slotSet =
catalogRelation.getOutput().stream().filter(SlotReference.class::isInstance)
- .map(s -> (SlotReference) s).collect(Collectors.toSet());
- Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>();
+ List<Slot> output = catalogRelation.getOutput();
+ ImmutableSet.Builder<SlotReference> slotSetBuilder =
ImmutableSet.builderWithExpectedSize(output.size());
+ for (Slot slot : output) {
+ if (slot instanceof SlotReference) {
+ slotSetBuilder.add((SlotReference) slot);
+ }
+ }
+ Set<SlotReference> slotSet = slotSetBuilder.build();
+ Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap =
new HashMap<>();
TableIf table = catalogRelation.getTable();
+ AnalysisManager analysisManager =
Env.getCurrentEnv().getAnalysisManager();
+ TableStatsMeta tableMeta =
analysisManager.findTableStatsStatus(table.getId());
+ // rows newly updated after last analyze
+ long deltaRowCount = tableMeta == null ? 0 :
tableMeta.updatedRows.get();
double rowCount = catalogRelation.getTable().getRowCountForNereids();
boolean hasUnknownCol = false;
long idxId = -1;
@@ -633,6 +646,10 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
idxId = olapScan.getSelectedIndexId();
}
}
+ if (deltaRowCount > 0 && LOG.isDebugEnabled()) {
+ LOG.debug("{} is partially analyzed, clear min/max values in
column stats",
+ catalogRelation.getTable().getName());
+ }
for (SlotReference slotReference : slotSet) {
String colName = slotReference.getColumn().isPresent()
? slotReference.getColumn().get().getName()
@@ -649,40 +666,44 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
} else {
cache = getColumnStatistic(table, colName, idxId);
}
+ ColumnStatisticBuilder colStatsBuilder = new
ColumnStatisticBuilder(cache);
if (cache.avgSizeByte <= 0) {
- cache = new ColumnStatisticBuilder(cache)
-
.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize())
- .build();
+
colStatsBuilder.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize());
}
if (!cache.isUnKnown) {
- rowCount = Math.max(rowCount, cache.count);
+ rowCount = Math.max(rowCount, cache.count + deltaRowCount);
} else {
hasUnknownCol = true;
}
if (ConnectContext.get() != null &&
ConnectContext.get().getSessionVariable().enableStats) {
- columnStatisticMap.put(slotReference, cache);
+ if (deltaRowCount > 0) {
+ // clear min-max to avoid error estimation
+ // for example, after yesterday data loaded, user send
query about yesterday immediately.
+ // since yesterday data are not analyzed, the max date is
before yesterday, and hence optimizer
+ // estimates the filter result is zero
+
colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
+
.setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
+ }
+ columnStatisticBuilderMap.put(slotReference, colStatsBuilder);
} else {
- columnStatisticMap.put(slotReference, ColumnStatistic.UNKNOWN);
+ columnStatisticBuilderMap.put(slotReference, new
ColumnStatisticBuilder(ColumnStatistic.UNKNOWN));
hasUnknownCol = true;
}
}
if (hasUnknownCol && ConnectContext.get() != null &&
ConnectContext.get().getStatementContext() != null) {
ConnectContext.get().getStatementContext().setHasUnknownColStats(true);
}
- Statistics stats = new Statistics(rowCount, columnStatisticMap);
- stats = normalizeCatalogRelationColumnStatsRowCount(stats);
- return stats;
- }
-
- private Statistics normalizeCatalogRelationColumnStatsRowCount(Statistics
stats) {
- for (Expression slot : stats.columnStatistics().keySet()) {
- ColumnStatistic colStats = stats.findColumnStatistics(slot);
- Preconditions.checkArgument(colStats != null,
- "can not find col stats for %s in table", slot.toSql());
- stats.addColumnStats(slot,
- new
ColumnStatisticBuilder(colStats).setCount(stats.getRowCount()).build());
+ return normalizeCatalogRelationColumnStatsRowCount(rowCount,
columnStatisticBuilderMap);
+ }
+
+ private Statistics normalizeCatalogRelationColumnStatsRowCount(double
rowCount,
+ Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap)
{
+ Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>();
+ for (Expression slot : columnStatisticBuilderMap.keySet()) {
+ columnStatisticMap.put(slot,
+
columnStatisticBuilderMap.get(slot).setCount(rowCount).build());
}
- return stats;
+ return new Statistics(rowCount, columnStatisticMap);
}
private Statistics computeTopN(TopN topN) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]