This is an automated email from the ASF dual-hosted git repository.
englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 489db8125ba [improve](nereids) if a value occurs many times in a
column, take it as hot value (#55827)
489db8125ba is described below
commit 489db8125ba8e181a1de82cde9aa728b9e59028b
Author: minghong <[email protected]>
AuthorDate: Wed Sep 17 08:37:53 2025 +0800
[improve](nereids) if a value occurs many times in a column, take it as hot
value (#55827)
### What problem does this PR solve?
In the previous PR #55472, whether a value qualifies as a hot value for a
certain column depends on whether its occurrence frequency is higher than the
average frequency of all values (i.e., 1/ndv). However, this approach is not
conducive to the use of skew join; therefore, we also include values from
columns with uniform distribution but small ndv into the collection of hot
values.
---
.../doris/nereids/stats/StatsCalculator.java | 3 +-
.../java/org/apache/doris/qe/SessionVariable.java | 35 +++++++++++++++++-----
.../doris/statistics/util/StatisticsUtil.java | 3 +-
.../doris/nereids/stats/StatsCalculatorTest.java | 2 +-
4 files changed, 32 insertions(+), 11 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index f129ac4e9df..f0ca1f1e6ba 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -1526,7 +1526,8 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
Map<Literal, Float> resultHotValues = new LinkedHashMap<>();
for (Literal hot : unionHotValues.keySet()) {
float ratio = (float) (unionHotValues.get(hot) /
unionRowCount);
- if (ratio * colStatsBuilder.getNdv() >=
SessionVariable.getHotValueThreshold()) {
+ if (ratio * colStatsBuilder.getNdv() >=
SessionVariable.getSkewValueThreshold()
+ || ratio >= SessionVariable.getHotValueThreshold()) {
resultHotValues.put(hot, ratio);
}
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 889eb4d91c8..f9cccbbfd85 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -794,24 +794,43 @@ public class SessionVariable implements Serializable,
Writable {
}
}
- public static final String HOT_VALUE_THRESHOLD = "hot_value_threshold";
+ public static final String SKEW_VALUE_THRESHOLD = "skew_value_threshold";
- @VariableMgr.VarAttr(name = HOT_VALUE_THRESHOLD, needForward = true,
- description = {"当列中某个特定值的出现次数大于等于(rowCount/ndv)×
hotValueThreshold 时,该值即被视为热点值",
+
+ @VariableMgr.VarAttr(name = SKEW_VALUE_THRESHOLD, needForward = true,
+ description = {"当列中某个特定值的出现次数大于等于(rowCount/ndv)×
skewValueThreshold 时,该值即被视为热点值",
"When the occurrence of a value in a column is greater
than "
- + "hotValueThreshold tmies of average
occurences "
- + "(occurrences >= hotValueThreshold *
rowCount / ndv), "
+ + "skewValueThreshold tmies of average
occurences "
+ + "(occurrences >= skewValueThreshold *
rowCount / ndv), "
+ "the value is regarded as hot value"})
- private double hotValueThreshold = 10;
+ private double skewValueThreshold = 10;
+
+ public void setSkewValueThreshold(int threshold) {
+ this.skewValueThreshold = threshold;
+ }
+
+ public static double getSkewValueThreshold() {
+ if (ConnectContext.get() != null) {
+ return
ConnectContext.get().getSessionVariable().skewValueThreshold;
+ } else {
+ return
Double.parseDouble(VariableMgr.getDefaultValue(SKEW_VALUE_THRESHOLD));
+ }
+ }
+
+ public static final String HOT_VALUE_THRESHOLD = "hot_value_threshold";
+ @VariableMgr.VarAttr(name = HOT_VALUE_THRESHOLD, needForward = true,
+ description = {"hot value 在列中出现的最小比例",
+ "The minimum ratio of occurrences of a hot value in a
column"})
+ private double hotValueThreshold = 0.10d;
- public void setHotValueThreshold(int threshold) {
+ public void setHotValueThreshold(double threshold) {
this.hotValueThreshold = threshold;
}
public static double getHotValueThreshold() {
if (ConnectContext.get() != null) {
if (ConnectContext.get().getState().isInternal()) {
- return 0.0;
+ return 0.1;
} else {
return
ConnectContext.get().getSessionVariable().hotValueThreshold;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 4f6cba39ab9..199dfd26987 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -1278,7 +1278,8 @@ public class StatisticsUtil {
for (String oneRow : stringValues.split(" ;")) {
String[] oneRowSplit = oneRow.split(" :");
float value = Float.parseFloat(oneRowSplit[1]);
- if (value >= avgOccurrences *
SessionVariable.getHotValueThreshold()) {
+ if (value >= avgOccurrences *
SessionVariable.getSkewValueThreshold()
+ || value >= SessionVariable.getHotValueThreshold()) {
org.apache.doris.nereids.trees.expressions.literal.StringLiteral stringLiteral =
new
org.apache.doris.nereids.trees.expressions.literal.StringLiteral(
oneRowSplit[0].replaceAll("\\\\:", ":")
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
index 0590078de22..c5d2f2fa737 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
@@ -518,7 +518,7 @@ public class StatsCalculatorTest {
StatsCalculator calculator = new StatsCalculator(null);
Statistics outputStats = calculator.computeUnion(unionAll,
ImmutableList.of(child0Stats, child1Stats));
ColumnStatistic iaStatsOut = outputStats.findColumnStatistics(ia);
- Assertions.assertEquals(1, iaStatsOut.getHotValues().size());
+ Assertions.assertEquals(3, iaStatsOut.getHotValues().size());
Assertions.assertTrue(containsHotValue(iaStatsOut, "1"));
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]