morrySnow commented on code in PR #64618:
URL: https://github.com/apache/doris/pull/64618#discussion_r3457805903


##########
fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InferSetOperatorDistinct.java:
##########
@@ -38,28 +44,137 @@
  * </pre>
  */
 public class InferSetOperatorDistinct extends OneRewriteRuleFactory {
+    private static final double LOWER_AGGREGATE_EFFECT_COEFFICIENT = 10000;
+    private static final double LOW_AGGREGATE_EFFECT_COEFFICIENT = 1000;
+    private static final double MEDIUM_AGGREGATE_EFFECT_COEFFICIENT = 100;
+
+    private final StatsDerive derive = new StatsDerive(false);
+
     @Override
     public Rule build() {
         return logicalSetOperation()
                 .when(operation -> operation.getQualifier() == 
Qualifier.DISTINCT)
                 .then(setOperation -> {
-                    if (setOperation.children().stream().anyMatch(child -> 
child instanceof LogicalAggregate)) {
-                        return null;
+                    ImmutableList.Builder<Plan> newChildren =
+                            
ImmutableList.builderWithExpectedSize(setOperation.arity());
+                    boolean hasNewChildren = false;
+                    for (Plan child : setOperation.children()) {
+                        if (shouldInferDistinct(child)) {
+                            newChildren.add(new LogicalAggregate<>(
+                                    ImmutableList.copyOf(child.getOutput()), 
true, child));
+                            hasNewChildren = true;
+                        } else {
+                            newChildren.add(child);
+                        }
                     }
-
-                    List<Plan> newChildren = setOperation.children().stream()
-                            .map(child -> isAgg(child) ? child
-                                    : new 
LogicalAggregate<>(ImmutableList.copyOf(child.getOutput()), true, child))
-                            .collect(ImmutableList.toImmutableList());
-                    if (newChildren.equals(setOperation.children())) {
+                    if (!hasNewChildren) {
                         return null;
                     }
-                    return setOperation.withChildren(newChildren);
+                    return setOperation.withChildren(newChildren.build());
                 }).toRule(RuleType.INFER_SET_OPERATOR_DISTINCT);
     }
 
+    private boolean shouldInferDistinct(Plan child) {
+        return !isAgg(child) && rejectNLJ(child)
+                && shouldGenerateAggregateByNdv(child, child.getOutput());
+    }
+
     private boolean isAgg(Plan plan) {
         return plan instanceof LogicalAggregate || (plan instanceof 
LogicalProject && plan.child(
                 0) instanceof LogicalAggregate);
     }
+
+    // if children exist NLJ, we can't infer distinct
+    // because NLJ could generate bitmap runtime filter. and it will execute 
failed when we do infer distinct.
+    private boolean rejectNLJ(Plan plan) {

Review Comment:
   remove this. has been fixed in #64314



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InferSetOperatorDistinct.java:
##########
@@ -38,28 +44,137 @@
  * </pre>
  */
 public class InferSetOperatorDistinct extends OneRewriteRuleFactory {
+    private static final double LOWER_AGGREGATE_EFFECT_COEFFICIENT = 10000;
+    private static final double LOW_AGGREGATE_EFFECT_COEFFICIENT = 1000;
+    private static final double MEDIUM_AGGREGATE_EFFECT_COEFFICIENT = 100;
+
+    private final StatsDerive derive = new StatsDerive(false);
+
     @Override
     public Rule build() {
         return logicalSetOperation()
                 .when(operation -> operation.getQualifier() == 
Qualifier.DISTINCT)
                 .then(setOperation -> {
-                    if (setOperation.children().stream().anyMatch(child -> 
child instanceof LogicalAggregate)) {
-                        return null;
+                    ImmutableList.Builder<Plan> newChildren =
+                            
ImmutableList.builderWithExpectedSize(setOperation.arity());
+                    boolean hasNewChildren = false;
+                    for (Plan child : setOperation.children()) {
+                        if (shouldInferDistinct(child)) {
+                            newChildren.add(new LogicalAggregate<>(
+                                    ImmutableList.copyOf(child.getOutput()), 
true, child));
+                            hasNewChildren = true;
+                        } else {
+                            newChildren.add(child);
+                        }
                     }
-
-                    List<Plan> newChildren = setOperation.children().stream()
-                            .map(child -> isAgg(child) ? child
-                                    : new 
LogicalAggregate<>(ImmutableList.copyOf(child.getOutput()), true, child))
-                            .collect(ImmutableList.toImmutableList());
-                    if (newChildren.equals(setOperation.children())) {
+                    if (!hasNewChildren) {
                         return null;
                     }
-                    return setOperation.withChildren(newChildren);
+                    return setOperation.withChildren(newChildren.build());
                 }).toRule(RuleType.INFER_SET_OPERATOR_DISTINCT);
     }
 
+    private boolean shouldInferDistinct(Plan child) {
+        return !isAgg(child) && rejectNLJ(child)
+                && shouldGenerateAggregateByNdv(child, child.getOutput());
+    }
+
     private boolean isAgg(Plan plan) {
         return plan instanceof LogicalAggregate || (plan instanceof 
LogicalProject && plan.child(
                 0) instanceof LogicalAggregate);
     }
+
+    // if children exist NLJ, we can't infer distinct
+    // because NLJ could generate bitmap runtime filter. and it will execute 
failed when we do infer distinct.
+    private boolean rejectNLJ(Plan plan) {
+        if (plan instanceof LogicalProject) {
+            plan = plan.child(0);
+        }
+        if (plan instanceof LogicalJoin) {
+            LogicalJoin<?, ?> join = (LogicalJoin<?, ?>) plan;
+            return join.getOtherJoinConjuncts().isEmpty();
+        }
+        return true;
+    }
+
+    private boolean shouldGenerateAggregateByNdv(Plan plan, List<? extends 
NamedExpression> groupKeys) {
+        Statistics stats = plan.getStats();
+        if (stats == null) {
+            stats = plan.accept(derive, new StatsDerive.DeriveContext());
+            if (stats == null) {
+                return false;
+            }
+        }
+        if (stats.getRowCount() <= 0) {
+            return false;
+        }
+
+        List<ColumnStatistic> lower = new ArrayList<>();

Review Comment:
   low? why use comparative degree?



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InferSetOperatorDistinct.java:
##########
@@ -38,28 +44,137 @@
  * </pre>
  */
 public class InferSetOperatorDistinct extends OneRewriteRuleFactory {
+    private static final double LOWER_AGGREGATE_EFFECT_COEFFICIENT = 10000;
+    private static final double LOW_AGGREGATE_EFFECT_COEFFICIENT = 1000;
+    private static final double MEDIUM_AGGREGATE_EFFECT_COEFFICIENT = 100;

Review Comment:
   The combination of names LOWER - LOW - MEDIUM is a bit odd.



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InferSetOperatorDistinct.java:
##########
@@ -38,28 +44,137 @@
  * </pre>
  */
 public class InferSetOperatorDistinct extends OneRewriteRuleFactory {
+    private static final double LOWER_AGGREGATE_EFFECT_COEFFICIENT = 10000;
+    private static final double LOW_AGGREGATE_EFFECT_COEFFICIENT = 1000;
+    private static final double MEDIUM_AGGREGATE_EFFECT_COEFFICIENT = 100;
+
+    private final StatsDerive derive = new StatsDerive(false);
+
     @Override
     public Rule build() {
         return logicalSetOperation()
                 .when(operation -> operation.getQualifier() == 
Qualifier.DISTINCT)
                 .then(setOperation -> {
-                    if (setOperation.children().stream().anyMatch(child -> 
child instanceof LogicalAggregate)) {
-                        return null;
+                    ImmutableList.Builder<Plan> newChildren =
+                            
ImmutableList.builderWithExpectedSize(setOperation.arity());
+                    boolean hasNewChildren = false;
+                    for (Plan child : setOperation.children()) {
+                        if (shouldInferDistinct(child)) {
+                            newChildren.add(new LogicalAggregate<>(
+                                    ImmutableList.copyOf(child.getOutput()), 
true, child));
+                            hasNewChildren = true;
+                        } else {
+                            newChildren.add(child);
+                        }
                     }
-
-                    List<Plan> newChildren = setOperation.children().stream()
-                            .map(child -> isAgg(child) ? child
-                                    : new 
LogicalAggregate<>(ImmutableList.copyOf(child.getOutput()), true, child))
-                            .collect(ImmutableList.toImmutableList());
-                    if (newChildren.equals(setOperation.children())) {
+                    if (!hasNewChildren) {
                         return null;
                     }
-                    return setOperation.withChildren(newChildren);
+                    return setOperation.withChildren(newChildren.build());
                 }).toRule(RuleType.INFER_SET_OPERATOR_DISTINCT);
     }
 
+    private boolean shouldInferDistinct(Plan child) {
+        return !isAgg(child) && rejectNLJ(child)
+                && shouldGenerateAggregateByNdv(child, child.getOutput());
+    }
+
     private boolean isAgg(Plan plan) {
         return plan instanceof LogicalAggregate || (plan instanceof 
LogicalProject && plan.child(
                 0) instanceof LogicalAggregate);
     }
+
+    // if children exist NLJ, we can't infer distinct
+    // because NLJ could generate bitmap runtime filter. and it will execute 
failed when we do infer distinct.
+    private boolean rejectNLJ(Plan plan) {
+        if (plan instanceof LogicalProject) {
+            plan = plan.child(0);
+        }
+        if (plan instanceof LogicalJoin) {
+            LogicalJoin<?, ?> join = (LogicalJoin<?, ?>) plan;
+            return join.getOtherJoinConjuncts().isEmpty();
+        }
+        return true;
+    }
+
+    private boolean shouldGenerateAggregateByNdv(Plan plan, List<? extends 
NamedExpression> groupKeys) {
+        Statistics stats = plan.getStats();
+        if (stats == null) {
+            stats = plan.accept(derive, new StatsDerive.DeriveContext());
+            if (stats == null) {
+                return false;
+            }
+        }
+        if (stats.getRowCount() <= 0) {
+            return false;
+        }
+
+        List<ColumnStatistic> lower = new ArrayList<>();
+        List<ColumnStatistic> medium = new ArrayList<>();
+        List<ColumnStatistic> high = new ArrayList<>();
+
+        List<ColumnStatistic>[] cards = new List[] { lower, medium, high };
+
+        for (NamedExpression key : groupKeys) {
+            ColumnStatistic colStats = 
ExpressionEstimation.INSTANCE.estimate(key, stats);
+            if (colStats.isUnKnown) {
+                return false;
+            }
+            if (stats.getRowCount() * 0.9 <= colStats.ndv) {

Review Comment:
   All judgments based on statistical information should be accompanied by 
explanatory notes to clarify the reasons.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to