>From Vijay Sarathy <[email protected]>:

Vijay Sarathy has uploaded this change for review. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17765 )


Change subject: [ASTERIXDB-3246][COMP]: CBO costing of all physical operators 
in a query plan
......................................................................

[ASTERIXDB-3246][COMP]: CBO costing of all physical operators in a query plan

Change-Id: I3196f664d716bb5b3806ec9a5a0dd5c1ea51ff62
---
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/SetAsterixPhysicalOperatorsRule.java
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/CostMethods.java
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EstimatedCostComputationVisitor.java
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
M 
hyracks-fullstack/algebricks/algebricks-rewriter/src/main/java/org/apache/hyracks/algebricks/rewriter/rules/SetAlgebricksPhysicalOperatorsRule.java
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/JoinEnum.java
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EnumerateJoinsRule.java
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/ICostMethods.java
8 files changed, 340 insertions(+), 24 deletions(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/65/17765/1

diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/CostMethods.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/CostMethods.java
index 887cc94..48f44af 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/CostMethods.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/CostMethods.java
@@ -22,6 +22,9 @@
 import org.apache.asterix.metadata.declared.MetadataProvider;
 import org.apache.asterix.optimizer.rules.cbo.JoinNode;
 import org.apache.hyracks.algebricks.core.algebra.base.IOptimizationContext;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.GroupByOperator;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.LimitOperator;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.OrderOperator;
 import 
org.apache.hyracks.algebricks.core.rewriter.base.PhysicalOptimizationConfig;

 public class CostMethods implements ICostMethods {
@@ -30,14 +33,18 @@
     protected PhysicalOptimizationConfig physOptConfig;
     protected long blockSize;
     protected long DOP;
-    protected double maxMemorySize;
+    protected double maxMemorySizeForJoin;
+    protected double maxMemorySizeForGroup;
+    protected double maxMemorySizeForSort;

     public CostMethods(IOptimizationContext context) {
         optCtx = context;
         physOptConfig = context.getPhysicalOptimizationConfig();
         blockSize = getBufferCachePageSize();
         DOP = getDOP();
-        maxMemorySize = getMaxMemorySize();
+        maxMemorySizeForJoin = getMaxMemorySizeForJoin();
+        maxMemorySizeForGroup = getMaxMemorySizeForGroup();
+        maxMemorySizeForJoin = getMaxMemorySizeForSort();
     }

     private long getBufferCacheSize() {
@@ -54,10 +61,18 @@
         return optCtx.getComputationNodeDomain().cardinality();
     }

-    public double getMaxMemorySize() {
+    public double getMaxMemorySizeForJoin() {
         return physOptConfig.getMaxFramesForJoin() * 
physOptConfig.getFrameSize();
     }

+    public double getMaxMemorySizeForGroup() {
+        return physOptConfig.getMaxFramesForGroupBy() * 
physOptConfig.getFrameSize();
+    }
+
+    public double getMaxMemorySizeForSort() {
+        return physOptConfig.getMaxFramesExternalSort() * 
physOptConfig.getFrameSize();
+    }
+
     // These cost methods are very simple and rudimentary for now. These can 
be improved by asterixdb developers as needed.
     public Cost costFullScan(JoinNode jn) {
         return new Cost(jn.computeJoinCardinality());
@@ -119,4 +134,20 @@
         JoinNode rightJn = jn.getRightJn();
         return new Cost(DOP * rightJn.computeJoinCardinality());
     }
+
+    public Cost costHashGroupBy(GroupByOperator groupByOperator) {
+        return new Cost(100.0);
+    }
+
+    public Cost costSortGroupBy(GroupByOperator groupByOperator) {
+        return new Cost(200.0);
+    }
+
+    public Cost costOrderBy(OrderOperator orderOp) {
+        return null;
+    }
+
+    public Cost costLimit(LimitOperator limitOp) {
+        return null;
+    }
 }
diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/ICostMethods.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/ICostMethods.java
index ef4af41..2431dc1 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/ICostMethods.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/cost/ICostMethods.java
@@ -20,6 +20,9 @@
 package org.apache.asterix.optimizer.cost;

 import org.apache.asterix.optimizer.rules.cbo.JoinNode;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.GroupByOperator;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.LimitOperator;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.OrderOperator;

 public interface ICostMethods {
     Cost costFullScan(JoinNode jn);
@@ -33,4 +36,12 @@
     Cost costIndexNLJoin(JoinNode currentJn);

     Cost costCartesianProductJoin(JoinNode currentJn);
+
+    Cost costHashGroupBy(GroupByOperator groupByOperator);
+
+    Cost costSortGroupBy(GroupByOperator groupByOperator);
+
+    Cost costOrderBy(OrderOperator orderOp);
+
+    Cost costLimit(LimitOperator limitOp);
 }
diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/SetAsterixPhysicalOperatorsRule.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/SetAsterixPhysicalOperatorsRule.java
index 5ec1a7f..2377c6e 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/SetAsterixPhysicalOperatorsRule.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/SetAsterixPhysicalOperatorsRule.java
@@ -36,6 +36,10 @@
 import org.apache.asterix.metadata.functions.ExternalFunctionCompilerUtil;
 import org.apache.asterix.om.functions.BuiltinFunctions;
 import org.apache.asterix.optimizer.base.AnalysisUtil;
+import org.apache.asterix.optimizer.cost.Cost;
+import org.apache.asterix.optimizer.cost.CostMethods;
+import org.apache.asterix.optimizer.cost.ICost;
+import org.apache.asterix.optimizer.cost.ICostMethods;
 import org.apache.asterix.optimizer.rules.am.AccessMethodJobGenParams;
 import org.apache.asterix.optimizer.rules.am.BTreeJobGenParams;
 import org.apache.asterix.optimizer.rules.util.AsterixJoinUtils;
@@ -50,6 +54,7 @@
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalExpressionTag;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalOperatorTag;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
+import org.apache.hyracks.algebricks.core.algebra.base.OperatorAnnotations;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.AggregateFunctionCallExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.IMergeAggregationExpressionFactory;
@@ -73,7 +78,7 @@
 import 
org.apache.hyracks.algebricks.core.algebra.visitors.ILogicalOperatorVisitor;
 import 
org.apache.hyracks.algebricks.rewriter.rules.SetAlgebricksPhysicalOperatorsRule;

-public final class SetAsterixPhysicalOperatorsRule extends 
SetAlgebricksPhysicalOperatorsRule {
+public class SetAsterixPhysicalOperatorsRule extends 
SetAlgebricksPhysicalOperatorsRule {

     // Disable ASSIGN_BATCH physical operator if this option is set to 'false'
     public static final String REWRITE_ATTEMPT_BATCH_ASSIGN = 
"rewrite_attempt_batch_assign";
@@ -90,13 +95,32 @@
         return 
metadataProvider.getBooleanProperty(REWRITE_ATTEMPT_BATCH_ASSIGN, 
REWRITE_ATTEMPT_BATCH_ASSIGN_DEFAULT);
     }

-    private static class AsterixPhysicalOperatorFactoryVisitor extends 
AlgebricksPhysicalOperatorFactoryVisitor {
+    protected static class AsterixPhysicalOperatorFactoryVisitor extends 
AlgebricksPhysicalOperatorFactoryVisitor {

         private final boolean isBatchAssignEnabled;
+        protected ICost cost;
+        protected ICostMethods costMethods;

-        private AsterixPhysicalOperatorFactoryVisitor(IOptimizationContext 
context) {
+        protected AsterixPhysicalOperatorFactoryVisitor(IOptimizationContext 
context) {
             super(context);
             isBatchAssignEnabled = isBatchAssignEnabled(context);
+            cost = new Cost();
+            costMethods = new CostMethods(context);
+        }
+
+        protected Enum groupByAlgorithm(GroupByOperator gby, Boolean 
topLevelOp) {
+            ICost costHashGroupBy = costMethods.costHashGroupBy(gby);
+            ICost costSortGroupBy = costMethods.costSortGroupBy(gby);
+            if (gby.getNestedPlans().size() == 1 && 
gby.getNestedPlans().get(0).getRoots().size() == 1) {
+                if (costHashGroupBy.costLE(costSortGroupBy)) {
+                    gby.getAnnotations().put(OperatorAnnotations.OP_COST_LOCAL,
+                            (double) 
Math.round(costHashGroupBy.computeTotalCost() * 100) / 100);
+                    return GroupByAlgorithm.HASH_GROUP_BY;
+                }
+            }
+            gby.getAnnotations().put(OperatorAnnotations.OP_COST_LOCAL,
+                    (double) Math.round(costSortGroupBy.computeTotalCost() * 
100) / 100);
+            return GroupByAlgorithm.SORT_GROUP_BY;
         }

         @Override
diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EnumerateJoinsRule.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EnumerateJoinsRule.java
index 4c9a10e..4989f53 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EnumerateJoinsRule.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EnumerateJoinsRule.java
@@ -49,13 +49,16 @@
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.ConstantExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.HashJoinExpressionAnnotation;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.IExpressionAnnotation;
+import 
org.apache.hyracks.algebricks.core.algebra.expressions.VariableReferenceExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.functions.AlgebricksBuiltinFunctions;
 import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractBinaryJoinOperator;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.AssignOperator;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.DataSourceScanOperator;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.DistinctOperator;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.EmptyTupleSourceOperator;
+import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.GroupByOperator;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.SelectOperator;
 import 
org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.VariableUtilities;
 import org.apache.hyracks.algebricks.core.algebra.plan.ALogicalPlanImpl;
@@ -64,6 +67,7 @@
 import org.apache.hyracks.algebricks.core.rewriter.base.IAlgebraicRewriteRule;
 import 
org.apache.hyracks.algebricks.core.rewriter.base.PhysicalOptimizationConfig;
 import org.apache.hyracks.api.exceptions.IWarningCollector;
+import org.apache.hyracks.api.exceptions.SourceLocation;
 import org.apache.hyracks.api.exceptions.Warning;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -84,6 +88,9 @@
     List<AssignOperator> assignOps;
     List<ILogicalExpression> assignJoinExprs; // These are the join 
expressions below the assign operator.

+    // The Distinct operators for each Select or DataSourceScan operator (if 
applicable)
+    HashMap<DataSourceScanOperator, ILogicalOperator> 
dataScanOrSelectAndDistinctOps;
+
     public EnumerateJoinsRule(JoinEnum joinEnum) {
         this.joinEnum = joinEnum;
     }
@@ -118,6 +125,13 @@
             return false;
         }

+        // The Distinct operators for each Select or DataSourceScan operator 
(if applicable)
+        dataScanOrSelectAndDistinctOps = new HashMap<>();
+        // If cboMode or cboTestMode is true, identify each DistinctOp or 
GroupByOp for the corresponding DataScanOp
+        if (op.getOperatorTag() == LogicalOperatorTag.DISTRIBUTE_RESULT) {
+            getDistinctOpsForJoinNodes(op);
+        }
+
         // if this join has already been seen before, no need to apply the 
rule again
         if (context.checkIfInDontApplySet(this, op)) {
             return false;
@@ -132,7 +146,6 @@
         assignOps = new ArrayList<>();
         assignJoinExprs = new ArrayList<>();
         buildSets = new ArrayList<>();
-
         IPlanPrettyPrinter pp = context.getPrettyPrinter();
         printPlan(pp, (AbstractLogicalOperator) op, "Original Whole plan1");
         leafInputNumber = 0;
@@ -158,7 +171,8 @@
             // we need to build the smaller sets first. So we need to find 
these first.
         }
         joinEnum.initEnum((AbstractLogicalOperator) op, cboMode, cboTestMode, 
numberOfFromTerms, leafInputs, allJoinOps,
-                assignOps, outerJoinsDependencyList, buildSets, 
varLeafInputIds, context);
+                assignOps, outerJoinsDependencyList, buildSets, 
varLeafInputIds, dataScanOrSelectAndDistinctOps,
+                context);

         if (cboMode) {
             if (!doAllDataSourcesHaveSamples(leafInputs, context)) {
@@ -388,6 +402,147 @@
         }
     }

+    private List<LogicalVariable> getGroupByDistinctVarList(ILogicalOperator 
grpByDistinctOp) {
+        List<LogicalVariable> distinctVars = new ArrayList<>();
+        ILogicalExpression varRef;
+        ILogicalOperator nextOp;
+        if (grpByDistinctOp.getOperatorTag() == LogicalOperatorTag.DISTINCT) {
+            nextOp = grpByDistinctOp.getInputs().get(0).getValue();
+            if (nextOp.getOperatorTag() == LogicalOperatorTag.ASSIGN) {
+                List<Mutable<ILogicalExpression>> argList =
+                        ((AbstractFunctionCallExpression) ((AssignOperator) 
nextOp).getExpressions().get(0).getValue())
+                                .getArguments();
+                for (int i = 0; i < argList.size(); i += 2) {
+                    varRef = argList.get(i + 1).getValue();
+                    if (varRef.getExpressionTag() == 
LogicalExpressionTag.VARIABLE) {
+                        distinctVars.add(((VariableReferenceExpression) 
varRef).getVariableReference());
+                    }
+                }
+            }
+        } else if (grpByDistinctOp.getOperatorTag() == 
LogicalOperatorTag.GROUP) {
+            distinctVars = ((GroupByOperator) 
grpByDistinctOp).getGroupByVarList();
+        }
+        return distinctVars;
+    }
+
+    private boolean 
containsAllGroupByDistinctVarsInScanOp(DataSourceScanOperator scanOp,
+            ILogicalOperator grpByDistinctOp) {
+        LogicalOperatorTag tag = grpByDistinctOp.getOperatorTag();
+        if (tag == LogicalOperatorTag.GROUP || tag == 
LogicalOperatorTag.DISTINCT) {
+            List<LogicalVariable> distinctVars = 
getGroupByDistinctVarList(grpByDistinctOp);
+            if (distinctVars.size() == 0) {
+                return false; // no Variable expressions in DistinctOp or 
GroupByOp
+            }
+            List<LogicalVariable> scanVars = scanOp.getVariables();
+            List<LogicalVariable> foundDistinctVars = new ArrayList<>();
+            for (LogicalVariable scanVar : scanVars) {
+                if (distinctVars.contains(scanVar)) {
+                    foundDistinctVars.add(scanVar);
+                }
+            }
+            // discarding the variable for Dataset name or alias from scanOp
+            return ((scanVars.size() - 1) == foundDistinctVars.size());
+        }
+        return false;
+    }
+
+    private void getDistinctOpsForJoinNodes(ILogicalOperator op) {
+        if (op.getOperatorTag() != LogicalOperatorTag.DISTRIBUTE_RESULT) {
+            return;
+        }
+        ILogicalOperator grpByDistinctOp = null; // null indicates no 
DistinctOp or GroupByOp
+        DataSourceScanOperator scanOp;
+        while (true) {
+            LogicalOperatorTag tag = op.getOperatorTag();
+            if (tag == LogicalOperatorTag.DISTINCT || tag == 
LogicalOperatorTag.GROUP) {
+                grpByDistinctOp = op; // GroupByOp Variable expressions (if 
any) take over DistinctOp ones
+            } else if (tag == LogicalOperatorTag.INNERJOIN || tag == 
LogicalOperatorTag.LEFTOUTERJOIN) {
+                if (grpByDistinctOp != null) {
+                    for (int i = 0; i < op.getInputs().size(); i++) {
+                        ILogicalOperator nextOp = 
op.getInputs().get(i).getValue();
+                        getDistinctOpsForJoinNodes(nextOp, grpByDistinctOp);
+                    }
+                }
+                return;
+            } else if (tag == LogicalOperatorTag.DATASOURCESCAN) { // single 
table queries
+                scanOp = (DataSourceScanOperator) op;
+                if (grpByDistinctOp != null) {
+                    if (!containsAllGroupByDistinctVarsInScanOp(scanOp, 
grpByDistinctOp)) {
+                        // doesn't contain all PK attribute(s), so CBO can get 
estimated cardinality from samples
+                        dataScanOrSelectAndDistinctOps.put(scanOp, 
grpByDistinctOp);
+                        //                        String viewInPlan = new 
ALogicalPlanImpl(new MutableObject<>(grpByDistinctOp)).toString(); //useful 
when debugging
+                    }
+                }
+                return;
+            }
+            op = op.getInputs().get(0).getValue();
+            if (op.getOperatorTag() == LogicalOperatorTag.EMPTYTUPLESOURCE) {
+                return; // if this happens, there is nothing we can do in CBO 
code since there is no DataSourceScan
+            }
+        }
+    }
+
+    private void getDistinctOpsForJoinNodes(ILogicalOperator op, 
ILogicalOperator grpByDistinctOp) {
+        List<LogicalVariable> foundDistinctVars = new ArrayList<>();
+        ILogicalOperator selOp = null, assignOp = null;
+
+        LogicalOperatorTag tag = op.getOperatorTag();
+        // add DistinctOp to count distinct values in an attribute (except PK 
attribute(s))
+        if (tag == LogicalOperatorTag.ASSIGN || tag == 
LogicalOperatorTag.SELECT) {
+            List<LogicalVariable> distinctVars = 
getGroupByDistinctVarList(grpByDistinctOp);
+            if (distinctVars.size() == 0) {
+                return; // no Variable expressions in DistinctOp or GroupByOp
+            }
+
+            DataSourceScanOperator scanOp = null;
+            LogicalVariable assignVar;
+            while (tag != LogicalOperatorTag.EMPTYTUPLESOURCE) {
+                if (tag == LogicalOperatorTag.SELECT) {
+                    selOp = op;
+                } else if (tag == LogicalOperatorTag.ASSIGN) {
+                    assignVar = ((AssignOperator) op).getVariables().get(0);
+                    int idx = distinctVars.indexOf(assignVar);
+                    if (idx != -1 && assignOp == null) { // first 
corresponding AssignOp found
+                        assignOp = op;
+                    }
+                    if (idx != -1) { // add all Variable expressions of the 
DataSourceScanOp
+                        foundDistinctVars.add(assignVar);
+                    }
+                } else if (tag == LogicalOperatorTag.DATASOURCESCAN) {
+                    scanOp = (DataSourceScanOperator) op;
+                }
+                op = op.getInputs().get(0).getValue();
+                tag = op.getOperatorTag();
+            }
+
+            if (assignOp != null && scanOp != null) {
+                SourceLocation sourceLocation =
+                        (selOp == null) ? assignOp.getSourceLocation() : 
selOp.getSourceLocation();
+                ILogicalOperator inputOp = (selOp == null) ? assignOp : selOp;
+                List<Mutable<ILogicalExpression>> distinctExpr = new 
ArrayList<>();
+                for (LogicalVariable var : foundDistinctVars) {
+                    VariableReferenceExpression varExpr = new 
VariableReferenceExpression(var);
+                    varExpr.setSourceLocation(sourceLocation);
+                    Mutable<ILogicalExpression> vRef = new 
MutableObject<>(varExpr);
+                    distinctExpr.add(vRef);
+                }
+
+                // create a Distinct operator
+                DistinctOperator distinctOp = new 
DistinctOperator(distinctExpr);
+                distinctOp.setSourceLocation(sourceLocation);
+                distinctOp.getInputs().add(new MutableObject<>(inputOp));
+                distinctOp.setExecutionMode(inputOp.getExecutionMode());
+                dataScanOrSelectAndDistinctOps.put(scanOp, distinctOp);
+                //                String viewInPlan = new ALogicalPlanImpl(new 
MutableObject<>(distinctOp)).toString(); //useful when debugging
+            }
+        } else if (tag == LogicalOperatorTag.INNERJOIN || tag == 
LogicalOperatorTag.LEFTOUTERJOIN) {
+            for (int i = 0; i < op.getInputs().size(); i++) {
+                ILogicalOperator nextOp = op.getInputs().get(i).getValue();
+                getDistinctOpsForJoinNodes(nextOp, grpByDistinctOp);
+            }
+        }
+    }
+
     private int getLeafInputId(LogicalVariable lv) {
         if (varLeafInputIds.containsKey(lv))
             return varLeafInputIds.get(lv);
diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EstimatedCostComputationVisitor.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EstimatedCostComputationVisitor.java
index 4d1e0fb..57abe54 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EstimatedCostComputationVisitor.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/EstimatedCostComputationVisitor.java
@@ -91,8 +91,23 @@

     @Override
     public Pair<Double, Double> visitGroupByOperator(GroupByOperator op, 
Double arg) throws AlgebricksException {
-        // Needs more work in the cardinality estimation code to estimate 
group by cardinality and cost.
-        return annotate(this, op, arg);
+
+        double grpByCost = 0.0;
+        Pair<Double, Double> cardCost = 
op.getInputs().get(0).getValue().accept(this, arg);
+
+        for (Map.Entry<String, Object> anno : op.getAnnotations().entrySet()) {
+            if (anno.getValue() != null && 
anno.getKey().equals(OperatorAnnotations.OP_OUTPUT_CARDINALITY)) {
+                cardCost.setFirst((Double) anno.getValue());
+            }
+            if (anno.getValue() != null && 
anno.getKey().equals(OperatorAnnotations.OP_COST_LOCAL)) {
+                grpByCost = (double) anno.getValue();
+            }
+        }
+        double totalCost = cardCost.getSecond() + grpByCost;
+        op.getAnnotations().put(OperatorAnnotations.OP_COST_TOTAL, totalCost);
+        cardCost.setSecond(totalCost);
+
+        return cardCost;
     }

     @Override
diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/JoinEnum.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/JoinEnum.java
index 3ce4e9f..bfbe66f 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/JoinEnum.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/JoinEnum.java
@@ -60,6 +60,7 @@
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalExpressionTag;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalOperatorTag;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
+import org.apache.hyracks.algebricks.core.algebra.base.OperatorAnnotations;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.BroadcastExpressionAnnotation;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.ConstantExpression;
@@ -103,6 +104,7 @@
     protected JoinNode[] jnArray; // array of all join nodes
     protected int jnArraySize;
     protected List<ILogicalOperator> leafInputs;
+    protected HashMap<DataSourceScanOperator, ILogicalOperator> 
dataScanOrSelectAndDistinctOps;
     protected List<ILogicalExpression> singleDatasetPreds;
     protected List<AssignOperator> assignOps;
     List<Quadruple<Integer, Integer, JoinOperator, Integer>> 
outerJoinsDependencyList;
@@ -135,6 +137,7 @@
             List<ILogicalOperator> leafInputs, List<JoinOperator> allJoinOps, 
List<AssignOperator> assignOps,
             List<Quadruple<Integer, Integer, JoinOperator, Integer>> 
outerJoinsDependencyList,
             List<Triple<Integer, Integer, Boolean>> buildSets, 
HashMap<LogicalVariable, Integer> varLeafInputIds,
+            HashMap<DataSourceScanOperator, ILogicalOperator> 
dataScanOrSelectAndDistinctOps,
             IOptimizationContext context) throws AsterixException {
         this.singleDatasetPreds = new ArrayList<>();
         this.joinConditions = new ArrayList<>();
@@ -154,6 +157,7 @@
         this.allJoinOps = allJoinOps;
         this.buildSets = buildSets;
         this.varLeafInputIds = varLeafInputIds;
+        this.dataScanOrSelectAndDistinctOps = dataScanOrSelectAndDistinctOps;
         this.op = op;
         this.forceJoinOrderMode = getForceJoinOrderMode(context);
         this.queryPlanShape = getQueryPlanShape(context);
@@ -874,6 +878,16 @@
                     // now switch the input back.
                     parent.getInputs().get(0).setValue(scanOp);
                     jn.setCardinality(finalDatasetCard);
+
+                    ILogicalOperator grpByDistinctOp = 
this.dataScanOrSelectAndDistinctOps.get(scanOp);
+                    // This needs to be scaled up, Mehnaz will fill in correct 
value.
+                    double distinctCardinality =
+                            (grpByDistinctOp != null) ? 
stats.findDistinctValuesCardinality(grpByDistinctOp) : 0.0;
+
+                    
grpByDistinctOp.getAnnotations().put(OperatorAnnotations.OP_OUTPUT_CARDINALITY,
+                            (double) Math.round(distinctCardinality * 100) / 
100);
+                    
grpByDistinctOp.getAnnotations().put(OperatorAnnotations.OP_INPUT_CARDINALITY,
+                            (double) Math.round(finalDatasetCard * 100) / 100);
                 }
             }
             dataScanPlan = jn.addSingleDatasetPlans();
diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
index d09783d..b0a9e71 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
@@ -523,6 +523,52 @@
     private void transformtoAnyInPlan(SelectOperator newSelOp) {
     }

+    protected double findDistinctValuesCardinality(ILogicalOperator 
grpByDistinctOp) throws AlgebricksException {
+        ILogicalOperator parent = 
joinEnum.findDataSourceScanOperatorParent(grpByDistinctOp);
+        DataSourceScanOperator scanOp = (DataSourceScanOperator) 
parent.getInputs().get(0).getValue();
+
+        if (scanOp == null) {
+            return 1.0; // what happens to the cards and sizes then? this may 
happen in case of in lists
+        }
+
+        Index index = findSampleIndex(scanOp, optCtx);
+        if (index == null) {
+            return 1.0;
+        }
+
+        Index.SampleIndexDetails idxDetails = (Index.SampleIndexDetails) 
index.getIndexDetails();
+        double origDatasetCard = idxDetails.getSourceCardinality();
+        // origDatasetCard must be equal to datasetCard. So we do not need 
datasetCard passed in here. VIJAY check if
+        // this parameter can be removed.
+        double sampleCard = Math.min(idxDetails.getSampleCardinalityTarget(), 
origDatasetCard);
+        if (sampleCard == 0) {
+            sampleCard = 1;
+            IWarningCollector warningCollector = optCtx.getWarningCollector();
+            if (warningCollector.shouldWarn()) {
+                warningCollector.warn(Warning.of(scanOp.getSourceLocation(),
+                        
org.apache.asterix.common.exceptions.ErrorCode.SAMPLE_HAS_ZERO_ROWS));
+            }
+        }
+
+        // replace the DataSourceScanOp with the sampling source
+        SampleDataSource sampledatasource = 
joinEnum.getSampleDataSource(scanOp);
+        DataSourceScanOperator deepCopyOfScan =
+                (DataSourceScanOperator) 
OperatorManipulationUtil.bottomUpCopyOperators(scanOp);
+        deepCopyOfScan.setDataSource(sampledatasource);
+
+        // insert this in place of the DataSourceScanOp.
+        parent.getInputs().get(0).setValue(deepCopyOfScan);
+
+        List<List<IAObject>> result = runSamplingQuery(optCtx, 
grpByDistinctOp);
+        double cardinality = ((double) ((AInt64) 
result.get(0).get(0)).getLongValue());
+        // TODO: Do cardinality estimation (Jackknife maybe?)
+
+        // switch  the scanOp back
+        parent.getInputs().get(0).setValue(scanOp);
+
+        return cardinality;
+    }
+
     protected List<List<IAObject>> runSamplingQuery(IOptimizationContext ctx, 
ILogicalOperator logOp)
             throws AlgebricksException {
         LOGGER.info("***running sample query***");
diff --git 
a/hyracks-fullstack/algebricks/algebricks-rewriter/src/main/java/org/apache/hyracks/algebricks/rewriter/rules/SetAlgebricksPhysicalOperatorsRule.java
 
b/hyracks-fullstack/algebricks/algebricks-rewriter/src/main/java/org/apache/hyracks/algebricks/rewriter/rules/SetAlgebricksPhysicalOperatorsRule.java
index 2784a6a..a81527f 100644
--- 
a/hyracks-fullstack/algebricks/algebricks-rewriter/src/main/java/org/apache/hyracks/algebricks/rewriter/rules/SetAlgebricksPhysicalOperatorsRule.java
+++ 
b/hyracks-fullstack/algebricks/algebricks-rewriter/src/main/java/org/apache/hyracks/algebricks/rewriter/rules/SetAlgebricksPhysicalOperatorsRule.java
@@ -127,6 +127,11 @@

 public class SetAlgebricksPhysicalOperatorsRule implements 
IAlgebraicRewriteRule {

+    protected enum GroupByAlgorithm {
+        HASH_GROUP_BY,
+        SORT_GROUP_BY
+    }
+
     @Override
     public boolean rewritePost(Mutable<ILogicalOperator> opRef, 
IOptimizationContext context)
             throws AlgebricksException {
@@ -144,7 +149,7 @@
         return true;
     }

-    private static void computeDefaultPhysicalOp(AbstractLogicalOperator op, 
boolean topLevelOp,
+    protected static void computeDefaultPhysicalOp(AbstractLogicalOperator op, 
boolean topLevelOp,
             ILogicalOperatorVisitor<IPhysicalOperator, Boolean> physOpFactory) 
throws AlgebricksException {
         if (op.getPhysicalOperator() == null) {
             IPhysicalOperator physOp = op.accept(physOpFactory, topLevelOp);
@@ -213,19 +218,15 @@
                 throws AlgebricksException {

             ensureAllVariables(gby.getGroupByList(), Pair::getSecond);
-
-            if (gby.getNestedPlans().size() == 1 && 
gby.getNestedPlans().get(0).getRoots().size() == 1) {
-                if (topLevelOp && 
((gby.getAnnotations().get(OperatorAnnotations.USE_HASH_GROUP_BY) == 
Boolean.TRUE)
-                        || 
(gby.getAnnotations().get(OperatorAnnotations.USE_EXTERNAL_GROUP_BY) == 
Boolean.TRUE))) {
-                    ExternalGroupByPOperator extGby = 
createExternalGroupByPOperator(gby);
-                    if (extGby != null) {
-                        return extGby;
-                    } else if (gby.getSourceLocation() != null) {
-                        IWarningCollector warningCollector = 
context.getWarningCollector();
-                        if (warningCollector.shouldWarn()) {
-                            
warningCollector.warn(Warning.of(gby.getSourceLocation(), 
ErrorCode.INAPPLICABLE_HINT,
-                                    "Group By", "hash"));
-                        }
+            if (groupByAlgorithm(gby, topLevelOp) == 
GroupByAlgorithm.HASH_GROUP_BY) {
+                ExternalGroupByPOperator extGby = 
createExternalGroupByPOperator(gby);
+                if (extGby != null) {
+                    return extGby;
+                } else if (gby.getSourceLocation() != null) {
+                    IWarningCollector warningCollector = 
context.getWarningCollector();
+                    if (warningCollector.shouldWarn()) {
+                        warningCollector.warn(
+                                Warning.of(gby.getSourceLocation(), 
ErrorCode.INAPPLICABLE_HINT, "Group By", "hash"));
                     }
                 }
             }
@@ -237,6 +238,16 @@
             }
         }

+        protected Enum groupByAlgorithm(GroupByOperator gby, Boolean 
topLevelOp) {
+            if (gby.getNestedPlans().size() == 1 && 
gby.getNestedPlans().get(0).getRoots().size() == 1) {
+                if (topLevelOp && 
((gby.getAnnotations().get(OperatorAnnotations.USE_HASH_GROUP_BY) == 
Boolean.TRUE)
+                        || 
(gby.getAnnotations().get(OperatorAnnotations.USE_EXTERNAL_GROUP_BY) == 
Boolean.TRUE))) {
+                    return GroupByAlgorithm.HASH_GROUP_BY;
+                }
+            }
+            return GroupByAlgorithm.SORT_GROUP_BY;
+        }
+
         protected ExternalGroupByPOperator 
createExternalGroupByPOperator(GroupByOperator gby)
                 throws AlgebricksException {
             boolean hasIntermediateAgg = 
generateMergeAggregationExpressions(gby);

--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17765
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings

Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: I3196f664d716bb5b3806ec9a5a0dd5c1ea51ff62
Gerrit-Change-Number: 17765
Gerrit-PatchSet: 1
Gerrit-Owner: Vijay Sarathy <[email protected]>
Gerrit-MessageType: newchange

Reply via email to