>From <[email protected]>:

[email protected] has uploaded this change for review. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18231 )


Change subject: [ASTERIXDB-3376][COMP] Estimate number of distincts in join 
columns
......................................................................

[ASTERIXDB-3376][COMP] Estimate number of distincts in join columns

Change-Id: I270982f0cdb54ea7fbc9c63413bd73e2b132f7e4
---
M 
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
1 file changed, 140 insertions(+), 3 deletions(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/31/18231/1

diff --git 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
index 63c67a5..6bf631e 100644
--- 
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
+++ 
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
@@ -53,6 +53,7 @@
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalOperatorTag;
 import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
+import 
org.apache.hyracks.algebricks.core.algebra.expressions.AbstractLogicalExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.AggregateFunctionCallExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.ConstantExpression;
 import 
org.apache.hyracks.algebricks.core.algebra.expressions.JoinProductivityAnnotation;
@@ -162,11 +163,18 @@
                 return productivity / card1;
             }
         } else {
+            ILogicalOperator Li1 = joinEnum.leafInputs.get(idx1);
+            List<List<IAObject>> result = 
runSamplingQueryDistinct(this.optCtx, Li1, exprUsedVars.get(1));
+            double estDistinctCardinalityFromSample = 
findPredicateCardinality(result, false);
+            double numDistincts = distinctEstimator2(Li1, 
estDistinctCardinalityFromSample);
+            return 1.0 / numDistincts; // this is the expected selectivity for 
joins.
+            /*
             if (card1 < card2) {
                 // we are assuming that the smaller side is the primary side 
and that the join is Pk-Fk join.
                 return 1.0 / card1;
             }
             return 1.0 / card2;
+             */
         }
     }

@@ -368,9 +376,13 @@
         while (currentOp != null && currentOp.getOperatorTag() != 
LogicalOperatorTag.EMPTYTUPLESOURCE) {
             if (currentOp.getOperatorTag().equals(LogicalOperatorTag.SELECT)) {
                 SelectOperator selOp = (SelectOperator) currentOp;
-                if (selOp != otherSelOp) {
-                    selExprs.add(selOp.getCondition().getValue());
+                if (otherSelOp == null) {
                     selOp.getCondition().setValue(ConstantExpression.TRUE);
+                } else {
+                    if (selOp != otherSelOp) {
+                        selExprs.add(selOp.getCondition().getValue());
+                        selOp.getCondition().setValue(ConstantExpression.TRUE);
+                    }
                 }
             }
             currentOp = currentOp.getInputs().get(0).getValue();
@@ -576,6 +588,81 @@
         return AnalysisUtil.runQuery(newAggOpRef, Arrays.asList(aggVar), 
newCtx, IRuleSetFactory.RuleSetKind.SAMPLING);
     }

+    protected List<List<IAObject>> 
runSamplingQueryDistinct(IOptimizationContext ctx, ILogicalOperator logOp,
+            LogicalVariable var) throws AlgebricksException {
+        LOGGER.info("***running sample query***");
+
+        IOptimizationContext newCtx = 
ctx.getOptimizationContextFactory().cloneOptimizationContext(ctx);
+
+        ILogicalOperator newLogOp = 
OperatorManipulationUtil.bottomUpCopyOperators(logOp);
+        storeSelectConditionsAndMakeThemTrue(newLogOp, null);
+
+        ILogicalOperator parent = 
joinEnum.findDataSourceScanOperatorParent(newLogOp);
+        DataSourceScanOperator scanOp = (DataSourceScanOperator) 
parent.getInputs().get(0).getValue();
+
+        if (scanOp == null) {
+            //return 1.0; // what happens to the cards and sizes then? this 
may happen in case of in lists
+        }
+
+        Index index = findSampleIndex(scanOp, optCtx);
+        if (index == null) {
+            //return 1.0;
+        }
+
+        Index.SampleIndexDetails idxDetails = (Index.SampleIndexDetails) 
index.getIndexDetails();
+        double origDatasetCard = idxDetails.getSourceCardinality();
+        double sampleCard = Math.min(idxDetails.getSampleCardinalityTarget(), 
origDatasetCard);
+        if (sampleCard == 0) {
+            sampleCard = 1;
+            IWarningCollector warningCollector = optCtx.getWarningCollector();
+            if (warningCollector.shouldWarn()) {
+                warningCollector.warn(Warning.of(scanOp.getSourceLocation(),
+                        
org.apache.asterix.common.exceptions.ErrorCode.SAMPLE_HAS_ZERO_ROWS));
+            }
+        }
+
+        // replace the dataScanSourceOperator with the sampling source
+        SampleDataSource sampledatasource = 
joinEnum.getSampleDataSource(scanOp);
+        DataSourceScanOperator deepCopyofScan =
+                (DataSourceScanOperator) 
OperatorManipulationUtil.bottomUpCopyOperators(scanOp);
+        deepCopyofScan.setDataSource(sampledatasource);
+        parent.getInputs().get(0).setValue(deepCopyofScan);
+
+        // end new code
+
+        // old code
+
+        List<Mutable<ILogicalExpression>> aggFunArgs = new ArrayList<>(1);
+        aggFunArgs.add(new MutableObject<>(ConstantExpression.TRUE));
+
+        AbstractLogicalExpression inputVarRef = new 
VariableReferenceExpression(var, newLogOp.getSourceLocation());
+        List<Mutable<ILogicalExpression>> fields = new ArrayList<>(1);
+        fields.add(new MutableObject<>(inputVarRef));
+
+        BuiltinFunctionInfo countFn = 
BuiltinFunctions.getBuiltinFunctionInfo(BuiltinFunctions.SQL_COUNT_DISTINCT);
+        AggregateFunctionCallExpression aggExpr = new 
AggregateFunctionCallExpression(countFn, false, fields);
+
+        List<Mutable<ILogicalExpression>> aggExprList = new ArrayList<>(1);
+        aggExprList.add(new MutableObject<>(aggExpr));
+
+        List<LogicalVariable> aggVarList = new ArrayList<>(1);
+        LogicalVariable aggVar = newCtx.newVar();
+        aggVarList.add(aggVar);
+
+        AggregateOperator newAggOp = new AggregateOperator(aggVarList, 
aggExprList);
+        newAggOp.getInputs().add(new MutableObject<>(newLogOp));
+
+        Mutable<ILogicalOperator> newAggOpRef = new MutableObject<>(newAggOp);
+
+        OperatorPropertiesUtil.typeOpRec(newAggOpRef, newCtx);
+        LOGGER.info("***returning from sample query***");
+
+        String viewInPlan = new ALogicalPlanImpl(newAggOpRef).toString(); 
//useful when debugging
+        LOGGER.trace("viewInPlan");
+        LOGGER.trace(viewInPlan);
+        return AnalysisUtil.runQuery(newAggOpRef, Arrays.asList(aggVar), 
newCtx, IRuleSetFactory.RuleSetKind.SAMPLING);
+    }
+
     // This one gets the cardinality and also projection sizes
     protected List<List<IAObject>> 
runSamplingQueryProjection(IOptimizationContext ctx, ILogicalOperator logOp,
             int dataset, LogicalVariable primaryKey) throws 
AlgebricksException {
@@ -778,6 +865,47 @@
         return Math.round(estDistCardinality);
     }

+    // Formula is d = D (1 - e^(-sampleCard/D))
+    double DistinctFormula(double sampleCard, double D) {
+        double a, b, c;
+
+        a = -sampleCard / D;
+        b = Math.exp(a);
+        c = 1.0 - b;
+        double x = D * c;
+        return x;
+    }
+
+    private double distinctEstimator2(ILogicalOperator logOp, double 
estDistinctCardinalityFromSample)
+            throws AlgebricksException {
+        // initialize the estimate to be the number of distinct values from 
the sample.
+
+        ILogicalOperator parent = 
joinEnum.findDataSourceScanOperatorParent(logOp);
+        DataSourceScanOperator scanOp = (DataSourceScanOperator) 
parent.getInputs().get(0).getValue();
+
+        Index index = findSampleIndex(scanOp, optCtx);
+        Index.SampleIndexDetails idxDetails = (Index.SampleIndexDetails) 
index.getIndexDetails();
+        double origDatasetCardinality = idxDetails.getSourceCardinality();
+        double sampleCard = idxDetails.getSampleCardinalityTarget();
+
+        double D, Dmin, Dmax;
+
+        Dmin = 1;
+        Dmax = origDatasetCardinality;
+        D = estDistinctCardinalityFromSample;
+        int i = 0;
+        while ((Dmin < Dmax) && i < 1000) { // just being very cautious to 
avoid infinite loops.
+            i++;
+            D = (Dmax + Dmin) / 2.0;
+            double x = DistinctFormula(sampleCard, D);
+            if (x < estDistinctCardinalityFromSample)
+                Dmin = D + 1;
+            else
+                Dmax = D - 1;
+        }
+        return D;
+    }
+
     // Use the Newton-Raphson method for distinct cardinality estimation.
     private double distinctEstimator(double estDistinctCardinalityFromSample, 
double origDatasetCardinality) {
         // initialize the estimate to be the number of distinct values from 
the sample.
@@ -858,4 +986,4 @@
         }
         return null; // no SelectOp in the query tree
     }
-}
\ No newline at end of file
+}

--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18231
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings

Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: I270982f0cdb54ea7fbc9c63413bd73e2b132f7e4
Gerrit-Change-Number: 18231
Gerrit-PatchSet: 1
Gerrit-Owner: [email protected]
Gerrit-MessageType: newchange

Reply via email to