>From <[email protected]>:
[email protected] has uploaded this change for review. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18231 )
Change subject: [ASTERIXDB-3376][COMP] Estimate number of distincts in join
columns
......................................................................
[ASTERIXDB-3376][COMP] Estimate number of distincts in join columns
Change-Id: I270982f0cdb54ea7fbc9c63413bd73e2b132f7e4
---
M
asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
1 file changed, 140 insertions(+), 3 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/31/18231/1
diff --git
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
index 63c67a5..6bf631e 100644
---
a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
+++
b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/cbo/Stats.java
@@ -53,6 +53,7 @@
import org.apache.hyracks.algebricks.core.algebra.base.LogicalOperatorTag;
import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
import
org.apache.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
+import
org.apache.hyracks.algebricks.core.algebra.expressions.AbstractLogicalExpression;
import
org.apache.hyracks.algebricks.core.algebra.expressions.AggregateFunctionCallExpression;
import
org.apache.hyracks.algebricks.core.algebra.expressions.ConstantExpression;
import
org.apache.hyracks.algebricks.core.algebra.expressions.JoinProductivityAnnotation;
@@ -162,11 +163,18 @@
return productivity / card1;
}
} else {
+ ILogicalOperator Li1 = joinEnum.leafInputs.get(idx1);
+ List<List<IAObject>> result =
runSamplingQueryDistinct(this.optCtx, Li1, exprUsedVars.get(1));
+ double estDistinctCardinalityFromSample =
findPredicateCardinality(result, false);
+ double numDistincts = distinctEstimator2(Li1,
estDistinctCardinalityFromSample);
+ return 1.0 / numDistincts; // this is the expected selectivity for
joins.
+ /*
if (card1 < card2) {
// we are assuming that the smaller side is the primary side
and that the join is Pk-Fk join.
return 1.0 / card1;
}
return 1.0 / card2;
+ */
}
}
@@ -368,9 +376,13 @@
while (currentOp != null && currentOp.getOperatorTag() !=
LogicalOperatorTag.EMPTYTUPLESOURCE) {
if (currentOp.getOperatorTag().equals(LogicalOperatorTag.SELECT)) {
SelectOperator selOp = (SelectOperator) currentOp;
- if (selOp != otherSelOp) {
- selExprs.add(selOp.getCondition().getValue());
+ if (otherSelOp == null) {
selOp.getCondition().setValue(ConstantExpression.TRUE);
+ } else {
+ if (selOp != otherSelOp) {
+ selExprs.add(selOp.getCondition().getValue());
+ selOp.getCondition().setValue(ConstantExpression.TRUE);
+ }
}
}
currentOp = currentOp.getInputs().get(0).getValue();
@@ -576,6 +588,81 @@
return AnalysisUtil.runQuery(newAggOpRef, Arrays.asList(aggVar),
newCtx, IRuleSetFactory.RuleSetKind.SAMPLING);
}
+ protected List<List<IAObject>>
runSamplingQueryDistinct(IOptimizationContext ctx, ILogicalOperator logOp,
+ LogicalVariable var) throws AlgebricksException {
+ LOGGER.info("***running sample query***");
+
+ IOptimizationContext newCtx =
ctx.getOptimizationContextFactory().cloneOptimizationContext(ctx);
+
+ ILogicalOperator newLogOp =
OperatorManipulationUtil.bottomUpCopyOperators(logOp);
+ storeSelectConditionsAndMakeThemTrue(newLogOp, null);
+
+ ILogicalOperator parent =
joinEnum.findDataSourceScanOperatorParent(newLogOp);
+ DataSourceScanOperator scanOp = (DataSourceScanOperator)
parent.getInputs().get(0).getValue();
+
+ if (scanOp == null) {
+ //return 1.0; // what happens to the cards and sizes then? this
may happen in case of in lists
+ }
+
+ Index index = findSampleIndex(scanOp, optCtx);
+ if (index == null) {
+ //return 1.0;
+ }
+
+ Index.SampleIndexDetails idxDetails = (Index.SampleIndexDetails)
index.getIndexDetails();
+ double origDatasetCard = idxDetails.getSourceCardinality();
+ double sampleCard = Math.min(idxDetails.getSampleCardinalityTarget(),
origDatasetCard);
+ if (sampleCard == 0) {
+ sampleCard = 1;
+ IWarningCollector warningCollector = optCtx.getWarningCollector();
+ if (warningCollector.shouldWarn()) {
+ warningCollector.warn(Warning.of(scanOp.getSourceLocation(),
+
org.apache.asterix.common.exceptions.ErrorCode.SAMPLE_HAS_ZERO_ROWS));
+ }
+ }
+
+ // replace the dataScanSourceOperator with the sampling source
+ SampleDataSource sampledatasource =
joinEnum.getSampleDataSource(scanOp);
+ DataSourceScanOperator deepCopyofScan =
+ (DataSourceScanOperator)
OperatorManipulationUtil.bottomUpCopyOperators(scanOp);
+ deepCopyofScan.setDataSource(sampledatasource);
+ parent.getInputs().get(0).setValue(deepCopyofScan);
+
+ // end new code
+
+ // old code
+
+ List<Mutable<ILogicalExpression>> aggFunArgs = new ArrayList<>(1);
+ aggFunArgs.add(new MutableObject<>(ConstantExpression.TRUE));
+
+ AbstractLogicalExpression inputVarRef = new
VariableReferenceExpression(var, newLogOp.getSourceLocation());
+ List<Mutable<ILogicalExpression>> fields = new ArrayList<>(1);
+ fields.add(new MutableObject<>(inputVarRef));
+
+ BuiltinFunctionInfo countFn =
BuiltinFunctions.getBuiltinFunctionInfo(BuiltinFunctions.SQL_COUNT_DISTINCT);
+ AggregateFunctionCallExpression aggExpr = new
AggregateFunctionCallExpression(countFn, false, fields);
+
+ List<Mutable<ILogicalExpression>> aggExprList = new ArrayList<>(1);
+ aggExprList.add(new MutableObject<>(aggExpr));
+
+ List<LogicalVariable> aggVarList = new ArrayList<>(1);
+ LogicalVariable aggVar = newCtx.newVar();
+ aggVarList.add(aggVar);
+
+ AggregateOperator newAggOp = new AggregateOperator(aggVarList,
aggExprList);
+ newAggOp.getInputs().add(new MutableObject<>(newLogOp));
+
+ Mutable<ILogicalOperator> newAggOpRef = new MutableObject<>(newAggOp);
+
+ OperatorPropertiesUtil.typeOpRec(newAggOpRef, newCtx);
+ LOGGER.info("***returning from sample query***");
+
+ String viewInPlan = new ALogicalPlanImpl(newAggOpRef).toString();
//useful when debugging
+ LOGGER.trace("viewInPlan");
+ LOGGER.trace(viewInPlan);
+ return AnalysisUtil.runQuery(newAggOpRef, Arrays.asList(aggVar),
newCtx, IRuleSetFactory.RuleSetKind.SAMPLING);
+ }
+
// This one gets the cardinality and also projection sizes
protected List<List<IAObject>>
runSamplingQueryProjection(IOptimizationContext ctx, ILogicalOperator logOp,
int dataset, LogicalVariable primaryKey) throws
AlgebricksException {
@@ -778,6 +865,47 @@
return Math.round(estDistCardinality);
}
+ // Formula is d = D (1 - e^(-sampleCard/D))
+ double DistinctFormula(double sampleCard, double D) {
+ double a, b, c;
+
+ a = -sampleCard / D;
+ b = Math.exp(a);
+ c = 1.0 - b;
+ double x = D * c;
+ return x;
+ }
+
+ private double distinctEstimator2(ILogicalOperator logOp, double
estDistinctCardinalityFromSample)
+ throws AlgebricksException {
+ // initialize the estimate to be the number of distinct values from
the sample.
+
+ ILogicalOperator parent =
joinEnum.findDataSourceScanOperatorParent(logOp);
+ DataSourceScanOperator scanOp = (DataSourceScanOperator)
parent.getInputs().get(0).getValue();
+
+ Index index = findSampleIndex(scanOp, optCtx);
+ Index.SampleIndexDetails idxDetails = (Index.SampleIndexDetails)
index.getIndexDetails();
+ double origDatasetCardinality = idxDetails.getSourceCardinality();
+ double sampleCard = idxDetails.getSampleCardinalityTarget();
+
+ double D, Dmin, Dmax;
+
+ Dmin = 1;
+ Dmax = origDatasetCardinality;
+ D = estDistinctCardinalityFromSample;
+ int i = 0;
+ while ((Dmin < Dmax) && i < 1000) { // just being very cautious to
avoid infinite loops.
+ i++;
+ D = (Dmax + Dmin) / 2.0;
+ double x = DistinctFormula(sampleCard, D);
+ if (x < estDistinctCardinalityFromSample)
+ Dmin = D + 1;
+ else
+ Dmax = D - 1;
+ }
+ return D;
+ }
+
// Use the Newton-Raphson method for distinct cardinality estimation.
private double distinctEstimator(double estDistinctCardinalityFromSample,
double origDatasetCardinality) {
// initialize the estimate to be the number of distinct values from
the sample.
@@ -858,4 +986,4 @@
}
return null; // no SelectOp in the query tree
}
-}
\ No newline at end of file
+}
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18231
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: I270982f0cdb54ea7fbc9c63413bd73e2b132f7e4
Gerrit-Change-Number: 18231
Gerrit-PatchSet: 1
Gerrit-Owner: [email protected]
Gerrit-MessageType: newchange