Repository: incubator-quickstep Updated Branches: refs/heads/estimate-num-distinct-values 2e689f126 -> 3b4ef8c9b
Updates Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/3b4ef8c9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/3b4ef8c9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/3b4ef8c9 Branch: refs/heads/estimate-num-distinct-values Commit: 3b4ef8c9ba7cab225da77e97b7af5f9777899310 Parents: 2e689f1 Author: Jianqiao Zhu <jianq...@cs.wisc.edu> Authored: Tue Oct 4 15:47:09 2016 -0500 Committer: Jianqiao Zhu <jianq...@cs.wisc.edu> Committed: Tue Oct 4 15:47:09 2016 -0500 ---------------------------------------------------------------------- .../cost_model/StarSchemaSimpleCostModel.cpp | 146 ++++++++++--------- .../cost_model/StarSchemaSimpleCostModel.hpp | 22 ++- query_optimizer/expressions/ExpressionUtil.hpp | 19 +++ .../StarSchemaHashJoinOrderOptimization.cpp | 1 + .../StarSchemaHashJoinOrderOptimization.hpp | 4 - 5 files changed, 109 insertions(+), 83 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp index c45d4f2..122485a 100644 --- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp +++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp @@ -113,7 +113,9 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableReference( std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSelection( const P::SelectionPtr &physical_plan) { - double selectivity = estimateSelectivity(physical_plan); + double selectivity = + estimateSelectivityForPredicate(physical_plan->filter_predicate(), + physical_plan); return static_cast<std::size_t>(estimateCardinality(physical_plan->input()) * selectivity); } @@ -135,7 +137,7 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForHashJoin( std::size_t right_cardinality = estimateCardinality(physical_plan->right()); double left_selectivity = estimateSelectivity(physical_plan->left()); double right_selectivity = estimateSelectivity(physical_plan->right()); - return std::min(static_cast<std::size_t>(left_cardinality * right_selectivity + 0.5), + return std::max(static_cast<std::size_t>(left_cardinality * right_selectivity + 0.5), static_cast<std::size_t>(right_cardinality * left_selectivity + 0.5)); } @@ -151,19 +153,25 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForAggregate( return 1; } + const std::size_t estimated_child_cardinality = estimateCardinality(physical_plan->input()); std::size_t estimated_num_groups = 1; + std::size_t max_attr_num_distinct_values = 0; for (const auto &expr : physical_plan->grouping_expressions()) { E::AttributeReferencePtr attr; if (E::SomeAttributeReference::MatchesWithConditionalCast(expr, &attr)) { - estimated_num_groups *= - std::max(1uL, estimateNumDistinctValues(attr, physical_plan->input())); + const std::size_t attr_num_distinct_values = + estimateNumDistinctValues(attr->id(), physical_plan->input()); + estimated_num_groups *= std::max(1uL, attr_num_distinct_values); + max_attr_num_distinct_values = + std::max(max_attr_num_distinct_values, attr_num_distinct_values); } else { // TODO(jianqiao): implement estimateNumDistinctValues() for expressions. - return estimateCardinality(physical_plan) /10; + estimated_num_groups *= 64uL; } } - estimated_num_groups = - std::min(estimated_num_groups, estimateCardinality(physical_plan)); + estimated_num_groups = std::max( + std::min(estimated_num_groups, estimated_child_cardinality / 10), + max_attr_num_distinct_values); return estimated_num_groups; } @@ -173,44 +181,23 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate( } std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues( - const expressions::AttributeReferencePtr &attribute, + const expressions::ExprId attribute_id, const physical::PhysicalPtr &physical_plan) { - DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), attribute)); + DCHECK(E::ContainsExprId(physical_plan->getOutputAttributes(), attribute_id)); P::TableReferencePtr table_reference; if (P::SomeTableReference::MatchesWithConditionalCast(physical_plan, &table_reference)) { - return getNumDistinctValues(attribute, table_reference); - } - - E::PredicatePtr filter_predicate = nullptr; - switch (physical_plan->getPhysicalType()) { - case P::PhysicalType::kSelection: - filter_predicate = - std::static_pointer_cast<const P::Selection>(physical_plan)->filter_predicate(); - case P::PhysicalType::kAggregate: - filter_predicate = - std::static_pointer_cast<const P::Aggregate>(physical_plan)->filter_predicate(); - case P::PhysicalType::kHashJoin: - filter_predicate = - std::static_pointer_cast<const P::HashJoin>(physical_plan)->residual_predicate(); - case P::PhysicalType::kNestedLoopsJoin: - filter_predicate = - std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan)->join_predicate(); - default: - break; - } - - double filter_selectivity = 1.0; - if (filter_predicate != nullptr) { - filter_selectivity = estimateSelectivityForPredicate(filter_predicate, physical_plan); + return getNumDistinctValues(attribute_id, table_reference); } + double filter_selectivity = estimateSelectivityForFilterPredicate(physical_plan); switch (physical_plan->getPhysicalType()) { case P::PhysicalType::kSelection: // Fall through case P::PhysicalType::kAggregate: { - if (E::ContainsExpression(physical_plan->getOutputAttributes(), attribute)) { - std::size_t child_num_distinct_values = - estimateNumDistinctValues(attribute, physical_plan->children()[0]); + const P::PhysicalPtr &child = physical_plan->children()[0]; + if (E::ContainsExprId(child->getOutputAttributes(), attribute_id)) { + const std::size_t child_num_distinct_values = + estimateNumDistinctValues(attribute_id, child); return static_cast<std::size_t>( child_num_distinct_values * filter_selectivity + 0.5); } @@ -219,19 +206,19 @@ std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues( case P::PhysicalType::kHashJoin: { const P::HashJoinPtr &hash_join = std::static_pointer_cast<const P::HashJoin>(physical_plan); - if (E::ContainsExpression(hash_join->left()->getOutputAttributes(), attribute)) { + if (E::ContainsExprId(hash_join->left()->getOutputAttributes(), attribute_id)) { const std::size_t left_child_num_distinct_values = - estimateNumDistinctValues(attribute, hash_join->left()); + estimateNumDistinctValues(attribute_id, hash_join->left()); const double right_child_selectivity = estimateSelectivity(hash_join->right()); return static_cast<std::size_t>( left_child_num_distinct_values * right_child_selectivity * filter_selectivity + 0.5); } - if (E::ContainsExpression(hash_join->right()->getOutputAttributes(), attribute)) { + if (E::ContainsExprId(hash_join->right()->getOutputAttributes(), attribute_id)) { const std::size_t right_child_num_distinct_values = - estimateNumDistinctValues(attribute, hash_join->left()); + estimateNumDistinctValues(attribute_id, hash_join->right()); const double left_child_selectivity = - estimateSelectivity(hash_join->right()); + estimateSelectivity(hash_join->left()); return static_cast<std::size_t>( right_child_num_distinct_values * left_child_selectivity * filter_selectivity + 0.5); } @@ -240,8 +227,7 @@ std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues( break; } - return static_cast<std::size_t>( - estimateCardinality(physical_plan) * filter_selectivity + 0.5); + return 16uL; } double StarSchemaSimpleCostModel::estimateSelectivity( @@ -291,6 +277,38 @@ double StarSchemaSimpleCostModel::estimateSelectivity( return 1.0; } +double StarSchemaSimpleCostModel::estimateSelectivityForFilterPredicate( + const physical::PhysicalPtr &physical_plan) { + E::PredicatePtr filter_predicate = nullptr; + switch (physical_plan->getPhysicalType()) { + case P::PhysicalType::kSelection: + filter_predicate = + std::static_pointer_cast<const P::Selection>(physical_plan)->filter_predicate(); + break; + case P::PhysicalType::kAggregate: + filter_predicate = + std::static_pointer_cast<const P::Aggregate>(physical_plan)->filter_predicate(); + break; + case P::PhysicalType::kHashJoin: + filter_predicate = + std::static_pointer_cast<const P::HashJoin>(physical_plan)->residual_predicate(); + break; + case P::PhysicalType::kNestedLoopsJoin: + filter_predicate = + std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan)->join_predicate(); + break; + default: + break; + } + + if (filter_predicate == nullptr) { + return 1.0; + } else { + return estimateSelectivityForPredicate(filter_predicate, physical_plan); + } +} + + double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( const expressions::PredicatePtr &filter_predicate, const P::PhysicalPtr &physical_plan) { @@ -303,7 +321,7 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( // Case 1 - Number of distinct values statistics available // Case 1.1 - Equality comparison: 1.0 / num_distinct_values // Case 1.2 - Otherwise: 0.1 - // Case 2 - Otherwise: 0.25 + // Case 2 - Otherwise: 0.2 const E::ComparisonExpressionPtr &comparison_expression = std::static_pointer_cast<const E::ComparisonExpression>(filter_predicate); E::AttributeReferencePtr attr; @@ -311,25 +329,26 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( E::SomeScalarLiteral::Matches(comparison_expression->right())) || (E::SomeAttributeReference::MatchesWithConditionalCast(comparison_expression->right(), &attr) && E::SomeScalarLiteral::Matches(comparison_expression->left()))) { - if (comparison_expression->isEqualityComparisonPredicate()) { - for (const auto &child : physical_plan->children()) { - if (E::ContainsExpression(child->getOutputAttributes(), attr)) { - return 1.0 / estimateNumDistinctValues(attr, child); + for (const auto &child : physical_plan->children()) { + if (E::ContainsExprId(child->getOutputAttributes(), attr->id())) { + const std::size_t child_num_distinct_values = estimateNumDistinctValues(attr->id(), child); + if (comparison_expression->isEqualityComparisonPredicate()) { + return 1.0 / child_num_distinct_values; + } else { + return 1.0 / std::max(std::min(child_num_distinct_values / 100.0, 10.0), 2.0); } } - return 0.1; } + return 0.1; } - return 0.25; + return 0.5; } case E::ExpressionType::kLogicalAnd: { const E::LogicalAndPtr &logical_and = std::static_pointer_cast<const E::LogicalAnd>(filter_predicate); double selectivity = 1.0; for (const auto &predicate : logical_and->operands()) { - selectivity = - std::min(selectivity, - estimateSelectivityForPredicate(predicate, physical_plan)); + selectivity = selectivity * estimateSelectivityForPredicate(predicate, physical_plan); } return selectivity; } @@ -348,30 +367,13 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( return 1.0; } -std::size_t StarSchemaSimpleCostModel::estimateDomainCardinality( - const E::AttributeReferencePtr &attribute, - const P::PhysicalPtr &physical_plan) { - DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), attribute)); - - if (physical_plan->getPhysicalType() == P::PhysicalType::kTableReference) { - return getNumDistinctValues( - attribute, std::static_pointer_cast<const P::TableReference>(physical_plan)); - } - for (const auto &child : physical_plan->children()) { - if (E::ContainsExpression(child->getOutputAttributes(), attribute)) { - return estimateDomainCardinality(attribute, child); - } - } - return estimateCardinality(physical_plan); -} - std::size_t StarSchemaSimpleCostModel::getNumDistinctValues( - const E::AttributeReferencePtr &attribute, + const E::ExprId attribute_id, const P::TableReferencePtr &table_reference) { const CatalogRelation &relation = *table_reference->relation(); const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list(); for (std::size_t i = 0; i < attributes.size(); ++i) { - if (attribute->equals(attributes[i])) { + if (attributes[i]->id() == attribute_id) { std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i); if (num_distinct_values > 0) { return num_distinct_values; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp index 17f970d..a607a4f 100644 --- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp +++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp @@ -70,11 +70,11 @@ class StarSchemaSimpleCostModel : public CostModel { /** * @brief Estimate the number of distinct values of an attribute in a relation. * - * @param attribute The attribute to be estimated. + * @param attribute_id The expression id of the target attribute. * @param physical_plan The physical plan of the attribute's relation. * @return The estimated number of distinct values for the attribute. */ - std::size_t estimateNumDistinctValues(const expressions::AttributeReferencePtr &attribute, + std::size_t estimateNumDistinctValues(const expressions::ExprId attribute_id, const physical::PhysicalPtr &physical_plan); /** @@ -86,6 +86,17 @@ class StarSchemaSimpleCostModel : public CostModel { */ double estimateSelectivity(const physical::PhysicalPtr &physical_plan); + /** + * @brief Estimate the filter predicate's selectivity if it is present in + * the input plan's root node. + * + * @param physical_plan The input physical plan. + * @return The estimated selectivity of the filter predicate if physical_plan + * has such a filter predicate; 1.0 otherwise. + */ + double estimateSelectivityForFilterPredicate( + const physical::PhysicalPtr &physical_plan); + private: std::size_t estimateCardinalityForAggregate( const physical::AggregatePtr &physical_plan); @@ -120,15 +131,12 @@ class StarSchemaSimpleCostModel : public CostModel { const std::vector<physical::PhysicalPtr> &shared_subplans_; - // Estimate the number of distinct values in the attribute's domain. - std::size_t estimateDomainCardinality(const expressions::AttributeReferencePtr &attribute, - const physical::PhysicalPtr &physical_plan); - // Get the number of distinct values of an attribute in the table reference. // If the stat is not avaiable, simply returns the table's cardinality. - std::size_t getNumDistinctValues(const expressions::AttributeReferencePtr &attribute, + std::size_t getNumDistinctValues(const expressions::ExprId attribute_id, const physical::TableReferencePtr &table_reference); + DISALLOW_COPY_AND_ASSIGN(StarSchemaSimpleCostModel); }; http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/expressions/ExpressionUtil.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/expressions/ExpressionUtil.hpp b/query_optimizer/expressions/ExpressionUtil.hpp index e9a4067..422d5ab 100644 --- a/query_optimizer/expressions/ExpressionUtil.hpp +++ b/query_optimizer/expressions/ExpressionUtil.hpp @@ -95,6 +95,25 @@ bool ContainsExpression( } /** + * @brief Checks whether expr_id equals any expression's id in \p expressions. + * + * @param expressions A list of expressions to be checked with. + * @param expr_id The ExprId that is to be checked if it is in \p expressions. + * @return True if \p expr_id is contained by \p expressions. + */ +template <class NamedExpressionType> +bool ContainsExprId( + const std::vector<std::shared_ptr<const NamedExpressionType>> &expressions, + const ExprId expr_id) { + for (const std::shared_ptr<const NamedExpressionType> &expression : expressions) { + if (expression->id() == expr_id) { + return true; + } + } + return false; +} + +/** * @brief Checks whether \p left is a subset of \p right. * * @param left The left operand of the subset operator (i.e. the one that may be http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp index 946d316..0bd35d4 100644 --- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp +++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp @@ -283,6 +283,7 @@ physical::PhysicalPtr StarSchemaHashJoinOrderOptimization::generatePlan( // TODO(jianqiao): Cache the estimated cardinality for each plan in cost // model to avoid duplicated estimation. second_table_info->estimated_cardinality = cost_model_->estimateCardinality(output); + second_table_info->estimated_selectivity = cost_model_->estimateSelectivity(output); second_table_info->join_attribute_pairs.insert(first_table_info->join_attribute_pairs.begin(), first_table_info->join_attribute_pairs.end()); http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp index 4d6765c..f832f00 100644 --- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp +++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp @@ -103,10 +103,6 @@ class StarSchemaHashJoinOrderOptimization : public Rule<physical::Physical> { if (lhs->estimated_selectivity < rhs->estimated_selectivity) { return !swapped; - } else if (lhs->estimated_cardinality < 1000u && - rhs->estimated_cardinality > 10000u && - lhs->estimated_selectivity < rhs->estimated_selectivity * 1.5) { - return !swapped; } else if (lhs->estimated_selectivity > rhs->estimated_selectivity) { return swapped; } else if (lhs->estimated_cardinality != rhs->estimated_cardinality) {