Repository: incubator-quickstep Updated Branches: refs/heads/estimate-num-distinct-values [created] 2e689f126
Initial commit Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/2e689f12 Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/2e689f12 Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/2e689f12 Branch: refs/heads/estimate-num-distinct-values Commit: 2e689f12606b5b568addbeb3d6226779cd1bc7c8 Parents: 0820e3d Author: Jianqiao Zhu <jianq...@cs.wisc.edu> Authored: Mon Oct 3 23:58:30 2016 -0500 Committer: Jianqiao Zhu <jianq...@cs.wisc.edu> Committed: Mon Oct 3 23:58:30 2016 -0500 ---------------------------------------------------------------------- .../cost_model/StarSchemaSimpleCostModel.cpp | 223 ++++++++++++++----- .../cost_model/StarSchemaSimpleCostModel.hpp | 52 +++-- 2 files changed, 207 insertions(+), 68 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2e689f12/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp ---------------------------------------------------------------------- diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp index 911a765..c45d4f2 100644 --- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp +++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp @@ -29,6 +29,7 @@ #include "query_optimizer/expressions/ComparisonExpression.hpp" #include "query_optimizer/expressions/ExprId.hpp" #include "query_optimizer/expressions/ExpressionType.hpp" +#include "query_optimizer/expressions/ExpressionUtil.hpp" #include "query_optimizer/expressions/LogicalAnd.hpp" #include "query_optimizer/expressions/LogicalOr.hpp" #include "query_optimizer/expressions/Predicate.hpp" @@ -36,6 +37,7 @@ #include "query_optimizer/physical/Aggregate.hpp" #include "query_optimizer/physical/NestedLoopsJoin.hpp" #include "query_optimizer/physical/HashJoin.hpp" +#include "query_optimizer/physical/PatternMatcher.hpp" #include "query_optimizer/physical/Physical.hpp" #include "query_optimizer/physical/PhysicalType.hpp" #include "query_optimizer/physical/Selection.hpp" @@ -85,8 +87,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinality( shared_subplans_[shared_subplan_reference->subplan_id()]); } case P::PhysicalType::kSort: - return estimateCardinality( - std::static_pointer_cast<const P::Sort>(physical_plan)->input()); + return estimateCardinalityForSort( + std::static_pointer_cast<const P::Sort>(physical_plan)); case P::PhysicalType::kWindowAggregate: return estimateCardinalityForWindowAggregate( std::static_pointer_cast<const P::WindowAggregate>(physical_plan)); @@ -111,9 +113,15 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableReference( std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSelection( const P::SelectionPtr &physical_plan) { - double selectivity = estimateSelectivityForSelection(physical_plan); - return std::max(static_cast<std::size_t>(estimateCardinality(physical_plan->input()) * selectivity), - static_cast<std::size_t>(1)); + double selectivity = estimateSelectivity(physical_plan); + return static_cast<std::size_t>(estimateCardinality(physical_plan->input()) * selectivity); +} + +std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSort( + const physical::SortPtr &physical_plan) { + std::size_t cardinality = estimateCardinality( + std::static_pointer_cast<const P::Sort>(physical_plan)->input()); + return std::min(cardinality, static_cast<std::size_t>(physical_plan->limit())); } std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableGenerator( @@ -127,8 +135,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForHashJoin( std::size_t right_cardinality = estimateCardinality(physical_plan->right()); double left_selectivity = estimateSelectivity(physical_plan->left()); double right_selectivity = estimateSelectivity(physical_plan->right()); - return std::max(static_cast<std::size_t>(left_cardinality * right_selectivity) + 1, - static_cast<std::size_t>(right_cardinality * left_selectivity) + 1); + return std::min(static_cast<std::size_t>(left_cardinality * right_selectivity + 0.5), + static_cast<std::size_t>(right_cardinality * left_selectivity + 0.5)); } std::size_t StarSchemaSimpleCostModel::estimateCardinalityForNestedLoopsJoin( @@ -142,8 +150,21 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForAggregate( if (physical_plan->grouping_expressions().empty()) { return 1; } - return std::max(static_cast<std::size_t>(1), - estimateCardinality(physical_plan->input()) / 10); + + std::size_t estimated_num_groups = 1; + for (const auto &expr : physical_plan->grouping_expressions()) { + E::AttributeReferencePtr attr; + if (E::SomeAttributeReference::MatchesWithConditionalCast(expr, &attr)) { + estimated_num_groups *= + std::max(1uL, estimateNumDistinctValues(attr, physical_plan->input())); + } else { + // TODO(jianqiao): implement estimateNumDistinctValues() for expressions. + return estimateCardinality(physical_plan) /10; + } + } + estimated_num_groups = + std::min(estimated_num_groups, estimateCardinality(physical_plan)); + return estimated_num_groups; } std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate( @@ -151,24 +172,107 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate( return estimateCardinality(physical_plan->input()); } +std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues( + const expressions::AttributeReferencePtr &attribute, + const physical::PhysicalPtr &physical_plan) { + DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), attribute)); + + P::TableReferencePtr table_reference; + if (P::SomeTableReference::MatchesWithConditionalCast(physical_plan, &table_reference)) { + return getNumDistinctValues(attribute, table_reference); + } + + E::PredicatePtr filter_predicate = nullptr; + switch (physical_plan->getPhysicalType()) { + case P::PhysicalType::kSelection: + filter_predicate = + std::static_pointer_cast<const P::Selection>(physical_plan)->filter_predicate(); + case P::PhysicalType::kAggregate: + filter_predicate = + std::static_pointer_cast<const P::Aggregate>(physical_plan)->filter_predicate(); + case P::PhysicalType::kHashJoin: + filter_predicate = + std::static_pointer_cast<const P::HashJoin>(physical_plan)->residual_predicate(); + case P::PhysicalType::kNestedLoopsJoin: + filter_predicate = + std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan)->join_predicate(); + default: + break; + } + + double filter_selectivity = 1.0; + if (filter_predicate != nullptr) { + filter_selectivity = estimateSelectivityForPredicate(filter_predicate, physical_plan); + } + + switch (physical_plan->getPhysicalType()) { + case P::PhysicalType::kSelection: // Fall through + case P::PhysicalType::kAggregate: { + if (E::ContainsExpression(physical_plan->getOutputAttributes(), attribute)) { + std::size_t child_num_distinct_values = + estimateNumDistinctValues(attribute, physical_plan->children()[0]); + return static_cast<std::size_t>( + child_num_distinct_values * filter_selectivity + 0.5); + } + break; + } + case P::PhysicalType::kHashJoin: { + const P::HashJoinPtr &hash_join = + std::static_pointer_cast<const P::HashJoin>(physical_plan); + if (E::ContainsExpression(hash_join->left()->getOutputAttributes(), attribute)) { + const std::size_t left_child_num_distinct_values = + estimateNumDistinctValues(attribute, hash_join->left()); + const double right_child_selectivity = + estimateSelectivity(hash_join->right()); + return static_cast<std::size_t>( + left_child_num_distinct_values * right_child_selectivity * filter_selectivity + 0.5); + } + if (E::ContainsExpression(hash_join->right()->getOutputAttributes(), attribute)) { + const std::size_t right_child_num_distinct_values = + estimateNumDistinctValues(attribute, hash_join->left()); + const double left_child_selectivity = + estimateSelectivity(hash_join->right()); + return static_cast<std::size_t>( + right_child_num_distinct_values * left_child_selectivity * filter_selectivity + 0.5); + } + } + default: + break; + } + + return static_cast<std::size_t>( + estimateCardinality(physical_plan) * filter_selectivity + 0.5); +} + double StarSchemaSimpleCostModel::estimateSelectivity( const physical::PhysicalPtr &physical_plan) { switch (physical_plan->getPhysicalType()) { case P::PhysicalType::kSelection: { - return estimateSelectivityForSelection( - std::static_pointer_cast<const P::Selection>(physical_plan)); + const P::SelectionPtr &selection = + std::static_pointer_cast<const P::Selection>(physical_plan); + double filter_selectivity = + estimateSelectivityForPredicate(selection->filter_predicate(), selection); + double child_selectivity = estimateSelectivity(selection->input()); + return filter_selectivity * child_selectivity; } case P::PhysicalType::kHashJoin: { const P::HashJoinPtr &hash_join = std::static_pointer_cast<const P::HashJoin>(physical_plan); - return std::min(estimateSelectivity(hash_join->left()), - estimateSelectivity(hash_join->right())); + double filter_selectivity = + estimateSelectivityForPredicate(hash_join->residual_predicate(), hash_join); + double child_selectivity = + estimateSelectivity(hash_join->left()) * estimateSelectivity(hash_join->right()); + return filter_selectivity * child_selectivity; } case P::PhysicalType::kNestedLoopsJoin: { const P::NestedLoopsJoinPtr &nested_loop_join = std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan); - return std::min(estimateSelectivity(nested_loop_join->left()), - estimateSelectivity(nested_loop_join->right())); + double filter_selectivity = + estimateSelectivityForPredicate(nested_loop_join->join_predicate(), nested_loop_join); + double child_selectivity = std::min( + estimateSelectivity(nested_loop_join->left()), + estimateSelectivity(nested_loop_join->right())); + return filter_selectivity * child_selectivity; } case P::PhysicalType::kSharedSubplanReference: { const P::SharedSubplanReferencePtr shared_subplan_reference = @@ -177,36 +281,19 @@ double StarSchemaSimpleCostModel::estimateSelectivity( shared_subplans_[shared_subplan_reference->subplan_id()]); } default: - return 1.0; + break; } -} -double StarSchemaSimpleCostModel::estimateSelectivityForSelection( - const physical::SelectionPtr &physical_plan) { - const E::PredicatePtr &filter_predicate = physical_plan->filter_predicate(); - - // If the subplan is a table reference, gather the number of distinct values - // statistics for each column (attribute). - std::unordered_map<E::ExprId, std::size_t> num_distinct_values_map; - if (physical_plan->input()->getPhysicalType() == P::PhysicalType::kTableReference) { - const P::TableReferencePtr &table_reference = - std::static_pointer_cast<const P::TableReference>(physical_plan->input()); - const CatalogRelation &relation = *table_reference->relation(); - const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list(); - for (std::size_t i = 0; i < attributes.size(); ++i) { - std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i); - if (num_distinct_values > 0) { - num_distinct_values_map[attributes[i]->id()] = num_distinct_values; - } - } + if (physical_plan->getNumChildren() == 1) { + return estimateSelectivity(physical_plan->children()[0]); } - return estimateSelectivityForPredicate(num_distinct_values_map, filter_predicate); + return 1.0; } double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( - const std::unordered_map<expressions::ExprId, std::size_t> &num_distinct_values_map, - const expressions::PredicatePtr &filter_predicate) { + const expressions::PredicatePtr &filter_predicate, + const P::PhysicalPtr &physical_plan) { if (filter_predicate == nullptr) { return 1.0; } @@ -215,10 +302,8 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( case E::ExpressionType::kComparisonExpression: { // Case 1 - Number of distinct values statistics available // Case 1.1 - Equality comparison: 1.0 / num_distinct_values - // Case 1.2 - Otherwise: 5.0 / num_distinct_values - // Case 2 - Number of distinct values statistics not available - // Case 2.1 - Equality comparison: 0.1 - // Case 2.2 - Otherwise: 0.5 + // Case 1.2 - Otherwise: 0.1 + // Case 2 - Otherwise: 0.25 const E::ComparisonExpressionPtr &comparison_expression = std::static_pointer_cast<const E::ComparisonExpression>(filter_predicate); E::AttributeReferencePtr attr; @@ -226,16 +311,16 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( E::SomeScalarLiteral::Matches(comparison_expression->right())) || (E::SomeAttributeReference::MatchesWithConditionalCast(comparison_expression->right(), &attr) && E::SomeScalarLiteral::Matches(comparison_expression->left()))) { - const auto it = num_distinct_values_map.find(attr->id()); - if (it != num_distinct_values_map.end() && it->second > 0) { - double unit_selectivity = 1.0 / it->second; - return comparison_expression->isEqualityComparisonPredicate() - ? unit_selectivity - : std::min(0.5, unit_selectivity * 5.0); + if (comparison_expression->isEqualityComparisonPredicate()) { + for (const auto &child : physical_plan->children()) { + if (E::ContainsExpression(child->getOutputAttributes(), attr)) { + return 1.0 / estimateNumDistinctValues(attr, child); + } + } + return 0.1; } } - - return comparison_expression->isEqualityComparisonPredicate() ? 0.1 : 0.5; + return 0.25; } case E::ExpressionType::kLogicalAnd: { const E::LogicalAndPtr &logical_and = @@ -244,7 +329,7 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( for (const auto &predicate : logical_and->operands()) { selectivity = std::min(selectivity, - estimateSelectivityForPredicate(num_distinct_values_map, predicate)); + estimateSelectivityForPredicate(predicate, physical_plan)); } return selectivity; } @@ -253,7 +338,7 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( std::static_pointer_cast<const E::LogicalOr>(filter_predicate); double selectivity = 0; for (const auto &predicate : logical_or->operands()) { - selectivity += estimateSelectivityForPredicate(num_distinct_values_map, predicate); + selectivity += estimateSelectivityForPredicate(predicate, physical_plan); } return std::min(selectivity, 1.0); } @@ -263,6 +348,40 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate( return 1.0; } +std::size_t StarSchemaSimpleCostModel::estimateDomainCardinality( + const E::AttributeReferencePtr &attribute, + const P::PhysicalPtr &physical_plan) { + DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), attribute)); + + if (physical_plan->getPhysicalType() == P::PhysicalType::kTableReference) { + return getNumDistinctValues( + attribute, std::static_pointer_cast<const P::TableReference>(physical_plan)); + } + for (const auto &child : physical_plan->children()) { + if (E::ContainsExpression(child->getOutputAttributes(), attribute)) { + return estimateDomainCardinality(attribute, child); + } + } + return estimateCardinality(physical_plan); +} + +std::size_t StarSchemaSimpleCostModel::getNumDistinctValues( + const E::AttributeReferencePtr &attribute, + const P::TableReferencePtr &table_reference) { + const CatalogRelation &relation = *table_reference->relation(); + const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list(); + for (std::size_t i = 0; i < attributes.size(); ++i) { + if (attribute->equals(attributes[i])) { + std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i); + if (num_distinct_values > 0) { + return num_distinct_values; + } + break; + } + } + return estimateCardinalityForTableReference(table_reference); +} + } // namespace cost } // namespace optimizer } // namespace quickstep http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2e689f12/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp ---------------------------------------------------------------------- diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp index 4314b92..17f970d 100644 --- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp +++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp @@ -32,6 +32,7 @@ #include "query_optimizer/physical/HashJoin.hpp" #include "query_optimizer/physical/Physical.hpp" #include "query_optimizer/physical/Selection.hpp" +#include "query_optimizer/physical/Sort.hpp" #include "query_optimizer/physical/TableGenerator.hpp" #include "query_optimizer/physical/TableReference.hpp" #include "query_optimizer/physical/TopLevelPlan.hpp" @@ -67,6 +68,16 @@ class StarSchemaSimpleCostModel : public CostModel { const physical::PhysicalPtr &physical_plan) override; /** + * @brief Estimate the number of distinct values of an attribute in a relation. + * + * @param attribute The attribute to be estimated. + * @param physical_plan The physical plan of the attribute's relation. + * @return The estimated number of distinct values for the attribute. + */ + std::size_t estimateNumDistinctValues(const expressions::AttributeReferencePtr &attribute, + const physical::PhysicalPtr &physical_plan); + + /** * @brief Estimate the "selectivity" of a physical plan under the assumption * that it acts as a filtered dimension table in a hash join. * @@ -76,39 +87,48 @@ class StarSchemaSimpleCostModel : public CostModel { double estimateSelectivity(const physical::PhysicalPtr &physical_plan); private: - std::size_t estimateCardinalityForTopLevelPlan( - const physical::TopLevelPlanPtr &physical_plan); + std::size_t estimateCardinalityForAggregate( + const physical::AggregatePtr &physical_plan); - std::size_t estimateCardinalityForTableReference( - const physical::TableReferencePtr &physical_plan); + std::size_t estimateCardinalityForHashJoin( + const physical::HashJoinPtr &physical_plan); + + std::size_t estimateCardinalityForNestedLoopsJoin( + const physical::NestedLoopsJoinPtr &physical_plan); std::size_t estimateCardinalityForSelection( const physical::SelectionPtr &physical_plan); + std::size_t estimateCardinalityForSort( + const physical::SortPtr &physical_plan); + std::size_t estimateCardinalityForTableGenerator( const physical::TableGeneratorPtr &physical_plan); - std::size_t estimateCardinalityForHashJoin( - const physical::HashJoinPtr &physical_plan); - - std::size_t estimateCardinalityForNestedLoopsJoin( - const physical::NestedLoopsJoinPtr &physical_plan); + std::size_t estimateCardinalityForTableReference( + const physical::TableReferencePtr &physical_plan); - std::size_t estimateCardinalityForAggregate( - const physical::AggregatePtr &physical_plan); + std::size_t estimateCardinalityForTopLevelPlan( + const physical::TopLevelPlanPtr &physical_plan); std::size_t estimateCardinalityForWindowAggregate( const physical::WindowAggregatePtr &physical_plan); - double estimateSelectivityForSelection( - const physical::SelectionPtr &physical_plan); - double estimateSelectivityForPredicate( - const std::unordered_map<expressions::ExprId, std::size_t> &num_distinct_values_map, - const expressions::PredicatePtr &filter_predicate); + const expressions::PredicatePtr &filter_predicate, + const physical::PhysicalPtr &physical_plan); const std::vector<physical::PhysicalPtr> &shared_subplans_; + // Estimate the number of distinct values in the attribute's domain. + std::size_t estimateDomainCardinality(const expressions::AttributeReferencePtr &attribute, + const physical::PhysicalPtr &physical_plan); + + // Get the number of distinct values of an attribute in the table reference. + // If the stat is not avaiable, simply returns the table's cardinality. + std::size_t getNumDistinctValues(const expressions::AttributeReferencePtr &attribute, + const physical::TableReferencePtr &table_reference); + DISALLOW_COPY_AND_ASSIGN(StarSchemaSimpleCostModel); };