Repository: incubator-quickstep
Updated Branches:
  refs/heads/estimate-num-distinct-values 2e689f126 -> 3b4ef8c9b


Updates


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/3b4ef8c9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/3b4ef8c9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/3b4ef8c9

Branch: refs/heads/estimate-num-distinct-values
Commit: 3b4ef8c9ba7cab225da77e97b7af5f9777899310
Parents: 2e689f1
Author: Jianqiao Zhu <jianq...@cs.wisc.edu>
Authored: Tue Oct 4 15:47:09 2016 -0500
Committer: Jianqiao Zhu <jianq...@cs.wisc.edu>
Committed: Tue Oct 4 15:47:09 2016 -0500

----------------------------------------------------------------------
 .../cost_model/StarSchemaSimpleCostModel.cpp    | 146 ++++++++++---------
 .../cost_model/StarSchemaSimpleCostModel.hpp    |  22 ++-
 query_optimizer/expressions/ExpressionUtil.hpp  |  19 +++
 .../StarSchemaHashJoinOrderOptimization.cpp     |   1 +
 .../StarSchemaHashJoinOrderOptimization.hpp     |   4 -
 5 files changed, 109 insertions(+), 83 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp 
b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
index c45d4f2..122485a 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
@@ -113,7 +113,9 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForTableReference(
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSelection(
     const P::SelectionPtr &physical_plan) {
-  double selectivity = estimateSelectivity(physical_plan);
+  double selectivity =
+      estimateSelectivityForPredicate(physical_plan->filter_predicate(),
+                                      physical_plan);
   return static_cast<std::size_t>(estimateCardinality(physical_plan->input()) 
* selectivity);
 }
 
@@ -135,7 +137,7 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForHashJoin(
   std::size_t right_cardinality = estimateCardinality(physical_plan->right());
   double left_selectivity = estimateSelectivity(physical_plan->left());
   double right_selectivity = estimateSelectivity(physical_plan->right());
-  return std::min(static_cast<std::size_t>(left_cardinality * 
right_selectivity + 0.5),
+  return std::max(static_cast<std::size_t>(left_cardinality * 
right_selectivity + 0.5),
                   static_cast<std::size_t>(right_cardinality * 
left_selectivity + 0.5));
 }
 
@@ -151,19 +153,25 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForAggregate(
     return 1;
   }
 
+  const std::size_t estimated_child_cardinality = 
estimateCardinality(physical_plan->input());
   std::size_t estimated_num_groups = 1;
+  std::size_t max_attr_num_distinct_values = 0;
   for (const auto &expr : physical_plan->grouping_expressions()) {
     E::AttributeReferencePtr attr;
     if (E::SomeAttributeReference::MatchesWithConditionalCast(expr, &attr)) {
-      estimated_num_groups *=
-          std::max(1uL, estimateNumDistinctValues(attr, 
physical_plan->input()));
+      const std::size_t attr_num_distinct_values =
+          estimateNumDistinctValues(attr->id(), physical_plan->input());
+      estimated_num_groups *= std::max(1uL, attr_num_distinct_values);
+      max_attr_num_distinct_values =
+          std::max(max_attr_num_distinct_values, attr_num_distinct_values);
     } else {
       // TODO(jianqiao): implement estimateNumDistinctValues() for expressions.
-      return estimateCardinality(physical_plan) /10;
+      estimated_num_groups *= 64uL;
     }
   }
-  estimated_num_groups =
-      std::min(estimated_num_groups, estimateCardinality(physical_plan));
+  estimated_num_groups = std::max(
+      std::min(estimated_num_groups, estimated_child_cardinality / 10),
+      max_attr_num_distinct_values);
   return estimated_num_groups;
 }
 
@@ -173,44 +181,23 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues(
-    const expressions::AttributeReferencePtr &attribute,
+    const expressions::ExprId attribute_id,
     const physical::PhysicalPtr &physical_plan) {
-  DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), 
attribute));
+  DCHECK(E::ContainsExprId(physical_plan->getOutputAttributes(), 
attribute_id));
 
   P::TableReferencePtr table_reference;
   if (P::SomeTableReference::MatchesWithConditionalCast(physical_plan, 
&table_reference)) {
-    return getNumDistinctValues(attribute, table_reference);
-  }
-
-  E::PredicatePtr filter_predicate = nullptr;
-  switch (physical_plan->getPhysicalType()) {
-    case P::PhysicalType::kSelection:
-      filter_predicate =
-          std::static_pointer_cast<const 
P::Selection>(physical_plan)->filter_predicate();
-    case P::PhysicalType::kAggregate:
-      filter_predicate =
-          std::static_pointer_cast<const 
P::Aggregate>(physical_plan)->filter_predicate();
-    case P::PhysicalType::kHashJoin:
-      filter_predicate =
-          std::static_pointer_cast<const 
P::HashJoin>(physical_plan)->residual_predicate();
-    case P::PhysicalType::kNestedLoopsJoin:
-      filter_predicate =
-          std::static_pointer_cast<const 
P::NestedLoopsJoin>(physical_plan)->join_predicate();
-    default:
-      break;
-  }
-
-  double filter_selectivity = 1.0;
-  if (filter_predicate != nullptr) {
-    filter_selectivity = estimateSelectivityForPredicate(filter_predicate, 
physical_plan);
+    return getNumDistinctValues(attribute_id, table_reference);
   }
 
+  double filter_selectivity = 
estimateSelectivityForFilterPredicate(physical_plan);
   switch (physical_plan->getPhysicalType()) {
     case P::PhysicalType::kSelection:  // Fall through
     case P::PhysicalType::kAggregate: {
-      if (E::ContainsExpression(physical_plan->getOutputAttributes(), 
attribute)) {
-        std::size_t child_num_distinct_values =
-            estimateNumDistinctValues(attribute, physical_plan->children()[0]);
+      const P::PhysicalPtr &child = physical_plan->children()[0];
+      if (E::ContainsExprId(child->getOutputAttributes(), attribute_id)) {
+        const std::size_t child_num_distinct_values =
+            estimateNumDistinctValues(attribute_id, child);
         return static_cast<std::size_t>(
             child_num_distinct_values * filter_selectivity + 0.5);
       }
@@ -219,19 +206,19 @@ std::size_t 
StarSchemaSimpleCostModel::estimateNumDistinctValues(
     case P::PhysicalType::kHashJoin: {
       const P::HashJoinPtr &hash_join =
           std::static_pointer_cast<const P::HashJoin>(physical_plan);
-      if (E::ContainsExpression(hash_join->left()->getOutputAttributes(), 
attribute)) {
+      if (E::ContainsExprId(hash_join->left()->getOutputAttributes(), 
attribute_id)) {
         const std::size_t left_child_num_distinct_values =
-            estimateNumDistinctValues(attribute, hash_join->left());
+            estimateNumDistinctValues(attribute_id, hash_join->left());
         const double right_child_selectivity =
             estimateSelectivity(hash_join->right());
         return static_cast<std::size_t>(
             left_child_num_distinct_values * right_child_selectivity * 
filter_selectivity + 0.5);
       }
-      if (E::ContainsExpression(hash_join->right()->getOutputAttributes(), 
attribute)) {
+      if (E::ContainsExprId(hash_join->right()->getOutputAttributes(), 
attribute_id)) {
         const std::size_t right_child_num_distinct_values =
-            estimateNumDistinctValues(attribute, hash_join->left());
+            estimateNumDistinctValues(attribute_id, hash_join->right());
         const double left_child_selectivity =
-            estimateSelectivity(hash_join->right());
+            estimateSelectivity(hash_join->left());
         return static_cast<std::size_t>(
             right_child_num_distinct_values * left_child_selectivity * 
filter_selectivity + 0.5);
       }
@@ -240,8 +227,7 @@ std::size_t 
StarSchemaSimpleCostModel::estimateNumDistinctValues(
       break;
   }
 
-  return static_cast<std::size_t>(
-      estimateCardinality(physical_plan) * filter_selectivity + 0.5);
+  return 16uL;
 }
 
 double StarSchemaSimpleCostModel::estimateSelectivity(
@@ -291,6 +277,38 @@ double StarSchemaSimpleCostModel::estimateSelectivity(
   return 1.0;
 }
 
+double StarSchemaSimpleCostModel::estimateSelectivityForFilterPredicate(
+    const physical::PhysicalPtr &physical_plan) {
+  E::PredicatePtr filter_predicate = nullptr;
+  switch (physical_plan->getPhysicalType()) {
+    case P::PhysicalType::kSelection:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::Selection>(physical_plan)->filter_predicate();
+      break;
+    case P::PhysicalType::kAggregate:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::Aggregate>(physical_plan)->filter_predicate();
+      break;
+    case P::PhysicalType::kHashJoin:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::HashJoin>(physical_plan)->residual_predicate();
+      break;
+    case P::PhysicalType::kNestedLoopsJoin:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::NestedLoopsJoin>(physical_plan)->join_predicate();
+      break;
+    default:
+      break;
+  }
+
+  if (filter_predicate == nullptr) {
+    return 1.0;
+  } else {
+    return estimateSelectivityForPredicate(filter_predicate, physical_plan);
+  }
+}
+
+
 double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
     const expressions::PredicatePtr &filter_predicate,
     const P::PhysicalPtr &physical_plan) {
@@ -303,7 +321,7 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
       // Case 1 - Number of distinct values statistics available
       //   Case 1.1 - Equality comparison: 1.0 / num_distinct_values
       //   Case 1.2 - Otherwise: 0.1
-      // Case 2 - Otherwise: 0.25
+      // Case 2 - Otherwise: 0.2
       const E::ComparisonExpressionPtr &comparison_expression =
           std::static_pointer_cast<const 
E::ComparisonExpression>(filter_predicate);
       E::AttributeReferencePtr attr;
@@ -311,25 +329,26 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
            E::SomeScalarLiteral::Matches(comparison_expression->right())) ||
           
(E::SomeAttributeReference::MatchesWithConditionalCast(comparison_expression->right(),
 &attr) &&
            E::SomeScalarLiteral::Matches(comparison_expression->left()))) {
-        if (comparison_expression->isEqualityComparisonPredicate()) {
-          for (const auto &child : physical_plan->children()) {
-            if (E::ContainsExpression(child->getOutputAttributes(), attr)) {
-              return 1.0 / estimateNumDistinctValues(attr, child);
+        for (const auto &child : physical_plan->children()) {
+          if (E::ContainsExprId(child->getOutputAttributes(), attr->id())) {
+            const std::size_t child_num_distinct_values = 
estimateNumDistinctValues(attr->id(), child);
+            if (comparison_expression->isEqualityComparisonPredicate()) {
+              return 1.0 / child_num_distinct_values;
+            } else {
+              return 1.0 / std::max(std::min(child_num_distinct_values / 
100.0, 10.0), 2.0);
             }
           }
-          return 0.1;
         }
+        return 0.1;
       }
-      return 0.25;
+      return 0.5;
     }
     case E::ExpressionType::kLogicalAnd: {
       const E::LogicalAndPtr &logical_and =
           std::static_pointer_cast<const E::LogicalAnd>(filter_predicate);
       double selectivity = 1.0;
       for (const auto &predicate : logical_and->operands()) {
-        selectivity =
-            std::min(selectivity,
-                     estimateSelectivityForPredicate(predicate, 
physical_plan));
+        selectivity = selectivity * estimateSelectivityForPredicate(predicate, 
physical_plan);
       }
       return selectivity;
     }
@@ -348,30 +367,13 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
   return 1.0;
 }
 
-std::size_t StarSchemaSimpleCostModel::estimateDomainCardinality(
-    const E::AttributeReferencePtr &attribute,
-    const P::PhysicalPtr &physical_plan) {
-  DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), 
attribute));
-
-  if (physical_plan->getPhysicalType() == P::PhysicalType::kTableReference) {
-    return getNumDistinctValues(
-        attribute, std::static_pointer_cast<const 
P::TableReference>(physical_plan));
-  }
-  for (const auto &child : physical_plan->children()) {
-    if (E::ContainsExpression(child->getOutputAttributes(), attribute)) {
-      return estimateDomainCardinality(attribute, child);
-    }
-  }
-  return estimateCardinality(physical_plan);
-}
-
 std::size_t StarSchemaSimpleCostModel::getNumDistinctValues(
-    const E::AttributeReferencePtr &attribute,
+    const E::ExprId attribute_id,
     const P::TableReferencePtr &table_reference) {
   const CatalogRelation &relation = *table_reference->relation();
   const std::vector<E::AttributeReferencePtr> &attributes = 
table_reference->attribute_list();
   for (std::size_t i = 0; i < attributes.size(); ++i) {
-    if (attribute->equals(attributes[i])) {
+    if (attributes[i]->id() == attribute_id) {
       std::size_t num_distinct_values = 
relation.getStatistics().getNumDistinctValues(i);
       if (num_distinct_values > 0) {
         return num_distinct_values;

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp 
b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
index 17f970d..a607a4f 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
@@ -70,11 +70,11 @@ class StarSchemaSimpleCostModel : public CostModel {
   /**
    * @brief Estimate the number of distinct values of an attribute in a 
relation.
    * 
-   * @param attribute The attribute to be estimated.
+   * @param attribute_id The expression id of the target attribute.
    * @param physical_plan The physical plan of the attribute's relation.
    * @return The estimated number of distinct values for the attribute.
    */
-  std::size_t estimateNumDistinctValues(const 
expressions::AttributeReferencePtr &attribute,
+  std::size_t estimateNumDistinctValues(const expressions::ExprId attribute_id,
                                         const physical::PhysicalPtr 
&physical_plan);
 
   /**
@@ -86,6 +86,17 @@ class StarSchemaSimpleCostModel : public CostModel {
    */
   double estimateSelectivity(const physical::PhysicalPtr &physical_plan);
 
+  /**
+   * @brief Estimate the filter predicate's selectivity if it is present in
+   *        the input plan's root node.
+   *
+   * @param physical_plan The input physical plan.
+   * @return The estimated selectivity of the filter predicate if physical_plan
+   *         has such a filter predicate; 1.0 otherwise.
+   */
+  double estimateSelectivityForFilterPredicate(
+      const physical::PhysicalPtr &physical_plan);
+
  private:
   std::size_t estimateCardinalityForAggregate(
       const physical::AggregatePtr &physical_plan);
@@ -120,15 +131,12 @@ class StarSchemaSimpleCostModel : public CostModel {
 
   const std::vector<physical::PhysicalPtr> &shared_subplans_;
 
-  // Estimate the number of distinct values in the attribute's domain.
-  std::size_t estimateDomainCardinality(const 
expressions::AttributeReferencePtr &attribute,
-                                        const physical::PhysicalPtr 
&physical_plan);
-
   // Get the number of distinct values of an attribute in the table reference.
   // If the stat is not avaiable, simply returns the table's cardinality.
-  std::size_t getNumDistinctValues(const expressions::AttributeReferencePtr 
&attribute,
+  std::size_t getNumDistinctValues(const expressions::ExprId attribute_id,
                                    const physical::TableReferencePtr 
&table_reference);
 
+
   DISALLOW_COPY_AND_ASSIGN(StarSchemaSimpleCostModel);
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/expressions/ExpressionUtil.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/expressions/ExpressionUtil.hpp 
b/query_optimizer/expressions/ExpressionUtil.hpp
index e9a4067..422d5ab 100644
--- a/query_optimizer/expressions/ExpressionUtil.hpp
+++ b/query_optimizer/expressions/ExpressionUtil.hpp
@@ -95,6 +95,25 @@ bool ContainsExpression(
 }
 
 /**
+ * @brief Checks whether expr_id equals any expression's id in \p expressions.
+ *
+ * @param expressions A list of expressions to be checked with.
+ * @param expr_id The ExprId that is to be checked if it is in \p expressions.
+ * @return True if \p expr_id is contained by \p expressions.
+ */
+template <class NamedExpressionType>
+bool ContainsExprId(
+    const std::vector<std::shared_ptr<const NamedExpressionType>> &expressions,
+    const ExprId expr_id) {
+  for (const std::shared_ptr<const NamedExpressionType> &expression : 
expressions) {
+    if (expression->id() == expr_id) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
  * @brief Checks whether \p left is a subset of \p right.
  *
  * @param left The left operand of the subset operator (i.e. the one that may 
be

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp 
b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp
index 946d316..0bd35d4 100644
--- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp
+++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.cpp
@@ -283,6 +283,7 @@ physical::PhysicalPtr 
StarSchemaHashJoinOrderOptimization::generatePlan(
       // TODO(jianqiao): Cache the estimated cardinality for each plan in cost
       // model to avoid duplicated estimation.
       second_table_info->estimated_cardinality = 
cost_model_->estimateCardinality(output);
+      second_table_info->estimated_selectivity = 
cost_model_->estimateSelectivity(output);
 
       
second_table_info->join_attribute_pairs.insert(first_table_info->join_attribute_pairs.begin(),
                                                      
first_table_info->join_attribute_pairs.end());

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/3b4ef8c9/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp 
b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
index 4d6765c..f832f00 100644
--- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
+++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
@@ -103,10 +103,6 @@ class StarSchemaHashJoinOrderOptimization : public 
Rule<physical::Physical> {
 
       if (lhs->estimated_selectivity < rhs->estimated_selectivity) {
         return !swapped;
-      } else if (lhs->estimated_cardinality < 1000u &&
-                 rhs->estimated_cardinality > 10000u &&
-                 lhs->estimated_selectivity < rhs->estimated_selectivity * 
1.5) {
-        return !swapped;
       } else if (lhs->estimated_selectivity > rhs->estimated_selectivity) {
         return swapped;
       } else if (lhs->estimated_cardinality != rhs->estimated_cardinality) {

Reply via email to