Repository: incubator-quickstep
Updated Branches:
  refs/heads/estimate-num-distinct-values [created] 2e689f126


Initial commit


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/2e689f12
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/2e689f12
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/2e689f12

Branch: refs/heads/estimate-num-distinct-values
Commit: 2e689f12606b5b568addbeb3d6226779cd1bc7c8
Parents: 0820e3d
Author: Jianqiao Zhu <jianq...@cs.wisc.edu>
Authored: Mon Oct 3 23:58:30 2016 -0500
Committer: Jianqiao Zhu <jianq...@cs.wisc.edu>
Committed: Mon Oct 3 23:58:30 2016 -0500

----------------------------------------------------------------------
 .../cost_model/StarSchemaSimpleCostModel.cpp    | 223 ++++++++++++++-----
 .../cost_model/StarSchemaSimpleCostModel.hpp    |  52 +++--
 2 files changed, 207 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2e689f12/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp 
b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
index 911a765..c45d4f2 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
@@ -29,6 +29,7 @@
 #include "query_optimizer/expressions/ComparisonExpression.hpp"
 #include "query_optimizer/expressions/ExprId.hpp"
 #include "query_optimizer/expressions/ExpressionType.hpp"
+#include "query_optimizer/expressions/ExpressionUtil.hpp"
 #include "query_optimizer/expressions/LogicalAnd.hpp"
 #include "query_optimizer/expressions/LogicalOr.hpp"
 #include "query_optimizer/expressions/Predicate.hpp"
@@ -36,6 +37,7 @@
 #include "query_optimizer/physical/Aggregate.hpp"
 #include "query_optimizer/physical/NestedLoopsJoin.hpp"
 #include "query_optimizer/physical/HashJoin.hpp"
+#include "query_optimizer/physical/PatternMatcher.hpp"
 #include "query_optimizer/physical/Physical.hpp"
 #include "query_optimizer/physical/PhysicalType.hpp"
 #include "query_optimizer/physical/Selection.hpp"
@@ -85,8 +87,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinality(
           shared_subplans_[shared_subplan_reference->subplan_id()]);
     }
     case P::PhysicalType::kSort:
-      return estimateCardinality(
-          std::static_pointer_cast<const P::Sort>(physical_plan)->input());
+      return estimateCardinalityForSort(
+          std::static_pointer_cast<const P::Sort>(physical_plan));
     case P::PhysicalType::kWindowAggregate:
       return estimateCardinalityForWindowAggregate(
           std::static_pointer_cast<const P::WindowAggregate>(physical_plan));
@@ -111,9 +113,15 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForTableReference(
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSelection(
     const P::SelectionPtr &physical_plan) {
-  double selectivity = estimateSelectivityForSelection(physical_plan);
-  return 
std::max(static_cast<std::size_t>(estimateCardinality(physical_plan->input()) * 
selectivity),
-                  static_cast<std::size_t>(1));
+  double selectivity = estimateSelectivity(physical_plan);
+  return static_cast<std::size_t>(estimateCardinality(physical_plan->input()) 
* selectivity);
+}
+
+std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSort(
+    const physical::SortPtr &physical_plan) {
+  std::size_t cardinality = estimateCardinality(
+      std::static_pointer_cast<const P::Sort>(physical_plan)->input());
+  return std::min(cardinality, 
static_cast<std::size_t>(physical_plan->limit()));
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableGenerator(
@@ -127,8 +135,8 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForHashJoin(
   std::size_t right_cardinality = estimateCardinality(physical_plan->right());
   double left_selectivity = estimateSelectivity(physical_plan->left());
   double right_selectivity = estimateSelectivity(physical_plan->right());
-  return std::max(static_cast<std::size_t>(left_cardinality * 
right_selectivity) + 1,
-                  static_cast<std::size_t>(right_cardinality * 
left_selectivity) + 1);
+  return std::min(static_cast<std::size_t>(left_cardinality * 
right_selectivity + 0.5),
+                  static_cast<std::size_t>(right_cardinality * 
left_selectivity + 0.5));
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForNestedLoopsJoin(
@@ -142,8 +150,21 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForAggregate(
   if (physical_plan->grouping_expressions().empty()) {
     return 1;
   }
-  return std::max(static_cast<std::size_t>(1),
-                  estimateCardinality(physical_plan->input()) / 10);
+
+  std::size_t estimated_num_groups = 1;
+  for (const auto &expr : physical_plan->grouping_expressions()) {
+    E::AttributeReferencePtr attr;
+    if (E::SomeAttributeReference::MatchesWithConditionalCast(expr, &attr)) {
+      estimated_num_groups *=
+          std::max(1uL, estimateNumDistinctValues(attr, 
physical_plan->input()));
+    } else {
+      // TODO(jianqiao): implement estimateNumDistinctValues() for expressions.
+      return estimateCardinality(physical_plan) /10;
+    }
+  }
+  estimated_num_groups =
+      std::min(estimated_num_groups, estimateCardinality(physical_plan));
+  return estimated_num_groups;
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
@@ -151,24 +172,107 @@ std::size_t 
StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
   return estimateCardinality(physical_plan->input());
 }
 
+std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues(
+    const expressions::AttributeReferencePtr &attribute,
+    const physical::PhysicalPtr &physical_plan) {
+  DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), 
attribute));
+
+  P::TableReferencePtr table_reference;
+  if (P::SomeTableReference::MatchesWithConditionalCast(physical_plan, 
&table_reference)) {
+    return getNumDistinctValues(attribute, table_reference);
+  }
+
+  E::PredicatePtr filter_predicate = nullptr;
+  switch (physical_plan->getPhysicalType()) {
+    case P::PhysicalType::kSelection:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::Selection>(physical_plan)->filter_predicate();
+    case P::PhysicalType::kAggregate:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::Aggregate>(physical_plan)->filter_predicate();
+    case P::PhysicalType::kHashJoin:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::HashJoin>(physical_plan)->residual_predicate();
+    case P::PhysicalType::kNestedLoopsJoin:
+      filter_predicate =
+          std::static_pointer_cast<const 
P::NestedLoopsJoin>(physical_plan)->join_predicate();
+    default:
+      break;
+  }
+
+  double filter_selectivity = 1.0;
+  if (filter_predicate != nullptr) {
+    filter_selectivity = estimateSelectivityForPredicate(filter_predicate, 
physical_plan);
+  }
+
+  switch (physical_plan->getPhysicalType()) {
+    case P::PhysicalType::kSelection:  // Fall through
+    case P::PhysicalType::kAggregate: {
+      if (E::ContainsExpression(physical_plan->getOutputAttributes(), 
attribute)) {
+        std::size_t child_num_distinct_values =
+            estimateNumDistinctValues(attribute, physical_plan->children()[0]);
+        return static_cast<std::size_t>(
+            child_num_distinct_values * filter_selectivity + 0.5);
+      }
+      break;
+    }
+    case P::PhysicalType::kHashJoin: {
+      const P::HashJoinPtr &hash_join =
+          std::static_pointer_cast<const P::HashJoin>(physical_plan);
+      if (E::ContainsExpression(hash_join->left()->getOutputAttributes(), 
attribute)) {
+        const std::size_t left_child_num_distinct_values =
+            estimateNumDistinctValues(attribute, hash_join->left());
+        const double right_child_selectivity =
+            estimateSelectivity(hash_join->right());
+        return static_cast<std::size_t>(
+            left_child_num_distinct_values * right_child_selectivity * 
filter_selectivity + 0.5);
+      }
+      if (E::ContainsExpression(hash_join->right()->getOutputAttributes(), 
attribute)) {
+        const std::size_t right_child_num_distinct_values =
+            estimateNumDistinctValues(attribute, hash_join->left());
+        const double left_child_selectivity =
+            estimateSelectivity(hash_join->right());
+        return static_cast<std::size_t>(
+            right_child_num_distinct_values * left_child_selectivity * 
filter_selectivity + 0.5);
+      }
+    }
+    default:
+      break;
+  }
+
+  return static_cast<std::size_t>(
+      estimateCardinality(physical_plan) * filter_selectivity + 0.5);
+}
+
 double StarSchemaSimpleCostModel::estimateSelectivity(
     const physical::PhysicalPtr &physical_plan) {
   switch (physical_plan->getPhysicalType()) {
     case P::PhysicalType::kSelection: {
-      return estimateSelectivityForSelection(
-          std::static_pointer_cast<const P::Selection>(physical_plan));
+      const P::SelectionPtr &selection =
+          std::static_pointer_cast<const P::Selection>(physical_plan);
+      double filter_selectivity =
+          estimateSelectivityForPredicate(selection->filter_predicate(), 
selection);
+      double child_selectivity = estimateSelectivity(selection->input());
+      return filter_selectivity * child_selectivity;
     }
     case P::PhysicalType::kHashJoin: {
       const P::HashJoinPtr &hash_join =
           std::static_pointer_cast<const P::HashJoin>(physical_plan);
-      return std::min(estimateSelectivity(hash_join->left()),
-                      estimateSelectivity(hash_join->right()));
+      double filter_selectivity =
+          estimateSelectivityForPredicate(hash_join->residual_predicate(), 
hash_join);
+      double child_selectivity =
+          estimateSelectivity(hash_join->left()) * 
estimateSelectivity(hash_join->right());
+      return filter_selectivity * child_selectivity;
     }
     case P::PhysicalType::kNestedLoopsJoin: {
       const P::NestedLoopsJoinPtr &nested_loop_join =
           std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan);
-      return std::min(estimateSelectivity(nested_loop_join->left()),
-                      estimateSelectivity(nested_loop_join->right()));
+      double filter_selectivity =
+          estimateSelectivityForPredicate(nested_loop_join->join_predicate(), 
nested_loop_join);
+      double child_selectivity = std::min(
+          estimateSelectivity(nested_loop_join->left()),
+          estimateSelectivity(nested_loop_join->right()));
+      return filter_selectivity * child_selectivity;
     }
     case P::PhysicalType::kSharedSubplanReference: {
       const P::SharedSubplanReferencePtr shared_subplan_reference =
@@ -177,36 +281,19 @@ double StarSchemaSimpleCostModel::estimateSelectivity(
           shared_subplans_[shared_subplan_reference->subplan_id()]);
     }
     default:
-      return 1.0;
+      break;
   }
-}
 
-double StarSchemaSimpleCostModel::estimateSelectivityForSelection(
-    const physical::SelectionPtr &physical_plan) {
-  const E::PredicatePtr &filter_predicate = physical_plan->filter_predicate();
-
-  // If the subplan is a table reference, gather the number of distinct values
-  // statistics for each column (attribute).
-  std::unordered_map<E::ExprId, std::size_t> num_distinct_values_map;
-  if (physical_plan->input()->getPhysicalType() == 
P::PhysicalType::kTableReference) {
-    const P::TableReferencePtr &table_reference =
-        std::static_pointer_cast<const 
P::TableReference>(physical_plan->input());
-    const CatalogRelation &relation = *table_reference->relation();
-    const std::vector<E::AttributeReferencePtr> &attributes = 
table_reference->attribute_list();
-    for (std::size_t i = 0; i < attributes.size(); ++i) {
-      std::size_t num_distinct_values = 
relation.getStatistics().getNumDistinctValues(i);
-      if (num_distinct_values > 0) {
-        num_distinct_values_map[attributes[i]->id()] = num_distinct_values;
-      }
-    }
+  if (physical_plan->getNumChildren() == 1) {
+    return estimateSelectivity(physical_plan->children()[0]);
   }
 
-  return estimateSelectivityForPredicate(num_distinct_values_map, 
filter_predicate);
+  return 1.0;
 }
 
 double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
-    const std::unordered_map<expressions::ExprId, std::size_t> 
&num_distinct_values_map,
-    const expressions::PredicatePtr &filter_predicate) {
+    const expressions::PredicatePtr &filter_predicate,
+    const P::PhysicalPtr &physical_plan) {
   if (filter_predicate == nullptr) {
     return 1.0;
   }
@@ -215,10 +302,8 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
     case E::ExpressionType::kComparisonExpression: {
       // Case 1 - Number of distinct values statistics available
       //   Case 1.1 - Equality comparison: 1.0 / num_distinct_values
-      //   Case 1.2 - Otherwise: 5.0 / num_distinct_values
-      // Case 2 - Number of distinct values statistics not available
-      //   Case 2.1 - Equality comparison: 0.1
-      //   Case 2.2 - Otherwise: 0.5
+      //   Case 1.2 - Otherwise: 0.1
+      // Case 2 - Otherwise: 0.25
       const E::ComparisonExpressionPtr &comparison_expression =
           std::static_pointer_cast<const 
E::ComparisonExpression>(filter_predicate);
       E::AttributeReferencePtr attr;
@@ -226,16 +311,16 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
            E::SomeScalarLiteral::Matches(comparison_expression->right())) ||
           
(E::SomeAttributeReference::MatchesWithConditionalCast(comparison_expression->right(),
 &attr) &&
            E::SomeScalarLiteral::Matches(comparison_expression->left()))) {
-        const auto it = num_distinct_values_map.find(attr->id());
-        if (it != num_distinct_values_map.end() && it->second > 0) {
-          double unit_selectivity = 1.0 / it->second;
-          return comparison_expression->isEqualityComparisonPredicate()
-                     ? unit_selectivity
-                     : std::min(0.5, unit_selectivity * 5.0);
+        if (comparison_expression->isEqualityComparisonPredicate()) {
+          for (const auto &child : physical_plan->children()) {
+            if (E::ContainsExpression(child->getOutputAttributes(), attr)) {
+              return 1.0 / estimateNumDistinctValues(attr, child);
+            }
+          }
+          return 0.1;
         }
       }
-
-      return comparison_expression->isEqualityComparisonPredicate() ? 0.1 : 
0.5;
+      return 0.25;
     }
     case E::ExpressionType::kLogicalAnd: {
       const E::LogicalAndPtr &logical_and =
@@ -244,7 +329,7 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
       for (const auto &predicate : logical_and->operands()) {
         selectivity =
             std::min(selectivity,
-                     estimateSelectivityForPredicate(num_distinct_values_map, 
predicate));
+                     estimateSelectivityForPredicate(predicate, 
physical_plan));
       }
       return selectivity;
     }
@@ -253,7 +338,7 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
           std::static_pointer_cast<const E::LogicalOr>(filter_predicate);
       double selectivity = 0;
       for (const auto &predicate : logical_or->operands()) {
-        selectivity += 
estimateSelectivityForPredicate(num_distinct_values_map, predicate);
+        selectivity += estimateSelectivityForPredicate(predicate, 
physical_plan);
       }
       return std::min(selectivity, 1.0);
     }
@@ -263,6 +348,40 @@ double 
StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
   return 1.0;
 }
 
+std::size_t StarSchemaSimpleCostModel::estimateDomainCardinality(
+    const E::AttributeReferencePtr &attribute,
+    const P::PhysicalPtr &physical_plan) {
+  DCHECK(E::ContainsExpression(physical_plan->getOutputAttributes(), 
attribute));
+
+  if (physical_plan->getPhysicalType() == P::PhysicalType::kTableReference) {
+    return getNumDistinctValues(
+        attribute, std::static_pointer_cast<const 
P::TableReference>(physical_plan));
+  }
+  for (const auto &child : physical_plan->children()) {
+    if (E::ContainsExpression(child->getOutputAttributes(), attribute)) {
+      return estimateDomainCardinality(attribute, child);
+    }
+  }
+  return estimateCardinality(physical_plan);
+}
+
+std::size_t StarSchemaSimpleCostModel::getNumDistinctValues(
+    const E::AttributeReferencePtr &attribute,
+    const P::TableReferencePtr &table_reference) {
+  const CatalogRelation &relation = *table_reference->relation();
+  const std::vector<E::AttributeReferencePtr> &attributes = 
table_reference->attribute_list();
+  for (std::size_t i = 0; i < attributes.size(); ++i) {
+    if (attribute->equals(attributes[i])) {
+      std::size_t num_distinct_values = 
relation.getStatistics().getNumDistinctValues(i);
+      if (num_distinct_values > 0) {
+        return num_distinct_values;
+      }
+      break;
+    }
+  }
+  return estimateCardinalityForTableReference(table_reference);
+}
+
 }  // namespace cost
 }  // namespace optimizer
 }  // namespace quickstep

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2e689f12/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp 
b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
index 4314b92..17f970d 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
@@ -32,6 +32,7 @@
 #include "query_optimizer/physical/HashJoin.hpp"
 #include "query_optimizer/physical/Physical.hpp"
 #include "query_optimizer/physical/Selection.hpp"
+#include "query_optimizer/physical/Sort.hpp"
 #include "query_optimizer/physical/TableGenerator.hpp"
 #include "query_optimizer/physical/TableReference.hpp"
 #include "query_optimizer/physical/TopLevelPlan.hpp"
@@ -67,6 +68,16 @@ class StarSchemaSimpleCostModel : public CostModel {
       const physical::PhysicalPtr &physical_plan) override;
 
   /**
+   * @brief Estimate the number of distinct values of an attribute in a 
relation.
+   * 
+   * @param attribute The attribute to be estimated.
+   * @param physical_plan The physical plan of the attribute's relation.
+   * @return The estimated number of distinct values for the attribute.
+   */
+  std::size_t estimateNumDistinctValues(const 
expressions::AttributeReferencePtr &attribute,
+                                        const physical::PhysicalPtr 
&physical_plan);
+
+  /**
    * @brief Estimate the "selectivity" of a physical plan under the assumption
    *        that it acts as a filtered dimension table in a hash join.
    *
@@ -76,39 +87,48 @@ class StarSchemaSimpleCostModel : public CostModel {
   double estimateSelectivity(const physical::PhysicalPtr &physical_plan);
 
  private:
-  std::size_t estimateCardinalityForTopLevelPlan(
-      const physical::TopLevelPlanPtr &physical_plan);
+  std::size_t estimateCardinalityForAggregate(
+      const physical::AggregatePtr &physical_plan);
 
-  std::size_t estimateCardinalityForTableReference(
-      const physical::TableReferencePtr &physical_plan);
+  std::size_t estimateCardinalityForHashJoin(
+      const physical::HashJoinPtr &physical_plan);
+
+  std::size_t estimateCardinalityForNestedLoopsJoin(
+      const physical::NestedLoopsJoinPtr &physical_plan);
 
   std::size_t estimateCardinalityForSelection(
       const physical::SelectionPtr &physical_plan);
 
+  std::size_t estimateCardinalityForSort(
+      const physical::SortPtr &physical_plan);
+
   std::size_t estimateCardinalityForTableGenerator(
       const physical::TableGeneratorPtr &physical_plan);
 
-  std::size_t estimateCardinalityForHashJoin(
-      const physical::HashJoinPtr &physical_plan);
-
-  std::size_t estimateCardinalityForNestedLoopsJoin(
-      const physical::NestedLoopsJoinPtr &physical_plan);
+  std::size_t estimateCardinalityForTableReference(
+      const physical::TableReferencePtr &physical_plan);
 
-  std::size_t estimateCardinalityForAggregate(
-      const physical::AggregatePtr &physical_plan);
+  std::size_t estimateCardinalityForTopLevelPlan(
+      const physical::TopLevelPlanPtr &physical_plan);
 
   std::size_t estimateCardinalityForWindowAggregate(
       const physical::WindowAggregatePtr &physical_plan);
 
-  double estimateSelectivityForSelection(
-      const physical::SelectionPtr &physical_plan);
-
   double estimateSelectivityForPredicate(
-      const std::unordered_map<expressions::ExprId, std::size_t> 
&num_distinct_values_map,
-      const expressions::PredicatePtr &filter_predicate);
+      const expressions::PredicatePtr &filter_predicate,
+      const physical::PhysicalPtr &physical_plan);
 
   const std::vector<physical::PhysicalPtr> &shared_subplans_;
 
+  // Estimate the number of distinct values in the attribute's domain.
+  std::size_t estimateDomainCardinality(const 
expressions::AttributeReferencePtr &attribute,
+                                        const physical::PhysicalPtr 
&physical_plan);
+
+  // Get the number of distinct values of an attribute in the table reference.
+  // If the stat is not avaiable, simply returns the table's cardinality.
+  std::size_t getNumDistinctValues(const expressions::AttributeReferencePtr 
&attribute,
+                                   const physical::TableReferencePtr 
&table_reference);
+
   DISALLOW_COPY_AND_ASSIGN(StarSchemaSimpleCostModel);
 };
 

Reply via email to