[kudu] branch master updated: [perf] Check range predicate first while evaluating Bloom filter predicate

bankim Fri, 15 May 2020 12:54:23 -0700

This is an automated email from the ASF dual-hosted git repository.

bankim pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git



The following commit(s) were added to refs/heads/master by this push:
     new 679c0b3  [perf] Check range predicate first while evaluating Bloom 
filter predicate
679c0b3 is described below

commit 679c0b37b1184a7c80f748207ae5980d1f4a6c57
Author: Bankim Bhavsar <[email protected]>
AuthorDate: Wed May 13 12:27:48 2020 -0700

    [perf] Check range predicate first while evaluating Bloom filter predicate
    
    Range predicates can be specified along with Bloom filter predicate
    for the same column. It's cheaper to check against range
    predicate and exit early if the column value is out of bounds
    compared to computing hash and then looking up the value in Bloom filter.
    
    This case is common when Impala pushes down Bloom filter
    predicate as it'll likely be accompained by min-max filter (i.e. range
    predicate) on the same column.
    
    Tests:
    Added a test case that scans against 1M column values with a range predicate
    and Bloom filter predicate. In one case, with a range predicate that yields
    no rows and other with a range predicate that yields all rows.
    
    Modified the test case to run against 100M rows on a 48 core
    Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz with 94GB of memory.
    
    Across iterations observed an improvement of 20-30% when the range predicate
    yields no rows preventing hash computation and Bloom filter lookup.
    Don't see any noticeable regression for the case where values are within
    range bounds.
    
    Without perf change:
     Counting rows with a range predicate less than the min value: real 0.953s 
user 0.001s sys 0.000s
     Counting rows with a range predicate that includes all values: real 0.767s 
user 0.001s sys 0.000s
    
     Counting rows with a range predicate less than the min value: real 0.899s 
user 0.000s sys 0.000s
     Counting rows with a range predicate that includes all values: real 0.775s 
user 0.000s sys 0.001s
    
     Counting rows with a range predicate less than the min value: real 0.983s 
user 0.000s sys 0.000s
     Counting rows with a range predicate that includes all values: real 0.832s 
user 0.001s sys 0.000s
    
    With perf change:
     Counting rows with a range predicate less than the min value: real 0.725s 
user 0.001s sys 0.000s
     Counting rows with a range predicate that includes all values: real 0.847s 
user 0.000s sys 0.000s
    
     Counting rows with a range predicate less than the min value: real 0.664s 
user 0.000s sys 0.000s
     Counting rows with a range predicate that includes all values: real 0.794s 
user 0.001s sys 0.000s
    
     Counting rows with a range predicate less than the min value: real 0.706s 
user 0.001s sys 0.000s
     Counting rows with a range predicate that includes all values: real 0.774s 
user 0.000s sys 0.000s
    
    Change-Id: I8451d6ddfe1fbdf307b3e9f2cc23a8d06e655ba3
    Reviewed-on: http://gerrit.cloudera.org:8080/15913
    Reviewed-by: Andrew Wong <[email protected]>
    Tested-by: Kudu Jenkins
---
 src/kudu/client/predicate-test.cc  | 183 +++++++++++++++++++++----------------
 src/kudu/common/column_predicate.h |  22 ++---
 2 files changed, 115 insertions(+), 90 deletions(-)

diff --git a/src/kudu/client/predicate-test.cc 
b/src/kudu/client/predicate-test.cc
index 6feb7dc..efc49f7 100644
--- a/src/kudu/client/predicate-test.cc
+++ b/src/kudu/client/predicate-test.cc
@@ -53,6 +53,7 @@
 #include "kudu/util/random_util.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
+#include "kudu/util/stopwatch.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
@@ -62,6 +63,7 @@ using std::string;
 using std::unique_ptr;
 using std::unordered_set;
 using std::vector;
+using strings::Substitute;
 
 namespace kudu {
 namespace client {
@@ -357,7 +359,7 @@ class PredicateTest : public KuduTest {
     ASSERT_EQ(values.size() + 1, CountRows(table, {}));
 
     for (T v : test_values) {
-      SCOPED_TRACE(strings::Substitute("test value: $0", v));
+      SCOPED_TRACE(Substitute("test value: $0", v));
 
       { // value = v
         int count = count_if(values.begin(), values.end(),
@@ -476,7 +478,7 @@ class PredicateTest : public KuduTest {
     test_values.emplace_back("a\1", 1);
 
     for (const string& v : test_values) {
-      SCOPED_TRACE(strings::Substitute("test value: '$0'", 
strings::CHexEscape(v)));
+      SCOPED_TRACE(Substitute("test value: '$0'", strings::CHexEscape(v)));
 
       { // value = v
         int count = count_if(values.begin(), values.end(),
@@ -889,7 +891,7 @@ TEST_F(PredicateTest, TestFloatPredicates) {
   ASSERT_EQ(values.size() + 1, CountRows(table, {}));
 
   for (float v : test_values) {
-    SCOPED_TRACE(strings::Substitute("test value: $0", v));
+    SCOPED_TRACE(Substitute("test value: $0", v));
 
     { // value = v
       int count = count_if(values.begin(), values.end(),
@@ -1019,7 +1021,7 @@ TEST_F(PredicateTest, TestDoublePredicates) {
   ASSERT_EQ(values.size() + 1, CountRows(table, {}));
 
   for (double v : test_values) {
-    SCOPED_TRACE(strings::Substitute("test value: $0", v));
+    SCOPED_TRACE(Substitute("test value: $0", v));
 
     { // value = v
       int count = count_if(values.begin(), values.end(),
@@ -1166,7 +1168,7 @@ TEST_F(PredicateTest, TestDecimalPredicates) {
   ASSERT_EQ(values.size() + 1, CountRows(table, {}));
 
   for (int128_t v : test_values) {
-    SCOPED_TRACE(strings::Substitute("test value: $0", v));
+    SCOPED_TRACE(Substitute("test value: $0", v));
 
     { // value = v
       int count = count_if(values.begin(), values.end(),
@@ -1276,40 +1278,39 @@ class BloomFilterPredicateTest : public PredicateTest {
  public:
   BloomFilterPredicateTest() : rand_(Random(SeedRandom())) {}
  protected:
-  // Number of values to be written to the table.
-  static constexpr int kNumAllValues = 100000;
-  // Subset of values from the table that'll be inserted in BloomFilter and 
searched against
-  // all values in the table.
-  static constexpr int kNumInclusiveValues = 10000;
-  // Values that are not present in the table.
-  static constexpr int kNumExclusiveValues = 10000;
-  // Number of false positives based on the number of values that'll be 
searched against.
-  static constexpr int kFalsePositives = kNumAllValues * 
kBloomFilterFalsePositiveProb;
-
   shared_ptr<KuduTable> table_;
   shared_ptr<KuduSession> session_;
   Random rand_;
   unordered_set<int32_t> all_values_;
   int32_t min_value_, max_value_;
   // Subset of "all_values_".
-  vector<int32_t> inclusive_values_;
+  vector<int32_t> included_values_;
   // Values that are not contained in "all_values_".
-  unordered_set<int32_t> exclusive_values_;
+  unordered_set<int32_t> excluded_values_;
+  // Number of false positives based on the number of values that'll be 
searched against.
+  int num_false_positive_values_;
 
-  void SetUp() override {
-    PredicateTest::SetUp();
+  // Initialize the members with the values that will be inserted to the table 
and Bloom filters.
+  // The table will have 'num_all_values' unique values, and we'll generate 
two Bloom filters:
+  // one with 'num_included_values' values from the table, and one with 
'num_excluded_values'
+  // values that aren't present in the table.
+  void Init(int num_all_values, int num_included_values, int 
num_excluded_values) {
+    ASSERT_LT(num_included_values, num_all_values);
+    ASSERT_LT(num_excluded_values, num_all_values);
 
     table_ = CreateAndOpenTable(KuduColumnSchema::INT32);
     session_ = CreateSession();
 
     const unordered_set<int32_t> empty_set;
-    all_values_ = CreateRandomUniqueIntegers<int32_t>(kNumAllValues, 
empty_set, &rand_);
+    all_values_ = CreateRandomUniqueIntegers<int32_t>(num_all_values, 
empty_set, &rand_);
     auto min_max_pair = std::minmax_element(all_values_.begin(), 
all_values_.end());
     min_value_ = *min_max_pair.first;
     max_value_ = *min_max_pair.second;
-    ReservoirSample(all_values_, kNumInclusiveValues, empty_set, &rand_, 
&inclusive_values_);
-    exclusive_values_ = 
CreateRandomUniqueIntegers<int32_t>(kNumExclusiveValues, all_values_,
-                                                            &rand_);
+    ReservoirSample(all_values_, num_included_values, empty_set, &rand_, 
&included_values_);
+    excluded_values_ = 
CreateRandomUniqueIntegers<int32_t>(num_excluded_values, all_values_,
+                                                           &rand_);
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions)
+    num_false_positive_values_ = num_all_values * 
kBloomFilterFalsePositiveProb;
   }
 
   template<typename BloomFilterType, typename Collection>
@@ -1344,97 +1345,121 @@ class BloomFilterPredicateTest : public PredicateTest {
     ASSERT_OK(session_->Flush());
   }
 
-  // Combine supplied Bloom filter predicates that contains inclusive values
+  // Combine supplied Bloom filter predicates that contains included values
   // with Range predicate.
-  void TestWithRangePredicate(KuduPredicate* inclusive_predicate1,
-                              KuduPredicate* inclusive_predicate2) {
+  void TestWithRangePredicate(KuduPredicate* included_predicate1,
+                              KuduPredicate* included_predicate2) {
     auto* less_predicate = table_->NewComparisonPredicate("value", 
KuduPredicate::LESS,
                                                           
KuduValue::FromInt(min_value_));
-    int actual_count_less = CountRows(table_, {inclusive_predicate1, 
less_predicate });
+    int actual_count_less;
+    LOG_TIMING(INFO, Substitute("$0: Counting rows with a range predicate less 
than the min value",
+                                CURRENT_TEST_NAME())) {
+      actual_count_less = CountRows(table_, {included_predicate1, 
less_predicate});
+    }
     EXPECT_EQ(0, actual_count_less);
 
     auto* ge_predicate = table_->NewComparisonPredicate("value", 
KuduPredicate::GREATER_EQUAL,
                                                         
KuduValue::FromInt(min_value_));
     auto* le_predicate = table_->NewComparisonPredicate("value", 
KuduPredicate::LESS_EQUAL,
                                                         
KuduValue::FromInt(max_value_));
-    int actual_count_range = CountRows(table_,
-                                       { inclusive_predicate2, ge_predicate, 
le_predicate });
-    EXPECT_LE(inclusive_values_.size(), actual_count_range);
-    EXPECT_GE(inclusive_values_.size() + kFalsePositives, actual_count_range);
+    int actual_count_range;
+    LOG_TIMING(INFO, Substitute("$0: Counting rows with a range predicate that 
includes all values",
+                                CURRENT_TEST_NAME())) {
+      actual_count_range = CountRows(table_,
+                                     {included_predicate2, ge_predicate, 
le_predicate});
+    }
+    EXPECT_LE(included_values_.size(), actual_count_range);
+    EXPECT_GE(included_values_.size() + num_false_positive_values_, 
actual_count_range);
   }
 };
 
-// Though this static constexpr expression is initialized in the class 
declaration, it needs to be
-// defined because it involves some computation.
-constexpr int BloomFilterPredicateTest::kFalsePositives;
-
 TEST_F(BloomFilterPredicateTest, TestKuduBloomFilterPredicate) {
-  KuduBloomFilter* inclusive_bf = 
CreateBloomFilterWithValues(inclusive_values_);
-  KuduBloomFilter* exclusive_bf = 
CreateBloomFilterWithValues(exclusive_values_);
+  Init(100000/*num_all_values*/, 10000/*num_included_values*/, 
10000/*num_excluded_values*/);
+  KuduBloomFilter* included_bf = CreateBloomFilterWithValues(included_values_);
+  KuduBloomFilter* excluded_bf = CreateBloomFilterWithValues(excluded_values_);
 
   InsertAllValuesInTable();
 
-  vector<KuduBloomFilter*> inclusive_bf_vec = { inclusive_bf };
-  auto* inclusive_predicate =
-      table_->NewInBloomFilterPredicate("value", &inclusive_bf_vec);
-  auto* inclusive_predicate_clone1 = inclusive_predicate->Clone();
-  auto* inclusive_predicate_clone2 = inclusive_predicate->Clone();
-
-  ASSERT_TRUE(inclusive_bf_vec.empty());
-  int actual_count_inclusive = CountRows(table_, { inclusive_predicate });
-  EXPECT_LE(inclusive_values_.size(), actual_count_inclusive);
-  EXPECT_GE(inclusive_values_.size() + kFalsePositives, 
actual_count_inclusive);
-
-  vector<KuduBloomFilter*> exclusive_bf_vec = { exclusive_bf };
-  auto* exclusive_predicate =
-      table_->NewInBloomFilterPredicate("value", &exclusive_bf_vec);
-  ASSERT_TRUE(exclusive_bf_vec.empty());
-  int actual_count_exclusive = CountRows(table_, { exclusive_predicate });
-  EXPECT_LE(0, actual_count_exclusive);
-  EXPECT_GE(kFalsePositives, actual_count_exclusive);
+  vector<KuduBloomFilter*> included_bf_vec = { included_bf };
+  auto* included_predicate =
+      table_->NewInBloomFilterPredicate("value", &included_bf_vec);
+  auto* included_predicate_clone1 = included_predicate->Clone();
+  auto* included_predicate_clone2 = included_predicate->Clone();
+
+  ASSERT_TRUE(included_bf_vec.empty());
+  int actual_count_included = CountRows(table_, { included_predicate });
+  EXPECT_LE(included_values_.size(), actual_count_included);
+  EXPECT_GE(included_values_.size() + num_false_positive_values_, 
actual_count_included);
+
+  vector<KuduBloomFilter*> excluded_bf_vec = { excluded_bf };
+  auto* excluded_predicate =
+      table_->NewInBloomFilterPredicate("value", &excluded_bf_vec);
+  ASSERT_TRUE(excluded_bf_vec.empty());
+  int actual_count_excluded = CountRows(table_, { excluded_predicate });
+  EXPECT_LE(0, actual_count_excluded);
+  EXPECT_GE(num_false_positive_values_, actual_count_excluded);
 
   // Combine Range predicate with Bloom filter predicate.
-  TestWithRangePredicate(inclusive_predicate_clone1, 
inclusive_predicate_clone2);
+  TestWithRangePredicate(included_predicate_clone1, included_predicate_clone2);
 }
 
 // Same as TestKuduBloomFilterPredicate above but using the overloaded 
NewInBloomFilterPredicate()
 // client API with direct BlockBloomFilter pointer.
 TEST_F(BloomFilterPredicateTest, TestDirectBlockBloomFilterPredicate) {
-  unique_ptr<BlockBloomFilter> inclusive_bf = 
CreateDirectBloomFilterWithValues(inclusive_values_);
-  unique_ptr<BlockBloomFilter> exclusive_bf = 
CreateDirectBloomFilterWithValues(exclusive_values_);
+  Init(100000/*num_all_values*/, 10000/*num_included_values*/, 
10000/*num_excluded_values*/);
+
+  unique_ptr<BlockBloomFilter> included_bf = 
CreateDirectBloomFilterWithValues(included_values_);
+  unique_ptr<BlockBloomFilter> excluded_bf = 
CreateDirectBloomFilterWithValues(excluded_values_);
 
   InsertAllValuesInTable();
 
   auto* allocator = DefaultBlockBloomFilterBufferAllocator::GetSingleton();
   Slice allocator_slice(reinterpret_cast<const uint8_t*>(allocator), 
sizeof(*allocator));
 
-  vector<Slice> inclusive_bf_vec =
-      { Slice(reinterpret_cast<const uint8_t*>(inclusive_bf.get()), 
sizeof(*inclusive_bf)) };
-  const size_t inclusive_bf_vec_size = inclusive_bf_vec.size();
+  vector<Slice> included_bf_vec =
+      { Slice(reinterpret_cast<const uint8_t*>(included_bf.get()), 
sizeof(*included_bf)) };
+  const size_t included_bf_vec_size = included_bf_vec.size();
 
-  auto* inclusive_predicate =
-      table_->NewInBloomFilterPredicate("value", allocator_slice, 
inclusive_bf_vec);
-  auto* inclusive_predicate_clone1 = inclusive_predicate->Clone();
-  auto* inclusive_predicate_clone2 = inclusive_predicate->Clone();
+  auto* included_predicate =
+      table_->NewInBloomFilterPredicate("value", allocator_slice, 
included_bf_vec);
+  auto* included_predicate_clone1 = included_predicate->Clone();
+  auto* included_predicate_clone2 = included_predicate->Clone();
 
-  ASSERT_EQ(inclusive_bf_vec_size, inclusive_bf_vec.size());
-  int actual_count_inclusive = CountRows(table_, { inclusive_predicate });
-  EXPECT_LE(inclusive_values_.size(), actual_count_inclusive);
-  EXPECT_GE(inclusive_values_.size() + kFalsePositives, 
actual_count_inclusive);
+  ASSERT_EQ(included_bf_vec_size, included_bf_vec.size());
+  int actual_count_included = CountRows(table_, { included_predicate });
+  EXPECT_LE(included_values_.size(), actual_count_included);
+  EXPECT_GE(included_values_.size() + num_false_positive_values_, 
actual_count_included);
 
-  vector<Slice> exclusive_bf_vec =
-      { Slice(reinterpret_cast<const uint8_t*>(exclusive_bf.get()), 
sizeof(*exclusive_bf)) };
-  const size_t exclusive_bf_vec_size = exclusive_bf_vec.size();
-  auto* exclusive_predicate =
-      table_->NewInBloomFilterPredicate("value", allocator_slice, 
exclusive_bf_vec);
-  ASSERT_EQ(exclusive_bf_vec_size, exclusive_bf_vec.size());
+  vector<Slice> excluded_bf_vec =
+      { Slice(reinterpret_cast<const uint8_t*>(excluded_bf.get()), 
sizeof(*excluded_bf)) };
+  const size_t excluded_bf_vec_size = excluded_bf_vec.size();
+  auto* excluded_predicate =
+      table_->NewInBloomFilterPredicate("value", allocator_slice, 
excluded_bf_vec);
+  ASSERT_EQ(excluded_bf_vec_size, excluded_bf_vec.size());
 
-  int actual_count_exclusive = CountRows(table_, { exclusive_predicate });
-  EXPECT_LE(0, actual_count_exclusive);
-  EXPECT_GE(kFalsePositives, actual_count_exclusive);
+  int actual_count_excluded = CountRows(table_, { excluded_predicate });
+  EXPECT_LE(0, actual_count_excluded);
+  EXPECT_GE(num_false_positive_values_, actual_count_excluded);
 
   // Combine Range predicate with Bloom filter predicate.
-  TestWithRangePredicate(inclusive_predicate_clone1, 
inclusive_predicate_clone2);
+  TestWithRangePredicate(included_predicate_clone1, included_predicate_clone2);
+}
+
+// Benchmark test that combines Bloom filter predicate with range predicate.
+TEST_F(BloomFilterPredicateTest, TestKuduBloomFilterPredicateBenchmark) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+
+  // Writing large number of rows sometimes leads to write timeouts. Hence the 
test below
+  // uses 1M rows instead.
+  Init(1000000/*num_all_values*/, 10000/*num_included_values*/, 
10000/*num_excluded_values*/);
+  KuduBloomFilter* included_bf = CreateBloomFilterWithValues(included_values_);
+
+  InsertAllValuesInTable();
+  vector<KuduBloomFilter*> included_bf_vec = { included_bf };
+  auto* included_predicate = table_->NewInBloomFilterPredicate("value", 
&included_bf_vec);
+  auto* included_predicate_clone = included_predicate->Clone();
+
+  TestWithRangePredicate(included_predicate, included_predicate_clone);
 }
 
 class ParameterizedPredicateTest : public PredicateTest,
diff --git a/src/kudu/common/column_predicate.h 
b/src/kudu/common/column_predicate.h
index e8233ec..6120095 100644
--- a/src/kudu/common/column_predicate.h
+++ b/src/kudu/common/column_predicate.h
@@ -319,6 +319,16 @@ class ColumnPredicate {
   // Evaluate the bloom filter and avoid the predicate type check on a single 
cell.
   template <DataType PhysicalType>
   bool EvaluateCellForBloomFilter(const void* cell) const {
+    // Check optional lower and upper bound.
+    // Range checks are cheaper compared to checking against Bloom filter and 
hence
+    // make these checks first.
+    if (lower_ != nullptr && DataTypeTraits<PhysicalType>::Compare(cell, 
this->lower_) < 0) {
+      return false;
+    }
+    if (upper_ != nullptr && DataTypeTraits<PhysicalType>::Compare(cell, 
this->upper_) >= 0) {
+      return false;
+    }
+
     typedef typename DataTypeTraits<PhysicalType>::cpp_type cpp_type;
     size_t size = sizeof(cpp_type);
     const void* data = cell;
@@ -333,17 +343,7 @@ class ColumnPredicate {
         return false;
       }
     }
-    // Check optional lower and upper bound.
-    if (lower_ != nullptr && upper_ != nullptr) {
-      return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0 &&
-             DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
-    }
-    if (upper_ != nullptr) {
-      return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0;
-    }
-    if (lower_ != nullptr) {
-      return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
-    }
+
     return true;
   }

[kudu] branch master updated: [perf] Check range predicate first while evaluating Bloom filter predicate

Reply via email to