This is an automated email from the ASF dual-hosted git repository.
granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 349aeaa KUDU-2846: optimize predicate evaluation for primitives
349aeaa is described below
commit 349aeaab33d33ba1ed323a6a4ff1bd6eee971d85
Author: Todd Lipcon <[email protected]>
AuthorDate: Tue Jun 11 16:02:37 2019 -0700
KUDU-2846: optimize predicate evaluation for primitives
This changes to an optimized unrolled-by-8 predicate evaluation for
primitive columns.
Performance is improved by up to 7.2x depending on the particular
predicate, type, and nullability (average around 4.8x). Branches are
reduced by about 6.5x and branch-misses by about 22x.
It's possible that hand-coded SIMD could improve on this a little bit
but likely not worth the effort.
perf-stat before:
Performance counter stats for 'build/latest/bin/column_predicate-test
--gtest_filter=*Bench*':
73905.379627 task-clock (msec) # 0.997 CPUs utilized
272,810,081,028 cycles # 3.691 GHz
938,488,388,743 instructions # 3.44 insn per cycle
148,052,698,322 branches # 2003.274 M/sec
882,311,138 branch-misses # 0.60% of all branches
perf-stat after:
Performance counter stats for 'build/latest/bin/column_predicate-test
--gtest_filter=*Bench*':
15354.077654 task-clock (msec) # 0.992 CPUs utilized
56,850,629,856 cycles # 3.703 GHz
181,599,095,960 instructions # 3.19 insn per cycle
22,496,453,160 branches # 1465.178 M/sec
38,662,626 branch-misses # 0.17% of all branches
Detailed results before:
int8 NOT NULL (c = 0) 632.1M evals/sec 4.44 cycles/eval
int8 NULL (c = 0) 515.6M evals/sec 5.48 cycles/eval
int8 NOT NULL (c >= 0) 630.8M evals/sec 4.45 cycles/eval
int8 NULL (c >= 0) 426.8M evals/sec 6.64 cycles/eval
int8 NOT NULL (c >= 0 AND c < 2) 632.6M evals/sec 4.44 cycles/eval
int8 NULL (c >= 0 AND c < 2) 384.7M evals/sec 7.38 cycles/eval
int16 NOT NULL (c = 0) 644.4M evals/sec 4.34 cycles/eval
int16 NULL (c = 0) 524.6M evals/sec 5.37 cycles/eval
int16 NOT NULL (c >= 0) 638.4M evals/sec 4.37 cycles/eval
int16 NULL (c >= 0) 458.8M evals/sec 6.17 cycles/eval
int16 NOT NULL (c >= 0 AND c < 2) 635.3M evals/sec 4.40 cycles/eval
int16 NULL (c >= 0 AND c < 2) 335.1M evals/sec 8.50 cycles/eval
int32 NOT NULL (c = 0) 645.2M evals/sec 4.34 cycles/eval
int32 NULL (c = 0) 492.6M evals/sec 5.77 cycles/eval
int32 NOT NULL (c >= 0) 608.6M evals/sec 4.64 cycles/eval
int32 NULL (c >= 0) 440.7M evals/sec 6.48 cycles/eval
int32 NOT NULL (c >= 0 AND c < 2) 637.8M evals/sec 4.43 cycles/eval
int32 NULL (c >= 0 AND c < 2) 348.0M evals/sec 8.22 cycles/eval
int64 NOT NULL (c = 0) 642.7M evals/sec 4.36 cycles/eval
int64 NULL (c = 0) 505.3M evals/sec 5.60 cycles/eval
int64 NOT NULL (c >= 0) 643.5M evals/sec 4.34 cycles/eval
int64 NULL (c >= 0) 472.8M evals/sec 6.00 cycles/eval
int64 NOT NULL (c >= 0 AND c < 2) 634.2M evals/sec 4.43 cycles/eval
int64 NULL (c >= 0 AND c < 2) 396.7M evals/sec 7.21 cycles/eval
float NOT NULL (c = 0) 604.6M evals/sec 4.63 cycles/eval
float NULL (c = 0) 406.7M evals/sec 7.05 cycles/eval
float NOT NULL (c >= 0) 545.3M evals/sec 5.20 cycles/eval
float NULL (c >= 0) 384.4M evals/sec 7.39 cycles/eval
float NOT NULL (c >= 0 AND c < 2) 583.2M evals/sec 4.80 cycles/eval
float NULL (c >= 0 AND c < 2) 312.2M evals/sec 9.12 cycles/eval
double NOT NULL (c = 0) 614.0M evals/sec 4.56 cycles/eval
double NULL (c = 0) 471.5M evals/sec 5.99 cycles/eval
double NOT NULL (c >= 0) 623.0M evals/sec 4.48 cycles/eval
double NULL (c >= 0) 379.9M evals/sec 7.47 cycles/eval
double NOT NULL (c >= 0 AND c < 2) 599.5M evals/sec 4.67 cycles/eval
double NULL (c >= 0 AND c < 2) 415.2M evals/sec 6.82 cycles/eval
Detailed results after:
int8 NOT NULL (c = 0) 3660.3M evals/sec 0.76 cycles/eval
int8 NULL (c = 0) 3657.1M evals/sec 0.76 cycles/eval
int8 NOT NULL (c >= 0) 3712.0M evals/sec 0.75 cycles/eval
int8 NULL (c >= 0) 3618.9M evals/sec 0.78 cycles/eval
int8 NOT NULL (c >= 0 AND c < 2) 1661.9M evals/sec 1.73 cycles/eval
int8 NULL (c >= 0 AND c < 2) 1663.4M evals/sec 1.77 cycles/eval
int16 NOT NULL (c = 0) 3781.4M evals/sec 0.73 cycles/eval
int16 NULL (c = 0) 3738.3M evals/sec 0.74 cycles/eval
int16 NOT NULL (c >= 0) 3672.9M evals/sec 0.76 cycles/eval
int16 NULL (c >= 0) 3767.4M evals/sec 0.75 cycles/eval
int16 NOT NULL (c >= 0 AND c < 2) 1654.3M evals/sec 1.77 cycles/eval
int16 NULL (c >= 0 AND c < 2) 1651.6M evals/sec 1.72 cycles/eval
int32 NOT NULL (c = 0) 2925.1M evals/sec 0.97 cycles/eval
int32 NULL (c = 0) 2844.4M evals/sec 0.97 cycles/eval
int32 NOT NULL (c >= 0) 2942.7M evals/sec 0.95 cycles/eval
int32 NULL (c >= 0) 2900.8M evals/sec 0.98 cycles/eval
int32 NOT NULL (c >= 0 AND c < 2) 1641.1M evals/sec 1.73 cycles/eval
int32 NULL (c >= 0 AND c < 2) 1638.8M evals/sec 1.75 cycles/eval
int64 NOT NULL (c = 0) 3878.6M evals/sec 0.71 cycles/eval
int64 NULL (c = 0) 3763.9M evals/sec 0.76 cycles/eval
int64 NOT NULL (c >= 0) 2784.4M evals/sec 1.01 cycles/eval
int64 NULL (c >= 0) 2782.6M evals/sec 1.01 cycles/eval
int64 NOT NULL (c >= 0 AND c < 2) 1671.4M evals/sec 1.71 cycles/eval
int64 NULL (c >= 0 AND c < 2) 1741.5M evals/sec 1.64 cycles/eval
float NOT NULL (c = 0) 3940.8M evals/sec 0.72 cycles/eval
float NULL (c = 0) 3820.9M evals/sec 0.72 cycles/eval
float NOT NULL (c >= 0) 4571.4M evals/sec 0.60 cycles/eval
float NULL (c >= 0) 4741.3M evals/sec 0.58 cycles/eval
float NOT NULL (c >= 0 AND c < 2) 1318.0M evals/sec 2.18 cycles/eval
float NULL (c >= 0 AND c < 2) 1262.3M evals/sec 2.28 cycles/eval
double NOT NULL (c = 0) 2813.4M evals/sec 1.01 cycles/eval
double NULL (c = 0) 2664.6M evals/sec 1.06 cycles/eval
double NOT NULL (c >= 0) 3620.8M evals/sec 0.77 cycles/eval
double NULL (c >= 0) 3657.2M evals/sec 0.76 cycles/eval
double NOT NULL (c >= 0 AND c < 2) 1248.8M evals/sec 2.30 cycles/eval
double NULL (c >= 0 AND c < 2) 1253.7M evals/sec 2.28 cycles/eval
Change-Id: I9dd062961a3cd2c892997d6aba12684e603628a1
Reviewed-on: http://gerrit.cloudera.org:8080/13591
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <[email protected]>
---
src/kudu/common/CMakeLists.txt | 2 +-
src/kudu/common/column_predicate-test.cc | 83 ++++++++++++++++++++++++++++++++
src/kudu/common/column_predicate.cc | 73 +++++++++++++++++++++++-----
3 files changed, 146 insertions(+), 12 deletions(-)
diff --git a/src/kudu/common/CMakeLists.txt b/src/kudu/common/CMakeLists.txt
index eb6a783..fa5c105 100644
--- a/src/kudu/common/CMakeLists.txt
+++ b/src/kudu/common/CMakeLists.txt
@@ -81,7 +81,7 @@ ADD_EXPORTABLE_LIBRARY(kudu_common
SET_KUDU_TEST_LINK_LIBS(kudu_common)
ADD_KUDU_TEST(columnblock-test)
-ADD_KUDU_TEST(column_predicate-test)
+ADD_KUDU_TEST(column_predicate-test NUM_SHARDS 4)
ADD_KUDU_TEST(encoded_key-test)
ADD_KUDU_TEST(generic_iterators-test)
ADD_KUDU_TEST(id_mapping-test)
diff --git a/src/kudu/common/column_predicate-test.cc
b/src/kudu/common/column_predicate-test.cc
index e40e1a4..946915a 100644
--- a/src/kudu/common/column_predicate-test.cc
+++ b/src/kudu/common/column_predicate-test.cc
@@ -19,23 +19,31 @@
#include <cmath>
#include <cstdint>
+#include <cstdlib>
+#include <functional>
#include <string>
#include <vector>
#include <boost/optional/optional.hpp>
#include <gflags/gflags.h>
+#include <glog/logging.h>
#include <gtest/gtest.h>
+#include "kudu/common/columnblock.h"
#include "kudu/common/common.pb.h"
+#include "kudu/common/rowblock.h"
#include "kudu/common/schema.h"
#include "kudu/common/types.h"
+#include "kudu/gutil/stringprintf.h"
#include "kudu/gutil/strings/substitute.h"
+#include "kudu/gutil/walltime.h"
#include "kudu/util/bloom_filter.h"
#include "kudu/util/hash.pb.h"
#include "kudu/util/int128.h"
#include "kudu/util/memory/arena.h"
#include "kudu/util/random.h"
#include "kudu/util/slice.h"
+#include "kudu/util/stopwatch.h"
#include "kudu/util/test_util.h"
using std::vector;
@@ -1496,4 +1504,79 @@ TEST_F(TestColumnPredicateDeathTest,
TestMergeRequiresNameAndType) {
}, "COMPARE_NAME_AND_TYPE");
}
+template<typename TypeParam>
+class ColumnPredicateBenchmark : public KuduTest {
+ protected:
+ static constexpr auto kColType = TypeParam::physical_type;
+ static constexpr int kNumRows = 1024;
+
+ ColumnPredicateBenchmark() {
+ }
+
+ void DoTest(const std::function<ColumnPredicate(const ColumnSchema&)>&
pred_factory) {
+ const int num_iters = AllowSlowTests() ? 1000000 : 100;
+ const int num_evals = num_iters * kNumRows;
+
+ for (bool nullable : {false, true}) {
+ ColumnSchema cs("c", kColType, nullable);
+ auto pred = pred_factory(cs);
+ ScopedColumnBlock<kColType> b(kNumRows);
+ for (int i = 0; i < kNumRows; i++) {
+ b[i] = rand() % 3;
+ if (nullable) {
+ b.SetCellIsNull(i, rand() % 10 == 1);
+ }
+ }
+
+ SelectionVector selvec(kNumRows);
+ int64_t tot_cycles = 0;
+ Stopwatch sw;
+ sw.start();
+ for (int i = 0; i < num_iters; i++) {
+ selvec.SetAllTrue();
+ int64_t cycles_start = CycleClock::Now();
+ pred.Evaluate(b, &selvec);
+ tot_cycles += CycleClock::Now() - cycles_start;
+ }
+ sw.stop();
+ LOG(INFO) << StringPrintf(
+ "%-6s %-10s (%s) %.1fM evals/sec\t%.2f cycles/eval",
+ TypeParam::name(), nullable ? "NULL" : "NOT NULL",
+ pred.ToString().c_str(),
+ num_evals / sw.elapsed().user_cpu_seconds() / 1000000,
+ static_cast<double>(tot_cycles) / num_evals);
+ }
+ }
+};
+
+using test_types = ::testing::Types<
+ DataTypeTraits<INT8>,
+ DataTypeTraits<INT16>,
+ DataTypeTraits<INT32>,
+ DataTypeTraits<INT64>,
+ DataTypeTraits<FLOAT>,
+ DataTypeTraits<DOUBLE>>;
+
+TYPED_TEST_CASE(ColumnPredicateBenchmark, test_types);
+
+TYPED_TEST(ColumnPredicateBenchmark, TestEquals) {
+ const typename TypeParam::cpp_type ref_val = 0;
+ ColumnPredicateBenchmark<TypeParam>::DoTest(
+ [&](const ColumnSchema& cs) { return ColumnPredicate::Equality(cs,
&ref_val); });
+}
+
+TYPED_TEST(ColumnPredicateBenchmark, TestLessThan) {
+ const typename TypeParam::cpp_type upper = 0;
+ ColumnPredicateBenchmark<TypeParam>::DoTest(
+ [&](const ColumnSchema& cs) { return ColumnPredicate::Range(cs, nullptr,
&upper); });
+}
+
+TYPED_TEST(ColumnPredicateBenchmark, TestRange) {
+ const typename TypeParam::cpp_type lower = 0;
+ const typename TypeParam::cpp_type upper = 2;
+ ColumnPredicateBenchmark<TypeParam>::DoTest(
+ [&](const ColumnSchema& cs) { return ColumnPredicate::Range(cs, &lower,
&upper); });
+}
+
+
} // namespace kudu
diff --git a/src/kudu/common/column_predicate.cc
b/src/kudu/common/column_predicate.cc
index 5f9e7be..3b665f1 100644
--- a/src/kudu/common/column_predicate.cc
+++ b/src/kudu/common/column_predicate.cc
@@ -20,6 +20,7 @@
#include <algorithm>
#include <cstring>
#include <iterator>
+#include <type_traits>
#include <boost/optional/optional.hpp>
@@ -29,6 +30,7 @@
#include "kudu/common/schema.h"
#include "kudu/common/types.h"
#include "kudu/gutil/macros.h"
+#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/util/bitmap.h"
@@ -646,20 +648,69 @@ void ColumnPredicate::MergeIntoBloomFilter(const
ColumnPredicate &other) {
}
namespace {
-template <typename P>
+
+// Optimized predicate evaluation for primitive types.
+//
+// For primitives, it's safe to evaluate a predicate even if the cell is
+// null or deselected -- it might be junk data, which means we'd get
+// a junk comparison result, but that's OK, because we can just bitwise-AND
+// the result against the null bitmap and the preexisting selection vector.
+//
+// This ends up removing most of the branches from the inner loop of
comparisons
+// and enables compilers to do SIMD optimizations.
+//
+// This technique can't safely be applied to cells like BINARY since these
+// consist of pointers, and following a junk pointer might crash the process.
+//
+// Returns the number of elements of 'cb' that were processed. This function
+// only processes multiples of 8, so if cb.nrows() is not a multiple of 8, the
+// last few elements may need to be processed by the caller.
+template <DataType PhysicalType, typename P>
+ATTRIBUTE_NOINLINE
+int ApplyPredicatePrimitive(const ColumnBlock& block, uint8_t* __restrict__
sel_bitmap, P p) {
+ using cpp_type = typename DataTypeTraits<PhysicalType>::cpp_type;
+ const cpp_type* data = reinterpret_cast<const cpp_type*>(block.data());
+ const int n_chunks = block.nrows() / 8;
+ for (int i = 0; i < n_chunks; i++) {
+ uint8_t res_8 = 0;;
+ for (int j = 0; j < 8; j++) {
+ res_8 |= p(data++) << j;
+ }
+ sel_bitmap[i] &= res_8;
+ }
+ if (block.is_nullable()) {
+ for (int i = 0; i < n_chunks; i++) {
+ sel_bitmap[i] &= block.null_bitmap()[i];
+ }
+ }
+ return n_chunks * 8;
+}
+
+
+template <DataType PhysicalType, typename P>
void ApplyPredicate(const ColumnBlock& block, SelectionVector* sel, P p) {
+ using cpp_type = typename DataTypeTraits<PhysicalType>::cpp_type;
+ int start_idx = 0;
+ if (std::is_fundamental<cpp_type>::value) {
+ start_idx = ApplyPredicatePrimitive<PhysicalType>(block,
sel->mutable_bitmap(), p);
+ if (PREDICT_TRUE(start_idx == block.nrows())) return;
+ // If we couldn't process the whole block unrolled by 8, fall through to
the
+ // remainder.
+ }
+
+ const cpp_type* data = reinterpret_cast<const cpp_type*>(block.data());
if (block.is_nullable()) {
- for (size_t i = 0; i < block.nrows(); i++) {
+ for (size_t i = start_idx; i < block.nrows(); i++) {
if (!sel->IsRowSelected(i)) continue;
- const void* cell = block.nullable_cell_ptr(i);
+ const cpp_type* cell = block.is_null(i) ? nullptr : &data[i];
if (cell == nullptr || !p(cell)) {
BitmapClear(sel->mutable_bitmap(), i);
}
}
} else {
- for (size_t i = 0; i < block.nrows(); i++) {
+ for (size_t i = start_idx; i < block.nrows(); i++) {
if (!sel->IsRowSelected(i)) continue;
- const void* cell = block.cell_ptr(i);
+ const cpp_type* cell = &data[i];
if (!p(cell)) {
BitmapClear(sel->mutable_bitmap(), i);
}
@@ -674,15 +725,15 @@ void ColumnPredicate::EvaluateForPhysicalType(const
ColumnBlock& block,
switch (predicate_type()) {
case PredicateType::Range: {
if (lower_ == nullptr) {
- ApplyPredicate(block, sel, [this] (const void* cell) {
+ ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0;
});
} else if (upper_ == nullptr) {
- ApplyPredicate(block, sel, [this] (const void* cell) {
+ ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >=
0;
});
} else {
- ApplyPredicate(block, sel, [this] (const void* cell) {
+ ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0
&&
DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >=
0;
});
@@ -690,7 +741,7 @@ void ColumnPredicate::EvaluateForPhysicalType(const
ColumnBlock& block,
return;
};
case PredicateType::Equality: {
- ApplyPredicate(block, sel, [this] (const void* cell) {
+ ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) == 0;
});
return;
@@ -721,7 +772,7 @@ void ColumnPredicate::EvaluateForPhysicalType(const
ColumnBlock& block,
return;
}
case PredicateType::InList: {
- ApplyPredicate(block, sel, [this] (const void* cell) {
+ ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return std::binary_search(values_.begin(), values_.end(), cell,
[] (const void* lhs, const void* rhs) {
return
DataTypeTraits<PhysicalType>::Compare(lhs, rhs) < 0;
@@ -731,7 +782,7 @@ void ColumnPredicate::EvaluateForPhysicalType(const
ColumnBlock& block,
};
case PredicateType::None: LOG(FATAL) << "NONE predicate evaluation";
case PredicateType::InBloomFilter: {
- ApplyPredicate(block, sel, [this] (const void* cell) {
+ ApplyPredicate<PhysicalType>(block, sel, [this] (const void* cell) {
return EvaluateCell<PhysicalType>(cell);
});
return;