This is an automated email from the ASF dual-hosted git repository. todd pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit e524334ff04253f38eda7ab3bc589ce76c0934ae Author: Todd Lipcon <[email protected]> AuthorDate: Wed Apr 1 15:11:58 2020 -0700 rowblock: use BMI instruction set when available for GetSelectedRows This enables a BMI variant of SelectionVector::GetSelectedRows which has a higher throughput. I disassembled the resulting hot loop as follows: BMI: L: tzcnt %rsi,%rbx or %r11d,%ebx mov %bx,(%rdx) blsr %rsi,%rsi tzcnt %rsi,%rbx or %r11d,%ebx mov %bx,0x2(%rdx) blsr %rsi,%rsi tzcnt %rsi,%rbx or %r11d,%ebx mov %bx,0x4(%rdx) add $0x6,%rdx blsr %rsi,%rsi add $0xfffffffd,%ecx jne L non-BMI: L: bsf %rsi,%rax or %r12d,%eax mov %ax,(%rdx) lea -0x1(%rsi),%rax and %rsi,%rax bsf %rax,%rsi or %r12d,%esi mov %si,0x2(%rdx) lea -0x1(%rax),%rbx and %rax,%rbx bsf %rbx,%rax or %r12d,%eax mov %ax,0x4(%rdx) add $0x6,%rdx lea -0x1(%rbx),%rsi and %rbx,%rsi add $0xfffffffd,%ecx jne L ... and then used llvm-mca on these assembly files across a few common architectures to see how many cycles were required for 100 iterations of the loop. Results are as follows: haswell non-bmi.s: Total Cycles: 606 haswell bmi.s: Total Cycles: 382 broadwell non-bmi.s: Total Cycles 606 broadwell bmi.s: Total Cycles: 382 skylake non-bmi.s: Total Cycles: 606 skylake bmi.s: Total Cycles: 307 So, on the most recent chips, this should be about a 2x improvement in this function. This function made up a few percent of overall CPU consumption in some TSBS workloads, so this patch had some small but measurable improvement on end-to-end throughput. Change-Id: I8ec74bc5db07c18d0e36de14a2343f49fc5c2859 Reviewed-on: http://gerrit.cloudera.org:8080/15635 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin <[email protected]> --- src/kudu/common/rowblock.cc | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/kudu/common/rowblock.cc b/src/kudu/common/rowblock.cc index 33fb220..85bd4ca 100644 --- a/src/kudu/common/rowblock.cc +++ b/src/kudu/common/rowblock.cc @@ -23,6 +23,7 @@ #include <glog/logging.h> #include "kudu/gutil/bits.h" +#include "kudu/gutil/cpu.h" #include "kudu/gutil/port.h" #include "kudu/util/bitmap.h" @@ -76,8 +77,7 @@ void SelectionVector::ClearToSelectAtMost(size_t max_rows) { } -// TODO(todd) this is a bit faster when implemented with target "bmi" enabled. -// Consider duplicating it and doing runtime switching. +template<bool BMI> static void GetSelectedRowsInternal(const uint8_t* __restrict__ bitmap, int n_bytes, uint16_t* __restrict__ dst) { @@ -87,6 +87,18 @@ static void GetSelectedRowsInternal(const uint8_t* __restrict__ bitmap, }); } +static const bool kHasBmi = base::CPU().has_bmi(); + +#ifdef __x86_64__ +// Explicit instantiation with the BMI instruction set enabled, which +// makes this slightly faster. +template +__attribute__((target("bmi"))) +void GetSelectedRowsInternal<true>(const uint8_t* __restrict__ bitmap, + int n_bytes, + uint16_t* __restrict__ dst); +#endif + SelectedRows SelectionVector::GetSelectedRows() const { CHECK_LE(n_rows_, std::numeric_limits<uint16_t>::max()); int n_selected = CountSelected(); @@ -97,7 +109,11 @@ SelectedRows SelectionVector::GetSelectedRows() const { vector<uint16_t> selected; if (n_selected > 0) { selected.resize(n_selected); - GetSelectedRowsInternal(&bitmap_[0], n_bytes_, selected.data()); + if (kHasBmi) { + GetSelectedRowsInternal<true>(&bitmap_[0], n_bytes_, selected.data()); + } else { + GetSelectedRowsInternal<false>(&bitmap_[0], n_bytes_, selected.data()); + } } return SelectedRows(this, std::move(selected)); }
