Misc optimizations to BinaryPlainBlockDecoder Looking at a profile while running a YCSB read workload against a binary built with clang, I noticed that BinaryPlainDecoder had done a really poor job of inlining vector::push_back when building the offsets array.
This switches away from using a vector there and instead uses a simple buffer/pointer view into the buffer. I also rearranged a bit of other code and added PREDICT_FALSEs to try to get the code into a tighter loop. Change-Id: I5b5061818de36dc268cd5d4fc8553bceeca5dadd Reviewed-on: http://gerrit.cloudera.org:8080/6159 Tested-by: Kudu Jenkins Reviewed-by: David Ribeiro Alves <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/cbb07d23 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/cbb07d23 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/cbb07d23 Branch: refs/heads/master Commit: cbb07d23f18c055ff1ae659cfec19f734eeb0403 Parents: 92fe87f Author: Todd Lipcon <[email protected]> Authored: Sun Feb 26 18:51:09 2017 -0800 Committer: Todd Lipcon <[email protected]> Committed: Tue Feb 28 21:24:41 2017 +0000 ---------------------------------------------------------------------- src/kudu/cfile/binary_plain_block.cc | 42 ++++++++++++++++--------------- src/kudu/cfile/binary_plain_block.h | 25 ++++++++++++++---- 2 files changed, 42 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/cbb07d23/src/kudu/cfile/binary_plain_block.cc ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/binary_plain_block.cc b/src/kudu/cfile/binary_plain_block.cc index a9eea75..430edcb 100644 --- a/src/kudu/cfile/binary_plain_block.cc +++ b/src/kudu/cfile/binary_plain_block.cc @@ -183,48 +183,50 @@ Status BinaryPlainBlockDecoder::ParseHeader() { const uint8_t *p = data_.data() + offsets_pos; const uint8_t *limit = data_.data() + data_.size(); - offsets_.clear(); // Reserve one extra element, which we'll fill in at the end // with an offset past the last element. - offsets_.reserve(num_elems_ + 1); - + offsets_buf_.resize(sizeof(uint32_t) * (num_elems_ + 1)); + uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(offsets_buf_.data()); size_t rem = num_elems_; while (rem >= 4) { - uint32_t ints[4]; - if (p + 16 < limit) { - p = coding::DecodeGroupVarInt32_SSE(p, &ints[0], &ints[1], &ints[2], &ints[3]); + if (PREDICT_TRUE(p + 16 < limit)) { + p = coding::DecodeGroupVarInt32_SSE( + p, &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_ptr[3]); + + // The above function should add at most 17 (4 32-bit ints plus a selector byte) to + // 'p'. Thus, since we checked that (p + 16 < limit) above, we are guaranteed that + // (p <= limit) now. + DCHECK_LE(p, limit); } else { - p = coding::DecodeGroupVarInt32_SlowButSafe(p, &ints[0], &ints[1], &ints[2], &ints[3]); - } - if (p > limit) { - LOG(WARNING) << "bad block: " << HexDump(data_); - return Status::Corruption( - StringPrintf("unable to decode offsets in block")); + p = coding::DecodeGroupVarInt32_SlowButSafe( + p, &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_ptr[3]); + if (PREDICT_FALSE(p > limit)) { + // Only need to check 'p' overrun in the slow path, because 'p' may have + // been within 16 bytes of 'limit'. + LOG(WARNING) << "bad block: " << HexDump(data_); + return Status::Corruption(StringPrintf("unable to decode offsets in block")); + } } - - offsets_.push_back(ints[0]); - offsets_.push_back(ints[1]); - offsets_.push_back(ints[2]); - offsets_.push_back(ints[3]); + dst_ptr += 4; rem -= 4; } if (rem > 0) { uint32_t ints[4]; p = coding::DecodeGroupVarInt32_SlowButSafe(p, &ints[0], &ints[1], &ints[2], &ints[3]); - if (p > limit) { + if (PREDICT_FALSE(p > limit)) { LOG(WARNING) << "bad block: " << HexDump(data_); return Status::Corruption( StringPrintf("unable to decode offsets in block")); } for (int i = 0; i < rem; i++) { - offsets_.push_back(ints[i]); + *dst_ptr++ = ints[i]; } } // Add one extra entry pointing after the last item to make the indexing easier. - offsets_.push_back(offsets_pos); + *dst_ptr++ = offsets_pos; parsed_ = true; http://git-wip-us.apache.org/repos/asf/kudu/blob/cbb07d23/src/kudu/cfile/binary_plain_block.h ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/binary_plain_block.h b/src/kudu/cfile/binary_plain_block.h index 8c9cd49..2edba9d 100644 --- a/src/kudu/cfile/binary_plain_block.h +++ b/src/kudu/cfile/binary_plain_block.h @@ -122,9 +122,9 @@ class BinaryPlainBlockDecoder final : public BlockDecoder { } Slice string_at_index(size_t idx) const { - const uint32_t offset = offsets_[idx]; - uint32_t len = offsets_[idx + 1] - offset; - return Slice(&data_[offset], len); + const uint32_t str_offset = offset(idx); + uint32_t len = offset(idx + 1) - str_offset; + return Slice(&data_[str_offset], len); } // Minimum length of a header. @@ -139,13 +139,28 @@ class BinaryPlainBlockDecoder final : public BlockDecoder { template <typename CellHandler> Status HandleBatch(size_t* n, ColumnDataView* dst, CellHandler c); + // Return the offset within 'data_' where the string value with index 'idx' + // can be found. + uint32_t offset(int idx) const { + const uint8_t* p = &offsets_buf_[idx * sizeof(uint32_t)]; + uint32_t ret; + memcpy(&ret, p, sizeof(uint32_t)); + return ret; + } + Slice data_; bool parsed_; - // The parsed offsets. + // A buffer for an array of 32-bit integers for the offsets of the underlying + // strings in 'data_'. + // // This array also contains one extra offset at the end, pointing // _after_ the last entry. This makes the code much simpler. - std::vector<uint32_t> offsets_; + // + // The array is stored inside a 'faststring' instead of a vector<uint32_t> to + // avoid the overhead of calling vector::push_back -- one would think it would + // be fully inlined away, but it's actually a perf win to do this. + faststring offsets_buf_; uint32_t num_elems_; rowid_t ordinal_pos_base_;
