Misc optimizations to BinaryPlainBlockDecoder

Looking at a profile while running a YCSB read workload against a binary
built with clang, I noticed that BinaryPlainDecoder had done a really
poor job of inlining vector::push_back when building the offsets array.

This switches away from using a vector there and instead uses a simple
buffer/pointer view into the buffer. I also rearranged a bit of other
code and added PREDICT_FALSEs to try to get the code into a tighter
loop.

Change-Id: I5b5061818de36dc268cd5d4fc8553bceeca5dadd
Reviewed-on: http://gerrit.cloudera.org:8080/6159
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/cbb07d23
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/cbb07d23
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/cbb07d23

Branch: refs/heads/master
Commit: cbb07d23f18c055ff1ae659cfec19f734eeb0403
Parents: 92fe87f
Author: Todd Lipcon <[email protected]>
Authored: Sun Feb 26 18:51:09 2017 -0800
Committer: Todd Lipcon <[email protected]>
Committed: Tue Feb 28 21:24:41 2017 +0000

----------------------------------------------------------------------
 src/kudu/cfile/binary_plain_block.cc | 42 ++++++++++++++++---------------
 src/kudu/cfile/binary_plain_block.h  | 25 ++++++++++++++----
 2 files changed, 42 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/cbb07d23/src/kudu/cfile/binary_plain_block.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/binary_plain_block.cc 
b/src/kudu/cfile/binary_plain_block.cc
index a9eea75..430edcb 100644
--- a/src/kudu/cfile/binary_plain_block.cc
+++ b/src/kudu/cfile/binary_plain_block.cc
@@ -183,48 +183,50 @@ Status BinaryPlainBlockDecoder::ParseHeader() {
   const uint8_t *p = data_.data() + offsets_pos;
   const uint8_t *limit = data_.data() + data_.size();
 
-  offsets_.clear();
   // Reserve one extra element, which we'll fill in at the end
   // with an offset past the last element.
-  offsets_.reserve(num_elems_ + 1);
-
+  offsets_buf_.resize(sizeof(uint32_t) * (num_elems_ + 1));
+  uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(offsets_buf_.data());
   size_t rem = num_elems_;
   while (rem >= 4) {
-    uint32_t ints[4];
-    if (p + 16 < limit) {
-      p = coding::DecodeGroupVarInt32_SSE(p, &ints[0], &ints[1], &ints[2], 
&ints[3]);
+    if (PREDICT_TRUE(p + 16 < limit)) {
+      p = coding::DecodeGroupVarInt32_SSE(
+          p, &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_ptr[3]);
+
+      // The above function should add at most 17 (4 32-bit ints plus a 
selector byte) to
+      // 'p'. Thus, since we checked that (p + 16 < limit) above, we are 
guaranteed that
+      // (p <= limit) now.
+      DCHECK_LE(p, limit);
     } else {
-      p = coding::DecodeGroupVarInt32_SlowButSafe(p, &ints[0], &ints[1], 
&ints[2], &ints[3]);
-    }
-    if (p > limit) {
-      LOG(WARNING) << "bad block: " << HexDump(data_);
-      return Status::Corruption(
-        StringPrintf("unable to decode offsets in block"));
+      p = coding::DecodeGroupVarInt32_SlowButSafe(
+          p, &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_ptr[3]);
+      if (PREDICT_FALSE(p > limit)) {
+        // Only need to check 'p' overrun in the slow path, because 'p' may 
have
+        // been within 16 bytes of 'limit'.
+        LOG(WARNING) << "bad block: " << HexDump(data_);
+        return Status::Corruption(StringPrintf("unable to decode offsets in 
block"));
+      }
     }
-
-    offsets_.push_back(ints[0]);
-    offsets_.push_back(ints[1]);
-    offsets_.push_back(ints[2]);
-    offsets_.push_back(ints[3]);
+    dst_ptr += 4;
     rem -= 4;
   }
 
   if (rem > 0) {
     uint32_t ints[4];
     p = coding::DecodeGroupVarInt32_SlowButSafe(p, &ints[0], &ints[1], 
&ints[2], &ints[3]);
-    if (p > limit) {
+    if (PREDICT_FALSE(p > limit)) {
       LOG(WARNING) << "bad block: " << HexDump(data_);
       return Status::Corruption(
         StringPrintf("unable to decode offsets in block"));
     }
 
     for (int i = 0; i < rem; i++) {
-      offsets_.push_back(ints[i]);
+      *dst_ptr++ = ints[i];
     }
   }
 
   // Add one extra entry pointing after the last item to make the indexing 
easier.
-  offsets_.push_back(offsets_pos);
+  *dst_ptr++ = offsets_pos;
 
   parsed_ = true;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/cbb07d23/src/kudu/cfile/binary_plain_block.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/binary_plain_block.h 
b/src/kudu/cfile/binary_plain_block.h
index 8c9cd49..2edba9d 100644
--- a/src/kudu/cfile/binary_plain_block.h
+++ b/src/kudu/cfile/binary_plain_block.h
@@ -122,9 +122,9 @@ class BinaryPlainBlockDecoder final : public BlockDecoder {
   }
 
   Slice string_at_index(size_t idx) const {
-    const uint32_t offset = offsets_[idx];
-    uint32_t len = offsets_[idx + 1] - offset;
-    return Slice(&data_[offset], len);
+    const uint32_t str_offset = offset(idx);
+    uint32_t len = offset(idx + 1) - str_offset;
+    return Slice(&data_[str_offset], len);
   }
 
   // Minimum length of a header.
@@ -139,13 +139,28 @@ class BinaryPlainBlockDecoder final : public BlockDecoder 
{
   template <typename CellHandler>
   Status HandleBatch(size_t* n, ColumnDataView* dst, CellHandler c);
 
+  // Return the offset within 'data_' where the string value with index 'idx'
+  // can be found.
+  uint32_t offset(int idx) const {
+    const uint8_t* p = &offsets_buf_[idx * sizeof(uint32_t)];
+    uint32_t ret;
+    memcpy(&ret, p, sizeof(uint32_t));
+    return ret;
+  }
+
   Slice data_;
   bool parsed_;
 
-  // The parsed offsets.
+  // A buffer for an array of 32-bit integers for the offsets of the underlying
+  // strings in 'data_'.
+  //
   // This array also contains one extra offset at the end, pointing
   // _after_ the last entry. This makes the code much simpler.
-  std::vector<uint32_t> offsets_;
+  //
+  // The array is stored inside a 'faststring' instead of a vector<uint32_t> to
+  // avoid the overhead of calling vector::push_back -- one would think it 
would
+  // be fully inlined away, but it's actually a perf win to do this.
+  faststring offsets_buf_;
 
   uint32_t num_elems_;
   rowid_t ordinal_pos_base_;

Reply via email to