This is an automated email from the ASF dual-hosted git repository.

wangbo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 6651d3b  SIMD instruction speed up the storage layer (#6089)
6651d3b is described below

commit 6651d3bf2a977692296c310e9a8aa399c07ab5e3
Author: HappenLee <[email protected]>
AuthorDate: Thu Jun 24 22:04:32 2021 -0500

    SIMD instruction speed up the storage layer (#6089)
    
    * SIMD instruction speed up the storage layer
    
    * 1. add DECHECK in power of 2 int32
    2. change vector to array deduce the cost
---
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 41 +++++++++++++++-------
 be/src/olap/rowset/segment_v2/binary_plain_page.h  | 34 +++++++++++++-----
 be/src/runtime/mem_pool.h                          |  2 +-
 be/src/util/bit_util.h                             |  6 ++++
 4 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp 
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index c5aec41..a65cdf2 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -20,6 +20,7 @@
 #include "common/logging.h"
 #include "gutil/strings/substitute.h" // for Substitute
 #include "olap/rowset/segment_v2/bitshuffle_page.h"
+#include "runtime/mem_pool.h"
 #include "util/slice.h" // for Slice
 
 namespace doris {
@@ -238,8 +239,8 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, 
ColumnBlockView* dst) {
     // dictionary encoding
     DCHECK(_parsed);
     DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr";
+
     if (PREDICT_FALSE(*n == 0)) {
-        *n = 0;
         return Status::OK();
     }
     Slice* out = reinterpret_cast<Slice*>(dst->data());
@@ -248,21 +249,37 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, 
ColumnBlockView* dst) {
     ColumnBlock column_block(_batch.get(), dst->column_block()->pool());
     ColumnBlockView tmp_block_view(&column_block);
     RETURN_IF_ERROR(_data_page_decoder->next_batch(n, &tmp_block_view));
-    for (int i = 0; i < *n; ++i) {
+    const auto len = *n;
+
+    size_t mem_len[len];
+    for (int i = 0; i < len; ++i) {
         int32_t codeword = *reinterpret_cast<const 
int32_t*>(column_block.cell_ptr(i));
         // get the string from the dict decoder
-        Slice element = _dict_decoder->string_at_index(codeword);
-        if (element.size > 0) {
-            char* destination = 
(char*)dst->column_block()->pool()->allocate(element.size);
-            if (destination == nullptr) {
-                return Status::MemoryAllocFailed(
-                        strings::Substitute("memory allocate failed, size:$0", 
element.size));
-            }
-            element.relocate(destination);
-        }
-        *out = element;
+        *out = _dict_decoder->string_at_index(codeword);
+        mem_len[i] = out->size;
+        out++;
+    }
+
+    // use SIMD instruction to speed up call function `RoundUpToPowerOfTwo`
+    auto mem_size = 0;
+    for (int i = 0; i < len; ++i) {
+        mem_len[i] = BitUtil::RoundUpToPowerOf2Int32(mem_len[i], 
MemPool::DEFAULT_ALIGNMENT);
+        mem_size += mem_len[i];
+    }
+
+    // allocate a batch of memory and do memcpy
+    out = reinterpret_cast<Slice*>(dst->data());
+    char* destination = (char*)dst->column_block()->pool()->allocate(mem_size);
+    if (destination == nullptr) {
+        return Status::MemoryAllocFailed(
+                strings::Substitute("memory allocate failed, size:$0", 
mem_size));
+    }
+    for (int i = 0; i < len; ++i) {
+        out->relocate(destination);
+        destination += mem_len[i];
         ++out;
     }
+
     return Status::OK();
 }
 
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index bde3ae0..97e7fa8 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -29,6 +29,7 @@
 #pragma once
 
 #include "common/logging.h"
+#include "gutil/strings/substitute.h"
 #include "olap/olap_common.h"
 #include "olap/rowset/segment_v2/options.h"
 #include "olap/rowset/segment_v2/page_builder.h"
@@ -193,18 +194,33 @@ public:
             *n = 0;
             return Status::OK();
         }
-        size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - 
_cur_idx));
+        const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - 
_cur_idx));
 
         Slice* out = reinterpret_cast<Slice*>(dst->data());
-
+        size_t mem_len[max_fetch];
         for (size_t i = 0; i < max_fetch; i++, out++, _cur_idx++) {
-            Slice elem(string_at_index(_cur_idx));
-            out->size = elem.size;
-            if (elem.size != 0) {
-                out->data =
-                        
reinterpret_cast<char*>(dst->pool()->allocate(elem.size * sizeof(uint8_t)));
-                memcpy(out->data, elem.data, elem.size);
-            }
+            *out = string_at_index(_cur_idx);
+            mem_len[i] = out->size;
+        }
+
+        // use SIMD instruction to speed up call function `RoundUpToPowerOfTwo`
+        auto mem_size = 0;
+        for (int i = 0; i < max_fetch; ++i) {
+            mem_len[i] = BitUtil::RoundUpToPowerOf2Int32(mem_len[i], 
MemPool::DEFAULT_ALIGNMENT);
+            mem_size += mem_len[i];
+        }
+
+        // allocate a batch of memory and do memcpy
+        out = reinterpret_cast<Slice*>(dst->data());
+        char* destination = 
(char*)dst->column_block()->pool()->allocate(mem_size);
+        if (destination == nullptr) {
+            return Status::MemoryAllocFailed(
+                strings::Substitute("memory allocate failed, size:$0", 
mem_size));
+        }
+        for (int i = 0; i < max_fetch; ++i) {
+            out->relocate(destination);
+            destination += mem_len[i];
+            ++out;
         }
 
         *n = max_fetch;
diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h
index 0290361..3a3750e 100644
--- a/be/src/runtime/mem_pool.h
+++ b/be/src/runtime/mem_pool.h
@@ -161,7 +161,7 @@ public:
 
     MemTracker* mem_tracker() { return mem_tracker_; }
 
-    static const int DEFAULT_ALIGNMENT = 8;
+    static constexpr int DEFAULT_ALIGNMENT = 8;
 
 private:
     friend class MemPoolTest;
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index d49e483..a4bf2ef 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -300,6 +300,12 @@ public:
         return (value + (factor - 1)) & ~(factor - 1);
     }
 
+    // speed up function compute for SIMD
+    static inline size_t RoundUpToPowerOf2Int32(size_t value, size_t factor) {
+        DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+        return (value + (factor - 1)) & ~(factor - 1);
+    }
+
     // Returns the ceil of value/divisor
     static inline int Ceil(int value, int divisor) {
         return value / divisor + (value % divisor != 0);

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to