gaodayue commented on a change in pull request #1818: Add frame_of_reference 
page
URL: https://github.com/apache/incubator-doris/pull/1818#discussion_r325998239
 
 

 ##########
 File path: be/src/util/frame_of_reference_coding.cpp
 ##########
 @@ -0,0 +1,261 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/frame_of_reference_coding.h"
+
+#include <cstring>
+
+namespace doris {
+
+static inline uint8_t bits(const uint32_t v) {
+    return v == 0 ? 0 : 32 - __builtin_clz(v);
+}
+
+static inline uint32_t bits64(const uint64_t v) {
+    return v == 0 ? 0 : 64 - __builtin_clzll(v);
+}
+
+template<typename T>
+const T* ForEncoder<T>::copy_value(const T *in_data, size_t count) {
+    memcpy(&_buffered_values[_buffered_values_num], in_data, count * 
sizeof(T));
+    _buffered_values_num += count;
+    in_data += count;
+    return in_data;
+}
+
+template<typename T>
+void ForEncoder<T>::put_batch(const T *in_data, size_t count) {
+    if (_buffered_values_num + count < ForCoding::FRAME_VALUE_NUM) {
+        copy_value(in_data, count);
+        _count += count;
+        return;
+    }
+
+    // 1. padding one frame
+    size_t padding_num = ForCoding::FRAME_VALUE_NUM - _buffered_values_num;
+    in_data = copy_value(in_data, padding_num);
+    bit_packing_one_frame_value(_buffered_values);
+
+    // 2. process frame by frame
+    size_t frame_size = (count - padding_num) / ForCoding::FRAME_VALUE_NUM;
+    for(size_t i = 0; i < frame_size; i ++) {
+        // directly encode value to the bit_writer, don't buffer the value
+        _buffered_values_num = ForCoding::FRAME_VALUE_NUM;
+        bit_packing_one_frame_value(in_data);
+        in_data += ForCoding::FRAME_VALUE_NUM;
+    }
+
+    // 3. process remaining value
+    size_t remaining_num = (count - padding_num) % ForCoding::FRAME_VALUE_NUM;
+    if (remaining_num > 0) {
+        copy_value(in_data, remaining_num);
+    }
+
+    _count += count;
+}
+
+template<typename T>
+void ForEncoder<T>::bit_packing_one_frame_value(const T* input) {
+    _frame_offsets.push_back(_bit_writer.bits_written());
+
+    T min = input[0];
+    T max = input[0];
+    for (uint8_t i = 1; i < _buffered_values_num; ++i) {
+        if (input[i] < min) {
+            min = input[i];
+            continue;
+        }
+
+        if (input[i] > max) {
+            max = input[i];
+        }
+    }
+
+    uint32_t bit_width = 0;
+    if (sizeof(T) == 8) {
+        bit_width = bits64(static_cast<T>(max - min));
+    } else {
+        bit_width = bits(static_cast<T>(max - min));
+    }
+
+    _bit_writer.PutValue(min, sizeof(T) * ForCoding::BYTE_SIZE);
 
 Review comment:
   The problems with read/write everything via BitReader/BitWriter are
   
   1. It doesn't deal with endianness correctly. The storage format requires 
that fixed length integers are written in little endian. When you have big 
endian machine, BitWriter doesn't write little endian ints. Use 
`put_fixed32_le` and `decode_fixed32_le` is the way to go.
   2. It makes us difficult to switch to a better-performant bit-packing 
library due to the use of mixed bit width.
   3. It actually makes the code more difficult to read and write. If we always 
read/write a whole frame with fixed bit width via BitReader/BitWriter, there is 
no need to care about bit position any more, and it would be much more easy to 
understand and less error-prone.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to