This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 6fe6656d955 [enhancement](parquet)Optimize the performance of parquet 
reader when decode RLE_DICTIONARY encoding (#57208)
6fe6656d955 is described below

commit 6fe6656d955b5294e62de6c4454daf6d26483e2c
Author: daidai <[email protected]>
AuthorDate: Fri Oct 31 13:04:02 2025 +0800

    [enhancement](parquet)Optimize the performance of parquet reader when 
decode RLE_DICTIONARY encoding (#57208)
    
    ### What problem does this PR solve?
    Problem Summary:
    When parsing RLE_DICTIONARY encoding, the parquet reader uniformly uses
    memcpy. However, for INT32, INT64, etc., direct assignment is faster
    than memcpy.
    
    In Parquet dictionary encoding, the actual data is not stored
    contiguously, resulting in very small memcpy sizes. When analyzing the
    implementation of `memcpy`, we can see that for such small sizes,
    `__builtin_memcpy` is used instead. The implementation of
    `__builtin_memcpy` essentially behaves like a series of simple
    assignments. You can observe the corresponding assembly code here:
    https://godbolt.org/z/r9Ma1ozvd.
---
 be/src/vec/exec/format/parquet/decoder.cpp         | 12 +++-
 .../format/parquet/fix_length_dict_decoder.hpp     | 65 +++++++++++++++++++---
 .../parquet/fix_length_dict_decoder_test.cpp       |  4 +-
 3 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/decoder.cpp 
b/be/src/vec/exec/format/parquet/decoder.cpp
index b192bef3036..fbba7136269 100644
--- a/be/src/vec/exec/format/parquet/decoder.cpp
+++ b/be/src/vec/exec/format/parquet/decoder.cpp
@@ -63,12 +63,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, 
tparquet::Encoding::type
             decoder.reset(new ByteArrayDictDecoder());
             break;
         case tparquet::Type::INT32:
+            decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT32>());
+            break;
         case tparquet::Type::INT64:
+            decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT64>());
+            break;
         case tparquet::Type::INT96:
+            decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT96>());
+            break;
         case tparquet::Type::FLOAT:
+            decoder.reset(new FixLengthDictDecoder<tparquet::Type::FLOAT>());
+            break;
         case tparquet::Type::DOUBLE:
+            decoder.reset(new FixLengthDictDecoder<tparquet::Type::DOUBLE>());
+            break;
         case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
-            decoder.reset(new FixLengthDictDecoder());
+            decoder.reset(new 
FixLengthDictDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
             break;
         default:
             return Status::InternalError("Unsupported type {}(encoding={}) in 
parquet decoder",
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp 
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index c932c15b30f..2ebb8ce08d0 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -25,8 +25,44 @@
 
 namespace doris::vectorized {
 #include "common/compile_check_begin.h"
+
+template <tparquet::Type::type type>
+struct PhysicalTypeTraits {};
+
+template <>
+struct PhysicalTypeTraits<tparquet::Type::INT32> {
+    using CppType = int32_t;
+};
+
+template <>
+struct PhysicalTypeTraits<tparquet::Type::INT64> {
+    using CppType = int64_t;
+};
+
+template <>
+struct PhysicalTypeTraits<tparquet::Type::INT96> {
+    using CppType = ParquetInt96;
+};
+
+template <>
+struct PhysicalTypeTraits<tparquet::Type::FLOAT> {
+    using CppType = float;
+};
+
+template <>
+struct PhysicalTypeTraits<tparquet::Type::DOUBLE> {
+    using CppType = double;
+};
+
+template <>
+struct PhysicalTypeTraits<tparquet::Type::FIXED_LEN_BYTE_ARRAY> {
+    using CppType = Slice;
+};
+
+template <tparquet::Type::type PhysicalType>
 class FixLengthDictDecoder final : public BaseDictDecoder {
 public:
+    using cppType = PhysicalTypeTraits<PhysicalType>::CppType;
     FixLengthDictDecoder() = default;
     ~FixLengthDictDecoder() override = default;
 
@@ -46,9 +82,12 @@ public:
         if (doris_column->is_column_dictionary() &&
             assert_cast<ColumnDictI32&>(*doris_column).dict_size() == 0) {
             std::vector<StringRef> dict_items;
+
+            char* dict_item_address = (char*)_dict.get();
             dict_items.reserve(_dict_items.size());
             for (int i = 0; i < _dict_items.size(); ++i) {
-                dict_items.emplace_back(_dict_items[i], _type_length);
+                dict_items.emplace_back(dict_item_address, _type_length);
+                dict_item_address += _type_length;
             }
             assert_cast<ColumnDictI32&>(*doris_column)
                     .insert_many_dict_data(dict_items.data(),
@@ -82,8 +121,12 @@ protected:
             switch (read_type) {
             case ColumnSelectVector::CONTENT: {
                 for (size_t i = 0; i < run_length; ++i) {
-                    memcpy(raw_data + data_index, 
_dict_items[_indexes[dict_index++]],
-                           _type_length);
+                    if constexpr (PhysicalType == 
tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
+                        auto& slice = _dict_items[_indexes[dict_index++]];
+                        memcpy(raw_data + data_index, slice.get_data(), 
_type_length);
+                    } else {
+                        *(cppType*)(raw_data + data_index) = 
_dict_items[_indexes[dict_index++]];
+                    }
                     data_index += _type_length;
                 }
                 break;
@@ -117,7 +160,11 @@ protected:
         char* dict_item_address = reinterpret_cast<char*>(_dict.get());
         _dict_items.resize(num_values);
         for (size_t i = 0; i < num_values; ++i) {
-            _dict_items[i] = dict_item_address;
+            if constexpr (PhysicalType == 
tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
+                _dict_items[i] = Slice {dict_item_address, 
(size_t)_type_length};
+            } else {
+                _dict_items[i] = *((cppType*)dict_item_address);
+            }
             dict_item_address += _type_length;
         }
         return Status::OK();
@@ -127,8 +174,10 @@ protected:
         size_t dict_items_size = _dict_items.size();
         std::vector<StringRef> dict_values;
         dict_values.reserve(dict_items_size);
+        auto* dict_item_address = (const char*)_dict.get();
         for (size_t i = 0; i < dict_items_size; ++i) {
-            dict_values.emplace_back(_dict_items[i], _type_length);
+            dict_values.emplace_back(dict_item_address, _type_length);
+            dict_item_address += _type_length;
         }
         doris_column->insert_many_strings(&dict_values[0], dict_items_size);
         return Status::OK();
@@ -139,14 +188,16 @@ protected:
         std::vector<StringRef> dict_values;
         dict_values.reserve(dict_column->size());
         const auto& data = dict_column->get_data();
+        auto* dict_item_address = (const char*)_dict.get();
+
         for (size_t i = 0; i < dict_column->size(); ++i) {
-            dict_values.emplace_back(_dict_items[data[i]], _type_length);
+            dict_values.emplace_back(dict_item_address + data[i] * 
_type_length, _type_length);
         }
         res->insert_many_strings(&dict_values[0], dict_values.size());
         return res;
     }
     // For dictionary encoding
-    std::vector<char*> _dict_items;
+    std::vector<typename PhysicalTypeTraits<PhysicalType>::CppType> 
_dict_items;
 };
 #include "common/compile_check_end.h"
 
diff --git a/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp 
b/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp
index 06311bfb2b8..b025cdf13cf 100644
--- a/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp
+++ b/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp
@@ -44,7 +44,7 @@ protected:
         ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, 
dict_size).ok());
     }
 
-    FixLengthDictDecoder _decoder;
+    FixLengthDictDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY> _decoder;
     size_t _type_length;
 };
 
@@ -200,7 +200,7 @@ TEST_F(FixLengthDictDecoderTest, 
test_decode_with_filter_and_null) {
 
 // Test empty dictionary case
 TEST_F(FixLengthDictDecoderTest, test_empty_dict) {
-    FixLengthDictDecoder empty_decoder;
+    FixLengthDictDecoder<tparquet::Type::INT32> empty_decoder;
     empty_decoder.set_type_length(sizeof(int32_t));
 
     auto dict_data = make_unique_buffer<uint8_t>(0);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to