This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch tpc_preview3
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/tpc_preview3 by this push:
     new 1751b539114 [Performance](exec) reduce the rle decoder cost in null map
1751b539114 is described below

commit 1751b53911458f0831ab61cd7ae737f09a1968e7
Author: happenlee <[email protected]>
AuthorDate: Wed Dec 10 14:18:29 2025 +0800

    [Performance](exec) reduce the rle decoder cost in null map
---
 be/src/olap/rowset/segment_v2/column_reader.cpp | 92 +++++++++++++++----------
 be/src/olap/rowset/segment_v2/column_reader.h   |  2 +
 be/src/olap/rowset/segment_v2/parsed_page.h     | 20 +++---
 3 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 12bdad521bc..e9154e920cb 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1635,22 +1635,27 @@ Status 
FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, ordinal_t offs
         return Status::OK();
     }
 
+    auto num_nulls = [this](ordinal_t start, ordinal_t end) {
+        auto null_count = 0;
+        for (auto i = start; i < end; i++) {
+            null_count += _page.null_maps[i];
+        }
+        return null_count;
+    };
     ordinal_t pos_in_data = offset_in_page;
-    if (_page.has_null) {
+    if (!_page.null_maps.empty()) {
         ordinal_t offset_in_data = 0;
         ordinal_t skips = offset_in_page;
+        auto skip_nulls = 0;
 
         if (offset_in_page > page->offset_in_page) {
+            skip_nulls = num_nulls(page->offset_in_page, offset_in_page);
             // forward, reuse null bitmap
             skips = offset_in_page - page->offset_in_page;
             offset_in_data = page->data_decoder->current_index();
         } else {
-            // rewind null bitmap, and
-            page->null_decoder = RleDecoder<bool>((const 
uint8_t*)page->null_bitmap.data,
-                                                  
cast_set<int>(page->null_bitmap.size), 1);
+            skip_nulls = num_nulls(0, offset_in_page);
         }
-
-        auto skip_nulls = page->null_decoder.Skip(skips);
         pos_in_data = offset_in_data + skips - skip_nulls;
     }
 
@@ -1663,6 +1668,17 @@ Status 
FileColumnIterator::next_batch_of_zone_map(size_t* n, vectorized::Mutable
     return _reader->next_batch_of_zone_map(n, dst);
 }
 
+std::pair<bool, int> FileColumnIterator::null_count(size_t nrows_to_read) {
+    bool is_null = _page.null_maps[_page.offset_in_page];
+    int i = 1;
+    for (; i < nrows_to_read; ++i) {
+        if (is_null != _page.null_maps[_page.offset_in_page + i]) {
+            break;
+        }
+    }
+    return std::make_pair(is_null, i);
+}
+
 Status FileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& 
dst,
                                       bool* has_null) {
     if (_reading_flag == ReadingFlag::SKIP_READING) {
@@ -1687,29 +1703,30 @@ Status FileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& d
         // number of rows to be read from this page
         size_t nrows_in_page = std::min(remaining, _page.remaining());
         size_t nrows_to_read = nrows_in_page;
-        if (_page.has_null) {
+        if (!_page.null_maps.empty()) {
+            auto* null_col =
+                    
vectorized::check_and_get_column<vectorized::ColumnNullable>(dst.get());
+            if (UNLIKELY(null_col == nullptr)) {
+                return Status::InternalError("unexpected column type in column 
reader");
+            }
+            auto& null_map = null_col->get_null_map_data();
+            auto nest_column = null_col->get_nested_column_ptr();
+
             while (nrows_to_read > 0) {
-                bool is_null = false;
-                size_t this_run = _page.null_decoder.GetNextRun(&is_null, 
nrows_to_read);
-                // we use num_rows only for CHECK
-                size_t num_rows = this_run;
-                if (!is_null) {
-                    RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, 
dst));
-                    DCHECK_EQ(this_run, num_rows);
+                bool is_null;
+                int i;
+                std::tie(is_null, i) = null_count(nrows_to_read);
+                if (is_null) {
+                    null_col->insert_many_defaults(i);
                 } else {
-                    *has_null = true;
-                    auto* null_col =
-                            
vectorized::check_and_get_column<vectorized::ColumnNullable>(dst.get());
-                    if (null_col != nullptr) {
-                        null_col->insert_many_defaults(this_run);
-                    } else {
-                        return Status::InternalError("unexpected column type 
in column reader");
-                    }
+                    null_map.resize_fill(null_map.size() + i, 0);
+                    size_t num_rows = i;
+                    RETURN_IF_ERROR(_page.data_decoder->next_batch(&num_rows, 
nest_column));
+                    DCHECK_EQ(i, num_rows);
                 }
-
-                nrows_to_read -= this_run;
-                _page.offset_in_page += this_run;
-                _current_ordinal += this_run;
+                nrows_to_read -= i;
+                _page.offset_in_page += i;
+                _current_ordinal += i;
             }
         } else {
             RETURN_IF_ERROR(_page.data_decoder->next_batch(&nrows_to_read, 
dst));
@@ -1742,7 +1759,7 @@ Status FileColumnIterator::read_by_rowids(const rowid_t* 
rowids, const size_t co
         // number of rows to be read from this page
         nrows_to_read = std::min(remaining, _page.remaining());
 
-        if (_page.has_null) {
+        if (!_page.null_maps.empty()) {
             size_t already_read = 0;
             while ((nrows_to_read - already_read) > 0) {
                 bool is_null = false;
@@ -1750,7 +1767,7 @@ Status FileColumnIterator::read_by_rowids(const rowid_t* 
rowids, const size_t co
                 if (UNLIKELY(this_run == 0)) {
                     break;
                 }
-                this_run = _page.null_decoder.GetNextRun(&is_null, this_run);
+                std::tie(is_null, this_run) = null_count(this_run);
                 size_t offset = total_read_count + already_read;
                 size_t this_read_count = 0;
                 rowid_t current_ordinal_in_page =
@@ -1764,14 +1781,15 @@ Status FileColumnIterator::read_by_rowids(const 
rowid_t* rowids, const size_t co
 
                 auto origin_index = _page.data_decoder->current_index();
                 if (this_read_count > 0) {
-                    if (is_null) {
-                        auto* null_col =
-                                
vectorized::check_and_get_column<vectorized::ColumnNullable>(
-                                        dst.get());
-                        if (UNLIKELY(null_col == nullptr)) {
-                            return Status::InternalError("unexpected column 
type in column reader");
-                        }
+                    auto* null_col =
+                            
vectorized::check_and_get_column<vectorized::ColumnNullable>(dst.get());
+                    if (UNLIKELY(null_col == nullptr)) {
+                        return Status::InternalError("unexpected column type 
in column reader");
+                    }
+                    auto& null_map = null_col->get_null_map_data();
+                    auto nest_column = null_col->get_nested_column_ptr();
 
+                    if (is_null) {
                         null_col->insert_many_defaults(this_read_count);
                     } else {
                         size_t read_count = this_read_count;
@@ -1781,7 +1799,9 @@ Status FileColumnIterator::read_by_rowids(const rowid_t* 
rowids, const size_t co
                         size_t page_start_off_in_decoder =
                                 _page.first_ordinal + _page.offset_in_page - 
origin_index;
                         RETURN_IF_ERROR(_page.data_decoder->read_by_rowids(
-                                &rowids[offset], page_start_off_in_decoder, 
&read_count, dst));
+                                &rowids[offset], page_start_off_in_decoder, 
&read_count,
+                                nest_column));
+                        null_map.resize_fill(null_map.size() + read_count, 0);
                         DCHECK_EQ(read_count, this_read_count);
                     }
                 }
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index 8e96e2f3faa..91d59996da0 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -435,6 +435,8 @@ public:
 
     Status next_batch_of_zone_map(size_t* n, vectorized::MutableColumnPtr& 
dst) override;
 
+    std::pair<bool, int> null_count(size_t nrows_to_read);
+
     Status read_by_rowids(const rowid_t* rowids, const size_t count,
                           vectorized::MutableColumnPtr& dst) override;
 
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h 
b/be/src/olap/rowset/segment_v2/parsed_page.h
index 399ec7b3604..7ef20adecfe 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -20,6 +20,7 @@
 #include <gen_cpp/segment_v2.pb.h>
 
 #include <memory>
+#include <vector>
 
 #include "common/status.h"
 #include "olap/rowset/segment_v2/binary_dict_page.h"
@@ -29,6 +30,7 @@
 #include "olap/rowset/segment_v2/page_decoder.h"
 #include "olap/rowset/segment_v2/page_handle.h"
 #include "util/rle_encoding.h"
+#include "util/slice.h"
 
 namespace doris {
 namespace segment_v2 {
@@ -46,12 +48,14 @@ struct ParsedPage {
         page->page_handle = std::move(handle);
 
         auto null_size = footer.nullmap_size();
-        page->has_null = null_size > 0;
-        page->null_bitmap = Slice(body.data + body.size - null_size, 
null_size);
-
-        if (page->has_null) {
-            page->null_decoder =
-                    RleDecoder<bool>((const uint8_t*)page->null_bitmap.data, 
null_size, 1);
+        auto null_bitmap = Slice(body.data + body.size - null_size, null_size);
+
+        if (null_size > 0) {
+            auto null_decoder = RleDecoder<bool>((const 
uint8_t*)null_bitmap.data, null_size, 1);
+            // Decode all null values into null_maps in advance
+            auto num_rows = footer.num_values();
+            page->null_maps.resize(num_rows);
+            null_decoder.get_values((bool*)page->null_maps.data(), num_rows);
         }
 
         Slice data_slice(body.data, body.size - null_size);
@@ -80,9 +84,7 @@ struct ParsedPage {
 
     PageHandle page_handle;
 
-    bool has_null;
-    Slice null_bitmap;
-    RleDecoder<bool> null_decoder;
+    std::vector<uint8_t> null_maps;
     std::unique_ptr<PageDecoder> data_decoder;
 
     // ordinal of the first value in this page


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to