This is an automated email from the ASF dual-hosted git repository.

adonisling pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new f50054f547 [Enhancement](array-type) record offsets info to speed up 
the seek performance (#12293)
f50054f547 is described below

commit f50054f54709ae7a649c34e801e20004f7d13770
Author: lihangyu <[email protected]>
AuthorDate: Wed Sep 14 22:41:54 2022 +0800

    [Enhancement](array-type) record offsets info to speed up the seek 
performance (#12293)
    
    Store the offset rather than the length in file for the data with array 
type. The new file format can improve the seek performance. Please refer to 
#12246 to get the performance report.
    
    Co-authored-by: xy720 <[email protected]>
---
 be/src/olap/column_vector.cpp                   | 10 ++++
 be/src/olap/column_vector.h                     |  7 +++
 be/src/olap/rowset/segment_v2/column_reader.cpp | 80 ++++++++++++++++++++-----
 be/src/olap/rowset/segment_v2/column_reader.h   | 46 ++++----------
 be/src/olap/rowset/segment_v2/column_writer.cpp | 34 +++++------
 be/src/olap/rowset/segment_v2/column_writer.h   |  6 +-
 be/src/olap/rowset/segment_v2/parsed_page.h     |  7 ++-
 gensrc/proto/segment_v2.proto                   |  4 +-
 8 files changed, 116 insertions(+), 78 deletions(-)

diff --git a/be/src/olap/column_vector.cpp b/be/src/olap/column_vector.cpp
index 36fd451a21..2c31fbd501 100644
--- a/be/src/olap/column_vector.cpp
+++ b/be/src/olap/column_vector.cpp
@@ -224,6 +224,16 @@ Status ArrayColumnVectorBatch::resize(size_t new_cap) {
     return Status::OK();
 }
 
+void ArrayColumnVectorBatch::put_item_ordinal(segment_v2::ordinal_t* ordinals, 
size_t start_idx,
+                                              size_t size) {
+    DCHECK(size > 0);
+    size_t first_offset = *(_offsets->scalar_cell_ptr(start_idx));
+    for (size_t i = 1; i < size; ++i) {
+        segment_v2::ordinal_t first_ordinal = ordinals[0];
+        *(_offsets->scalar_cell_ptr(start_idx + i)) = first_offset + 
(ordinals[i] - first_ordinal);
+    }
+}
+
 void ArrayColumnVectorBatch::get_offset_by_length(size_t start_idx, size_t 
size) {
     DCHECK(start_idx >= 0);
     DCHECK(start_idx + size < _offsets->capacity());
diff --git a/be/src/olap/column_vector.h b/be/src/olap/column_vector.h
index 28139fe7d2..33c59065f7 100644
--- a/be/src/olap/column_vector.h
+++ b/be/src/olap/column_vector.h
@@ -233,6 +233,13 @@ public:
      */
     void get_offset_by_length(size_t start_idx, size_t size);
 
+    // From `start_idx`, put `size` ordinals to _item_offsets
+    // Ex:
+    // original _item_offsets: 0 3 5 9; ordinals to be added: 100 105 111; 
size: 3; start_idx: 3
+    // --> _item_offsets: 0 3 5 9 (9 + 105 - 100) (9 + 111 - 100)
+    // _item_offsets becomes 0 3 5 9 14 20
+    void put_item_ordinal(segment_v2::ordinal_t* ordinals, size_t start_idx, 
size_t size);
+
     size_t get_item_size(size_t start_idx, size_t size) {
         return *(_offsets->scalar_cell_ptr(start_idx + size)) -
                *(_offsets->scalar_cell_ptr(start_idx));
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 31d07a1560..c9dc6e2c32 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -387,7 +387,7 @@ 
ArrayFileColumnIterator::ArrayFileColumnIterator(ColumnReader* reader,
                                                  ColumnIterator* item_iterator,
                                                  ColumnIterator* null_iterator)
         : _array_reader(reader) {
-    _length_iterator.reset(offset_reader);
+    _offset_iterator.reset(offset_reader);
     _item_iterator.reset(item_iterator);
     if (_array_reader->is_nullable()) {
         _null_iterator.reset(null_iterator);
@@ -395,7 +395,7 @@ 
ArrayFileColumnIterator::ArrayFileColumnIterator(ColumnReader* reader,
 }
 
 Status ArrayFileColumnIterator::init(const ColumnIteratorOptions& opts) {
-    RETURN_IF_ERROR(_length_iterator->init(opts));
+    RETURN_IF_ERROR(_offset_iterator->init(opts));
     RETURN_IF_ERROR(_item_iterator->init(opts));
     if (_array_reader->is_nullable()) {
         RETURN_IF_ERROR(_null_iterator->init(opts));
@@ -406,22 +406,42 @@ Status ArrayFileColumnIterator::init(const 
ColumnIteratorOptions& opts) {
     return Status::OK();
 }
 
+Status ArrayFileColumnIterator::_peek_one_offset(ordinal_t* offset) {
+    if (_offset_iterator->get_current_page()->has_remaining()) {
+        PageDecoder* offset_page_decoder = 
_offset_iterator->get_current_page()->data_decoder;
+        ColumnBlock ordinal_block(_length_batch.get(), nullptr);
+        ColumnBlockView ordinal_view(&ordinal_block);
+        size_t i = 1;
+        RETURN_IF_ERROR(offset_page_decoder->peek_next_batch(&i, 
&ordinal_view)); // not null
+        DCHECK(i == 1);
+        *offset = *reinterpret_cast<uint64_t*>(_length_batch->data());
+    } else {
+        *offset = 
_offset_iterator->get_current_page()->next_array_item_ordinal;
+    }
+    return Status::OK();
+}
+
 Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, 
bool* has_null) {
     ColumnBlock* array_block = dst->column_block();
     auto* array_batch = 
static_cast<ArrayColumnVectorBatch*>(array_block->vector_batch());
 
-    // 1. read n offsets
+    // 1. read n+1 offsets
+    array_batch->offsets()->resize(*n + 1);
     ColumnBlock offset_block(array_batch->offsets(), nullptr);
-    ColumnBlockView offset_view(&offset_block,
-                                dst->current_offset() + 1); // 
offset应该比collection的游标多1
+    ColumnBlockView offset_view(&offset_block);
     bool offset_has_null = false;
-    RETURN_IF_ERROR(_length_iterator->next_batch(n, &offset_view, 
&offset_has_null));
+    RETURN_IF_ERROR(_offset_iterator->next_batch(n, &offset_view, 
&offset_has_null));
     DCHECK(!offset_has_null);
 
     if (*n == 0) {
         return Status::OK();
     }
-    array_batch->get_offset_by_length(dst->current_offset(), *n);
+
+    
RETURN_IF_ERROR(_peek_one_offset(reinterpret_cast<ordinal_t*>(offset_view.data())));
+
+    size_t start_offset = dst->current_offset();
+    auto* ordinals = reinterpret_cast<ordinal_t*>(offset_block.data());
+    array_batch->put_item_ordinal(ordinals, start_offset, *n + 1);
 
     // 2. read null
     if (_array_reader->is_nullable()) {
@@ -439,7 +459,7 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, 
ColumnBlockView* dst, bool
     }
 
     // read item
-    size_t item_size = array_batch->get_item_size(dst->current_offset(), *n);
+    size_t item_size = ordinals[*n] - ordinals[0];
     if (item_size >= 0) {
         bool item_has_null = false;
         ColumnVectorBatch* item_vector_batch = array_batch->elements();
@@ -466,6 +486,39 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, 
ColumnBlockView* dst, bool
     return Status::OK();
 }
 
+Status ArrayFileColumnIterator::_seek_by_offsets(ordinal_t ord) {
+    // using offsets info
+    ordinal_t offset = 0;
+    RETURN_IF_ERROR(_peek_one_offset(&offset));
+    RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(offset));
+    return Status::OK();
+}
+
+Status ArrayFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
+    RETURN_IF_ERROR(_offset_iterator->seek_to_ordinal(ord));
+    if (_array_reader->is_nullable()) {
+        RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
+    }
+    return _seek_by_offsets(ord);
+}
+
+Status ArrayFileColumnIterator::_calculate_offsets(
+        ssize_t start, vectorized::ColumnArray::ColumnOffsets& column_offsets) 
{
+    ordinal_t last_offset = 0;
+    RETURN_IF_ERROR(_peek_one_offset(&last_offset));
+
+    // calculate real offsets
+    auto& offsets_data = column_offsets.get_data();
+    ordinal_t first_offset = offsets_data[start - 1]; // -1 is valid
+    ordinal_t first_ord = offsets_data[start];
+    for (ssize_t i = start; i < offsets_data.size() - 1; ++i) {
+        offsets_data[i] = first_offset + (offsets_data[i + 1] - first_ord);
+    }
+    // last offset
+    offsets_data[offsets_data.size() - 1] = first_offset + (last_offset - 
first_ord);
+    return Status::OK();
+}
+
 Status ArrayFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
                                            bool* has_null) {
     const auto* column_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
@@ -475,19 +528,16 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnP
     bool offsets_has_null = false;
     auto column_offsets_ptr = 
column_array->get_offsets_column().assume_mutable();
     ssize_t start = column_offsets_ptr->size();
-    RETURN_IF_ERROR(_length_iterator->next_batch(n, column_offsets_ptr, 
&offsets_has_null));
+    RETURN_IF_ERROR(_offset_iterator->next_batch(n, column_offsets_ptr, 
&offsets_has_null));
     if (*n == 0) {
         return Status::OK();
     }
     auto& column_offsets =
             
static_cast<vectorized::ColumnArray::ColumnOffsets&>(*column_offsets_ptr);
-    auto& offsets_data = column_offsets.get_data();
-    for (ssize_t i = start; i < offsets_data.size(); ++i) {
-        offsets_data[i] += offsets_data[i - 1]; // -1 is ok
-    }
-
+    RETURN_IF_ERROR(_calculate_offsets(start, column_offsets));
+    size_t num_items =
+            column_offsets.get_data().back() - column_offsets.get_data()[start 
- 1]; // -1 is valid
     auto column_items_ptr = column_array->get_data().assume_mutable();
-    size_t num_items = offsets_data.back() - offsets_data[start - 1];
     if (num_items > 0) {
         size_t num_read = num_items;
         bool items_has_null = false;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index 60b8c3ae7b..92b6572e4e 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -38,6 +38,7 @@
 #include "olap/tablet_schema.h"
 #include "util/file_cache.h"
 #include "util/once.h"
+#include "vec/columns/column_array.h" // ColumnArray
 
 namespace doris {
 
@@ -381,7 +382,7 @@ public:
                           vectorized::MutableColumnPtr& dst) override;
 
     Status seek_to_first() override {
-        RETURN_IF_ERROR(_length_iterator->seek_to_first());
+        RETURN_IF_ERROR(_offset_iterator->seek_to_first());
         RETURN_IF_ERROR(_item_iterator->seek_to_first()); // lazy???
         if (_array_reader->is_nullable()) {
             RETURN_IF_ERROR(_null_iterator->seek_to_first());
@@ -389,50 +390,23 @@ public:
         return Status::OK();
     }
 
-    Status seek_to_ordinal(ordinal_t ord) override {
-        RETURN_IF_ERROR(_length_iterator->seek_to_ordinal(ord));
-        if (_array_reader->is_nullable()) {
-            RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
-        }
-
-        RETURN_IF_ERROR(_length_iterator->seek_to_page_start());
-        if (_length_iterator->get_current_ordinal() == ord) {
-            RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(
-                    
_length_iterator->get_current_page()->first_array_item_ordinal));
-        } else {
-            ordinal_t start_offset_in_this_page =
-                    
_length_iterator->get_current_page()->first_array_item_ordinal;
-            ColumnBlock ordinal_block(_length_batch.get(), nullptr);
-            ordinal_t size_to_read = ord - 
_length_iterator->get_current_ordinal();
-            bool has_null = false;
-            ordinal_t item_ordinal = start_offset_in_this_page;
-            while (size_to_read > 0) {
-                size_t this_read = _length_batch->capacity() < size_to_read
-                                           ? _length_batch->capacity()
-                                           : size_to_read;
-                ColumnBlockView ordinal_view(&ordinal_block);
-                RETURN_IF_ERROR(_length_iterator->next_batch(&this_read, 
&ordinal_view, &has_null));
-                auto* ordinals = 
reinterpret_cast<uint64_t*>(_length_batch->data());
-                for (int i = 0; i < this_read; ++i) {
-                    item_ordinal += ordinals[i];
-                }
-                size_to_read -= this_read;
-            }
-            RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(item_ordinal));
-        }
-        return Status::OK();
-    }
+    Status seek_to_ordinal(ordinal_t ord) override;
 
     ordinal_t get_current_ordinal() const override {
-        return _length_iterator->get_current_ordinal();
+        return _offset_iterator->get_current_ordinal();
     }
 
 private:
     ColumnReader* _array_reader;
-    std::unique_ptr<FileColumnIterator> _length_iterator;
+    std::unique_ptr<FileColumnIterator> _offset_iterator;
     std::unique_ptr<ColumnIterator> _null_iterator;
     std::unique_ptr<ColumnIterator> _item_iterator;
     std::unique_ptr<ColumnVectorBatch> _length_batch;
+
+    Status _peek_one_offset(ordinal_t* offset);
+    Status _seek_by_offsets(ordinal_t ord);
+    Status _calculate_offsets(ssize_t start,
+                              vectorized::ColumnArray::ColumnOffsets& 
column_offsets);
 };
 
 // This iterator is used to read default value column
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp 
b/be/src/olap/rowset/segment_v2/column_writer.cpp
index ca71331165..e49f0270e3 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -519,24 +519,25 @@ ArrayColumnWriter::ArrayColumnWriter(const 
ColumnWriterOptions& opts, std::uniqu
         : ColumnWriter(std::move(field), opts.meta->is_nullable()),
           _item_writer(std::move(item_writer)),
           _opts(opts) {
-    _length_writer.reset(length_writer);
+    _offset_writer.reset(length_writer);
     if (is_nullable()) {
         _null_writer.reset(null_writer);
     }
 }
 
 Status ArrayColumnWriter::init() {
-    RETURN_IF_ERROR(_length_writer->init());
+    RETURN_IF_ERROR(_offset_writer->init());
     if (is_nullable()) {
         RETURN_IF_ERROR(_null_writer->init());
     }
     RETURN_IF_ERROR(_item_writer->init());
-    _length_writer->register_flush_page_callback(this);
+    _offset_writer->register_flush_page_callback(this);
+
     return Status::OK();
 }
 
 Status ArrayColumnWriter::put_extra_info_in_page(DataPageFooterPB* footer) {
-    footer->set_first_array_item_ordinal(_current_length_page_first_ordinal);
+    footer->set_next_array_item_ordinal(_item_writer->get_next_rowid());
     return Status::OK();
 }
 
@@ -547,14 +548,12 @@ Status ArrayColumnWriter::append_data(const uint8_t** 
ptr, size_t num_rows) {
     while (remaining > 0) {
         // TODO llj: bulk write
         size_t num_written = 1;
-        auto size_ptr = col_cursor->length();
-        RETURN_IF_ERROR(_length_writer->append_data_in_current_page(
-                reinterpret_cast<uint8_t*>(&size_ptr), &num_written));
+        ordinal_t next_item_ordinal = _item_writer->get_next_rowid();
+        RETURN_IF_ERROR(_offset_writer->append_data_in_current_page(
+                reinterpret_cast<uint8_t*>(&next_item_ordinal), &num_written));
         if (num_written <
             1) { // page is full, write first item offset and update current 
length page's start ordinal
-            RETURN_IF_ERROR(_length_writer->finish_current_page());
-            _current_length_page_first_ordinal += _length_sum_in_cur_page;
-            _length_sum_in_cur_page = 0;
+            RETURN_IF_ERROR(_offset_writer->finish_current_page());
         } else {
             // write child item.
             if (_item_writer->is_nullable()) {
@@ -568,7 +567,6 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr, 
size_t num_rows) {
                 
RETURN_IF_ERROR(_item_writer->append_data(reinterpret_cast<const 
uint8_t**>(&data),
                                                           
col_cursor->length()));
             }
-            _length_sum_in_cur_page += col_cursor->length();
         }
         remaining -= num_written;
         col_cursor += num_written;
@@ -582,13 +580,13 @@ Status ArrayColumnWriter::append_data(const uint8_t** 
ptr, size_t num_rows) {
 }
 
 uint64_t ArrayColumnWriter::estimate_buffer_size() {
-    return _length_writer->estimate_buffer_size() +
+    return _offset_writer->estimate_buffer_size() +
            (is_nullable() ? _null_writer->estimate_buffer_size() : 0) +
            _item_writer->estimate_buffer_size();
 }
 
 Status ArrayColumnWriter::finish() {
-    RETURN_IF_ERROR(_length_writer->finish());
+    RETURN_IF_ERROR(_offset_writer->finish());
     if (is_nullable()) {
         RETURN_IF_ERROR(_null_writer->finish());
     }
@@ -597,7 +595,7 @@ Status ArrayColumnWriter::finish() {
 }
 
 Status ArrayColumnWriter::write_data() {
-    RETURN_IF_ERROR(_length_writer->write_data());
+    RETURN_IF_ERROR(_offset_writer->write_data());
     if (is_nullable()) {
         RETURN_IF_ERROR(_null_writer->write_data());
     }
@@ -606,7 +604,7 @@ Status ArrayColumnWriter::write_data() {
 }
 
 Status ArrayColumnWriter::write_ordinal_index() {
-    RETURN_IF_ERROR(_length_writer->write_ordinal_index());
+    RETURN_IF_ERROR(_offset_writer->write_ordinal_index());
     if (is_nullable()) {
         RETURN_IF_ERROR(_null_writer->write_ordinal_index());
     }
@@ -618,11 +616,11 @@ Status ArrayColumnWriter::write_ordinal_index() {
 
 Status ArrayColumnWriter::append_nulls(size_t num_rows) {
     size_t num_lengths = num_rows;
-    const ordinal_t zero = 0;
+    const ordinal_t offset = _item_writer->get_next_rowid();
     while (num_lengths > 0) {
         // TODO llj bulk write
-        const auto* zero_ptr = reinterpret_cast<const uint8_t*>(&zero);
-        RETURN_IF_ERROR(_length_writer->append_data(&zero_ptr, 1));
+        const auto* offset_ptr = reinterpret_cast<const uint8_t*>(&offset);
+        RETURN_IF_ERROR(_offset_writer->append_data(&offset_ptr, 1));
         --num_lengths;
     }
     return write_null_column(num_rows, true);
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h 
b/be/src/olap/rowset/segment_v2/column_writer.h
index 2c3ea79781..68fef318e5 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -292,7 +292,7 @@ public:
         }
         return Status::OK();
     }
-    ordinal_t get_next_rowid() const override { return 
_length_writer->get_next_rowid(); }
+    ordinal_t get_next_rowid() const override { return 
_offset_writer->get_next_rowid(); }
 
 private:
     Status put_extra_info_in_page(DataPageFooterPB* header) override;
@@ -300,12 +300,10 @@ private:
     bool has_empty_items() const { return _item_writer->get_next_rowid() == 0; 
}
 
 private:
-    std::unique_ptr<ScalarColumnWriter> _length_writer;
+    std::unique_ptr<ScalarColumnWriter> _offset_writer;
     std::unique_ptr<ScalarColumnWriter> _null_writer;
     std::unique_ptr<ColumnWriter> _item_writer;
     ColumnWriterOptions _opts;
-    ordinal_t _current_length_page_first_ordinal = 0;
-    ordinal_t _length_sum_in_cur_page = 0;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h 
b/be/src/olap/rowset/segment_v2/parsed_page.h
index 232c347c38..f3b6ed7b55 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -68,7 +68,7 @@ struct ParsedPage {
         page->page_pointer = page_pointer;
         page->page_index = page_index;
 
-        page->first_array_item_ordinal = footer.first_array_item_ordinal();
+        page->next_array_item_ordinal = footer.next_array_item_ordinal();
 
         return Status::OK();
     }
@@ -89,8 +89,9 @@ struct ParsedPage {
     ordinal_t first_ordinal = 0;
     // number of rows including nulls and not-nulls
     ordinal_t num_rows = 0;
-    // just for array type
-    ordinal_t first_array_item_ordinal = 0;
+    // record it to get the last array element's size
+    // should be none zero if setted in page
+    ordinal_t next_array_item_ordinal = 0;
 
     PagePointer page_pointer;
     uint32_t page_index = 0;
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index fe0fcd9d93..7a69286046 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -69,8 +69,8 @@ message DataPageFooterPB {
     // required: size of nullmap, 0 if the page doesn't contain NULL
     optional uint32 nullmap_size = 3;
     // only for array column
-    // Save the first array's first item's ordinal.
-    optional uint64 first_array_item_ordinal = 4;
+    // Save the offset of next page 
+    optional uint64 next_array_item_ordinal = 4;
 }
 
 message IndexPageFooterPB {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to