This is an automated email from the ASF dual-hosted git repository.
adonisling pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new f50054f547 [Enhancement](array-type) record offsets info to speed up
the seek performance (#12293)
f50054f547 is described below
commit f50054f54709ae7a649c34e801e20004f7d13770
Author: lihangyu <[email protected]>
AuthorDate: Wed Sep 14 22:41:54 2022 +0800
[Enhancement](array-type) record offsets info to speed up the seek
performance (#12293)
Store the offset rather than the length in file for the data with array
type. The new file format can improve the seek performance. Please refer to
#12246 to get the performance report.
Co-authored-by: xy720 <[email protected]>
---
be/src/olap/column_vector.cpp | 10 ++++
be/src/olap/column_vector.h | 7 +++
be/src/olap/rowset/segment_v2/column_reader.cpp | 80 ++++++++++++++++++++-----
be/src/olap/rowset/segment_v2/column_reader.h | 46 ++++----------
be/src/olap/rowset/segment_v2/column_writer.cpp | 34 +++++------
be/src/olap/rowset/segment_v2/column_writer.h | 6 +-
be/src/olap/rowset/segment_v2/parsed_page.h | 7 ++-
gensrc/proto/segment_v2.proto | 4 +-
8 files changed, 116 insertions(+), 78 deletions(-)
diff --git a/be/src/olap/column_vector.cpp b/be/src/olap/column_vector.cpp
index 36fd451a21..2c31fbd501 100644
--- a/be/src/olap/column_vector.cpp
+++ b/be/src/olap/column_vector.cpp
@@ -224,6 +224,16 @@ Status ArrayColumnVectorBatch::resize(size_t new_cap) {
return Status::OK();
}
+void ArrayColumnVectorBatch::put_item_ordinal(segment_v2::ordinal_t* ordinals,
size_t start_idx,
+ size_t size) {
+ DCHECK(size > 0);
+ size_t first_offset = *(_offsets->scalar_cell_ptr(start_idx));
+ for (size_t i = 1; i < size; ++i) {
+ segment_v2::ordinal_t first_ordinal = ordinals[0];
+ *(_offsets->scalar_cell_ptr(start_idx + i)) = first_offset +
(ordinals[i] - first_ordinal);
+ }
+}
+
void ArrayColumnVectorBatch::get_offset_by_length(size_t start_idx, size_t
size) {
DCHECK(start_idx >= 0);
DCHECK(start_idx + size < _offsets->capacity());
diff --git a/be/src/olap/column_vector.h b/be/src/olap/column_vector.h
index 28139fe7d2..33c59065f7 100644
--- a/be/src/olap/column_vector.h
+++ b/be/src/olap/column_vector.h
@@ -233,6 +233,13 @@ public:
*/
void get_offset_by_length(size_t start_idx, size_t size);
+ // From `start_idx`, put `size` ordinals to _item_offsets
+ // Ex:
+ // original _item_offsets: 0 3 5 9; ordinals to be added: 100 105 111;
size: 3; start_idx: 3
+ // --> _item_offsets: 0 3 5 9 (9 + 105 - 100) (9 + 111 - 100)
+ // _item_offsets becomes 0 3 5 9 14 20
+ void put_item_ordinal(segment_v2::ordinal_t* ordinals, size_t start_idx,
size_t size);
+
size_t get_item_size(size_t start_idx, size_t size) {
return *(_offsets->scalar_cell_ptr(start_idx + size)) -
*(_offsets->scalar_cell_ptr(start_idx));
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 31d07a1560..c9dc6e2c32 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -387,7 +387,7 @@
ArrayFileColumnIterator::ArrayFileColumnIterator(ColumnReader* reader,
ColumnIterator* item_iterator,
ColumnIterator* null_iterator)
: _array_reader(reader) {
- _length_iterator.reset(offset_reader);
+ _offset_iterator.reset(offset_reader);
_item_iterator.reset(item_iterator);
if (_array_reader->is_nullable()) {
_null_iterator.reset(null_iterator);
@@ -395,7 +395,7 @@
ArrayFileColumnIterator::ArrayFileColumnIterator(ColumnReader* reader,
}
Status ArrayFileColumnIterator::init(const ColumnIteratorOptions& opts) {
- RETURN_IF_ERROR(_length_iterator->init(opts));
+ RETURN_IF_ERROR(_offset_iterator->init(opts));
RETURN_IF_ERROR(_item_iterator->init(opts));
if (_array_reader->is_nullable()) {
RETURN_IF_ERROR(_null_iterator->init(opts));
@@ -406,22 +406,42 @@ Status ArrayFileColumnIterator::init(const
ColumnIteratorOptions& opts) {
return Status::OK();
}
+Status ArrayFileColumnIterator::_peek_one_offset(ordinal_t* offset) {
+ if (_offset_iterator->get_current_page()->has_remaining()) {
+ PageDecoder* offset_page_decoder =
_offset_iterator->get_current_page()->data_decoder;
+ ColumnBlock ordinal_block(_length_batch.get(), nullptr);
+ ColumnBlockView ordinal_view(&ordinal_block);
+ size_t i = 1;
+ RETURN_IF_ERROR(offset_page_decoder->peek_next_batch(&i,
&ordinal_view)); // not null
+ DCHECK(i == 1);
+ *offset = *reinterpret_cast<uint64_t*>(_length_batch->data());
+ } else {
+ *offset =
_offset_iterator->get_current_page()->next_array_item_ordinal;
+ }
+ return Status::OK();
+}
+
Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst,
bool* has_null) {
ColumnBlock* array_block = dst->column_block();
auto* array_batch =
static_cast<ArrayColumnVectorBatch*>(array_block->vector_batch());
- // 1. read n offsets
+ // 1. read n+1 offsets
+ array_batch->offsets()->resize(*n + 1);
ColumnBlock offset_block(array_batch->offsets(), nullptr);
- ColumnBlockView offset_view(&offset_block,
- dst->current_offset() + 1); //
offset应该比collection的游标多1
+ ColumnBlockView offset_view(&offset_block);
bool offset_has_null = false;
- RETURN_IF_ERROR(_length_iterator->next_batch(n, &offset_view,
&offset_has_null));
+ RETURN_IF_ERROR(_offset_iterator->next_batch(n, &offset_view,
&offset_has_null));
DCHECK(!offset_has_null);
if (*n == 0) {
return Status::OK();
}
- array_batch->get_offset_by_length(dst->current_offset(), *n);
+
+
RETURN_IF_ERROR(_peek_one_offset(reinterpret_cast<ordinal_t*>(offset_view.data())));
+
+ size_t start_offset = dst->current_offset();
+ auto* ordinals = reinterpret_cast<ordinal_t*>(offset_block.data());
+ array_batch->put_item_ordinal(ordinals, start_offset, *n + 1);
// 2. read null
if (_array_reader->is_nullable()) {
@@ -439,7 +459,7 @@ Status ArrayFileColumnIterator::next_batch(size_t* n,
ColumnBlockView* dst, bool
}
// read item
- size_t item_size = array_batch->get_item_size(dst->current_offset(), *n);
+ size_t item_size = ordinals[*n] - ordinals[0];
if (item_size >= 0) {
bool item_has_null = false;
ColumnVectorBatch* item_vector_batch = array_batch->elements();
@@ -466,6 +486,39 @@ Status ArrayFileColumnIterator::next_batch(size_t* n,
ColumnBlockView* dst, bool
return Status::OK();
}
+Status ArrayFileColumnIterator::_seek_by_offsets(ordinal_t ord) {
+ // using offsets info
+ ordinal_t offset = 0;
+ RETURN_IF_ERROR(_peek_one_offset(&offset));
+ RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(offset));
+ return Status::OK();
+}
+
+Status ArrayFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
+ RETURN_IF_ERROR(_offset_iterator->seek_to_ordinal(ord));
+ if (_array_reader->is_nullable()) {
+ RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
+ }
+ return _seek_by_offsets(ord);
+}
+
+Status ArrayFileColumnIterator::_calculate_offsets(
+ ssize_t start, vectorized::ColumnArray::ColumnOffsets& column_offsets)
{
+ ordinal_t last_offset = 0;
+ RETURN_IF_ERROR(_peek_one_offset(&last_offset));
+
+ // calculate real offsets
+ auto& offsets_data = column_offsets.get_data();
+ ordinal_t first_offset = offsets_data[start - 1]; // -1 is valid
+ ordinal_t first_ord = offsets_data[start];
+ for (ssize_t i = start; i < offsets_data.size() - 1; ++i) {
+ offsets_data[i] = first_offset + (offsets_data[i + 1] - first_ord);
+ }
+ // last offset
+ offsets_data[offsets_data.size() - 1] = first_offset + (last_offset -
first_ord);
+ return Status::OK();
+}
+
Status ArrayFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnPtr& dst,
bool* has_null) {
const auto* column_array =
vectorized::check_and_get_column<vectorized::ColumnArray>(
@@ -475,19 +528,16 @@ Status ArrayFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnP
bool offsets_has_null = false;
auto column_offsets_ptr =
column_array->get_offsets_column().assume_mutable();
ssize_t start = column_offsets_ptr->size();
- RETURN_IF_ERROR(_length_iterator->next_batch(n, column_offsets_ptr,
&offsets_has_null));
+ RETURN_IF_ERROR(_offset_iterator->next_batch(n, column_offsets_ptr,
&offsets_has_null));
if (*n == 0) {
return Status::OK();
}
auto& column_offsets =
static_cast<vectorized::ColumnArray::ColumnOffsets&>(*column_offsets_ptr);
- auto& offsets_data = column_offsets.get_data();
- for (ssize_t i = start; i < offsets_data.size(); ++i) {
- offsets_data[i] += offsets_data[i - 1]; // -1 is ok
- }
-
+ RETURN_IF_ERROR(_calculate_offsets(start, column_offsets));
+ size_t num_items =
+ column_offsets.get_data().back() - column_offsets.get_data()[start
- 1]; // -1 is valid
auto column_items_ptr = column_array->get_data().assume_mutable();
- size_t num_items = offsets_data.back() - offsets_data[start - 1];
if (num_items > 0) {
size_t num_read = num_items;
bool items_has_null = false;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index 60b8c3ae7b..92b6572e4e 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -38,6 +38,7 @@
#include "olap/tablet_schema.h"
#include "util/file_cache.h"
#include "util/once.h"
+#include "vec/columns/column_array.h" // ColumnArray
namespace doris {
@@ -381,7 +382,7 @@ public:
vectorized::MutableColumnPtr& dst) override;
Status seek_to_first() override {
- RETURN_IF_ERROR(_length_iterator->seek_to_first());
+ RETURN_IF_ERROR(_offset_iterator->seek_to_first());
RETURN_IF_ERROR(_item_iterator->seek_to_first()); // lazy???
if (_array_reader->is_nullable()) {
RETURN_IF_ERROR(_null_iterator->seek_to_first());
@@ -389,50 +390,23 @@ public:
return Status::OK();
}
- Status seek_to_ordinal(ordinal_t ord) override {
- RETURN_IF_ERROR(_length_iterator->seek_to_ordinal(ord));
- if (_array_reader->is_nullable()) {
- RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
- }
-
- RETURN_IF_ERROR(_length_iterator->seek_to_page_start());
- if (_length_iterator->get_current_ordinal() == ord) {
- RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(
-
_length_iterator->get_current_page()->first_array_item_ordinal));
- } else {
- ordinal_t start_offset_in_this_page =
-
_length_iterator->get_current_page()->first_array_item_ordinal;
- ColumnBlock ordinal_block(_length_batch.get(), nullptr);
- ordinal_t size_to_read = ord -
_length_iterator->get_current_ordinal();
- bool has_null = false;
- ordinal_t item_ordinal = start_offset_in_this_page;
- while (size_to_read > 0) {
- size_t this_read = _length_batch->capacity() < size_to_read
- ? _length_batch->capacity()
- : size_to_read;
- ColumnBlockView ordinal_view(&ordinal_block);
- RETURN_IF_ERROR(_length_iterator->next_batch(&this_read,
&ordinal_view, &has_null));
- auto* ordinals =
reinterpret_cast<uint64_t*>(_length_batch->data());
- for (int i = 0; i < this_read; ++i) {
- item_ordinal += ordinals[i];
- }
- size_to_read -= this_read;
- }
- RETURN_IF_ERROR(_item_iterator->seek_to_ordinal(item_ordinal));
- }
- return Status::OK();
- }
+ Status seek_to_ordinal(ordinal_t ord) override;
ordinal_t get_current_ordinal() const override {
- return _length_iterator->get_current_ordinal();
+ return _offset_iterator->get_current_ordinal();
}
private:
ColumnReader* _array_reader;
- std::unique_ptr<FileColumnIterator> _length_iterator;
+ std::unique_ptr<FileColumnIterator> _offset_iterator;
std::unique_ptr<ColumnIterator> _null_iterator;
std::unique_ptr<ColumnIterator> _item_iterator;
std::unique_ptr<ColumnVectorBatch> _length_batch;
+
+ Status _peek_one_offset(ordinal_t* offset);
+ Status _seek_by_offsets(ordinal_t ord);
+ Status _calculate_offsets(ssize_t start,
+ vectorized::ColumnArray::ColumnOffsets&
column_offsets);
};
// This iterator is used to read default value column
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp
b/be/src/olap/rowset/segment_v2/column_writer.cpp
index ca71331165..e49f0270e3 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -519,24 +519,25 @@ ArrayColumnWriter::ArrayColumnWriter(const
ColumnWriterOptions& opts, std::uniqu
: ColumnWriter(std::move(field), opts.meta->is_nullable()),
_item_writer(std::move(item_writer)),
_opts(opts) {
- _length_writer.reset(length_writer);
+ _offset_writer.reset(length_writer);
if (is_nullable()) {
_null_writer.reset(null_writer);
}
}
Status ArrayColumnWriter::init() {
- RETURN_IF_ERROR(_length_writer->init());
+ RETURN_IF_ERROR(_offset_writer->init());
if (is_nullable()) {
RETURN_IF_ERROR(_null_writer->init());
}
RETURN_IF_ERROR(_item_writer->init());
- _length_writer->register_flush_page_callback(this);
+ _offset_writer->register_flush_page_callback(this);
+
return Status::OK();
}
Status ArrayColumnWriter::put_extra_info_in_page(DataPageFooterPB* footer) {
- footer->set_first_array_item_ordinal(_current_length_page_first_ordinal);
+ footer->set_next_array_item_ordinal(_item_writer->get_next_rowid());
return Status::OK();
}
@@ -547,14 +548,12 @@ Status ArrayColumnWriter::append_data(const uint8_t**
ptr, size_t num_rows) {
while (remaining > 0) {
// TODO llj: bulk write
size_t num_written = 1;
- auto size_ptr = col_cursor->length();
- RETURN_IF_ERROR(_length_writer->append_data_in_current_page(
- reinterpret_cast<uint8_t*>(&size_ptr), &num_written));
+ ordinal_t next_item_ordinal = _item_writer->get_next_rowid();
+ RETURN_IF_ERROR(_offset_writer->append_data_in_current_page(
+ reinterpret_cast<uint8_t*>(&next_item_ordinal), &num_written));
if (num_written <
1) { // page is full, write first item offset and update current
length page's start ordinal
- RETURN_IF_ERROR(_length_writer->finish_current_page());
- _current_length_page_first_ordinal += _length_sum_in_cur_page;
- _length_sum_in_cur_page = 0;
+ RETURN_IF_ERROR(_offset_writer->finish_current_page());
} else {
// write child item.
if (_item_writer->is_nullable()) {
@@ -568,7 +567,6 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr,
size_t num_rows) {
RETURN_IF_ERROR(_item_writer->append_data(reinterpret_cast<const
uint8_t**>(&data),
col_cursor->length()));
}
- _length_sum_in_cur_page += col_cursor->length();
}
remaining -= num_written;
col_cursor += num_written;
@@ -582,13 +580,13 @@ Status ArrayColumnWriter::append_data(const uint8_t**
ptr, size_t num_rows) {
}
uint64_t ArrayColumnWriter::estimate_buffer_size() {
- return _length_writer->estimate_buffer_size() +
+ return _offset_writer->estimate_buffer_size() +
(is_nullable() ? _null_writer->estimate_buffer_size() : 0) +
_item_writer->estimate_buffer_size();
}
Status ArrayColumnWriter::finish() {
- RETURN_IF_ERROR(_length_writer->finish());
+ RETURN_IF_ERROR(_offset_writer->finish());
if (is_nullable()) {
RETURN_IF_ERROR(_null_writer->finish());
}
@@ -597,7 +595,7 @@ Status ArrayColumnWriter::finish() {
}
Status ArrayColumnWriter::write_data() {
- RETURN_IF_ERROR(_length_writer->write_data());
+ RETURN_IF_ERROR(_offset_writer->write_data());
if (is_nullable()) {
RETURN_IF_ERROR(_null_writer->write_data());
}
@@ -606,7 +604,7 @@ Status ArrayColumnWriter::write_data() {
}
Status ArrayColumnWriter::write_ordinal_index() {
- RETURN_IF_ERROR(_length_writer->write_ordinal_index());
+ RETURN_IF_ERROR(_offset_writer->write_ordinal_index());
if (is_nullable()) {
RETURN_IF_ERROR(_null_writer->write_ordinal_index());
}
@@ -618,11 +616,11 @@ Status ArrayColumnWriter::write_ordinal_index() {
Status ArrayColumnWriter::append_nulls(size_t num_rows) {
size_t num_lengths = num_rows;
- const ordinal_t zero = 0;
+ const ordinal_t offset = _item_writer->get_next_rowid();
while (num_lengths > 0) {
// TODO llj bulk write
- const auto* zero_ptr = reinterpret_cast<const uint8_t*>(&zero);
- RETURN_IF_ERROR(_length_writer->append_data(&zero_ptr, 1));
+ const auto* offset_ptr = reinterpret_cast<const uint8_t*>(&offset);
+ RETURN_IF_ERROR(_offset_writer->append_data(&offset_ptr, 1));
--num_lengths;
}
return write_null_column(num_rows, true);
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h
b/be/src/olap/rowset/segment_v2/column_writer.h
index 2c3ea79781..68fef318e5 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -292,7 +292,7 @@ public:
}
return Status::OK();
}
- ordinal_t get_next_rowid() const override { return
_length_writer->get_next_rowid(); }
+ ordinal_t get_next_rowid() const override { return
_offset_writer->get_next_rowid(); }
private:
Status put_extra_info_in_page(DataPageFooterPB* header) override;
@@ -300,12 +300,10 @@ private:
bool has_empty_items() const { return _item_writer->get_next_rowid() == 0;
}
private:
- std::unique_ptr<ScalarColumnWriter> _length_writer;
+ std::unique_ptr<ScalarColumnWriter> _offset_writer;
std::unique_ptr<ScalarColumnWriter> _null_writer;
std::unique_ptr<ColumnWriter> _item_writer;
ColumnWriterOptions _opts;
- ordinal_t _current_length_page_first_ordinal = 0;
- ordinal_t _length_sum_in_cur_page = 0;
};
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h
b/be/src/olap/rowset/segment_v2/parsed_page.h
index 232c347c38..f3b6ed7b55 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -68,7 +68,7 @@ struct ParsedPage {
page->page_pointer = page_pointer;
page->page_index = page_index;
- page->first_array_item_ordinal = footer.first_array_item_ordinal();
+ page->next_array_item_ordinal = footer.next_array_item_ordinal();
return Status::OK();
}
@@ -89,8 +89,9 @@ struct ParsedPage {
ordinal_t first_ordinal = 0;
// number of rows including nulls and not-nulls
ordinal_t num_rows = 0;
- // just for array type
- ordinal_t first_array_item_ordinal = 0;
+ // record it to get the last array element's size
+ // should be none zero if setted in page
+ ordinal_t next_array_item_ordinal = 0;
PagePointer page_pointer;
uint32_t page_index = 0;
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index fe0fcd9d93..7a69286046 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -69,8 +69,8 @@ message DataPageFooterPB {
// required: size of nullmap, 0 if the page doesn't contain NULL
optional uint32 nullmap_size = 3;
// only for array column
- // Save the first array's first item's ordinal.
- optional uint64 first_array_item_ordinal = 4;
+ // Save the offset of next page
+ optional uint64 next_array_item_ordinal = 4;
}
message IndexPageFooterPB {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]