This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 983cdc7b0d [feature-wip](array-type) Support loading data in
vectorized format (#10065)
983cdc7b0d is described below
commit 983cdc7b0dec2a7a2838ee364633725cd2117a31
Author: Adonis Ling <[email protected]>
AuthorDate: Wed Jun 15 14:40:28 2022 +0800
[feature-wip](array-type) Support loading data in vectorized format (#10065)
---
be/src/olap/rowset/segment_v2/column_reader.cpp | 45 +++++++++++++++++++++++--
be/src/olap/rowset/segment_v2/column_reader.h | 2 ++
be/test/runtime/array_test.cpp | 42 ++++++++++++++++++++++-
3 files changed, 86 insertions(+), 3 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 88eb1bfc3e..d68ffc66c0 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -17,7 +17,6 @@
#include "olap/rowset/segment_v2/column_reader.h"
-#include "common/logging.h"
#include "gutil/strings/substitute.h" // for Substitute
#include "olap/column_block.h" // for ColumnBlockView
#include "olap/rowset/segment_v2/binary_dict_page.h" // for
BinaryDictPageDecoder
@@ -28,8 +27,9 @@
#include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer
#include "olap/types.h" // for TypeInfo
#include "util/block_compression.h"
-#include "util/coding.h" // for get_varint32
#include "util/rle_encoding.h" // for RleDecoder
+#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
#include "vec/core/types.h"
#include "vec/runtime/vdatetime_value.h" //for VecDateTime
@@ -462,6 +462,47 @@ Status ArrayFileColumnIterator::next_batch(size_t* n,
ColumnBlockView* dst, bool
return Status::OK();
}
+Status ArrayFileColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnPtr& dst,
+ bool* has_null) {
+ const auto* column_array =
vectorized::check_and_get_column<vectorized::ColumnArray>(
+ dst->is_nullable() ?
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
+ : *dst);
+
+ bool offsets_has_null = false;
+ auto column_offsets_ptr =
column_array->get_offsets_column().assume_mutable();
+ ssize_t start = column_offsets_ptr->size();
+ RETURN_IF_ERROR(_length_iterator->next_batch(n, column_offsets_ptr,
&offsets_has_null));
+ if (*n == 0) {
+ return Status::OK();
+ }
+ auto& column_offsets =
+
static_cast<vectorized::ColumnArray::ColumnOffsets&>(*column_offsets_ptr);
+ auto& offsets_data = column_offsets.get_data();
+ for (ssize_t i = start; i < offsets_data.size(); ++i) {
+ offsets_data[i] += offsets_data[i - 1]; // -1 is ok
+ }
+
+ auto column_items_ptr = column_array->get_data().assume_mutable();
+ size_t num_items = offsets_data.back() - offsets_data[start - 1];
+ if (num_items > 0) {
+ size_t num_read = num_items;
+ bool items_has_null = false;
+ RETURN_IF_ERROR(_item_iterator->next_batch(&num_read,
column_items_ptr, &items_has_null));
+ DCHECK(num_read == num_items);
+ }
+
+ if (dst->is_nullable()) {
+ auto null_map_ptr =
+
static_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column_ptr();
+ size_t num_read = *n;
+ bool null_signs_has_null = false;
+ RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr,
&null_signs_has_null));
+ DCHECK(num_read == *n);
+ }
+
+ return Status::OK();
+}
+
////////////////////////////////////////////////////////////////////////////////
FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader)
{}
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index de51b01824..47250a96cb 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -335,6 +335,8 @@ public:
Status next_batch(size_t* n, ColumnBlockView* dst, bool* has_null)
override;
+ Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool*
has_null) override;
+
Status seek_to_first() override {
RETURN_IF_ERROR(_length_iterator->seek_to_first());
RETURN_IF_ERROR(_item_iterator->seek_to_first()); // lazy???
diff --git a/be/test/runtime/array_test.cpp b/be/test/runtime/array_test.cpp
index 162157852d..9ef5277352 100644
--- a/be/test/runtime/array_test.cpp
+++ b/be/test/runtime/array_test.cpp
@@ -45,7 +45,10 @@
#include "testutil/desc_tbl_builder.h"
#include "util/file_utils.h"
#include "util/uid_util.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
#include "vec/core/block.h"
+#include "vec/data_types/data_type_factory.hpp"
namespace doris {
@@ -289,10 +292,12 @@ private:
auto col = block.column_block(0);
int index = 0;
size_t rows_read = 1024;
+ size_t num_rows = 0;
do {
ColumnBlockView dst(&col);
st = iter->next_batch(&rows_read, &dst);
EXPECT_TRUE(st.ok());
+ num_rows += rows_read;
for (int i = 0; i < rows_read; ++i) {
validate(field, arrays[index++],
reinterpret_cast<const
CollectionValue*>(col.cell_ptr(i)));
@@ -301,9 +306,44 @@ private:
} while (rows_read >= 1024);
auto type_info = get_type_info(column_pb);
auto tuple_desc = get_tuple_descriptor(_object_pool,
type_info.get());
- block.set_selected_size(rows_read);
+ block.set_selected_size(num_rows);
test_convert_to_vec_block(block, tuple_desc, field, arrays);
}
+ {
+ auto type_info = get_type_info(column_pb);
+ auto tuple_desc = get_tuple_descriptor(_object_pool,
type_info.get());
+
+ auto reader = create_column_reader(path, meta, arrays.size());
+ EXPECT_NE(reader, nullptr);
+ auto rblock = create_readable_block(path);
+ EXPECT_NE(rblock, nullptr);
+ OlapReaderStatistics stats;
+ std::unique_ptr<segment_v2::ColumnIterator> iter(
+ new_iterator(rblock.get(), &stats, reader.get()));
+ EXPECT_NE(iter, nullptr);
+ auto st = iter->seek_to_first();
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ auto data_type =
+
vectorized::DataTypeFactory::instance().create_data_type(tablet_column);
+ auto column_ptr = data_type->create_column();
+ size_t rows_read = 1024;
+ column_ptr->reserve(rows_read);
+ do {
+ bool has_null = false;
+ st = iter->next_batch(&rows_read, column_ptr, &has_null);
+ EXPECT_TRUE(st.ok());
+ vectorized::Block vblock;
+ vblock.insert({const_cast<const
vectorized::IColumn&>(*column_ptr).get_ptr(),
+ data_type, ""});
+ for (int i = 0; i < arrays.size(); ++i) {
+ auto tuple = vblock.deep_copy_tuple(*tuple_desc,
_mem_pool.get(), i, 0, false);
+ auto actual =
+
tuple->get_collection_slot(tuple_desc->slots().front()->tuple_offset());
+ validate(field, arrays[i], actual);
+ }
+ } while (rows_read >= 1024);
+ }
}
template <segment_v2::EncodingTypePB array_encoding,
segment_v2::EncodingTypePB item_encoding>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]