This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new c3779a08bc9 branch-3.1: [fix](variant) fix variant column reader
#54830 (#54922)
c3779a08bc9 is described below
commit c3779a08bc941455d6f342a7488052f8aa528302
Author: Sun Chenyang <[email protected]>
AuthorDate: Tue Aug 19 11:17:24 2025 +0800
branch-3.1: [fix](variant) fix variant column reader #54830 (#54922)
pick from master #54830
---
.../segment_v2/variant/variant_column_reader.cpp | 16 +-
.../variant_column_writer_reader_test.cpp | 186 ++++++++++++++++++++-
2 files changed, 198 insertions(+), 4 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
index ee88b5065fd..ec0079edc64 100644
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
@@ -312,9 +312,19 @@ Status
VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator,
_statistics->sparse_column_non_null_size.size() >=
config::variant_max_sparse_column_statistics_size;
- // For compaction operations, read flat leaves, otherwise read
hierarchical data
- // Since the variant subcolumns are flattened in
schema_util::get_compaction_schema
- if (opt != nullptr && is_compaction_reader_type(opt->io_ctx.reader_type)) {
+ // If the variant column has extracted columns and is a compaction reader,
then read flat leaves
+ // Otherwise read hierarchical data, since the variant subcolumns are
flattened in schema_util::get_compaction_schema
+ // For checksum reader, we need to read flat leaves to get the correct
data if has extracted columns
+ auto need_read_flat_leaves = [](const StorageReadOptions* opts) {
+ return opts != nullptr && opts->tablet_schema != nullptr &&
+ std::ranges::any_of(
+ opts->tablet_schema->columns(),
+ [](const auto& column) { return
column->is_extracted_column(); }) &&
+ (is_compaction_reader_type(opts->io_ctx.reader_type) ||
+ opts->io_ctx.reader_type == ReaderType::READER_CHECKSUM);
+ };
+
+ if (need_read_flat_leaves(opt)) {
// original path, compaction with wide schema
return _new_iterator_with_flat_leaves(iterator, *target_col, opt,
exceeded_sparse_column_limit,
diff --git
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
index d6841057d0d..8495e7c4e0b 100644
--- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -532,6 +532,15 @@ TEST_F(VariantColumnWriterReaderTest,
test_write_data_normal) {
paths_set_info.sparse_path_set.insert("key9");
uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info;
_tablet_schema->set_path_set_info(std::move(uid_to_paths_set_info));
+ TabletColumn subcolumn_compaction;
+ subcolumn_compaction.set_name(parent_column.name_lower_case() + ".key10");
+ subcolumn_compaction.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+ subcolumn_compaction.set_parent_unique_id(parent_column.unique_id());
+
subcolumn_compaction.set_path_info(PathInData(parent_column.name_lower_case() +
".key10"));
+ subcolumn_compaction.set_variant_max_subcolumns_count(
+ parent_column.variant_max_subcolumns_count());
+ subcolumn_compaction.set_is_nullable(true);
+ storage_read_opts.tablet_schema->append_column(subcolumn_compaction);
// 14. check compaction subcolumn reader
check_leaf_reader();
@@ -2316,4 +2325,179 @@ TEST_F(VariantColumnWriterReaderTest,
test_nested_iter_nullable) {
EXPECT_TRUE(stats.bytes_read > 0);
}
-} // namespace doris
+TEST_F(VariantColumnWriterReaderTest, test_read_with_checksum) {
+ auto fill_string_column_with_test_data =
+ [&](auto& column_string, int size,
+ std::unordered_map<int, std::string>* inserted_jsonstr,
+ schema_util::PathToNoneNullValues* path_with_size) {
+ for (int i = 0; i < size; ++i) {
+ std::string jsonstr;
+ if (i % 2 == 0) {
+ jsonstr = R"({"b" : 3})";
+ (*path_with_size)["b"] += 1;
+ } else {
+ jsonstr = R"({"b" : {"c" : 5}})";
+ (*path_with_size)["b.c"] += 1;
+ }
+ inserted_jsonstr->emplace(i, jsonstr);
+ column_string->insert_data(jsonstr.c_str(),
jsonstr.size());
+ }
+ };
+
+ auto fill_object_column_with_test_data =
+ [&](auto& column_object, int size,
+ std::unordered_map<int, std::string>* inserted_jsonstr,
+ schema_util::PathToNoneNullValues* path_with_size) {
+ auto type_string =
std::make_shared<vectorized::DataTypeString>();
+ auto column = type_string->create_column();
+ auto* column_string = assert_cast<ColumnString*>(column.get());
+ fill_string_column_with_test_data(column_string, size,
inserted_jsonstr,
+ path_with_size);
+ vectorized::ParseConfig config;
+ config.enable_flatten_nested = false;
+ parse_json_to_variant(*column_object, *column_string, config);
+ };
+
+ // 1. create tablet_schema
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+ SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1");
+ _tablet_schema = std::make_shared<TabletSchema>();
+ _tablet_schema->init_from_pb(schema_pb);
+
+ // 2. create tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+ tablet_meta->_tablet_id = 10000;
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+
+ EXPECT_TRUE(_tablet->init().ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+ // 3. create file_writer
+ io::FileWriterPtr file_writer;
+ auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+ auto st = io::global_local_filesystem()->create_file(file_path,
&file_writer);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ // 4. create column_writer
+ SegmentFooterPB footer;
+ ColumnWriterOptions opts;
+ opts.meta = footer.add_columns();
+ opts.compression_type = CompressionTypePB::LZ4;
+ opts.file_writer = file_writer.get();
+ opts.footer = &footer;
+ RowsetWriterContext rowset_ctx;
+ rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+ opts.rowset_ctx = &rowset_ctx;
+ opts.rowset_ctx->tablet_schema = _tablet_schema;
+ TabletColumn column = _tablet_schema->column(0);
+ _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+ std::unique_ptr<ColumnWriter> writer;
+ EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(),
&writer).ok());
+ EXPECT_TRUE(writer->init().ok());
+ EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr);
+
+ // 5. write data
+ auto olap_data_convertor =
std::make_unique<vectorized::OlapBlockDataConvertor>();
+ auto block = _tablet_schema->create_block();
+ auto column_object =
(*std::move(block.get_by_position(0).column)).mutate();
+ schema_util::PathToNoneNullValues path_with_size;
+ std::unordered_map<int, std::string> inserted_jsonstr;
+ fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr,
&path_with_size);
+
+ olap_data_convertor->add_column_data_convertor(column);
+ olap_data_convertor->set_source_content(&block, 0, 1000);
+ auto [result, accessor] = olap_data_convertor->convert_column_data(0);
+ EXPECT_TRUE(result.ok());
+ EXPECT_TRUE(accessor != nullptr);
+ EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(),
1000).ok());
+ st = writer->finish();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_data();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_ordinal_index();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_zone_map();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(file_writer->close().ok());
+ footer.set_num_rows(1000);
+
+ // 6. check footer
+ EXPECT_EQ(footer.columns_size(), 4);
+ auto column_meta = footer.columns(0);
+ EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+ for (int i = 1; i < footer.columns_size() - 1; ++i) {
+ auto column_met = footer.columns(i);
+ check_column_meta(column_met, path_with_size);
+ }
+ check_sparse_column_meta(footer.columns(footer.columns_size() - 1),
path_with_size);
+
+ // 7. check variant reader
+ io::FileReaderSPtr file_reader;
+ st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ ColumnReaderOptions read_opts;
+ read_opts.tablet_schema = _tablet_schema;
+ std::shared_ptr<ColumnReader> column_reader;
+ st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader,
&column_reader);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ auto* variant_column_reader =
assert_cast<VariantColumnReader*>(column_reader.get());
+ EXPECT_TRUE(variant_column_reader != nullptr);
+
+ TabletColumn parent_column = _tablet_schema->column(0);
+ StorageReadOptions storage_read_opts;
+
+ storage_read_opts.tablet_schema = _tablet_schema;
+
+ TabletColumn subcolumn;
+ subcolumn.set_name(parent_column.name_lower_case() + ".b");
+ subcolumn.set_type((FieldType)(int)footer.columns(1).type());
+ subcolumn.set_parent_unique_id(parent_column.unique_id());
+ subcolumn.set_path_info(PathInData(parent_column.name_lower_case() +
".b"));
+
subcolumn.set_variant_max_subcolumns_count(parent_column.variant_max_subcolumns_count());
+ subcolumn.set_is_nullable(true);
+ _tablet_schema->append_column(subcolumn);
+ storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY;
+ ColumnIteratorUPtr hierarchical_it;
+
+ MockColumnReaderCache column_reader_cache(footer, file_reader,
_tablet_schema);
+ st = variant_column_reader->new_iterator(&hierarchical_it, &subcolumn,
&storage_read_opts,
+ &column_reader_cache);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(dynamic_cast<HierarchicalDataIterator*>(hierarchical_it.get())
!= nullptr);
+
+ storage_read_opts.io_ctx.reader_type = ReaderType::READER_CHECKSUM;
+ ColumnIteratorUPtr it;
+ st = variant_column_reader->new_iterator(&it, &subcolumn,
&storage_read_opts,
+ &column_reader_cache);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(dynamic_cast<FileColumnIterator*>(it.get()) != nullptr);
+ ColumnIteratorOptions column_iter_opts;
+ OlapReaderStatistics stats;
+ column_iter_opts.stats = &stats;
+ column_iter_opts.file_reader = file_reader.get();
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ auto column_type = DataTypeFactory::instance().create_data_type(subcolumn,
true);
+ auto read_column = column_type->create_column();
+ size_t nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, read_column);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+
+ for (int row = 0; row < 1000; ++row) {
+ const std::string& value = column_type->to_string(*read_column, row);
+ if (row % 2 == 0) {
+ EXPECT_EQ(value, "3");
+ }
+ }
+}
+
+} // namespace doris
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]