This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new c3779a08bc9 branch-3.1: [fix](variant) fix variant column reader 
#54830 (#54922)
c3779a08bc9 is described below

commit c3779a08bc941455d6f342a7488052f8aa528302
Author: Sun Chenyang <[email protected]>
AuthorDate: Tue Aug 19 11:17:24 2025 +0800

    branch-3.1: [fix](variant) fix variant column reader #54830 (#54922)
    
    pick from master #54830
---
 .../segment_v2/variant/variant_column_reader.cpp   |  16 +-
 .../variant_column_writer_reader_test.cpp          | 186 ++++++++++++++++++++-
 2 files changed, 198 insertions(+), 4 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp 
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
index ee88b5065fd..ec0079edc64 100644
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
@@ -312,9 +312,19 @@ Status 
VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator,
                                         
_statistics->sparse_column_non_null_size.size() >=
                                                 
config::variant_max_sparse_column_statistics_size;
 
-    // For compaction operations, read flat leaves, otherwise read 
hierarchical data
-    // Since the variant subcolumns are flattened in 
schema_util::get_compaction_schema
-    if (opt != nullptr && is_compaction_reader_type(opt->io_ctx.reader_type)) {
+    // If the variant column has extracted columns and is a compaction reader, 
then read flat leaves
+    // Otherwise read hierarchical data, since the variant subcolumns are 
flattened in schema_util::get_compaction_schema
+    // For checksum reader, we need to read flat leaves to get the correct 
data if has extracted columns
+    auto need_read_flat_leaves = [](const StorageReadOptions* opts) {
+        return opts != nullptr && opts->tablet_schema != nullptr &&
+               std::ranges::any_of(
+                       opts->tablet_schema->columns(),
+                       [](const auto& column) { return 
column->is_extracted_column(); }) &&
+               (is_compaction_reader_type(opts->io_ctx.reader_type) ||
+                opts->io_ctx.reader_type == ReaderType::READER_CHECKSUM);
+    };
+
+    if (need_read_flat_leaves(opt)) {
         // original path, compaction with wide schema
         return _new_iterator_with_flat_leaves(iterator, *target_col, opt,
                                               exceeded_sparse_column_limit,
diff --git 
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp 
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
index d6841057d0d..8495e7c4e0b 100644
--- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -532,6 +532,15 @@ TEST_F(VariantColumnWriterReaderTest, 
test_write_data_normal) {
     paths_set_info.sparse_path_set.insert("key9");
     uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info;
     _tablet_schema->set_path_set_info(std::move(uid_to_paths_set_info));
+    TabletColumn subcolumn_compaction;
+    subcolumn_compaction.set_name(parent_column.name_lower_case() + ".key10");
+    subcolumn_compaction.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+    subcolumn_compaction.set_parent_unique_id(parent_column.unique_id());
+    
subcolumn_compaction.set_path_info(PathInData(parent_column.name_lower_case() + 
".key10"));
+    subcolumn_compaction.set_variant_max_subcolumns_count(
+            parent_column.variant_max_subcolumns_count());
+    subcolumn_compaction.set_is_nullable(true);
+    storage_read_opts.tablet_schema->append_column(subcolumn_compaction);
 
     // 14. check compaction subcolumn reader
     check_leaf_reader();
@@ -2316,4 +2325,179 @@ TEST_F(VariantColumnWriterReaderTest, 
test_nested_iter_nullable) {
     EXPECT_TRUE(stats.bytes_read > 0);
 }
 
-} // namespace doris
+TEST_F(VariantColumnWriterReaderTest, test_read_with_checksum) {
+    auto fill_string_column_with_test_data =
+            [&](auto& column_string, int size,
+                std::unordered_map<int, std::string>* inserted_jsonstr,
+                schema_util::PathToNoneNullValues* path_with_size) {
+                for (int i = 0; i < size; ++i) {
+                    std::string jsonstr;
+                    if (i % 2 == 0) {
+                        jsonstr = R"({"b" : 3})";
+                        (*path_with_size)["b"] += 1;
+                    } else {
+                        jsonstr = R"({"b" : {"c" : 5}})";
+                        (*path_with_size)["b.c"] += 1;
+                    }
+                    inserted_jsonstr->emplace(i, jsonstr);
+                    column_string->insert_data(jsonstr.c_str(), 
jsonstr.size());
+                }
+            };
+
+    auto fill_object_column_with_test_data =
+            [&](auto& column_object, int size,
+                std::unordered_map<int, std::string>* inserted_jsonstr,
+                schema_util::PathToNoneNullValues* path_with_size) {
+                auto type_string = 
std::make_shared<vectorized::DataTypeString>();
+                auto column = type_string->create_column();
+                auto* column_string = assert_cast<ColumnString*>(column.get());
+                fill_string_column_with_test_data(column_string, size, 
inserted_jsonstr,
+                                                  path_with_size);
+                vectorized::ParseConfig config;
+                config.enable_flatten_nested = false;
+                parse_json_to_variant(*column_object, *column_string, config);
+            };
+
+    // 1. create tablet_schema
+    TabletSchemaPB schema_pb;
+    schema_pb.set_keys_type(KeysType::DUP_KEYS);
+    SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1");
+    _tablet_schema = std::make_shared<TabletSchema>();
+    _tablet_schema->init_from_pb(schema_pb);
+
+    // 2. create tablet
+    TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+    tablet_meta->_tablet_id = 10000;
+    _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, 
_data_dir.get());
+
+    EXPECT_TRUE(_tablet->init().ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+    
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+    // 3. create file_writer
+    io::FileWriterPtr file_writer;
+    auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+    auto st = io::global_local_filesystem()->create_file(file_path, 
&file_writer);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    // 4. create column_writer
+    SegmentFooterPB footer;
+    ColumnWriterOptions opts;
+    opts.meta = footer.add_columns();
+    opts.compression_type = CompressionTypePB::LZ4;
+    opts.file_writer = file_writer.get();
+    opts.footer = &footer;
+    RowsetWriterContext rowset_ctx;
+    rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+    opts.rowset_ctx = &rowset_ctx;
+    opts.rowset_ctx->tablet_schema = _tablet_schema;
+    TabletColumn column = _tablet_schema->column(0);
+    _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+    std::unique_ptr<ColumnWriter> writer;
+    EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), 
&writer).ok());
+    EXPECT_TRUE(writer->init().ok());
+    EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr);
+
+    // 5. write data
+    auto olap_data_convertor = 
std::make_unique<vectorized::OlapBlockDataConvertor>();
+    auto block = _tablet_schema->create_block();
+    auto column_object = 
(*std::move(block.get_by_position(0).column)).mutate();
+    schema_util::PathToNoneNullValues path_with_size;
+    std::unordered_map<int, std::string> inserted_jsonstr;
+    fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr, 
&path_with_size);
+
+    olap_data_convertor->add_column_data_convertor(column);
+    olap_data_convertor->set_source_content(&block, 0, 1000);
+    auto [result, accessor] = olap_data_convertor->convert_column_data(0);
+    EXPECT_TRUE(result.ok());
+    EXPECT_TRUE(accessor != nullptr);
+    EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 
1000).ok());
+    st = writer->finish();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_data();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_ordinal_index();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = writer->write_zone_map();
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(file_writer->close().ok());
+    footer.set_num_rows(1000);
+
+    // 6. check footer
+    EXPECT_EQ(footer.columns_size(), 4);
+    auto column_meta = footer.columns(0);
+    EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+    for (int i = 1; i < footer.columns_size() - 1; ++i) {
+        auto column_met = footer.columns(i);
+        check_column_meta(column_met, path_with_size);
+    }
+    check_sparse_column_meta(footer.columns(footer.columns_size() - 1), 
path_with_size);
+
+    // 7. check variant reader
+    io::FileReaderSPtr file_reader;
+    st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    ColumnReaderOptions read_opts;
+    read_opts.tablet_schema = _tablet_schema;
+    std::shared_ptr<ColumnReader> column_reader;
+    st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, 
&column_reader);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    auto* variant_column_reader = 
assert_cast<VariantColumnReader*>(column_reader.get());
+    EXPECT_TRUE(variant_column_reader != nullptr);
+
+    TabletColumn parent_column = _tablet_schema->column(0);
+    StorageReadOptions storage_read_opts;
+
+    storage_read_opts.tablet_schema = _tablet_schema;
+
+    TabletColumn subcolumn;
+    subcolumn.set_name(parent_column.name_lower_case() + ".b");
+    subcolumn.set_type((FieldType)(int)footer.columns(1).type());
+    subcolumn.set_parent_unique_id(parent_column.unique_id());
+    subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + 
".b"));
+    
subcolumn.set_variant_max_subcolumns_count(parent_column.variant_max_subcolumns_count());
+    subcolumn.set_is_nullable(true);
+    _tablet_schema->append_column(subcolumn);
+    storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY;
+    ColumnIteratorUPtr hierarchical_it;
+
+    MockColumnReaderCache column_reader_cache(footer, file_reader, 
_tablet_schema);
+    st = variant_column_reader->new_iterator(&hierarchical_it, &subcolumn, 
&storage_read_opts,
+                                             &column_reader_cache);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(dynamic_cast<HierarchicalDataIterator*>(hierarchical_it.get()) 
!= nullptr);
+
+    storage_read_opts.io_ctx.reader_type = ReaderType::READER_CHECKSUM;
+    ColumnIteratorUPtr it;
+    st = variant_column_reader->new_iterator(&it, &subcolumn, 
&storage_read_opts,
+                                             &column_reader_cache);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(dynamic_cast<FileColumnIterator*>(it.get()) != nullptr);
+    ColumnIteratorOptions column_iter_opts;
+    OlapReaderStatistics stats;
+    column_iter_opts.stats = &stats;
+    column_iter_opts.file_reader = file_reader.get();
+    st = it->init(column_iter_opts);
+    EXPECT_TRUE(st.ok()) << st.msg();
+
+    auto column_type = DataTypeFactory::instance().create_data_type(subcolumn, 
true);
+    auto read_column = column_type->create_column();
+    size_t nrows = 1000;
+    st = it->seek_to_ordinal(0);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    st = it->next_batch(&nrows, read_column);
+    EXPECT_TRUE(st.ok()) << st.msg();
+    EXPECT_TRUE(stats.bytes_read > 0);
+
+    for (int row = 0; row < 1000; ++row) {
+        const std::string& value = column_type->to_string(*read_column, row);
+        if (row % 2 == 0) {
+            EXPECT_EQ(value, "3");
+        }
+    }
+}
+
+} // namespace doris
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to