Re: [PR] Variant master index load [doris]

via GitHub Wed, 01 Nov 2023 01:46:15 -0700


github-actions[bot] commented on code in PR #26220:
URL: https://github.com/apache/doris/pull/26220#discussion_r1378530496



##########
be/src/olap/block_column_predicate.h:
##########
@@ -72,6 +72,11 @@ class BlockColumnPredicate {
     virtual void evaluate_vec(vectorized::MutableColumns& block, uint16_t 
size, bool* flags) const {
     }
 
+    virtual bool can_do_apply_safely(PrimitiveType input_type, bool is_null) 
const {

Review Comment:
   warning: function 'can_do_apply_safely' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] virtual bool can_do_apply_safely(PrimitiveType input_type, 
bool is_null) const {
   ```
   



##########
be/src/exprs/json_functions.cpp:
##########
@@ -315,4 +317,36 @@ Status 
JsonFunctions::extract_from_object(simdjson::ondemand::object& obj,
     return Status::OK();
 }
 
+std::string JsonFunctions::print_json_value(const rapidjson::Value& value) {
+    rapidjson::StringBuffer buffer;

Review Comment:
   warning: variable 'buffer' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
       rapidjson::StringBuffer buffer = 0;
   ```
   



##########
be/src/olap/block_column_predicate.h:
##########
@@ -125,6 +130,10 @@
         return _predicate->can_do_bloom_filter(ngram);
     }
 
+    bool can_do_apply_safely(PrimitiveType input_type, bool is_null) const 
override {

Review Comment:
   warning: function 'can_do_apply_safely' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] bool can_do_apply_safely(PrimitiveType input_type, bool 
is_null) const override {
   ```
   



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -592,6 +594,38 @@
     int32_t _segment_id = 0;
 };
 
+class VariantRootColumnIterator : public ColumnIterator {
+public:
+    VariantRootColumnIterator() = delete;
+
+    explicit VariantRootColumnIterator(FileColumnIterator* iter) { 
_inner_iter.reset(iter); }
+
+    ~VariantRootColumnIterator() override = default;
+
+    Status init(const ColumnIteratorOptions& opts) override { return 
_inner_iter->init(opts); }
+
+    Status seek_to_first() override { return _inner_iter->seek_to_first(); }
+
+    Status seek_to_ordinal(ordinal_t ord_idx) override {
+        return _inner_iter->seek_to_ordinal(ord_idx);
+    }
+
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst) {
+        bool has_null;
+        return next_batch(n, dst, &has_null);
+    }
+
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* 
has_null) override;
+
+    Status read_by_rowids(const rowid_t* rowids, const size_t count,
+                          vectorized::MutableColumnPtr& dst) override;
+
+    ordinal_t get_current_ordinal() const override { return 
_inner_iter->get_current_ordinal(); }

Review Comment:
   warning: function 'get_current_ordinal' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] ordinal_t get_current_ordinal() const override { return 
_inner_iter->get_current_ordinal(); }
   ```
   



##########
be/src/olap/field.h:
##########
@@ -57,6 +58,7 @@ class Field {
     size_t index_size() const { return _index_size; }
     int32_t unique_id() const { return _unique_id; }
     const std::string& name() const { return _name; }
+    const vectorized::PathInData& path() const { return _path; }

Review Comment:
   warning: function 'path' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] const vectorized::PathInData& path() const { return _path; 
}
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;

Review Comment:
   warning: variable 'leaves' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
       std::vector<const SubcolumnColumnReaders::Node*> leaves = 0;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;
+    SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+    for (size_t i = 0; i < leaves_paths.size(); ++i) {
+        if (leaves_paths[i] == root->path) {
+            // use set_root to share instead
+            continue;
+        }
+        stream_iter->add_stream(leaves[i]);
+    }
+    // Make sure the root node is in strem_cache, so that child can merge data 
with root
+    // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
+    // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
+    // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+    ColumnIterator* it;
+    RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
+    stream_iter->set_root(std::make_unique<StreamReader>(
+            root->data.file_column_type->create_column(), 
std::unique_ptr<ColumnIterator>(it),
+            root->data.file_column_type));
+    reader->reset(stream_iter);
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::init(const ColumnIteratorOptions& opts) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->init(opts));
+        node.data.inited = true;
+        return Status::OK();
+    }));
+    if (_root_reader && !_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status HierarchicalDataReader::seek_to_ordinal(ordinal_t ord) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->seek_to_ordinal(ord));
+        return Status::OK();
+    }));
+    if (_root_reader) {
+        DCHECK(_root_reader->inited);
+        RETURN_IF_ERROR(_root_reader->iterator->seek_to_ordinal(ord));
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
+                                          bool* has_null) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->next_batch(n, reader.column, 
has_null));
+                VLOG_DEBUG << fmt::format("{} next_batch {} rows, type={}", 
path.get_path(), *n,
+                                          type->get_name());
+                reader.rows_read += *n;
+                return Status::OK();
+            },
+            dst, *n);
+}
+
+Status HierarchicalDataReader::read_by_rowids(const rowid_t* rowids, const 
size_t count,
+                                              vectorized::MutableColumnPtr& 
dst) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->read_by_rowids(rowids, count, 
reader.column));
+                VLOG_DEBUG << fmt::format("{} read_by_rowids {} rows, 
type={}", path.get_path(),
+                                          count, type->get_name());
+                reader.rows_read += count;
+                return Status::OK();
+            },
+            dst, count);
+}
+
+Status HierarchicalDataReader::add_stream(const SubcolumnColumnReaders::Node* 
node) {
+    if (_substream_reader.find_leaf(node->path)) {
+        VLOG_DEBUG << "Already exist sub column " << node->path.get_path();
+        return Status::OK();
+    }
+    CHECK(node);
+    ColumnIterator* it;
+    RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+    std::unique_ptr<ColumnIterator> it_ptr;
+    it_ptr.reset(it);
+    StreamReader reader(node->data.file_column_type->create_column(), 
std::move(it_ptr),
+                        node->data.file_column_type);
+    bool added = _substream_reader.add(node->path, std::move(reader));
+    if (!added) {
+        return Status::InternalError("Failed to add node path {}", 
node->path.get_path());
+    }
+    VLOG_DEBUG << fmt::format("Add substream {} for {}", 
node->path.get_path(), _path.get_path());
+    return Status::OK();
+}
+
+ordinal_t HierarchicalDataReader::get_current_ordinal() const {
+    return (*_substream_reader.begin())->data.iterator->get_current_ordinal();
+}
+
+Status ExtractReader::init(const ColumnIteratorOptions& opts) {
+    if (!_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status ExtractReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status ExtractReader::seek_to_ordinal(ordinal_t ord) {
+    CHECK(_root_reader->inited);
+    return _root_reader->iterator->seek_to_ordinal(ord);
+}
+
+Status ExtractReader::extract_to(vectorized::MutableColumnPtr& dst, size_t 
nrows) {
+    DCHECK(_root_reader);
+    DCHECK(_root_reader->inited);
+    vectorized::MutableColumnPtr extracted_column;
+    const auto& root = assert_cast<const 
vectorized::ColumnObject&>(*_root_reader->column);
+    // extract root value with path, we can't modify the original root column
+    // since some other column may depend on it.
+    RETURN_IF_ERROR(root.extract_root( // trim the root name, eg. v.a.b -> a.b
+            _col.path_info().pop_front(), extracted_column));
+    if (dst->is_variant()) {
+        auto& dst_var = assert_cast<vectorized::ColumnObject&>(*dst);
+        if (dst_var.empty() || dst_var.is_null_root()) {
+            dst_var.create_root(root.get_root_type(), 
std::move(extracted_column));
+        } else {
+            vectorized::ColumnPtr cast_column;

Review Comment:
   warning: variable 'cast_column' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
               vectorized::ColumnPtr cast_column = 0;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "io/io_common.h"
+#include "olap/field.h"
+#include "olap/iterators.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "olap/schema.h"
+#include "olap/tablet_schema.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/columns/subcolumn_tree.h"
+#include "vec/data_types/data_type_object.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+struct StreamReader {
+    vectorized::MutableColumnPtr column;
+    std::unique_ptr<ColumnIterator> iterator;
+    std::shared_ptr<const vectorized::IDataType> type;
+    bool inited = false;
+    size_t rows_read = 0;
+    StreamReader() = default;
+    StreamReader(vectorized::MutableColumnPtr&& col, 
std::unique_ptr<ColumnIterator>&& it,
+                 std::shared_ptr<const vectorized::IDataType> t)
+            : column(std::move(col)), iterator(std::move(it)), type(t) {}
+};
+
+// path -> StreamReader
+using SubstreamReaderTree = vectorized::SubcolumnsTree<StreamReader>;
+
+// path -> SubcolumnReader
+struct SubcolumnReader {
+    std::unique_ptr<ColumnReader> reader;
+    std::shared_ptr<const vectorized::IDataType> file_column_type;
+};
+using SubcolumnColumnReaders = vectorized::SubcolumnsTree<SubcolumnReader>;
+
+// Reader for hierarchical data for variant, merge with root(sparse encoded 
columns)
+class HierarchicalDataReader : public ColumnIterator {
+public:
+    HierarchicalDataReader(const vectorized::PathInData& path, bool 
output_as_raw_json = false)
+            : _path(path), _output_as_raw_json(output_as_raw_json) {}
+
+    static Status create(std::unique_ptr<ColumnIterator>* reader,
+                         const SubcolumnColumnReaders::Node* target_node,
+                         const SubcolumnColumnReaders::Node* root, bool 
output_as_raw_json = false);
+
+    Status init(const ColumnIteratorOptions& opts) override;
+
+    Status seek_to_first() override;
+
+    Status seek_to_ordinal(ordinal_t ord) override;
+
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* 
has_null) override;
+
+    Status read_by_rowids(const rowid_t* rowids, const size_t count,
+                          vectorized::MutableColumnPtr& dst) override;
+
+    ordinal_t get_current_ordinal() const override;

Review Comment:
   warning: function 'get_current_ordinal' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] ordinal_t get_current_ordinal() const override;
   ```
   



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -287,6 +337,108 @@ Status Segment::_create_column_readers(const 
SegmentFooterPB& footer) {
                                              _file_reader, &reader));
         _column_readers.emplace(column.unique_id(), std::move(reader));
     }
+
+    // init by column path
+    for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); 
++ordinal) {
+        auto& column = _tablet_schema->column(ordinal);
+        auto iter = column_path_to_footer_ordinal.find(column.path_info());
+        if (iter == column_path_to_footer_ordinal.end()) {
+            continue;
+        }
+        ColumnReaderOptions opts;
+        opts.kept_in_memory = _tablet_schema->is_in_memory();
+        std::unique_ptr<ColumnReader> reader;
+        RETURN_IF_ERROR(ColumnReader::create(opts, 
footer.columns(iter->second), footer.num_rows(),
+                                             _file_reader, &reader));
+        _sub_column_tree.add(
+                iter->first,
+                SubcolumnReader {std::move(reader),
+                                 
get_data_type_from_column_meta(footer.columns(iter->second))});
+    }
+    return Status::OK();
+}
+
+static Status new_default_iterator(const TabletColumn& tablet_column,
+                                   std::unique_ptr<ColumnIterator>* iter) {
+    if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) {
+        return Status::InternalError("invalid nonexistent column without 
default value.");
+    }
+    auto type_info = get_type_info(&tablet_column);
+    std::unique_ptr<DefaultValueColumnIterator> default_value_iter(new 
DefaultValueColumnIterator(
+            tablet_column.has_default_value(), tablet_column.default_value(),
+            tablet_column.is_nullable(), std::move(type_info), 
tablet_column.precision(),
+            tablet_column.frac()));
+    ColumnIteratorOptions iter_opts;
+
+    RETURN_IF_ERROR(default_value_iter->init(iter_opts));
+    *iter = std::move(default_value_iter);
+    return Status::OK();
+}
+
+Status Segment::new_iterator_with_path(const TabletColumn& tablet_column,
+                                       std::unique_ptr<ColumnIterator>* iter,
+                                       StorageReadOptions* opt) {
+    vectorized::PathInData 
root_path({tablet_column.path_info().get_parts()[0]});
+    auto root = _sub_column_tree.find_leaf(root_path);
+    auto node = _sub_column_tree.find_exact(tablet_column.path_info());
+    if (opt->io_ctx.reader_type == ReaderType::READER_ALTER_TABLE) {
+        CHECK(tablet_column.is_variant_type());
+        if (node == nullptr) {
+            // No such variant column in this segment, get a default one
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        bool output_as_raw_json = true;
+        // Alter table operation should read the whole variant column, since 
it does not aware of
+        // subcolumns of variant during processing rewriting rowsets.
+        // This is slow, since it needs to read all sub columns and merge them 
into a single column
+        RETURN_IF_ERROR(HierarchicalDataReader::create(iter, node, root, 
output_as_raw_json));
+        return Status::OK();
+    }
+
+    if (opt->io_ctx.reader_type != ReaderType::READER_QUERY) {
+        // Could be compaction ..etc and read flat leaves nodes data
+        auto node = _sub_column_tree.find_leaf(tablet_column.path_info());
+        if (!node) {
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        ColumnIterator* it;

Review Comment:
   warning: variable 'it' is not initialized [cppcoreguidelines-init-variables]
   
   ```suggestion
           ColumnIterator* it = nullptr;
   ```
   



##########
be/src/olap/block_column_predicate.h:
##########
@@ -199,6 +208,15 @@
         return true;
     }
 
+    bool can_do_apply_safely(PrimitiveType input_type, bool is_null) const 
override {

Review Comment:
   warning: function 'can_do_apply_safely' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] bool can_do_apply_safely(PrimitiveType input_type, bool 
is_null) const override {
   ```
   



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -287,6 +337,108 @@
                                              _file_reader, &reader));
         _column_readers.emplace(column.unique_id(), std::move(reader));
     }
+
+    // init by column path
+    for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); 
++ordinal) {
+        auto& column = _tablet_schema->column(ordinal);
+        auto iter = column_path_to_footer_ordinal.find(column.path_info());
+        if (iter == column_path_to_footer_ordinal.end()) {
+            continue;
+        }
+        ColumnReaderOptions opts;
+        opts.kept_in_memory = _tablet_schema->is_in_memory();
+        std::unique_ptr<ColumnReader> reader;
+        RETURN_IF_ERROR(ColumnReader::create(opts, 
footer.columns(iter->second), footer.num_rows(),
+                                             _file_reader, &reader));
+        _sub_column_tree.add(
+                iter->first,
+                SubcolumnReader {std::move(reader),
+                                 
get_data_type_from_column_meta(footer.columns(iter->second))});
+    }
+    return Status::OK();
+}
+
+static Status new_default_iterator(const TabletColumn& tablet_column,
+                                   std::unique_ptr<ColumnIterator>* iter) {
+    if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) {
+        return Status::InternalError("invalid nonexistent column without 
default value.");
+    }
+    auto type_info = get_type_info(&tablet_column);
+    std::unique_ptr<DefaultValueColumnIterator> default_value_iter(new 
DefaultValueColumnIterator(
+            tablet_column.has_default_value(), tablet_column.default_value(),
+            tablet_column.is_nullable(), std::move(type_info), 
tablet_column.precision(),
+            tablet_column.frac()));
+    ColumnIteratorOptions iter_opts;
+
+    RETURN_IF_ERROR(default_value_iter->init(iter_opts));
+    *iter = std::move(default_value_iter);
+    return Status::OK();
+}
+
+Status Segment::new_iterator_with_path(const TabletColumn& tablet_column,
+                                       std::unique_ptr<ColumnIterator>* iter,
+                                       StorageReadOptions* opt) {
+    vectorized::PathInData 
root_path({tablet_column.path_info().get_parts()[0]});
+    auto root = _sub_column_tree.find_leaf(root_path);
+    auto node = _sub_column_tree.find_exact(tablet_column.path_info());
+    if (opt->io_ctx.reader_type == ReaderType::READER_ALTER_TABLE) {
+        CHECK(tablet_column.is_variant_type());
+        if (node == nullptr) {
+            // No such variant column in this segment, get a default one
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        bool output_as_raw_json = true;
+        // Alter table operation should read the whole variant column, since 
it does not aware of
+        // subcolumns of variant during processing rewriting rowsets.
+        // This is slow, since it needs to read all sub columns and merge them 
into a single column
+        RETURN_IF_ERROR(HierarchicalDataReader::create(iter, node, root, 
output_as_raw_json));
+        return Status::OK();
+    }
+
+    if (opt->io_ctx.reader_type != ReaderType::READER_QUERY) {
+        // Could be compaction ..etc and read flat leaves nodes data
+        auto node = _sub_column_tree.find_leaf(tablet_column.path_info());
+        if (!node) {
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        ColumnIterator* it;
+        RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+        iter->reset(it);
+        return Status::OK();
+    }
+
+    // Init iterators with extra path info.
+    // TODO If this segment does not contain any data correspond to the 
relatate path,
+    // then we could optimize to generate a default iterator
+    // This file doest not contain this column, so only read from sparse column
+    // to avoid read amplification
+    if (node != nullptr && node->is_scalar() && node->children.empty()) {
+        // Direct read extracted columns
+        const auto* node = 
_sub_column_tree.find_leaf(tablet_column.path_info());
+        ColumnIterator* it;

Review Comment:
   warning: variable 'it' is not initialized [cppcoreguidelines-init-variables]
   
   ```suggestion
           ColumnIterator* it = nullptr;
   ```
   



##########
be/src/olap/rowset/segment_v2/column_reader.cpp:
##########
@@ -1434,5 +1454,47 @@ void 
DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP
     }
 }
 
+Status VariantRootColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,

Review Comment:
   warning: pointer parameter 'n' can be pointer to const 
[readability-non-const-parameter]
   
   ```suggestion
   Status VariantRootColumnIterator::next_batch(const size_t* n, 
vectorized::MutableColumnPtr& dst,
   ```
   



##########
be/src/olap/in_list_predicate.h:
##########
@@ -221,13 +225,13 @@ class InListPredicateBase : public ColumnPredicate {
         return Status::OK();
     }
 
-    Status evaluate(const Schema& schema, InvertedIndexIterator* iterator, 
uint32_t num_rows,
+    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+                    InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* result) const override {
         if (iterator == nullptr) {
             return Status::OK();
         }
-        auto column_desc = schema.column(_column_id);
-        std::string column_name = column_desc->name();
+        std::string column_name = name_with_type.first;

Review Comment:
   warning: variable 'column_name' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           std::string column_name = 0 = name_with_type.first;
   ```
   



##########
be/src/olap/rowset/segment_v2/column_reader.h:
##########
@@ -592,6 +594,38 @@ class RowIdColumnIterator : public ColumnIterator {
     int32_t _segment_id = 0;
 };
 
+class VariantRootColumnIterator : public ColumnIterator {
+public:
+    VariantRootColumnIterator() = delete;
+
+    explicit VariantRootColumnIterator(FileColumnIterator* iter) { 
_inner_iter.reset(iter); }
+
+    ~VariantRootColumnIterator() override = default;
+
+    Status init(const ColumnIteratorOptions& opts) override { return 
_inner_iter->init(opts); }
+
+    Status seek_to_first() override { return _inner_iter->seek_to_first(); }
+
+    Status seek_to_ordinal(ordinal_t ord_idx) override {
+        return _inner_iter->seek_to_ordinal(ord_idx);
+    }
+
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst) {
+        bool has_null;

Review Comment:
   warning: variable 'has_null' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           bool has_null = false;
   ```
   



##########
be/src/olap/comparison_predicate.h:
##########
@@ -74,13 +78,13 @@ class ComparisonPredicateBase : public ColumnPredicate {
                                bitmap);
     }
 
-    Status evaluate(const Schema& schema, InvertedIndexIterator* iterator, 
uint32_t num_rows,
+    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+                    InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* bitmap) const override {
         if (iterator == nullptr) {
             return Status::OK();
         }
-        auto column_desc = schema.column(_column_id);
-        std::string column_name = column_desc->name();
+        std::string column_name = name_with_type.first;

Review Comment:
   warning: variable 'column_name' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           std::string column_name = 0 = name_with_type.first;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;

Review Comment:
   warning: variable 'leaves_paths' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
       vectorized::PathsInData leaves_paths = 0;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;
+    SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+    for (size_t i = 0; i < leaves_paths.size(); ++i) {
+        if (leaves_paths[i] == root->path) {
+            // use set_root to share instead
+            continue;
+        }
+        stream_iter->add_stream(leaves[i]);
+    }
+    // Make sure the root node is in strem_cache, so that child can merge data 
with root
+    // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
+    // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
+    // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+    ColumnIterator* it;
+    RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
+    stream_iter->set_root(std::make_unique<StreamReader>(
+            root->data.file_column_type->create_column(), 
std::unique_ptr<ColumnIterator>(it),
+            root->data.file_column_type));
+    reader->reset(stream_iter);
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::init(const ColumnIteratorOptions& opts) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->init(opts));
+        node.data.inited = true;
+        return Status::OK();
+    }));
+    if (_root_reader && !_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status HierarchicalDataReader::seek_to_ordinal(ordinal_t ord) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->seek_to_ordinal(ord));
+        return Status::OK();
+    }));
+    if (_root_reader) {
+        DCHECK(_root_reader->inited);
+        RETURN_IF_ERROR(_root_reader->iterator->seek_to_ordinal(ord));
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
+                                          bool* has_null) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->next_batch(n, reader.column, 
has_null));
+                VLOG_DEBUG << fmt::format("{} next_batch {} rows, type={}", 
path.get_path(), *n,
+                                          type->get_name());
+                reader.rows_read += *n;
+                return Status::OK();
+            },
+            dst, *n);
+}
+
+Status HierarchicalDataReader::read_by_rowids(const rowid_t* rowids, const 
size_t count,
+                                              vectorized::MutableColumnPtr& 
dst) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->read_by_rowids(rowids, count, 
reader.column));
+                VLOG_DEBUG << fmt::format("{} read_by_rowids {} rows, 
type={}", path.get_path(),
+                                          count, type->get_name());
+                reader.rows_read += count;
+                return Status::OK();
+            },
+            dst, count);
+}
+
+Status HierarchicalDataReader::add_stream(const SubcolumnColumnReaders::Node* 
node) {
+    if (_substream_reader.find_leaf(node->path)) {
+        VLOG_DEBUG << "Already exist sub column " << node->path.get_path();
+        return Status::OK();
+    }
+    CHECK(node);
+    ColumnIterator* it;

Review Comment:
   warning: variable 'it' is not initialized [cppcoreguidelines-init-variables]
   
   ```suggestion
       ColumnIterator* it = nullptr;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;
+    SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+    for (size_t i = 0; i < leaves_paths.size(); ++i) {
+        if (leaves_paths[i] == root->path) {
+            // use set_root to share instead
+            continue;
+        }
+        stream_iter->add_stream(leaves[i]);
+    }
+    // Make sure the root node is in strem_cache, so that child can merge data 
with root
+    // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
+    // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
+    // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+    ColumnIterator* it;
+    RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
+    stream_iter->set_root(std::make_unique<StreamReader>(
+            root->data.file_column_type->create_column(), 
std::unique_ptr<ColumnIterator>(it),
+            root->data.file_column_type));
+    reader->reset(stream_iter);
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::init(const ColumnIteratorOptions& opts) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->init(opts));
+        node.data.inited = true;
+        return Status::OK();
+    }));
+    if (_root_reader && !_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status HierarchicalDataReader::seek_to_ordinal(ordinal_t ord) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->seek_to_ordinal(ord));
+        return Status::OK();
+    }));
+    if (_root_reader) {
+        DCHECK(_root_reader->inited);
+        RETURN_IF_ERROR(_root_reader->iterator->seek_to_ordinal(ord));
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
+                                          bool* has_null) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->next_batch(n, reader.column, 
has_null));
+                VLOG_DEBUG << fmt::format("{} next_batch {} rows, type={}", 
path.get_path(), *n,
+                                          type->get_name());
+                reader.rows_read += *n;
+                return Status::OK();
+            },
+            dst, *n);
+}
+
+Status HierarchicalDataReader::read_by_rowids(const rowid_t* rowids, const 
size_t count,
+                                              vectorized::MutableColumnPtr& 
dst) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->read_by_rowids(rowids, count, 
reader.column));
+                VLOG_DEBUG << fmt::format("{} read_by_rowids {} rows, 
type={}", path.get_path(),
+                                          count, type->get_name());
+                reader.rows_read += count;
+                return Status::OK();
+            },
+            dst, count);
+}
+
+Status HierarchicalDataReader::add_stream(const SubcolumnColumnReaders::Node* 
node) {
+    if (_substream_reader.find_leaf(node->path)) {
+        VLOG_DEBUG << "Already exist sub column " << node->path.get_path();
+        return Status::OK();
+    }
+    CHECK(node);
+    ColumnIterator* it;
+    RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+    std::unique_ptr<ColumnIterator> it_ptr;

Review Comment:
   warning: variable 'it_ptr' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
       std::unique_ptr<ColumnIterator> it_ptr = 0;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;
+    SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+    for (size_t i = 0; i < leaves_paths.size(); ++i) {
+        if (leaves_paths[i] == root->path) {
+            // use set_root to share instead
+            continue;
+        }
+        stream_iter->add_stream(leaves[i]);
+    }
+    // Make sure the root node is in strem_cache, so that child can merge data 
with root
+    // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
+    // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
+    // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+    ColumnIterator* it;
+    RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
+    stream_iter->set_root(std::make_unique<StreamReader>(
+            root->data.file_column_type->create_column(), 
std::unique_ptr<ColumnIterator>(it),
+            root->data.file_column_type));
+    reader->reset(stream_iter);
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::init(const ColumnIteratorOptions& opts) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->init(opts));
+        node.data.inited = true;
+        return Status::OK();
+    }));
+    if (_root_reader && !_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status HierarchicalDataReader::seek_to_ordinal(ordinal_t ord) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->seek_to_ordinal(ord));
+        return Status::OK();
+    }));
+    if (_root_reader) {
+        DCHECK(_root_reader->inited);
+        RETURN_IF_ERROR(_root_reader->iterator->seek_to_ordinal(ord));
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
+                                          bool* has_null) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->next_batch(n, reader.column, 
has_null));
+                VLOG_DEBUG << fmt::format("{} next_batch {} rows, type={}", 
path.get_path(), *n,
+                                          type->get_name());
+                reader.rows_read += *n;
+                return Status::OK();
+            },
+            dst, *n);
+}
+
+Status HierarchicalDataReader::read_by_rowids(const rowid_t* rowids, const 
size_t count,
+                                              vectorized::MutableColumnPtr& 
dst) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->read_by_rowids(rowids, count, 
reader.column));
+                VLOG_DEBUG << fmt::format("{} read_by_rowids {} rows, 
type={}", path.get_path(),
+                                          count, type->get_name());
+                reader.rows_read += count;
+                return Status::OK();
+            },
+            dst, count);
+}
+
+Status HierarchicalDataReader::add_stream(const SubcolumnColumnReaders::Node* 
node) {
+    if (_substream_reader.find_leaf(node->path)) {
+        VLOG_DEBUG << "Already exist sub column " << node->path.get_path();
+        return Status::OK();
+    }
+    CHECK(node);
+    ColumnIterator* it;
+    RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+    std::unique_ptr<ColumnIterator> it_ptr;
+    it_ptr.reset(it);
+    StreamReader reader(node->data.file_column_type->create_column(), 
std::move(it_ptr),
+                        node->data.file_column_type);
+    bool added = _substream_reader.add(node->path, std::move(reader));

Review Comment:
   warning: variable 'added' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
       bool added = false = _substream_reader.add(node->path, 
std::move(reader));
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;
+    SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+    for (size_t i = 0; i < leaves_paths.size(); ++i) {
+        if (leaves_paths[i] == root->path) {
+            // use set_root to share instead
+            continue;
+        }
+        stream_iter->add_stream(leaves[i]);
+    }
+    // Make sure the root node is in strem_cache, so that child can merge data 
with root
+    // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
+    // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
+    // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+    ColumnIterator* it;

Review Comment:
   warning: variable 'it' is not initialized [cppcoreguidelines-init-variables]
   
   ```suggestion
       ColumnIterator* it = nullptr;
   ```
   



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -287,6 +337,108 @@
                                              _file_reader, &reader));
         _column_readers.emplace(column.unique_id(), std::move(reader));
     }
+
+    // init by column path
+    for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); 
++ordinal) {
+        auto& column = _tablet_schema->column(ordinal);
+        auto iter = column_path_to_footer_ordinal.find(column.path_info());
+        if (iter == column_path_to_footer_ordinal.end()) {
+            continue;
+        }
+        ColumnReaderOptions opts;
+        opts.kept_in_memory = _tablet_schema->is_in_memory();
+        std::unique_ptr<ColumnReader> reader;
+        RETURN_IF_ERROR(ColumnReader::create(opts, 
footer.columns(iter->second), footer.num_rows(),
+                                             _file_reader, &reader));
+        _sub_column_tree.add(
+                iter->first,
+                SubcolumnReader {std::move(reader),
+                                 
get_data_type_from_column_meta(footer.columns(iter->second))});
+    }
+    return Status::OK();
+}
+
+static Status new_default_iterator(const TabletColumn& tablet_column,
+                                   std::unique_ptr<ColumnIterator>* iter) {
+    if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) {
+        return Status::InternalError("invalid nonexistent column without 
default value.");
+    }
+    auto type_info = get_type_info(&tablet_column);
+    std::unique_ptr<DefaultValueColumnIterator> default_value_iter(new 
DefaultValueColumnIterator(
+            tablet_column.has_default_value(), tablet_column.default_value(),
+            tablet_column.is_nullable(), std::move(type_info), 
tablet_column.precision(),
+            tablet_column.frac()));
+    ColumnIteratorOptions iter_opts;
+
+    RETURN_IF_ERROR(default_value_iter->init(iter_opts));
+    *iter = std::move(default_value_iter);
+    return Status::OK();
+}
+
+Status Segment::new_iterator_with_path(const TabletColumn& tablet_column,
+                                       std::unique_ptr<ColumnIterator>* iter,
+                                       StorageReadOptions* opt) {
+    vectorized::PathInData 
root_path({tablet_column.path_info().get_parts()[0]});
+    auto root = _sub_column_tree.find_leaf(root_path);
+    auto node = _sub_column_tree.find_exact(tablet_column.path_info());
+    if (opt->io_ctx.reader_type == ReaderType::READER_ALTER_TABLE) {
+        CHECK(tablet_column.is_variant_type());
+        if (node == nullptr) {
+            // No such variant column in this segment, get a default one
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        bool output_as_raw_json = true;
+        // Alter table operation should read the whole variant column, since 
it does not aware of
+        // subcolumns of variant during processing rewriting rowsets.
+        // This is slow, since it needs to read all sub columns and merge them 
into a single column
+        RETURN_IF_ERROR(HierarchicalDataReader::create(iter, node, root, 
output_as_raw_json));
+        return Status::OK();
+    }
+
+    if (opt->io_ctx.reader_type != ReaderType::READER_QUERY) {
+        // Could be compaction ..etc and read flat leaves nodes data
+        auto node = _sub_column_tree.find_leaf(tablet_column.path_info());
+        if (!node) {
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        ColumnIterator* it;
+        RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+        iter->reset(it);
+        return Status::OK();
+    }
+
+    // Init iterators with extra path info.
+    // TODO If this segment does not contain any data correspond to the 
relatate path,
+    // then we could optimize to generate a default iterator
+    // This file doest not contain this column, so only read from sparse column
+    // to avoid read amplification
+    if (node != nullptr && node->is_scalar() && node->children.empty()) {
+        // Direct read extracted columns
+        const auto* node = 
_sub_column_tree.find_leaf(tablet_column.path_info());

Review Comment:
   warning: variable 'node' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           const auto* node = nullptr = 
_sub_column_tree.find_leaf(tablet_column.path_info());
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+
+#include "common/status.h"
+#include "io/io_common.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
+                                      const SubcolumnColumnReaders::Node* node,
+                                      const SubcolumnColumnReaders::Node* root,
+                                      bool output_as_raw_json) {
+    // None leave node need merge with root
+    auto* stream_iter = new HierarchicalDataReader(node->path, 
output_as_raw_json);
+    std::vector<const SubcolumnColumnReaders::Node*> leaves;
+    vectorized::PathsInData leaves_paths;
+    SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+    for (size_t i = 0; i < leaves_paths.size(); ++i) {
+        if (leaves_paths[i] == root->path) {
+            // use set_root to share instead
+            continue;
+        }
+        stream_iter->add_stream(leaves[i]);
+    }
+    // Make sure the root node is in strem_cache, so that child can merge data 
with root
+    // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
+    // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
+    // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+    ColumnIterator* it;
+    RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
+    stream_iter->set_root(std::make_unique<StreamReader>(
+            root->data.file_column_type->create_column(), 
std::unique_ptr<ColumnIterator>(it),
+            root->data.file_column_type));
+    reader->reset(stream_iter);
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::init(const ColumnIteratorOptions& opts) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->init(opts));
+        node.data.inited = true;
+        return Status::OK();
+    }));
+    if (_root_reader && !_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status HierarchicalDataReader::seek_to_ordinal(ordinal_t ord) {
+    RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+        RETURN_IF_ERROR(node.data.iterator->seek_to_ordinal(ord));
+        return Status::OK();
+    }));
+    if (_root_reader) {
+        DCHECK(_root_reader->inited);
+        RETURN_IF_ERROR(_root_reader->iterator->seek_to_ordinal(ord));
+    }
+    return Status::OK();
+}
+
+Status HierarchicalDataReader::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
+                                          bool* has_null) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->next_batch(n, reader.column, 
has_null));
+                VLOG_DEBUG << fmt::format("{} next_batch {} rows, type={}", 
path.get_path(), *n,
+                                          type->get_name());
+                reader.rows_read += *n;
+                return Status::OK();
+            },
+            dst, *n);
+}
+
+Status HierarchicalDataReader::read_by_rowids(const rowid_t* rowids, const 
size_t count,
+                                              vectorized::MutableColumnPtr& 
dst) {
+    return process_read(
+            [&](StreamReader& reader, const vectorized::PathInData& path,
+                const vectorized::DataTypePtr& type) {
+                CHECK(reader.inited);
+                RETURN_IF_ERROR(reader.iterator->read_by_rowids(rowids, count, 
reader.column));
+                VLOG_DEBUG << fmt::format("{} read_by_rowids {} rows, 
type={}", path.get_path(),
+                                          count, type->get_name());
+                reader.rows_read += count;
+                return Status::OK();
+            },
+            dst, count);
+}
+
+Status HierarchicalDataReader::add_stream(const SubcolumnColumnReaders::Node* 
node) {
+    if (_substream_reader.find_leaf(node->path)) {
+        VLOG_DEBUG << "Already exist sub column " << node->path.get_path();
+        return Status::OK();
+    }
+    CHECK(node);
+    ColumnIterator* it;
+    RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+    std::unique_ptr<ColumnIterator> it_ptr;
+    it_ptr.reset(it);
+    StreamReader reader(node->data.file_column_type->create_column(), 
std::move(it_ptr),
+                        node->data.file_column_type);
+    bool added = _substream_reader.add(node->path, std::move(reader));
+    if (!added) {
+        return Status::InternalError("Failed to add node path {}", 
node->path.get_path());
+    }
+    VLOG_DEBUG << fmt::format("Add substream {} for {}", 
node->path.get_path(), _path.get_path());
+    return Status::OK();
+}
+
+ordinal_t HierarchicalDataReader::get_current_ordinal() const {
+    return (*_substream_reader.begin())->data.iterator->get_current_ordinal();
+}
+
+Status ExtractReader::init(const ColumnIteratorOptions& opts) {
+    if (!_root_reader->inited) {
+        RETURN_IF_ERROR(_root_reader->iterator->init(opts));
+        _root_reader->inited = true;
+    }
+    return Status::OK();
+}
+
+Status ExtractReader::seek_to_first() {
+    LOG(FATAL) << "Not implemented";
+}
+
+Status ExtractReader::seek_to_ordinal(ordinal_t ord) {
+    CHECK(_root_reader->inited);
+    return _root_reader->iterator->seek_to_ordinal(ord);
+}
+
+Status ExtractReader::extract_to(vectorized::MutableColumnPtr& dst, size_t 
nrows) {
+    DCHECK(_root_reader);
+    DCHECK(_root_reader->inited);
+    vectorized::MutableColumnPtr extracted_column;

Review Comment:
   warning: variable 'extracted_column' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
       vectorized::MutableColumnPtr extracted_column = 0;
   ```
   



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "io/io_common.h"
+#include "olap/field.h"
+#include "olap/iterators.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "olap/schema.h"
+#include "olap/tablet_schema.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_object.h"
+#include "vec/columns/subcolumn_tree.h"
+#include "vec/data_types/data_type_object.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris {
+namespace segment_v2 {
+
+struct StreamReader {
+    vectorized::MutableColumnPtr column;
+    std::unique_ptr<ColumnIterator> iterator;
+    std::shared_ptr<const vectorized::IDataType> type;
+    bool inited = false;
+    size_t rows_read = 0;
+    StreamReader() = default;
+    StreamReader(vectorized::MutableColumnPtr&& col, 
std::unique_ptr<ColumnIterator>&& it,
+                 std::shared_ptr<const vectorized::IDataType> t)
+            : column(std::move(col)), iterator(std::move(it)), type(t) {}
+};
+
+// path -> StreamReader
+using SubstreamReaderTree = vectorized::SubcolumnsTree<StreamReader>;
+
+// path -> SubcolumnReader
+struct SubcolumnReader {
+    std::unique_ptr<ColumnReader> reader;
+    std::shared_ptr<const vectorized::IDataType> file_column_type;
+};
+using SubcolumnColumnReaders = vectorized::SubcolumnsTree<SubcolumnReader>;
+
+// Reader for hierarchical data for variant, merge with root(sparse encoded 
columns)
+class HierarchicalDataReader : public ColumnIterator {
+public:
+    HierarchicalDataReader(const vectorized::PathInData& path, bool 
output_as_raw_json = false)
+            : _path(path), _output_as_raw_json(output_as_raw_json) {}
+
+    static Status create(std::unique_ptr<ColumnIterator>* reader,
+                         const SubcolumnColumnReaders::Node* target_node,
+                         const SubcolumnColumnReaders::Node* root, bool 
output_as_raw_json = false);
+
+    Status init(const ColumnIteratorOptions& opts) override;
+
+    Status seek_to_first() override;
+
+    Status seek_to_ordinal(ordinal_t ord) override;
+
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* 
has_null) override;
+
+    Status read_by_rowids(const rowid_t* rowids, const size_t count,
+                          vectorized::MutableColumnPtr& dst) override;
+
+    ordinal_t get_current_ordinal() const override;
+
+    Status add_stream(const SubcolumnColumnReaders::Node* node);
+
+    void set_root(std::unique_ptr<StreamReader>&& root) { _root_reader = 
std::move(root); }
+
+private:
+    SubstreamReaderTree _substream_reader;
+    std::unique_ptr<StreamReader> _root_reader;
+    size_t _rows_read = 0;
+    vectorized::PathInData _path;
+    bool _output_as_raw_json = false;
+
+    template <typename NodeFunction>
+    Status tranverse(NodeFunction&& node_func) {
+        for (auto& entry : _substream_reader) {
+            RETURN_IF_ERROR(node_func(*entry));
+        }
+        return Status::OK();
+    }
+    // process read
+    template <typename ReadFunction>
+    Status process_read(ReadFunction&& read_func, 
vectorized::MutableColumnPtr& dst, size_t nrows) {
+        // // Read all sub columns, and merge with root column
+        // for (const SubstreamCache::Node* node : _attatched_nodes) {
+        //     RETURN_IF_ERROR(node_func(node));
+        // }
+        auto& variant = assert_cast<vectorized::ColumnObject&>(*dst);
+
+        // read data
+        // read root first if it is not read before
+        RETURN_IF_ERROR(read_func(*_root_reader, {}, _root_reader->type));
+
+        // read container columns
+        RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+            RETURN_IF_ERROR(read_func(node.data, node.path, node.data.type));
+            return Status::OK();
+        }));
+
+        // build variant as container
+        auto container = vectorized::ColumnObject::create(true, false);
+        auto& container_variant = 
assert_cast<vectorized::ColumnObject&>(*container);
+
+        // add root first
+        if (_path.get_parts().size() == 1) {
+            auto& root_var = 
assert_cast<vectorized::ColumnObject&>(*_root_reader->column);
+            auto column = root_var.get_root();
+            auto type = root_var.get_root_type();
+            container_variant.add_sub_column({}, std::move(column), type);
+        }
+
+        RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
+            vectorized::MutableColumnPtr column = node.data.column->get_ptr();
+            bool add = container_variant.add_sub_column(node.path.pop_front(), 
std::move(column),
+                                                        node.data.type);
+            if (!add) {
+                return Status::InternalError("Duplicated {}, type {}", 
node.path.get_path(),
+                                             node.data.type->get_name());
+            }
+            return Status::OK();
+        }));
+
+        if (_output_as_raw_json) {
+            auto col_to = vectorized::ColumnString::create();
+            col_to->reserve(nrows * 2);
+            vectorized::VectorBufferWriter write_buffer(*col_to.get());
+            auto type = std::make_shared<vectorized::DataTypeObject>();
+            for (size_t i = 0; i < nrows; ++i) {
+                type->to_string(container_variant, i, write_buffer);
+                write_buffer.commit();
+            }
+            CHECK(variant.empty());
+            
variant.create_root(std::make_shared<vectorized::DataTypeString>(), 
std::move(col_to));
+        } else {
+            // TODO select v:b -> v.b / v.b.c but v.d maybe in v
+            // copy container variant to dst variant, todo avoid copy
+            variant.insert_range_from(container_variant, 0, nrows);
+        }
+
+        variant.set_num_rows(nrows);
+        _rows_read += nrows;
+        variant.finalize();
+#ifndef NDEBUG
+        variant.check_consistency();
+#endif
+        // clear data in nodes
+        tranverse([&](SubstreamReaderTree::Node& node) {
+            node.data.column->clear();
+            return Status::OK();
+        });
+        container->clear();
+        
static_cast<vectorized::ColumnObject*>(_root_reader->column.get())->clear_subcolumns_data();
+        return Status::OK();
+    }
+};
+
+// Extract from root column of variant, since root column of variant
+// encodes sparse columns that are not materialized
+class ExtractReader : public ColumnIterator {
+public:
+    ExtractReader(const TabletColumn& col, std::unique_ptr<StreamReader>&& 
root_reader)
+            : _col(col), _root_reader(std::move(root_reader)) {}
+
+    Status init(const ColumnIteratorOptions& opts) override;
+
+    Status seek_to_first() override;
+
+    Status seek_to_ordinal(ordinal_t ord) override;
+
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* 
has_null) override;
+
+    Status read_by_rowids(const rowid_t* rowids, const size_t count,
+                          vectorized::MutableColumnPtr& dst) override;
+
+    ordinal_t get_current_ordinal() const override;

Review Comment:
   warning: function 'get_current_ordinal' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] ordinal_t get_current_ordinal() const override;
   ```
   



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -298,36 +450,53 @@
 // but in the old schema column b's cid == 2
 // but they are not the same column
 Status Segment::new_column_iterator(const TabletColumn& tablet_column,
-                                    std::unique_ptr<ColumnIterator>* iter) {
+                                    std::unique_ptr<ColumnIterator>* iter,
+                                    StorageReadOptions* opt) {
+    // init column iterator by path info
+    if (!tablet_column.path_info().empty() || tablet_column.is_variant_type()) 
{
+        return new_iterator_with_path(tablet_column, iter, opt);
+    }
+    // init default iterator
     if (_column_readers.count(tablet_column.unique_id()) < 1) {
-        if (!tablet_column.has_default_value() && 
!tablet_column.is_nullable()) {
-            return Status::InternalError("invalid nonexistent column without 
default value.");
-        }
-        auto type_info = get_type_info(&tablet_column);
-        std::unique_ptr<DefaultValueColumnIterator> default_value_iter(
-                new 
DefaultValueColumnIterator(tablet_column.has_default_value(),
-                                               tablet_column.default_value(),
-                                               tablet_column.is_nullable(), 
std::move(type_info),
-                                               tablet_column.precision(), 
tablet_column.frac()));
-        ColumnIteratorOptions iter_opts;
-
-        RETURN_IF_ERROR(default_value_iter->init(iter_opts));
-        *iter = std::move(default_value_iter);
+        RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
         return Status::OK();
     }
+    // init iterator by unique id
     ColumnIterator* it;
     
RETURN_IF_ERROR(_column_readers.at(tablet_column.unique_id())->new_iterator(&it));
     iter->reset(it);
     return Status::OK();
 }
 
+Status Segment::new_column_iterator(int32_t unique_id, 
std::unique_ptr<ColumnIterator>* iter) {
+    ColumnIterator* it;

Review Comment:
   warning: variable 'it' is not initialized [cppcoreguidelines-init-variables]
   
   ```suggestion
       ColumnIterator* it = nullptr;
   ```
   



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -287,6 +337,108 @@
                                              _file_reader, &reader));
         _column_readers.emplace(column.unique_id(), std::move(reader));
     }
+
+    // init by column path
+    for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); 
++ordinal) {
+        auto& column = _tablet_schema->column(ordinal);
+        auto iter = column_path_to_footer_ordinal.find(column.path_info());
+        if (iter == column_path_to_footer_ordinal.end()) {
+            continue;
+        }
+        ColumnReaderOptions opts;
+        opts.kept_in_memory = _tablet_schema->is_in_memory();
+        std::unique_ptr<ColumnReader> reader;
+        RETURN_IF_ERROR(ColumnReader::create(opts, 
footer.columns(iter->second), footer.num_rows(),
+                                             _file_reader, &reader));
+        _sub_column_tree.add(
+                iter->first,
+                SubcolumnReader {std::move(reader),
+                                 
get_data_type_from_column_meta(footer.columns(iter->second))});
+    }
+    return Status::OK();
+}
+
+static Status new_default_iterator(const TabletColumn& tablet_column,
+                                   std::unique_ptr<ColumnIterator>* iter) {
+    if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) {
+        return Status::InternalError("invalid nonexistent column without 
default value.");
+    }
+    auto type_info = get_type_info(&tablet_column);
+    std::unique_ptr<DefaultValueColumnIterator> default_value_iter(new 
DefaultValueColumnIterator(
+            tablet_column.has_default_value(), tablet_column.default_value(),
+            tablet_column.is_nullable(), std::move(type_info), 
tablet_column.precision(),
+            tablet_column.frac()));
+    ColumnIteratorOptions iter_opts;
+
+    RETURN_IF_ERROR(default_value_iter->init(iter_opts));
+    *iter = std::move(default_value_iter);
+    return Status::OK();
+}
+
+Status Segment::new_iterator_with_path(const TabletColumn& tablet_column,
+                                       std::unique_ptr<ColumnIterator>* iter,
+                                       StorageReadOptions* opt) {
+    vectorized::PathInData 
root_path({tablet_column.path_info().get_parts()[0]});
+    auto root = _sub_column_tree.find_leaf(root_path);
+    auto node = _sub_column_tree.find_exact(tablet_column.path_info());
+    if (opt->io_ctx.reader_type == ReaderType::READER_ALTER_TABLE) {
+        CHECK(tablet_column.is_variant_type());
+        if (node == nullptr) {
+            // No such variant column in this segment, get a default one
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        bool output_as_raw_json = true;
+        // Alter table operation should read the whole variant column, since 
it does not aware of
+        // subcolumns of variant during processing rewriting rowsets.
+        // This is slow, since it needs to read all sub columns and merge them 
into a single column
+        RETURN_IF_ERROR(HierarchicalDataReader::create(iter, node, root, 
output_as_raw_json));
+        return Status::OK();
+    }
+
+    if (opt->io_ctx.reader_type != ReaderType::READER_QUERY) {
+        // Could be compaction ..etc and read flat leaves nodes data
+        auto node = _sub_column_tree.find_leaf(tablet_column.path_info());
+        if (!node) {
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        ColumnIterator* it;
+        RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+        iter->reset(it);
+        return Status::OK();
+    }
+
+    // Init iterators with extra path info.
+    // TODO If this segment does not contain any data correspond to the 
relatate path,
+    // then we could optimize to generate a default iterator
+    // This file doest not contain this column, so only read from sparse column
+    // to avoid read amplification
+    if (node != nullptr && node->is_scalar() && node->children.empty()) {
+        // Direct read extracted columns
+        const auto* node = 
_sub_column_tree.find_leaf(tablet_column.path_info());
+        ColumnIterator* it;
+        RETURN_IF_ERROR(node->data.reader->new_iterator(&it));
+        iter->reset(it);
+    } else if (node != nullptr && !node->children.empty()) {
+        // Create reader with hirachical data
+        RETURN_IF_ERROR(HierarchicalDataReader::create(iter, node, root));
+    } else {
+        // If file only exist column `v.a` and `v` but target path is `v.b`, 
read only read and parse root column
+        if (root == nullptr) {
+            // No such variant column in this segment, get a default one
+            RETURN_IF_ERROR(new_default_iterator(tablet_column, iter));
+            return Status::OK();
+        }
+        ColumnIterator* it;

Review Comment:
   warning: variable 'it' is not initialized [cppcoreguidelines-init-variables]
   
   ```suggestion
           ColumnIterator* it = nullptr;
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Variant master index load [doris]

Reply via email to