Copilot commented on code in PR #60511:
URL: https://github.com/apache/doris/pull/60511#discussion_r2765103976


##########
be/test/olap/variant_doc_mode_compaction_test.cpp:
##########
@@ -0,0 +1,469 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gen_cpp/Types_types.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/config.h"
+#include "common/status.h"
+#include "io/fs/local_file_system.h"
+#include "olap/cumulative_compaction.h"
+#include "olap/data_dir.h"
+#include "olap/olap_define.h"
+#include "olap/olap_common.h"
+#include "olap/options.h"
+#include "olap/rowset/beta_rowset.h"
+#include "olap/rowset/rowset.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/rowset_reader.h"
+#include "olap/rowset/rowset_reader_context.h"
+#include "olap/rowset/rowset_writer.h"
+#include "olap/rowset/rowset_writer_context.h"
+#include "olap/rowset/segment_v2/index_writer.h"
+#include "olap/schema.h"
+#include "olap/storage_engine.h"
+#include "olap/tablet.h"
+#include "olap/tablet_meta.h"
+#include "olap/tablet_schema.h"
+#include "runtime/exec_env.h"
+#include "testutil/variant_util.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_variant.h"
+#include "vec/core/block.h"
+#include "vec/core/field.h"
+#include "vec/data_types/data_type_string.h"
+
+namespace doris {
+namespace vectorized {
+using namespace ErrorCode;
+
+static constexpr uint32_t MAX_PATH_LEN = 1024;
+static StorageEngine* engine_ref = nullptr;
+static constexpr int32_t kRowsPerSegment = 100000;
+
+class VariantDocModeCompactionTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        char buffer[MAX_PATH_LEN];
+        ASSERT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+        absolute_dir = std::string(buffer) + std::string(kTestDir);
+        Status st;
+        bool exists = false;
+        ASSERT_TRUE(io::global_local_filesystem()->exists(absolute_dir, 
&exists).ok());
+        if (!exists) {
+            st = io::global_local_filesystem()->create_directory(absolute_dir);
+            ASSERT_TRUE(st.ok()) << st.to_string();
+        }
+        ASSERT_TRUE(
+                io::global_local_filesystem()->create_directory(absolute_dir + 
"/tablet_path").ok());
+        cache_dir = std::string(buffer) + 
"/ut_dir/variant_doc_mode_compaction_test_cache";
+        
ASSERT_TRUE(io::global_local_filesystem()->create_directory(cache_dir).ok());
+
+        
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(std::string(tmp_dir)).ok());
+        
ASSERT_TRUE(io::global_local_filesystem()->create_directory(std::string(tmp_dir)).ok());
+        std::vector<StorePath> paths;
+        paths.emplace_back(std::string(tmp_dir), 1024000000);
+        auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+        st = tmp_file_dirs->init();
+        ASSERT_TRUE(st.ok()) << st.to_json();
+        ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+        EngineOptions options;
+        auto engine = std::make_unique<StorageEngine>(options);
+        engine_ref = engine.get();
+        _data_dir = std::make_unique<DataDir>(*engine_ref, absolute_dir);
+        ASSERT_TRUE(_data_dir->init(true).ok());
+        ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+
+        config::enable_ordered_data_compaction = false;
+        config::segments_key_bounds_truncation_threshold = -1;
+        config::enable_vertical_compact_variant_subcolumns = true;
+        config::enable_compaction_checksum = false;
+        config::enable_stacktrace = true;
+    }

Review Comment:
   This fixture mutates multiple global `config::` flags. Since `doris_be_test` 
runs all unit tests in one binary, these values will leak into other tests 
unless restored. Consider saving previous values in `SetUp()` and restoring 
them in `TearDown()`.



##########
be/test/olap/variant_doc_mode_compaction_test.cpp:
##########
@@ -0,0 +1,469 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gen_cpp/Types_types.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/config.h"
+#include "common/status.h"
+#include "io/fs/local_file_system.h"
+#include "olap/cumulative_compaction.h"
+#include "olap/data_dir.h"
+#include "olap/olap_define.h"
+#include "olap/olap_common.h"
+#include "olap/options.h"
+#include "olap/rowset/beta_rowset.h"
+#include "olap/rowset/rowset.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/rowset_reader.h"
+#include "olap/rowset/rowset_reader_context.h"
+#include "olap/rowset/rowset_writer.h"
+#include "olap/rowset/rowset_writer_context.h"
+#include "olap/rowset/segment_v2/index_writer.h"
+#include "olap/schema.h"
+#include "olap/storage_engine.h"
+#include "olap/tablet.h"
+#include "olap/tablet_meta.h"
+#include "olap/tablet_schema.h"
+#include "runtime/exec_env.h"
+#include "testutil/variant_util.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_variant.h"
+#include "vec/core/block.h"
+#include "vec/core/field.h"
+#include "vec/data_types/data_type_string.h"
+
+namespace doris {
+namespace vectorized {
+using namespace ErrorCode;
+
+static constexpr uint32_t MAX_PATH_LEN = 1024;
+static StorageEngine* engine_ref = nullptr;
+static constexpr int32_t kRowsPerSegment = 100000;
+

Review Comment:
   The test builds 10 segments of 100,000 rows each (1,000,000 rows total) and 
generates fairly large random JSON per row. This is likely to be too slow/heavy 
for a unit test binary. Consider reducing `kRowsPerSegment` and the 
materialization threshold proportionally, or gating this as a 
performance/benchmark-style test (e.g., `DISABLED_` or env-guarded).



##########
be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp:
##########
@@ -1176,8 +1207,8 @@ Status VariantDocCompactWriter::finalize() {
     const auto& column_offsets = 
variant_column->serialized_doc_value_column_offsets();
     std::map<StringRef, uint32_t> column_stats;
     for (int64_t i = 0; i < num_rows; ++i) {
-        size_t start = column_offsets[i - 1];
-        size_t end = column_offsets[i];
+        const size_t start = column_offsets[i - 1];

Review Comment:
   In the `for (i = 0; i < num_rows; ++i)` loop, `column_offsets[i - 1]` is 
used when `i == 0`, which underflows and can read invalid memory. Initialize 
`start` to 0 for the first row (e.g., `start = (i == 0) ? 0 : column_offsets[i 
- 1]`).
   ```suggestion
           const size_t start = (i == 0) ? 0 : column_offsets[i - 1];
   ```



##########
be/test/olap/variant_doc_mode_compaction_test.cpp:
##########
@@ -0,0 +1,469 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gen_cpp/Types_types.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/config.h"
+#include "common/status.h"
+#include "io/fs/local_file_system.h"
+#include "olap/cumulative_compaction.h"
+#include "olap/data_dir.h"
+#include "olap/olap_define.h"
+#include "olap/olap_common.h"
+#include "olap/options.h"
+#include "olap/rowset/beta_rowset.h"
+#include "olap/rowset/rowset.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/rowset_reader.h"
+#include "olap/rowset/rowset_reader_context.h"
+#include "olap/rowset/rowset_writer.h"
+#include "olap/rowset/rowset_writer_context.h"
+#include "olap/rowset/segment_v2/index_writer.h"
+#include "olap/schema.h"
+#include "olap/storage_engine.h"
+#include "olap/tablet.h"
+#include "olap/tablet_meta.h"
+#include "olap/tablet_schema.h"
+#include "runtime/exec_env.h"
+#include "testutil/variant_util.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_variant.h"
+#include "vec/core/block.h"
+#include "vec/core/field.h"
+#include "vec/data_types/data_type_string.h"
+
+namespace doris {
+namespace vectorized {
+using namespace ErrorCode;
+
+static constexpr uint32_t MAX_PATH_LEN = 1024;
+static StorageEngine* engine_ref = nullptr;
+static constexpr int32_t kRowsPerSegment = 100000;
+
+class VariantDocModeCompactionTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        char buffer[MAX_PATH_LEN];
+        ASSERT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+        absolute_dir = std::string(buffer) + std::string(kTestDir);
+        Status st;
+        bool exists = false;
+        ASSERT_TRUE(io::global_local_filesystem()->exists(absolute_dir, 
&exists).ok());
+        if (!exists) {
+            st = io::global_local_filesystem()->create_directory(absolute_dir);
+            ASSERT_TRUE(st.ok()) << st.to_string();
+        }
+        ASSERT_TRUE(
+                io::global_local_filesystem()->create_directory(absolute_dir + 
"/tablet_path").ok());
+        cache_dir = std::string(buffer) + 
"/ut_dir/variant_doc_mode_compaction_test_cache";
+        
ASSERT_TRUE(io::global_local_filesystem()->create_directory(cache_dir).ok());
+
+        
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(std::string(tmp_dir)).ok());
+        
ASSERT_TRUE(io::global_local_filesystem()->create_directory(std::string(tmp_dir)).ok());
+        std::vector<StorePath> paths;
+        paths.emplace_back(std::string(tmp_dir), 1024000000);
+        auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+        st = tmp_file_dirs->init();
+        ASSERT_TRUE(st.ok()) << st.to_json();
+        ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+        EngineOptions options;
+        auto engine = std::make_unique<StorageEngine>(options);
+        engine_ref = engine.get();
+        _data_dir = std::make_unique<DataDir>(*engine_ref, absolute_dir);
+        ASSERT_TRUE(_data_dir->init(true).ok());
+        ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+
+        config::enable_ordered_data_compaction = false;
+        config::segments_key_bounds_truncation_threshold = -1;
+        config::enable_vertical_compact_variant_subcolumns = true;
+        config::enable_compaction_checksum = false;
+        config::enable_stacktrace = true;
+    }
+
+    void TearDown() override {
+        // 
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(absolute_dir).ok());
+        engine_ref = nullptr;
+        ExecEnv::GetInstance()->set_storage_engine(nullptr);
+    }

Review Comment:
   `TearDown()` does not clean `absolute_dir`, `cache_dir`, or `tmp_dir` (and 
`SetUp()` reuses existing dirs). This makes the test non-hermetic and can leave 
large artifacts under `ut_dir`, affecting subsequent local/CI runs. Prefer 
deleting these directories in `TearDown()` and/or recreating them from a clean 
state in `SetUp()`.



##########
be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp:
##########
@@ -335,19 +342,38 @@ Status UnifiedSparseColumnWriter::append_bucket_sparse(
         auto& m = assert_cast<vectorized::ColumnMap&>(*tmp_maps[b]);
         m.get_offsets().reserve(num_rows);
     }
-    for (ssize_t row = 0; row < static_cast<ssize_t>(num_rows); ++row) {
-        size_t start = offsets[row - 1];
-        size_t end = offsets[row];
+    std::vector<vectorized::ColumnString*> bucket_keys(bucket_num);
+    std::vector<vectorized::ColumnString*> bucket_values(bucket_num);
+    std::vector<vectorized::ColumnArray::Offsets64*> 
bucket_offsets(bucket_num);
+    for (int b = 0; b < bucket_num; ++b) {
+        auto& m = assert_cast<vectorized::ColumnMap&>(*tmp_maps[b]);
+        bucket_keys[b] = &assert_cast<vectorized::ColumnString&>(m.get_keys());
+        bucket_values[b] = 
&assert_cast<vectorized::ColumnString&>(m.get_values());
+        bucket_offsets[b] = &m.get_offsets();
+    }
+    for (size_t row = 0; row < num_rows; ++row) {
+        const size_t start = offsets[row - 1];

Review Comment:
   `offsets[row - 1]` is accessed when `row == 0`, which underflows and results 
in an out-of-bounds read. Use the same pattern as 
`VariantDocWriter::append_data` below (start = 0 for the first row).
   ```suggestion
           const size_t start = (row == 0 ? 0 : offsets[row - 1]);
   ```



##########
be/test/olap/variant_doc_mode_compaction_test.cpp:
##########
@@ -0,0 +1,469 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gen_cpp/Types_types.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/config.h"
+#include "common/status.h"
+#include "io/fs/local_file_system.h"
+#include "olap/cumulative_compaction.h"
+#include "olap/data_dir.h"
+#include "olap/olap_define.h"
+#include "olap/olap_common.h"
+#include "olap/options.h"
+#include "olap/rowset/beta_rowset.h"
+#include "olap/rowset/rowset.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/rowset_reader.h"
+#include "olap/rowset/rowset_reader_context.h"
+#include "olap/rowset/rowset_writer.h"
+#include "olap/rowset/rowset_writer_context.h"
+#include "olap/rowset/segment_v2/index_writer.h"
+#include "olap/schema.h"
+#include "olap/storage_engine.h"
+#include "olap/tablet.h"
+#include "olap/tablet_meta.h"
+#include "olap/tablet_schema.h"
+#include "runtime/exec_env.h"
+#include "testutil/variant_util.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_variant.h"
+#include "vec/core/block.h"
+#include "vec/core/field.h"
+#include "vec/data_types/data_type_string.h"
+
+namespace doris {
+namespace vectorized {
+using namespace ErrorCode;
+
+static constexpr uint32_t MAX_PATH_LEN = 1024;
+static StorageEngine* engine_ref = nullptr;
+static constexpr int32_t kRowsPerSegment = 100000;
+
+class VariantDocModeCompactionTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        char buffer[MAX_PATH_LEN];
+        ASSERT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+        absolute_dir = std::string(buffer) + std::string(kTestDir);
+        Status st;
+        bool exists = false;
+        ASSERT_TRUE(io::global_local_filesystem()->exists(absolute_dir, 
&exists).ok());
+        if (!exists) {
+            st = io::global_local_filesystem()->create_directory(absolute_dir);
+            ASSERT_TRUE(st.ok()) << st.to_string();
+        }
+        ASSERT_TRUE(
+                io::global_local_filesystem()->create_directory(absolute_dir + 
"/tablet_path").ok());
+        cache_dir = std::string(buffer) + 
"/ut_dir/variant_doc_mode_compaction_test_cache";
+        
ASSERT_TRUE(io::global_local_filesystem()->create_directory(cache_dir).ok());
+
+        
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(std::string(tmp_dir)).ok());
+        
ASSERT_TRUE(io::global_local_filesystem()->create_directory(std::string(tmp_dir)).ok());
+        std::vector<StorePath> paths;
+        paths.emplace_back(std::string(tmp_dir), 1024000000);
+        auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+        st = tmp_file_dirs->init();
+        ASSERT_TRUE(st.ok()) << st.to_json();
+        ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+        EngineOptions options;
+        auto engine = std::make_unique<StorageEngine>(options);
+        engine_ref = engine.get();
+        _data_dir = std::make_unique<DataDir>(*engine_ref, absolute_dir);
+        ASSERT_TRUE(_data_dir->init(true).ok());
+        ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+
+        config::enable_ordered_data_compaction = false;
+        config::segments_key_bounds_truncation_threshold = -1;
+        config::enable_vertical_compact_variant_subcolumns = true;
+        config::enable_compaction_checksum = false;
+        config::enable_stacktrace = true;
+    }
+
+    void TearDown() override {
+        // 
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(absolute_dir).ok());
+        engine_ref = nullptr;
+        ExecEnv::GetInstance()->set_storage_engine(nullptr);
+    }
+
+    TabletSchemaSPtr create_schema() {
+        TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+        TabletSchemaPB tablet_schema_pb;
+        tablet_schema_pb.set_keys_type(KeysType::DUP_KEYS);
+        tablet_schema_pb.set_num_short_key_columns(1);
+        tablet_schema_pb.set_num_rows_per_row_block(1024);
+        tablet_schema_pb.set_compress_kind(COMPRESS_NONE);
+        tablet_schema_pb.set_next_column_unique_id(3);
+
+        ColumnPB* column_1 = tablet_schema_pb.add_column();
+        column_1->set_unique_id(1);
+        column_1->set_name("c1");
+        column_1->set_type("INT");
+        column_1->set_is_key(true);
+        column_1->set_length(4);
+        column_1->set_index_length(4);
+        column_1->set_is_nullable(false);
+        column_1->set_is_bf_column(false);
+
+        ColumnPB* column_2 = tablet_schema_pb.add_column();
+        column_2->set_unique_id(2);
+        column_2->set_name("v");
+        column_2->set_type("VARIANT");
+        column_2->set_is_key(false);
+        column_2->set_is_nullable(false);
+        column_2->set_variant_max_subcolumns_count(0);
+        column_2->set_variant_max_sparse_column_statistics_size(10000);
+        column_2->set_variant_sparse_hash_shard_count(128);
+        column_2->set_variant_enable_doc_mode(true);
+        column_2->set_variant_doc_materialization_min_rows(kRowsPerSegment);
+
+        tablet_schema->init_from_pb(tablet_schema_pb);
+        return tablet_schema;
+    }
+
+    TabletSharedPtr create_tablet(const TabletSchema& tablet_schema) {
+        std::vector<TColumn> cols;
+        std::unordered_map<uint32_t, uint32_t> col_ordinal_to_unique_id;
+        for (int i = 0; i < tablet_schema.num_columns(); ++i) {
+            const TabletColumn& column = tablet_schema.column(i);
+            TColumn col;
+            if (column.type() == FieldType::OLAP_FIELD_TYPE_VARIANT) {
+                col.column_type.type = TPrimitiveType::VARIANT;
+            } else {
+                col.column_type.type = TPrimitiveType::INT;
+            }
+            col.__set_column_name(column.name());
+            col.__set_is_key(column.is_key());
+            cols.push_back(col);
+            col_ordinal_to_unique_id[i] = column.unique_id();
+        }
+
+        TTabletSchema t_tablet_schema;
+        
t_tablet_schema.__set_short_key_column_count(tablet_schema.num_short_key_columns());
+        t_tablet_schema.__set_schema_hash(3333);
+        t_tablet_schema.__set_keys_type(TKeysType::DUP_KEYS);
+        t_tablet_schema.__set_storage_type(TStorageType::COLUMN);
+        t_tablet_schema.__set_columns(cols);
+
+        TabletMetaSharedPtr tablet_meta(
+                new TabletMeta(2, 2, 2, 2, 2, 2, t_tablet_schema, 2, 
col_ordinal_to_unique_id,
+                               UniqueId(1, 2), TTabletType::TABLET_TYPE_DISK,
+                               TCompressionType::LZ4F, 0, false));
+
+        TabletSharedPtr tablet(new Tablet(*engine_ref, tablet_meta, 
_data_dir.get()));
+        static_cast<void>(tablet->init());
+        return tablet;
+    }
+
+    void create_rowset_writer_context(TabletSchemaSPtr tablet_schema, const 
std::string& rowset_dir,
+                                      int64_t version, RowsetWriterContext* 
rowset_writer_context) {
+        RowsetId rowset_id;
+        rowset_id.init(version + 1000);
+        rowset_writer_context->rowset_id = rowset_id;
+        rowset_writer_context->rowset_type = BETA_ROWSET;
+        rowset_writer_context->data_dir = _data_dir.get();
+        rowset_writer_context->rowset_state = VISIBLE;
+        rowset_writer_context->tablet_schema = tablet_schema;
+        rowset_writer_context->tablet_path = rowset_dir;
+        rowset_writer_context->version = Version(version, version);
+        rowset_writer_context->segments_overlap = NONOVERLAPPING;
+        rowset_writer_context->max_rows_per_segment = kRowsPerSegment;
+    }
+
+    std::string generate_random_json(const std::vector<std::string>& key_pool, 
std::mt19937_64& rng,
+                                    uint32_t row_seed) {
+        std::uniform_int_distribution<int> kv_count_dist(10, 50);
+        std::uniform_int_distribution<int> key_idx_dist(0, 
static_cast<int>(key_pool.size() - 1));
+        int kv_count = kv_count_dist(rng);
+
+        std::array<int, 50> selected {};
+        int selected_size = 0;
+        while (selected_size < kv_count) {
+            int idx = key_idx_dist(rng);
+            bool dup = false;
+            for (int i = 0; i < selected_size; ++i) {
+                if (selected[i] == idx) {
+                    dup = true;
+                    break;
+                }
+            }
+            if (!dup) {
+                selected[selected_size++] = idx;
+            }
+        }
+
+        std::string json;
+        json.reserve(static_cast<size_t>(kv_count) * 20 + 2);
+        json.push_back('{');
+        for (int i = 0; i < kv_count; ++i) {
+            const auto& key = key_pool[selected[i]];
+            json.push_back('"');
+            json.append(key);
+            json.append("\":");
+            json.append(std::to_string(static_cast<uint32_t>(selected[i]) ^ 
row_seed));
+            if (i + 1 < kv_count) {
+                json.push_back(',');
+            }
+        }
+        json.push_back('}');
+        return json;
+    }
+
+    RowsetSharedPtr create_rowset_with_variant(TabletSchemaSPtr tablet_schema, 
TabletSharedPtr tablet,
+                                               int64_t version, int64_t 
start_key,
+                                               const std::vector<std::string>& 
key_pool,
+                                               std::mt19937_64& rng, int64_t* 
import_elapsed_ms) {
+        RowsetWriterContext writer_context;
+        create_rowset_writer_context(tablet_schema, tablet->tablet_path(), 
version, &writer_context);
+
+        auto res = RowsetFactory::create_rowset_writer(*engine_ref, 
writer_context, true);
+        EXPECT_TRUE(res.has_value()) << res.error();
+        auto rowset_writer = std::move(res).value();
+
+        vectorized::Block block = tablet_schema->create_block();
+        auto columns = block.mutate_columns();
+        auto* variant_col = assert_cast<ColumnVariant*>(columns[1].get());
+        auto raw_json_column = ColumnString::create();
+        raw_json_column->reserve(kRowsPerSegment);
+        for (uint32_t i = 0; i < kRowsPerSegment; ++i) {
+            int32_t c1 = static_cast<int32_t>(start_key + i);
+            columns[0]->insert_data(reinterpret_cast<const char*>(&c1), 
sizeof(c1));
+
+            std::string json = generate_random_json(key_pool, rng, 
static_cast<uint32_t>(c1));
+            raw_json_column->insert_data(json.data(), json.size());
+        }
+
+        
variant_col->create_root(make_nullable(std::make_shared<DataTypeString>()),
+                                 std::move(raw_json_column));
+
+        auto import_start = std::chrono::steady_clock::now();
+        auto s = rowset_writer->add_block(&block);
+        EXPECT_TRUE(s.ok()) << s.to_json();
+        s = rowset_writer->flush();
+        EXPECT_TRUE(s.ok()) << s.to_json();
+
+        RowsetSharedPtr rowset;
+        EXPECT_EQ(Status::OK(), rowset_writer->build(rowset));
+        auto import_end = std::chrono::steady_clock::now();
+        if (import_elapsed_ms != nullptr) {
+            *import_elapsed_ms =
+                    
std::chrono::duration_cast<std::chrono::milliseconds>(import_end - import_start)
+                            .count();
+        }
+        EXPECT_EQ(1, rowset->rowset_meta()->num_segments());
+        EXPECT_EQ(kRowsPerSegment, rowset->rowset_meta()->num_rows());
+        const auto& fs = io::global_local_filesystem();
+        RowsetId rowset_id_cached;
+        rowset_id_cached.init(version + 1000);
+        auto seg_path = local_segment_path(tablet->tablet_path(), 
rowset_id_cached.to_string(), 0);
+        auto cache_seg_path = local_segment_path(cache_dir, 
rowset_id_cached.to_string(), 0);
+        bool cache_exists = false;
+        static_cast<void>(fs->exists(cache_seg_path, &cache_exists));
+        if (!cache_exists) {
+            static_cast<void>(fs->link_file(seg_path, cache_seg_path));

Review Comment:
   Filesystem calls here ignore returned `Status` (both `exists` and 
`link_file`). If either fails, the test will silently proceed with an 
unexpected state and fail later in a harder-to-debug way. Please check the 
returned status and assert/return on failure.
   ```suggestion
           Status cache_exist_st = fs->exists(cache_seg_path, &cache_exists);
           ASSERT_TRUE(cache_exist_st.ok()) << cache_exist_st.to_json();
           if (!cache_exists) {
               Status link_st = fs->link_file(seg_path, cache_seg_path);
               ASSERT_TRUE(link_st.ok()) << link_st.to_json();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to