Copilot commented on code in PR #60511: URL: https://github.com/apache/doris/pull/60511#discussion_r2777247273
########## be/test/olap/variant_doc_mode_compaction_test.cpp: ########## @@ -0,0 +1,472 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gen_cpp/Types_types.h> +#include <glog/logging.h> +#include <gtest/gtest.h> +#include <stdint.h> +#include <unistd.h> + +#include <algorithm> +#include <array> +#include <chrono> +#include <iostream> +#include <memory> +#include <random> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "common/config.h" +#include "common/status.h" +#include "io/fs/local_file_system.h" +#include "olap/cumulative_compaction.h" +#include "olap/data_dir.h" +#include "olap/olap_common.h" +#include "olap/olap_define.h" +#include "olap/options.h" +#include "olap/rowset/beta_rowset.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_reader.h" +#include "olap/rowset/rowset_reader_context.h" +#include "olap/rowset/rowset_writer.h" +#include "olap/rowset/rowset_writer_context.h" +#include "olap/rowset/segment_v2/index_writer.h" +#include "olap/schema.h" +#include "olap/storage_engine.h" +#include "olap/tablet.h" +#include "olap/tablet_meta.h" +#include "olap/tablet_schema.h" +#include "runtime/exec_env.h" +#include "testutil/variant_util.h" +#include "vec/columns/column.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_variant.h" +#include "vec/core/block.h" +#include "vec/core/field.h" +#include "vec/data_types/data_type_string.h" + +namespace doris { +namespace vectorized { +using namespace ErrorCode; + +static constexpr uint32_t MAX_PATH_LEN = 1024; +static StorageEngine* engine_ref = nullptr; +static constexpr int32_t kRowsPerSegment = 10000; + +class VariantDocModeCompactionTest : public ::testing::Test { +protected: + void SetUp() override { + char buffer[MAX_PATH_LEN]; + ASSERT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + absolute_dir = std::string(buffer) + std::string(kTestDir); + Status st; + bool exists = false; + ASSERT_TRUE(io::global_local_filesystem()->exists(absolute_dir, &exists).ok()); + if (!exists) { + st = io::global_local_filesystem()->create_directory(absolute_dir); + ASSERT_TRUE(st.ok()) << st.to_string(); + } + ASSERT_TRUE(io::global_local_filesystem() + ->create_directory(absolute_dir + "/tablet_path") + .ok()); + cache_dir = std::string(buffer) + "/ut_dir/variant_doc_mode_compaction_test_cache"; + ASSERT_TRUE(io::global_local_filesystem()->create_directory(cache_dir).ok()); + + ASSERT_TRUE(io::global_local_filesystem()->delete_directory(std::string(tmp_dir)).ok()); + ASSERT_TRUE(io::global_local_filesystem()->create_directory(std::string(tmp_dir)).ok()); + std::vector<StorePath> paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); + st = tmp_file_dirs->init(); + ASSERT_TRUE(st.ok()) << st.to_json(); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + engine_ref = engine.get(); + _data_dir = std::make_unique<DataDir>(*engine_ref, absolute_dir); + ASSERT_TRUE(_data_dir->init(true).ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + + config::enable_ordered_data_compaction = false; + config::segments_key_bounds_truncation_threshold = -1; + config::enable_vertical_compact_variant_subcolumns = true; + config::enable_compaction_checksum = false; + config::enable_stacktrace = true; + } + + void TearDown() override { + // ASSERT_TRUE(io::global_local_filesystem()->delete_directory(absolute_dir).ok()); + engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + TabletSchemaSPtr create_schema() { + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(KeysType::DUP_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(3); + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("c1"); + column_1->set_type("INT"); + column_1->set_is_key(true); + column_1->set_length(4); + column_1->set_index_length(4); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("v"); + column_2->set_type("VARIANT"); + column_2->set_is_key(false); + column_2->set_is_nullable(false); + column_2->set_variant_max_subcolumns_count(0); + column_2->set_variant_max_sparse_column_statistics_size(10000); + column_2->set_variant_sparse_hash_shard_count(32); + column_2->set_variant_enable_doc_mode(true); + column_2->set_variant_doc_materialization_min_rows(kRowsPerSegment); + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + TabletSharedPtr create_tablet(const TabletSchema& tablet_schema) { + std::vector<TColumn> cols; + std::unordered_map<uint32_t, uint32_t> col_ordinal_to_unique_id; + for (int i = 0; i < tablet_schema.num_columns(); ++i) { + const TabletColumn& column = tablet_schema.column(i); + TColumn col; + if (column.type() == FieldType::OLAP_FIELD_TYPE_VARIANT) { + col.column_type.type = TPrimitiveType::VARIANT; + } else { + col.column_type.type = TPrimitiveType::INT; + } + col.__set_column_name(column.name()); + col.__set_is_key(column.is_key()); + cols.push_back(col); + col_ordinal_to_unique_id[i] = column.unique_id(); + } + + TTabletSchema t_tablet_schema; + t_tablet_schema.__set_short_key_column_count(tablet_schema.num_short_key_columns()); + t_tablet_schema.__set_schema_hash(3333); + t_tablet_schema.__set_keys_type(TKeysType::DUP_KEYS); + t_tablet_schema.__set_storage_type(TStorageType::COLUMN); + t_tablet_schema.__set_columns(cols); + + TabletMetaSharedPtr tablet_meta(new TabletMeta( + 2, 2, 2, 2, 2, 2, t_tablet_schema, 2, col_ordinal_to_unique_id, UniqueId(1, 2), + TTabletType::TABLET_TYPE_DISK, TCompressionType::LZ4F, 0, false)); + + TabletSharedPtr tablet(new Tablet(*engine_ref, tablet_meta, _data_dir.get())); + static_cast<void>(tablet->init()); Review Comment: `tablet->init()` return value is ignored; if initialization fails, later steps will operate on a partially initialized tablet and the failure will be hard to diagnose. Please assert/expect `tablet->init().ok()` (or propagate `Status`). ```suggestion ASSERT_TRUE(tablet->init().ok()); ``` ########## be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp: ########## @@ -422,24 +668,45 @@ Status VariantDocWriter::append_data(const TabletColumn* parent_column, auto& map_col = assert_cast<vectorized::ColumnMap&>(*tmp_maps[b]); map_col.get_offsets().reserve(num_rows); } + std::vector<vectorized::ColumnString*> bucket_keys(_bucket_num); + std::vector<vectorized::ColumnString*> bucket_values(_bucket_num); + std::vector<vectorized::ColumnArray::Offsets64*> bucket_offsets(_bucket_num); + for (int b = 0; b < _bucket_num; ++b) { + auto& m = assert_cast<vectorized::ColumnMap&>(*tmp_maps[b]); + bucket_keys[b] = &assert_cast<vectorized::ColumnString&>(m.get_keys()); + bucket_values[b] = &assert_cast<vectorized::ColumnString&>(m.get_values()); + bucket_offsets[b] = &m.get_offsets(); + } - std::vector<std::unordered_map<StringRef, uint32_t>> bucket_path_counts(_bucket_num); + std::vector<phmap::flat_hash_map<StringRef, uint32_t, StringRefHash>> bucket_path_counts( + _bucket_num); + const size_t limit = parent_column->variant_max_sparse_column_statistics_size(); + for (int b = 0; b < _bucket_num; ++b) { + bucket_path_counts[b].reserve(std::min<size_t>(limit, 1024)); + } + phmap::flat_hash_map<StringRef, uint32_t, StringRefHash> path_to_bucket; + path_to_bucket.reserve(std::min<size_t>(paths_col->size(), 8192)); for (size_t row = 0; row < num_rows; ++row) { - const ssize_t srow = static_cast<ssize_t>(row); - size_t start = offsets[srow - 1]; - size_t end = offsets[srow]; + const size_t start = offsets[row - 1]; Review Comment: `offsets[row - 1]` with `row` as `size_t` relies on unsigned underflow + implementation-defined conversion to reach the padded `-1` element. Please use an explicit `row == 0 ? 0 : offsets[row - 1]` to avoid fragile behavior (especially under sanitizers). ```suggestion const size_t start = (row == 0 ? 0 : offsets[row - 1]); ``` ########## be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp: ########## @@ -436,6 +436,51 @@ TEST_F(ColumnReaderWriterTest, test_array_type) { delete[] array_is_null; } +TEST_F(ColumnReaderWriterTest, test_array_append_nulls) { + ColumnMetaPB meta; + TabletColumn list_column(OLAP_FIELD_AGGREGATION_NONE, FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn item_column(OLAP_FIELD_AGGREGATION_NONE, FieldType::OLAP_FIELD_TYPE_TINYINT, true); + list_column.add_sub_column(item_column); + + std::string fname = TEST_DIR + "/array_append_nulls"; + auto fs = io::global_local_filesystem(); + io::FileWriterPtr file_writer; + Status st = fs->create_file(fname, &file_writer); + ASSERT_TRUE(st.ok()) << st; + + ColumnWriterOptions writer_opts; + writer_opts.meta = &meta; + writer_opts.meta->set_column_id(0); + writer_opts.meta->set_unique_id(0); + writer_opts.meta->set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + writer_opts.meta->set_length(0); + writer_opts.meta->set_encoding(BIT_SHUFFLE); + writer_opts.meta->set_compression(segment_v2::CompressionTypePB::LZ4F); + writer_opts.meta->set_is_nullable(true); + + ColumnMetaPB* child_meta = meta.add_children_columns(); + child_meta->set_column_id(1); + child_meta->set_unique_id(1); + child_meta->set_type(FieldType::OLAP_FIELD_TYPE_TINYINT); + child_meta->set_length(0); + child_meta->set_encoding(BIT_SHUFFLE); + child_meta->set_compression(segment_v2::CompressionTypePB::LZ4F); + child_meta->set_is_nullable(true); + + std::unique_ptr<ColumnWriter> writer; + ColumnWriter::create(writer_opts, &list_column, file_writer.get(), &writer); Review Comment: The return `Status` from `ColumnWriter::create(...)` is ignored; if creation fails, `writer` may be null and `writer->init()` will crash. Please capture and assert the returned status (and/or `ASSERT_NE(writer, nullptr)`). ```suggestion st = ColumnWriter::create(writer_opts, &list_column, file_writer.get(), &writer); ASSERT_TRUE(st.ok()) << st; ASSERT_NE(writer, nullptr); ``` ########## be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp: ########## @@ -159,6 +161,236 @@ Status convert_and_write_column(vectorized::OlapBlockDataConvertor* converter, return Status::OK(); } +namespace { +// Per-path sparse materialization result from a VARIANT doc-value column. +// - subcolumn stores the decoded values for rows listed in rowids (dense, no gaps). +// - rowids records which input rows have a non-null value for this path. +// - non_null_count is used to build variant statistics quickly. +struct SubcolumnSparseData { + vectorized::ColumnVariant::Subcolumn subcolumn {0, true, false}; + std::vector<uint32_t> rowids; + uint32_t non_null_count = 0; +}; + +using DocSparseSubcolumns = phmap::flat_hash_map<StringRef, SubcolumnSparseData, StringRefHash>; +using DocValuePathStats = phmap::flat_hash_map<StringRef, uint32_t, StringRefHash>; + +struct SubcolumnWriteEntry { + std::string_view path; + vectorized::ColumnVariant::Subcolumn* subcolumn = nullptr; + // nullptr means dense materialization; otherwise sparse row ids for this path. + const std::vector<uint32_t>* rowids = nullptr; +}; + +struct SubcolumnWritePlan { + using DenseSubcolumns = + phmap::flat_hash_map<std::string_view, vectorized::ColumnVariant::Subcolumn>; + + // Owns materialized subcolumns and a flattened iteration view for finalize(). + DocSparseSubcolumns sparse_subcolumns; + DenseSubcolumns dense_subcolumns; + std::vector<SubcolumnWriteEntry> entries; + DocValuePathStats stats; +}; + +// Build per-path non-null counts from the serialized doc-value representation. +void build_doc_value_stats(const vectorized::ColumnVariant& variant, DocValuePathStats* stats) { + auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values(); + (void)column_value; + const auto& column_offsets = variant.serialized_doc_value_column_offsets(); + const size_t num_rows = column_offsets.size(); + + stats->clear(); + stats->reserve(column_key->size()); + for (size_t row = 0; row < num_rows; ++row) { + const size_t start = column_offsets[row - 1]; + const size_t end = column_offsets[row]; + for (size_t j = start; j < end; ++j) { + const StringRef path = column_key->get_data_at(j); + auto it = stats->try_emplace(path, 0).first; + ++it->second; + } + } +} + +// Materialize sparse subcolumns for each path and build per-path non-null counts. +// For each row, we decode only present (path, value) pairs and append them to the +// corresponding subcolumn, while recording the row id to allow gap filling later. +void build_sparse_subcolumns_and_stats(const vectorized::ColumnVariant& variant, + DocSparseSubcolumns* subcolumns, DocValuePathStats* stats) { + auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values(); + const auto& column_offsets = variant.serialized_doc_value_column_offsets(); + const size_t num_rows = column_offsets.size(); + + subcolumns->clear(); + stats->clear(); + subcolumns->reserve(column_key->size()); + + for (size_t row = 0; row < num_rows; ++row) { + const size_t start = column_offsets[row - 1]; + const size_t end = column_offsets[row]; + for (size_t i = start; i < end; ++i) { Review Comment: Same `size_t row - 1` underflow/implementation-defined conversion issue as above. When `row == 0`, compute `start` explicitly as 0 instead of relying on unsigned underflow to become `-1` after conversion. ########## be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp: ########## @@ -159,6 +161,236 @@ Status convert_and_write_column(vectorized::OlapBlockDataConvertor* converter, return Status::OK(); } +namespace { +// Per-path sparse materialization result from a VARIANT doc-value column. +// - subcolumn stores the decoded values for rows listed in rowids (dense, no gaps). +// - rowids records which input rows have a non-null value for this path. +// - non_null_count is used to build variant statistics quickly. +struct SubcolumnSparseData { + vectorized::ColumnVariant::Subcolumn subcolumn {0, true, false}; + std::vector<uint32_t> rowids; + uint32_t non_null_count = 0; +}; + +using DocSparseSubcolumns = phmap::flat_hash_map<StringRef, SubcolumnSparseData, StringRefHash>; +using DocValuePathStats = phmap::flat_hash_map<StringRef, uint32_t, StringRefHash>; + +struct SubcolumnWriteEntry { + std::string_view path; + vectorized::ColumnVariant::Subcolumn* subcolumn = nullptr; + // nullptr means dense materialization; otherwise sparse row ids for this path. + const std::vector<uint32_t>* rowids = nullptr; +}; + +struct SubcolumnWritePlan { + using DenseSubcolumns = + phmap::flat_hash_map<std::string_view, vectorized::ColumnVariant::Subcolumn>; + + // Owns materialized subcolumns and a flattened iteration view for finalize(). + DocSparseSubcolumns sparse_subcolumns; + DenseSubcolumns dense_subcolumns; + std::vector<SubcolumnWriteEntry> entries; + DocValuePathStats stats; +}; + +// Build per-path non-null counts from the serialized doc-value representation. +void build_doc_value_stats(const vectorized::ColumnVariant& variant, DocValuePathStats* stats) { + auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values(); + (void)column_value; + const auto& column_offsets = variant.serialized_doc_value_column_offsets(); + const size_t num_rows = column_offsets.size(); + + stats->clear(); + stats->reserve(column_key->size()); + for (size_t row = 0; row < num_rows; ++row) { + const size_t start = column_offsets[row - 1]; + const size_t end = column_offsets[row]; + for (size_t j = start; j < end; ++j) { Review Comment: `row` is `size_t`, so `row - 1` underflows when `row == 0`. The subsequent implicit conversion to `ssize_t` for `PaddedPODArray::operator[]` is implementation-defined; it usually becomes `-1`, but this is fragile and can break under sanitizers/other platforms. Please mirror the safer pattern used elsewhere in this PR: `start = (row == 0) ? 0 : column_offsets[row - 1]`. ########## be/src/olap/rowset/segment_v2/variant/variant_column_writer_impl.cpp: ########## @@ -335,19 +575,38 @@ Status UnifiedSparseColumnWriter::append_bucket_sparse( auto& m = assert_cast<vectorized::ColumnMap&>(*tmp_maps[b]); m.get_offsets().reserve(num_rows); } - for (ssize_t row = 0; row < static_cast<ssize_t>(num_rows); ++row) { - size_t start = offsets[row - 1]; - size_t end = offsets[row]; + std::vector<vectorized::ColumnString*> bucket_keys(bucket_num); + std::vector<vectorized::ColumnString*> bucket_values(bucket_num); + std::vector<vectorized::ColumnArray::Offsets64*> bucket_offsets(bucket_num); + for (int b = 0; b < bucket_num; ++b) { + auto& m = assert_cast<vectorized::ColumnMap&>(*tmp_maps[b]); + bucket_keys[b] = &assert_cast<vectorized::ColumnString&>(m.get_keys()); + bucket_values[b] = &assert_cast<vectorized::ColumnString&>(m.get_values()); + bucket_offsets[b] = &m.get_offsets(); + } + for (size_t row = 0; row < num_rows; ++row) { + const size_t start = offsets[row - 1]; Review Comment: Same `size_t row - 1` underflow / implementation-defined conversion pattern as in the new doc-value helpers: `offsets[row - 1]` relies on unsigned underflow becoming `-1` after conversion to `ssize_t` for `PaddedPODArray::operator[]`. Please make this explicit with `start = (row == 0) ? 0 : offsets[row - 1]` for clarity and portability. ```suggestion const size_t start = (row == 0) ? 0 : offsets[row - 1]; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
