This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit e8aa5ee7d5681c677632060b9478d99128b83814 Author: lihangyu <[email protected]> AuthorDate: Fri Mar 8 13:41:24 2024 +0800 [Improve](Variant) support bloom filter for variant subcolumns (#31347) * [Improve](Variant) support bloom filter for variant subcolumns * rebase --- .../rowset/segment_v2/vertical_segment_writer.cpp | 4 +- be/src/olap/tablet_schema.cpp | 4 ++ be/src/olap/tablet_schema.h | 2 + be/src/vec/common/schema_util.cpp | 22 ++++++++--- .../data/variant_p0/with_index/bloom_filter.out | 9 +++++ .../suites/variant_github_events_p0/load.groovy | 2 +- .../variant_p0/with_index/bloom_filter.groovy | 44 ++++++++++++++++++++++ 7 files changed, 79 insertions(+), 8 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index a5bea89b835..e755691f291 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -169,6 +169,9 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo (column.is_extracted_column() && column.is_array_type())) { // variant and jsonb type skip write index opts.indexes.clear(); + opts.need_zone_map = false; + opts.need_bloom_filter = false; + opts.need_bitmap_index = false; } for (auto index : opts.indexes) { if (!skip_inverted_index && index && index->index_type() == IndexType::INVERTED) { @@ -194,7 +197,6 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo CHECK_FIELD_TYPE(JSONB, "jsonb") CHECK_FIELD_TYPE(AGG_STATE, "agg_state") CHECK_FIELD_TYPE(MAP, "map") - CHECK_FIELD_TYPE(VARIANT, "variant") CHECK_FIELD_TYPE(OBJECT, "object") CHECK_FIELD_TYPE(HLL, "hll") CHECK_FIELD_TYPE(QUANTILE_STATE, "quantile_state") diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 804b4fc427a..6f097c96641 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -1192,6 +1192,10 @@ TabletColumn& TabletSchema::mutable_column_by_uid(int32_t col_unique_id) { return *_cols.at(_field_id_to_index.at(col_unique_id)); } +TabletColumn& TabletSchema::mutable_column(size_t ordinal) { + return *_cols.at(ordinal); +} + void TabletSchema::update_indexes_from_thrift(const std::vector<doris::TOlapTableIndex>& tindexes) { std::vector<TabletIndex> indexes; for (auto& tindex : tindexes) { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 847f74038b9..5c36122ffc2 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -158,6 +158,7 @@ public: }; int32_t parent_unique_id() const { return _parent_col_unique_id; } void set_parent_unique_id(int32_t col_unique_id) { _parent_col_unique_id = col_unique_id; } + void set_is_bf_column(bool is_bf_column) { _is_bf_column = is_bf_column; } std::shared_ptr<const vectorized::IDataType> get_vec_type() const; void append_sparse_column(TabletColumn column); @@ -292,6 +293,7 @@ public: Status have_column(const std::string& field_name) const; const TabletColumn& column_by_uid(int32_t col_unique_id) const; TabletColumn& mutable_column_by_uid(int32_t col_unique_id); + TabletColumn& mutable_column(size_t ordinal); void replace_column(size_t pos, TabletColumn new_col); const std::vector<TabletColumnPtr>& columns() const; size_t num_columns() const { return _num_columns; } diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 98148fa55bc..29167734d96 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -259,6 +259,7 @@ void update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolu std::set<PathInData>* path_set = nullptr) { PathsInData tuple_paths; DataTypes tuple_types; + CHECK(common_schema.use_count() == 1); // Get the least common type for all paths. for (const auto& [key, subtypes] : subcolumns_types) { assert(!subtypes.empty()); @@ -379,22 +380,30 @@ void inherit_tablet_index(TabletSchemaSPtr& schema) { } // Add index meta if extracted column is missing index meta - for (const auto& col : schema->columns()) { - if (!col->is_extracted_column()) { + for (size_t i = 0; i < schema->num_columns(); ++i) { + TabletColumn& col = schema->mutable_column(i); + if (!col.is_extracted_column()) { continue; } - auto it = variants_index_meta.find(col->parent_unique_id()); + if (col.type() != FieldType::OLAP_FIELD_TYPE_TINYINT && + col.type() != FieldType::OLAP_FIELD_TYPE_ARRAY && + col.type() != FieldType::OLAP_FIELD_TYPE_DOUBLE && + col.type() != FieldType::OLAP_FIELD_TYPE_FLOAT) { + // above types are not supported in bf + col.set_is_bf_column(schema->column(col.parent_unique_id()).is_bf_column()); + } + auto it = variants_index_meta.find(col.parent_unique_id()); // variant has no index meta, ignore if (it == variants_index_meta.end()) { continue; } - auto index_meta = schema->get_inverted_index(*col); + auto index_meta = schema->get_inverted_index(col); // add index meta TabletIndex index_info = it->second; - index_info.set_escaped_escaped_index_suffix_path(col->path_info_ptr()->get_path()); + index_info.set_escaped_escaped_index_suffix_path(col.path_info_ptr()->get_path()); if (index_meta != nullptr) { // already exist - schema->update_index(*col, index_info); + schema->update_index(col, index_info); } else { schema->append_index(index_info); } @@ -599,6 +608,7 @@ static void _append_column(const TabletColumn& parent_variant, // If column already exist in original tablet schema, then we pick common type // and cast column to common type, and modify tablet column to common type, // otherwise it's a new column + CHECK(to_append.use_count() == 1); const std::string& column_name = parent_variant.name_lower_case() + "." + subcolumn->path.get_path(); const vectorized::DataTypePtr& final_data_type_from_object = diff --git a/regression-test/data/variant_p0/with_index/bloom_filter.out b/regression-test/data/variant_p0/with_index/bloom_filter.out new file mode 100644 index 00000000000..4eaab9d140b --- /dev/null +++ b/regression-test/data/variant_p0/with_index/bloom_filter.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +2 {"a":789111} +3 {"a":789111} + +-- !sql -- +2 {"b":"yyyyyyy"} +2 {"b":"yyyyyyy"} + diff --git a/regression-test/suites/variant_github_events_p0/load.groovy b/regression-test/suites/variant_github_events_p0/load.groovy index 4bde4400f44..e7485807c60 100644 --- a/regression-test/suites/variant_github_events_p0/load.groovy +++ b/regression-test/suites/variant_github_events_p0/load.groovy @@ -65,7 +65,7 @@ suite("regression_test_variant_github_events_p0", "nonConcurrent"){ ) DUPLICATE KEY(`k`) DISTRIBUTED BY HASH(k) BUCKETS 4 - properties("replication_num" = "1", "disable_auto_compaction" = "false"); + properties("replication_num" = "1", "disable_auto_compaction" = "false", "bloom_filter_columns" = "v"); """ set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1") // 2015 diff --git a/regression-test/suites/variant_p0/with_index/bloom_filter.groovy b/regression-test/suites/variant_p0/with_index/bloom_filter.groovy new file mode 100644 index 00000000000..b7f08609b87 --- /dev/null +++ b/regression-test/suites/variant_p0/with_index/bloom_filter.groovy @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("regression_test_variant_with_bf", ""){ + def table_name = "var_with_bloom_filter" + sql "DROP TABLE IF EXISTS var_with_bloom_filter" + sql """ + CREATE TABLE IF NOT EXISTS var_with_bloom_filter ( + k bigint, + v variant + ) + DUPLICATE KEY(`k`) + DISTRIBUTED BY HASH(k) BUCKETS 1 + properties("replication_num" = "1", "bloom_filter_columns" = "v"); + """ + sql """insert into ${table_name} values (1, '{"a" : 123456}')""" + sql """insert into ${table_name} values (2, '{"a" : 789111}')""" + sql """insert into ${table_name} values (3, '{"a" : 789111}')""" + + sql """insert into ${table_name} values (1, '{"b" : "xxxxxxx"}')""" + sql """insert into ${table_name} values (2, '{"b" : "yyyyyyy"}')""" + sql """insert into ${table_name} values (3, '{"b" : "zzzzzzz"}')""" + + sql """insert into ${table_name} values (1, '{"b" : "xxxxxxx"}')""" + sql """insert into ${table_name} values (2, '{"b" : "yyyyyyy"}')""" + sql """insert into ${table_name} values (3, '{"b" : "zzzzzzz"}')""" + + qt_sql "select * from var_with_bloom_filter where cast(v['a'] as int) = 789111" + qt_sql "select * from var_with_bloom_filter where cast(v['b'] as text) = 'yyyyyyy' "; +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
