This is an automated email from the ASF dual-hosted git repository. zhangchen pushed a commit to branch branch-1.2-lts in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8ea3dfcd101ec60eee963a80a59180796ebfca1f Author: Xin Liao <[email protected]> AuthorDate: Wed May 31 09:49:15 2023 +0800 [Enhancement](merge-on-write) optimize bloom filter for primary key index (#20182) --- be/src/olap/primary_key_index.cpp | 8 ++- .../segment_v2/bloom_filter_index_writer.cpp | 59 +++++++++++++++++++++- .../rowset/segment_v2/bloom_filter_index_writer.h | 39 ++++++++++++++ 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index 7c2c5fe16a..6d5dc0edd2 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -19,6 +19,9 @@ #include "common/config.h" #include "io/fs/file_reader.h" +#include "olap/olap_common.h" +#include "olap/rowset/segment_v2/bloom_filter_index_reader.h" +#include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/encoding_info.h" namespace doris { @@ -36,8 +39,9 @@ Status PrimaryKeyIndexBuilder::init() { new segment_v2::IndexedColumnWriter(options, type_info, _file_writer)); RETURN_IF_ERROR(_primary_key_index_builder->init()); - return segment_v2::BloomFilterIndexWriter::create(segment_v2::BloomFilterOptions(), type_info, - &_bloom_filter_index_builder); + _bloom_filter_index_builder.reset(new segment_v2::PrimaryKeyBloomFilterIndexWriterImpl( + segment_v2::BloomFilterOptions(), type_info)); + return Status::OK(); } Status PrimaryKeyIndexBuilder::add_item(const Slice& key) { diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 5542a6068d..c8f8cc8c50 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -25,7 +25,6 @@ #include "olap/rowset/segment_v2/encoding_info.h" #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" -#include "runtime/mem_pool.h" #include "util/faststring.h" #include "util/slice.h" @@ -170,6 +169,64 @@ private: } // namespace +void PrimaryKeyBloomFilterIndexWriterImpl::add_values(const void* values, size_t count) { + const Slice* v = (const Slice*)values; + for (int i = 0; i < count; ++i) { + Slice new_value; + _type_info->deep_copy(&new_value, v, &_pool); + _values.push_back(new_value); + ++v; + } +} + +Status PrimaryKeyBloomFilterIndexWriterImpl::flush() { + std::unique_ptr<BloomFilter> bf; + RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf)); + RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); + bf->set_has_null(_has_null); + for (auto& v : _values) { + Slice* s = (Slice*)&v; + bf->add_bytes(s->data, s->size); + } + _bf_buffer_size += bf->size(); + _bfs.push_back(std::move(bf)); + _values.clear(); + _has_null = false; + return Status::OK(); +} + +Status PrimaryKeyBloomFilterIndexWriterImpl::finish(io::FileWriter* file_writer, + ColumnIndexMetaPB* index_meta) { + if (_values.size() > 0) { + RETURN_IF_ERROR(flush()); + } + index_meta->set_type(BLOOM_FILTER_INDEX); + BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index(); + meta->set_hash_strategy(_bf_options.strategy); + meta->set_algorithm(BLOCK_BLOOM_FILTER); + + // write bloom filters + const auto* bf_type_info = get_scalar_type_info<FieldType::OLAP_FIELD_TYPE_VARCHAR>(); + IndexedColumnWriterOptions options; + options.write_ordinal_index = true; + options.write_value_index = false; + options.encoding = PLAIN_ENCODING; + IndexedColumnWriter bf_writer(options, bf_type_info, file_writer); + RETURN_IF_ERROR(bf_writer.init()); + for (auto& bf : _bfs) { + Slice data(bf->data(), bf->size()); + bf_writer.add(&data); + } + RETURN_IF_ERROR(bf_writer.finish(meta->mutable_bloom_filter())); + return Status::OK(); +} + +uint64_t PrimaryKeyBloomFilterIndexWriterImpl::size() { + uint64_t total_size = _bf_buffer_size; + total_size += _pool.total_allocated_bytes(); + return total_size; +} + // TODO currently we don't support bloom filter index for tinyint/hll/float/double Status BloomFilterIndexWriter::create(const BloomFilterOptions& bf_options, const TypeInfo* type_info, diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h index 8b9a945e1a..52df34ec68 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h @@ -23,6 +23,10 @@ #include "common/status.h" #include "gen_cpp/segment_v2.pb.h" #include "gutil/macros.h" +#include "olap/rowset/segment_v2/bloom_filter.h" +#include "runtime/mem_pool.h" +#include "util/slice.h" +#include "vec/common/arena.h" namespace doris { @@ -58,5 +62,40 @@ private: DISALLOW_COPY_AND_ASSIGN(BloomFilterIndexWriter); }; +// For unique key with merge on write, the data for each segment is deduplicated. +// Bloom filter doesn't need to use `set` for deduplication like +// `BloomFilterIndexWriterImpl`, so vector can be used to accelerate. +class PrimaryKeyBloomFilterIndexWriterImpl : public BloomFilterIndexWriter { +public: + explicit PrimaryKeyBloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options, + const TypeInfo* type_info) + : _bf_options(bf_options), + _type_info(type_info), + _has_null(false), + _bf_buffer_size(0) {} + + ~PrimaryKeyBloomFilterIndexWriterImpl() override = default; + + void add_values(const void* values, size_t count) override; + + void add_nulls(uint32_t count) override { _has_null = true; } + + Status flush() override; + + Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override; + + uint64_t size() override; + +private: + BloomFilterOptions _bf_options; + const TypeInfo* _type_info; + MemPool _pool; + bool _has_null; + uint64_t _bf_buffer_size; + // distinct values + std::vector<Slice> _values; + std::vector<std::unique_ptr<BloomFilter>> _bfs; +}; + } // namespace segment_v2 } // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
