hanahmily commented on code in PR #665: URL: https://github.com/apache/skywalking-banyandb/pull/665#discussion_r2095878886
########## pkg/filter/bloom_filter.go: ########## @@ -0,0 +1,145 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Package filter defines and implements data structures and interfaces for skipping index. +package filter + +import ( + "errors" + "hash" + "hash/fnv" + "math" + + "github.com/bits-and-blooms/bitset" +) + +// FalsePositiveRate is the default false positive rate for Bloom filter. +const FalsePositiveRate = 0.01 + +// BloomFilter is a probabilistic data structure designed to test whether an element is a member of a set. +type BloomFilter struct { + hashFunc hash.Hash64 + bits *bitset.BitSet + m uint64 + k uint +} + +// NewBloomFilter creates a new Bloom filter with the number of items n and false positive rate p. +func NewBloomFilter(n uint64, p float64) (*BloomFilter, error) { + if p <= 0 || p >= 1 { + return nil, errors.New("invalid false positive rate") + } + if n <= 0 { + return nil, errors.New("invalid number of items") + } + m := calculateM(n, p) + k := calculateK(n, m) + return &BloomFilter{ + m: m, + k: k, + bits: bitset.New(uint(m)), + hashFunc: fnv.New64a(), + }, nil +} + +func calculateM(n uint64, p float64) uint64 { + val := math.Ceil(-(float64(n) * math.Log(p)) / (math.Ln2 * math.Ln2)) + return uint64(math.Max(float64(val), 1)) +} + +func calculateK(n uint64, m uint64) uint { + val := math.Ceil((float64(m) / float64(n)) * math.Ln2) + return uint(math.Max(float64(val), 1)) +} + +// Add adds an item to the Bloom filter. +func (bf *BloomFilter) Add(item []byte) error { + hashes, err := bf.getHashes(item) + if err != nil { + return err + } + for _, h := range hashes { + bf.bits.Set(uint(h % bf.m)) + } + return nil +} + +// MightContain checks if an item might be in the Bloom filter. +func (bf *BloomFilter) MightContain(item []byte) (bool, error) { + hashes, err := bf.getHashes(item) + if err != nil { + return false, err + } + for _, h := range hashes { + if !bf.bits.Test(uint(h % bf.m)) { + return false, nil + } + } + return true, nil +} + +func (bf *BloomFilter) getHashes(item []byte) ([]uint64, error) { + bf.hashFunc.Reset() + _, err := bf.hashFunc.Write(item) + if err != nil { + return nil, err + } + h1 := bf.hashFunc.Sum64() + + h2 := h1 >> 32 + + hashes := make([]uint64, bf.k) + for i := uint(0); i < bf.k; i++ { + hashes[i] = h1 + uint64(i)*h2 + } + return hashes, nil +} + +// M returns the size of the Bloom filter. +func (bf *BloomFilter) M() uint64 { + return bf.m +} + +// K returns the number of hash functions. +func (bf *BloomFilter) K() uint { + return bf.k +} + +// Bits returns the underlying bitset. +func (bf *BloomFilter) Bits() *bitset.BitSet { + return bf.bits +} + +// SetM sets the size of the Bloom filter. +func (bf *BloomFilter) SetM(m uint64) { + bf.m = m +} + +// SetK sets the number of hash functions. +func (bf *BloomFilter) SetK(k uint) { + bf.k = k +} + +// SetBits sets the underlying bitset. +func (bf *BloomFilter) SetBits(bits *bitset.BitSet) { + bf.bits = bits +} + +// SetHashFunc sets the hash function used by the Bloom filter. +func (bf *BloomFilter) SetHashFunc(hashFunc hash.Hash64) { + bf.hashFunc = hashFunc +} Review Comment: We don't need these functions. ########## pkg/filter/bloom_filter.go: ########## @@ -0,0 +1,145 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Package filter defines and implements data structures and interfaces for skipping index. +package filter + +import ( + "errors" + "hash" + "hash/fnv" + "math" + + "github.com/bits-and-blooms/bitset" +) + +// FalsePositiveRate is the default false positive rate for Bloom filter. +const FalsePositiveRate = 0.01 + +// BloomFilter is a probabilistic data structure designed to test whether an element is a member of a set. +type BloomFilter struct { + hashFunc hash.Hash64 + bits *bitset.BitSet + m uint64 + k uint +} + +// NewBloomFilter creates a new Bloom filter with the number of items n and false positive rate p. +func NewBloomFilter(n uint64, p float64) (*BloomFilter, error) { + if p <= 0 || p >= 1 { + return nil, errors.New("invalid false positive rate") + } + if n <= 0 { + return nil, errors.New("invalid number of items") + } + m := calculateM(n, p) + k := calculateK(n, m) Review Comment: The modern bloom filter typically uses fixed values for bits per item (b) and the number of hash functions (k) to derive the probability of a false positive (p). The relationship can be expressed with the following formula: ``` p ≈ (1 - e^(-k/b))^k ``` If we want the probability (p) to equal 1%, suitable values for b and k would be 10 and 7, respectively. Consequently, the total number of bits (m) can be calculated as m = b* n . This approach allows us to avoid the CPU overhead associated with floating-point calculations. ########## pkg/filter/bloom_filter.go: ########## @@ -0,0 +1,145 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Package filter defines and implements data structures and interfaces for skipping index. +package filter + +import ( + "errors" + "hash" + "hash/fnv" + "math" + + "github.com/bits-and-blooms/bitset" +) + +// FalsePositiveRate is the default false positive rate for Bloom filter. +const FalsePositiveRate = 0.01 + +// BloomFilter is a probabilistic data structure designed to test whether an element is a member of a set. +type BloomFilter struct { + hashFunc hash.Hash64 + bits *bitset.BitSet Review Comment: Can you use []uint64 instead of BitSet? ########## pkg/encoding/bloom_filter.go: ########## Review Comment: Please move this file into stream package. ########## pkg/encoding/bloom_filter.go: ########## @@ -0,0 +1,100 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package encoding + +import ( + "bytes" + "encoding/binary" + "fmt" + "hash/fnv" + + "github.com/bits-and-blooms/bitset" + + "github.com/apache/skywalking-banyandb/pkg/filter" +) + +// BloomFilterToBytes encodes a Bloom filter to bytes. +func BloomFilterToBytes(bf *filter.BloomFilter) ([]byte, error) { + var buf bytes.Buffer Review Comment: Accept the Buf from outside instead of allocating a new one. ########## pkg/filter/bloom_filter.go: ########## @@ -0,0 +1,145 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Package filter defines and implements data structures and interfaces for skipping index. +package filter + +import ( + "errors" + "hash" + "hash/fnv" + "math" + + "github.com/bits-and-blooms/bitset" +) + +// FalsePositiveRate is the default false positive rate for Bloom filter. +const FalsePositiveRate = 0.01 + +// BloomFilter is a probabilistic data structure designed to test whether an element is a member of a set. +type BloomFilter struct { + hashFunc hash.Hash64 + bits *bitset.BitSet + m uint64 + k uint +} + +// NewBloomFilter creates a new Bloom filter with the number of items n and false positive rate p. +func NewBloomFilter(n uint64, p float64) (*BloomFilter, error) { + if p <= 0 || p >= 1 { + return nil, errors.New("invalid false positive rate") + } + if n <= 0 { + return nil, errors.New("invalid number of items") + } + m := calculateM(n, p) + k := calculateK(n, m) + return &BloomFilter{ + m: m, + k: k, + bits: bitset.New(uint(m)), + hashFunc: fnv.New64a(), Review Comment: [xxHash](github.com/cespare/xxhash/v2) is significantly faster than FNV (often 2-5x speedup) while maintaining excellent distribution qualities. and you should add a benchmark to verify the performance of "Add" and "MightContain" ########## banyand/stream/tag.go: ########## @@ -68,6 +73,22 @@ func (t *tag) mustWriteTo(tm *tagMetadata, tagWriter *writer) { } tm.offset = tagWriter.bytesWritten tagWriter.MustWrite(bb.Buf) + + if t.filter != nil { + bb.Reset() + var err error + bb.Buf, err = encoding.BloomFilterToBytes(t.filter) Review Comment: ```suggestion bb.Buf, err = encoding.BloomFilterToBytes(bb.Buf[:0], t.filter) ``` You should pass bb.Buf into this function to avoid object allocation. ########## pkg/filter/bloom_filter.go: ########## @@ -0,0 +1,145 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Package filter defines and implements data structures and interfaces for skipping index. +package filter + +import ( + "errors" + "hash" + "hash/fnv" + "math" + + "github.com/bits-and-blooms/bitset" +) + +// FalsePositiveRate is the default false positive rate for Bloom filter. +const FalsePositiveRate = 0.01 + +// BloomFilter is a probabilistic data structure designed to test whether an element is a member of a set. +type BloomFilter struct { Review Comment: You should add a pool to cache BloomFilter in stream package. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@skywalking.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org