hanahmily commented on code in PR #665:
URL: 
https://github.com/apache/skywalking-banyandb/pull/665#discussion_r2095878886


##########
pkg/filter/bloom_filter.go:
##########
@@ -0,0 +1,145 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Package filter defines and implements data structures and interfaces for 
skipping index.
+package filter
+
+import (
+       "errors"
+       "hash"
+       "hash/fnv"
+       "math"
+
+       "github.com/bits-and-blooms/bitset"
+)
+
+// FalsePositiveRate is the default false positive rate for Bloom filter.
+const FalsePositiveRate = 0.01
+
+// BloomFilter is a probabilistic data structure designed to test whether an 
element is a member of a set.
+type BloomFilter struct {
+       hashFunc hash.Hash64
+       bits     *bitset.BitSet
+       m        uint64
+       k        uint
+}
+
+// NewBloomFilter creates a new Bloom filter with the number of items n and 
false positive rate p.
+func NewBloomFilter(n uint64, p float64) (*BloomFilter, error) {
+       if p <= 0 || p >= 1 {
+               return nil, errors.New("invalid false positive rate")
+       }
+       if n <= 0 {
+               return nil, errors.New("invalid number of items")
+       }
+       m := calculateM(n, p)
+       k := calculateK(n, m)
+       return &BloomFilter{
+               m:        m,
+               k:        k,
+               bits:     bitset.New(uint(m)),
+               hashFunc: fnv.New64a(),
+       }, nil
+}
+
+func calculateM(n uint64, p float64) uint64 {
+       val := math.Ceil(-(float64(n) * math.Log(p)) / (math.Ln2 * math.Ln2))
+       return uint64(math.Max(float64(val), 1))
+}
+
+func calculateK(n uint64, m uint64) uint {
+       val := math.Ceil((float64(m) / float64(n)) * math.Ln2)
+       return uint(math.Max(float64(val), 1))
+}
+
+// Add adds an item to the Bloom filter.
+func (bf *BloomFilter) Add(item []byte) error {
+       hashes, err := bf.getHashes(item)
+       if err != nil {
+               return err
+       }
+       for _, h := range hashes {
+               bf.bits.Set(uint(h % bf.m))
+       }
+       return nil
+}
+
+// MightContain checks if an item might be in the Bloom filter.
+func (bf *BloomFilter) MightContain(item []byte) (bool, error) {
+       hashes, err := bf.getHashes(item)
+       if err != nil {
+               return false, err
+       }
+       for _, h := range hashes {
+               if !bf.bits.Test(uint(h % bf.m)) {
+                       return false, nil
+               }
+       }
+       return true, nil
+}
+
+func (bf *BloomFilter) getHashes(item []byte) ([]uint64, error) {
+       bf.hashFunc.Reset()
+       _, err := bf.hashFunc.Write(item)
+       if err != nil {
+               return nil, err
+       }
+       h1 := bf.hashFunc.Sum64()
+
+       h2 := h1 >> 32
+
+       hashes := make([]uint64, bf.k)
+       for i := uint(0); i < bf.k; i++ {
+               hashes[i] = h1 + uint64(i)*h2
+       }
+       return hashes, nil
+}
+
+// M returns the size of the Bloom filter.
+func (bf *BloomFilter) M() uint64 {
+       return bf.m
+}
+
+// K returns the number of hash functions.
+func (bf *BloomFilter) K() uint {
+       return bf.k
+}
+
+// Bits returns the underlying bitset.
+func (bf *BloomFilter) Bits() *bitset.BitSet {
+       return bf.bits
+}
+
+// SetM sets the size of the Bloom filter.
+func (bf *BloomFilter) SetM(m uint64) {
+       bf.m = m
+}
+
+// SetK sets the number of hash functions.
+func (bf *BloomFilter) SetK(k uint) {
+       bf.k = k
+}
+
+// SetBits sets the underlying bitset.
+func (bf *BloomFilter) SetBits(bits *bitset.BitSet) {
+       bf.bits = bits
+}
+
+// SetHashFunc sets the hash function used by the Bloom filter.
+func (bf *BloomFilter) SetHashFunc(hashFunc hash.Hash64) {
+       bf.hashFunc = hashFunc
+}

Review Comment:
   We don't need these functions.



##########
pkg/filter/bloom_filter.go:
##########
@@ -0,0 +1,145 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Package filter defines and implements data structures and interfaces for 
skipping index.
+package filter
+
+import (
+       "errors"
+       "hash"
+       "hash/fnv"
+       "math"
+
+       "github.com/bits-and-blooms/bitset"
+)
+
+// FalsePositiveRate is the default false positive rate for Bloom filter.
+const FalsePositiveRate = 0.01
+
+// BloomFilter is a probabilistic data structure designed to test whether an 
element is a member of a set.
+type BloomFilter struct {
+       hashFunc hash.Hash64
+       bits     *bitset.BitSet
+       m        uint64
+       k        uint
+}
+
+// NewBloomFilter creates a new Bloom filter with the number of items n and 
false positive rate p.
+func NewBloomFilter(n uint64, p float64) (*BloomFilter, error) {
+       if p <= 0 || p >= 1 {
+               return nil, errors.New("invalid false positive rate")
+       }
+       if n <= 0 {
+               return nil, errors.New("invalid number of items")
+       }
+       m := calculateM(n, p)
+       k := calculateK(n, m)

Review Comment:
   The modern bloom filter typically uses fixed values for bits per item (b) 
and the number of hash functions (k) to derive the probability of a false 
positive (p). The relationship can be expressed with the following formula:
   
   ```
   p ≈ (1 - e^(-k/b))^k
   ```
   
   If we want the probability (p) to equal 1%, suitable values for b and k 
would be 10 and 7, respectively. Consequently, the total number of bits (m) can 
be calculated as m = b* n . This approach allows us to avoid the CPU overhead 
associated with floating-point calculations.



##########
pkg/filter/bloom_filter.go:
##########
@@ -0,0 +1,145 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Package filter defines and implements data structures and interfaces for 
skipping index.
+package filter
+
+import (
+       "errors"
+       "hash"
+       "hash/fnv"
+       "math"
+
+       "github.com/bits-and-blooms/bitset"
+)
+
+// FalsePositiveRate is the default false positive rate for Bloom filter.
+const FalsePositiveRate = 0.01
+
+// BloomFilter is a probabilistic data structure designed to test whether an 
element is a member of a set.
+type BloomFilter struct {
+       hashFunc hash.Hash64
+       bits     *bitset.BitSet

Review Comment:
   Can you use []uint64 instead of BitSet?



##########
pkg/encoding/bloom_filter.go:
##########


Review Comment:
   Please move this file into stream package.



##########
pkg/encoding/bloom_filter.go:
##########
@@ -0,0 +1,100 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package encoding
+
+import (
+       "bytes"
+       "encoding/binary"
+       "fmt"
+       "hash/fnv"
+
+       "github.com/bits-and-blooms/bitset"
+
+       "github.com/apache/skywalking-banyandb/pkg/filter"
+)
+
+// BloomFilterToBytes encodes a Bloom filter to bytes.
+func BloomFilterToBytes(bf *filter.BloomFilter) ([]byte, error) {
+       var buf bytes.Buffer

Review Comment:
   Accept the Buf from outside instead of allocating a new one.



##########
pkg/filter/bloom_filter.go:
##########
@@ -0,0 +1,145 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Package filter defines and implements data structures and interfaces for 
skipping index.
+package filter
+
+import (
+       "errors"
+       "hash"
+       "hash/fnv"
+       "math"
+
+       "github.com/bits-and-blooms/bitset"
+)
+
+// FalsePositiveRate is the default false positive rate for Bloom filter.
+const FalsePositiveRate = 0.01
+
+// BloomFilter is a probabilistic data structure designed to test whether an 
element is a member of a set.
+type BloomFilter struct {
+       hashFunc hash.Hash64
+       bits     *bitset.BitSet
+       m        uint64
+       k        uint
+}
+
+// NewBloomFilter creates a new Bloom filter with the number of items n and 
false positive rate p.
+func NewBloomFilter(n uint64, p float64) (*BloomFilter, error) {
+       if p <= 0 || p >= 1 {
+               return nil, errors.New("invalid false positive rate")
+       }
+       if n <= 0 {
+               return nil, errors.New("invalid number of items")
+       }
+       m := calculateM(n, p)
+       k := calculateK(n, m)
+       return &BloomFilter{
+               m:        m,
+               k:        k,
+               bits:     bitset.New(uint(m)),
+               hashFunc: fnv.New64a(),

Review Comment:
   [xxHash](github.com/cespare/xxhash/v2) is significantly faster than FNV 
(often 2-5x speedup) while maintaining excellent distribution qualities. 
   
   and you should add a benchmark to verify the performance of "Add" and 
"MightContain"
   



##########
banyand/stream/tag.go:
##########
@@ -68,6 +73,22 @@ func (t *tag) mustWriteTo(tm *tagMetadata, tagWriter 
*writer) {
        }
        tm.offset = tagWriter.bytesWritten
        tagWriter.MustWrite(bb.Buf)
+
+       if t.filter != nil {
+               bb.Reset()
+               var err error
+               bb.Buf, err = encoding.BloomFilterToBytes(t.filter)

Review Comment:
   ```suggestion
                bb.Buf, err = encoding.BloomFilterToBytes(bb.Buf[:0], t.filter)
   ```
   
   You should pass bb.Buf into this function to avoid object allocation. 



##########
pkg/filter/bloom_filter.go:
##########
@@ -0,0 +1,145 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Package filter defines and implements data structures and interfaces for 
skipping index.
+package filter
+
+import (
+       "errors"
+       "hash"
+       "hash/fnv"
+       "math"
+
+       "github.com/bits-and-blooms/bitset"
+)
+
+// FalsePositiveRate is the default false positive rate for Bloom filter.
+const FalsePositiveRate = 0.01
+
+// BloomFilter is a probabilistic data structure designed to test whether an 
element is a member of a set.
+type BloomFilter struct {

Review Comment:
   You should add a pool to cache BloomFilter in stream package.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscr...@skywalking.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to