(skywalking-banyandb) branch main updated: Enhance DictionaryFilter with map-based lookups for O(1) performance. Add benchmarks for various dictionary sizes and types, including string and int64 arrays. Refactor value handling and extraction methods for improved efficiency. (#872)

hanahmily Sat, 29 Nov 2025 06:02:53 -0800

This is an automated email from the ASF dual-hosted git repository.

hanahmily pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git



The following commit(s) were added to refs/heads/main by this push:
     new 393630a9 Enhance DictionaryFilter with map-based lookups for O(1) 
performance. Add benchmarks for various dictionary sizes and types, including 
string and int64 arrays. Refactor value handling and extraction methods for 
improved efficiency. (#872)
393630a9 is described below

commit 393630a94ceb6f8a53c58b303910ff5105a2da49
Author: Gao Hongtao <[email protected]>
AuthorDate: Sat Nov 29 22:02:40 2025 +0800

    Enhance DictionaryFilter with map-based lookups for O(1) performance. Add 
benchmarks for various dictionary sizes and types, including string and int64 
arrays. Refactor value handling and extraction methods for improved efficiency. 
(#872)
---
 pkg/filter/dictionary_filter.go      |  97 ++++++++++--------
 pkg/filter/dictionary_filter_test.go | 185 +++++++++++++++++++++++++++++++++++
 2 files changed, 240 insertions(+), 42 deletions(-)

diff --git a/pkg/filter/dictionary_filter.go b/pkg/filter/dictionary_filter.go
index 56543fb6..8df2db1c 100644
--- a/pkg/filter/dictionary_filter.go
+++ b/pkg/filter/dictionary_filter.go
@@ -18,51 +18,61 @@
 package filter
 
 import (
-       "bytes"
-
+       "github.com/apache/skywalking-banyandb/pkg/convert"
        pbv1 "github.com/apache/skywalking-banyandb/pkg/pb/v1"
 )
 
 // DictionaryFilter is a filter implementation backed by a dictionary.
+// It uses a map-based lookup for O(1) performance instead of O(n) linear 
search.
 type DictionaryFilter struct {
+       valueSet  map[string]struct{}
        values    [][]byte
        valueType pbv1.ValueType
 }
 
 // NewDictionaryFilter creates a new dictionary filter with the given values.
 func NewDictionaryFilter(values [][]byte) *DictionaryFilter {
-       return &DictionaryFilter{
+       df := &DictionaryFilter{
                values: values,
        }
+       df.buildValueSet()
+       return df
 }
 
 // MightContain checks if an item is in the dictionary.
+// For non-array types, it uses O(1) map lookup instead of O(n) linear search.
 func (df *DictionaryFilter) MightContain(item []byte) bool {
        if df.valueType == pbv1.ValueTypeStrArr || df.valueType == 
pbv1.ValueTypeInt64Arr {
-               for _, serializedArray := range df.values {
-                       if containElement(serializedArray, item, df.valueType) {
-                               return true
-                       }
+               // For array types, check if the item exists in the 
pre-computed element set
+               if df.valueSet != nil {
+                       _, exists := df.valueSet[convert.BytesToString(item)]
+                       return exists
                }
                return false
        }
 
-       for _, v := range df.values {
-               if bytes.Equal(v, item) {
-                       return true
-               }
+       // For non-array types, use O(1) map lookup
+       if df.valueSet != nil {
+               _, exists := df.valueSet[convert.BytesToString(item)]
+               return exists
        }
        return false
 }
 
-// SetValues sets the dictionary values.
+// SetValues sets the dictionary values and builds the lookup set.
 func (df *DictionaryFilter) SetValues(values [][]byte) {
        df.values = values
+       df.buildValueSet()
 }
 
 // SetValueType sets the value type for the dictionary filter.
+// For array types, it rebuilds the lookup set by extracting elements from 
serialized arrays.
 func (df *DictionaryFilter) SetValueType(valueType pbv1.ValueType) {
        df.valueType = valueType
+       // Rebuild the set for array types since elements need to be extracted
+       if valueType == pbv1.ValueTypeStrArr || valueType == 
pbv1.ValueTypeInt64Arr {
+               df.buildValueSet()
+       }
 }
 
 // Reset resets the dictionary filter.
@@ -72,33 +82,43 @@ func (df *DictionaryFilter) Reset() {
        }
        df.values = df.values[:0]
        df.valueType = pbv1.ValueTypeUnknown
+       clear(df.valueSet)
 }
 
-func containElement(serializedArray []byte, element []byte, valueType 
pbv1.ValueType) bool {
-       if len(serializedArray) == 0 {
-               return false
+// buildValueSet builds a map-based lookup set from the dictionary values.
+// For non-array types, values are added directly.
+// For array types, elements are extracted from serialized arrays.
+func (df *DictionaryFilter) buildValueSet() {
+       if df.valueSet == nil {
+               df.valueSet = make(map[string]struct{}, len(df.values))
        }
-       if valueType == pbv1.ValueTypeInt64Arr {
-               if len(element) != 8 {
-                       return false
-               }
-               for i := 0; i < len(serializedArray); i += 8 {
-                       if i+8 > len(serializedArray) {
-                               break
-                       }
-                       if bytes.Equal(serializedArray[i:i+8], element) {
-                               return true
+
+       if df.valueType == pbv1.ValueTypeInt64Arr {
+               // Extract int64 elements from serialized arrays
+               for _, serializedArray := range df.values {
+                       for i := 0; i+8 <= len(serializedArray); i += 8 {
+                               
df.valueSet[convert.BytesToString(serializedArray[i:i+8])] = struct{}{}
                        }
                }
-               return false
+               return
        }
-       if valueType == pbv1.ValueTypeStrArr {
-               return containString(serializedArray, element)
+
+       if df.valueType == pbv1.ValueTypeStrArr {
+               // Extract string elements from serialized arrays
+               for _, serializedArray := range df.values {
+                       extractStrings(serializedArray, df.valueSet)
+               }
+               return
+       }
+
+       // For non-array types, add values directly
+       for _, v := range df.values {
+               df.valueSet[convert.BytesToString(v)] = struct{}{}
        }
-       return false
 }
 
-func containString(serializedArray, element []byte) bool {
+// extractStrings extracts string elements from a serialized string array and 
adds them to the set.
+func extractStrings(serializedArray []byte, set map[string]struct{}) {
        const (
                entityDelimiter = '|'
                escape          = '\\'
@@ -108,13 +128,9 @@ func containString(serializedArray, element []byte) bool {
        var buf []byte
        for len(src) > 0 {
                buf = buf[:0]
-               if len(src) == 0 {
-                       break
-               }
                if src[0] == entityDelimiter {
-                       if len(element) == 0 {
-                               return true
-                       }
+                       // Empty string element
+                       set[""] = struct{}{}
                        src = src[1:]
                        continue
                }
@@ -122,23 +138,20 @@ func containString(serializedArray, element []byte) bool {
                        switch {
                        case src[0] == escape:
                                if len(src) < 2 {
-                                       return false
+                                       return
                                }
                                src = src[1:]
                                buf = append(buf, src[0])
                        case src[0] == entityDelimiter:
                                src = src[1:]
-                               if bytes.Equal(buf, element) {
-                                       return true
-                               }
+                               set[string(buf)] = struct{}{}
                                goto nextElement
                        default:
                                buf = append(buf, src[0])
                        }
                        src = src[1:]
                }
-               return false
+               return
        nextElement:
        }
-       return false
 }
diff --git a/pkg/filter/dictionary_filter_test.go 
b/pkg/filter/dictionary_filter_test.go
index 594479fb..af2a4667 100644
--- a/pkg/filter/dictionary_filter_test.go
+++ b/pkg/filter/dictionary_filter_test.go
@@ -19,6 +19,7 @@ package filter
 
 import (
        stdbytes "bytes"
+       "fmt"
        "testing"
 
        "github.com/stretchr/testify/assert"
@@ -130,3 +131,187 @@ func TestDictionaryFilterStrArr(t *testing.T) {
                assert.False(df.MightContain(items[i]))
        }
 }
+
+// generateDictionaryValues creates test data with n unique values.
+func generateDictionaryValues(n int) [][]byte {
+       values := make([][]byte, n)
+       for i := 0; i < n; i++ {
+               values[i] = []byte(fmt.Sprintf("value_%d", i))
+       }
+       return values
+}
+
+// BenchmarkDictionaryFilterMightContain_Small benchmarks with 16 values 
(small dictionary).
+func BenchmarkDictionaryFilterMightContain_Small(b *testing.B) {
+       values := generateDictionaryValues(16)
+       df := NewDictionaryFilter(values)
+
+       // Test item that exists (middle of list)
+       existingItem := values[8]
+       // Test item that doesn't exist
+       nonExistingItem := []byte("not_found")
+
+       b.Run("Existing", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(existingItem)
+               }
+       })
+
+       b.Run("NonExisting", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(nonExistingItem)
+               }
+       })
+}
+
+// BenchmarkDictionaryFilterMightContain_Medium benchmarks with 128 values.
+func BenchmarkDictionaryFilterMightContain_Medium(b *testing.B) {
+       values := generateDictionaryValues(128)
+       df := NewDictionaryFilter(values)
+
+       // Test item that exists (middle of list)
+       existingItem := values[64]
+       // Test item that doesn't exist
+       nonExistingItem := []byte("not_found")
+
+       b.Run("Existing", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(existingItem)
+               }
+       })
+
+       b.Run("NonExisting", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(nonExistingItem)
+               }
+       })
+}
+
+// BenchmarkDictionaryFilterMightContain_Max benchmarks with 256 values (max 
dictionary size).
+func BenchmarkDictionaryFilterMightContain_Max(b *testing.B) {
+       values := generateDictionaryValues(256)
+       df := NewDictionaryFilter(values)
+
+       // Test item at the end (worst case for linear search)
+       lastItem := values[255]
+       // Test item that doesn't exist
+       nonExistingItem := []byte("not_found")
+
+       b.Run("ExistingLast", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(lastItem)
+               }
+       })
+
+       b.Run("NonExisting", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(nonExistingItem)
+               }
+       })
+}
+
+// BenchmarkDictionaryFilterMightContain_Int64Arr benchmarks array type 
lookups.
+func BenchmarkDictionaryFilterMightContain_Int64Arr(b *testing.B) {
+       // Create multiple int64 arrays
+       arrays := make([][]byte, 10)
+       for i := 0; i < 10; i++ {
+               arr := make([]byte, 0, 24)
+               for j := 0; j < 3; j++ {
+                       arr = append(arr, convert.Int64ToBytes(int64(i*3+j))...)
+               }
+               arrays[i] = arr
+       }
+
+       df := NewDictionaryFilter(arrays)
+       df.SetValueType(pbv1.ValueTypeInt64Arr)
+
+       existingItem := convert.Int64ToBytes(15) // exists in arrays[5]
+       nonExistingItem := convert.Int64ToBytes(999)
+
+       b.Run("Existing", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(existingItem)
+               }
+       })
+
+       b.Run("NonExisting", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(nonExistingItem)
+               }
+       })
+}
+
+// BenchmarkDictionaryFilterMightContain_StrArr benchmarks string array type 
lookups.
+func BenchmarkDictionaryFilterMightContain_StrArr(b *testing.B) {
+       // Create multiple string arrays
+       arrays := make([][]byte, 10)
+       for i := 0; i < 10; i++ {
+               dst := make([]byte, 0)
+               for j := 0; j < 3; j++ {
+                       dst = marshalVarArray(dst, 
[]byte(fmt.Sprintf("element_%d_%d", i, j)))
+               }
+               arrays[i] = dst
+       }
+
+       df := NewDictionaryFilter(arrays)
+       df.SetValueType(pbv1.ValueTypeStrArr)
+
+       existingItem := []byte("element_5_1")
+       nonExistingItem := []byte("not_found")
+
+       b.Run("Existing", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(existingItem)
+               }
+       })
+
+       b.Run("NonExisting", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       df.MightContain(nonExistingItem)
+               }
+       })
+}
+
+// linearSearchMightContain is the old O(n) implementation for comparison.
+func linearSearchMightContain(values [][]byte, item []byte) bool {
+       for _, v := range values {
+               if stdbytes.Equal(v, item) {
+                       return true
+               }
+       }
+       return false
+}
+
+// BenchmarkLinearSearch benchmarks the old linear search implementation for 
comparison.
+func BenchmarkLinearSearch_Max(b *testing.B) {
+       values := generateDictionaryValues(256)
+
+       lastItem := values[255]
+       nonExistingItem := []byte("not_found")
+
+       var result bool
+       b.Run("ExistingLast", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       result = linearSearchMightContain(values, lastItem)
+               }
+       })
+
+       b.Run("NonExisting", func(b *testing.B) {
+               b.ResetTimer()
+               for i := 0; i < b.N; i++ {
+                       result = linearSearchMightContain(values, 
nonExistingItem)
+               }
+       })
+       _ = result // Prevent compiler optimization
+}

(skywalking-banyandb) branch main updated: Enhance DictionaryFilter with map-based lookups for O(1) performance. Add benchmarks for various dictionary sizes and types, including string and int64 arrays. Refactor value handling and extraction methods for improved efficiency. (#872)

Reply via email to