This is an automated email from the ASF dual-hosted git repository.
hanahmily pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git
The following commit(s) were added to refs/heads/main by this push:
new 393630a9 Enhance DictionaryFilter with map-based lookups for O(1)
performance. Add benchmarks for various dictionary sizes and types, including
string and int64 arrays. Refactor value handling and extraction methods for
improved efficiency. (#872)
393630a9 is described below
commit 393630a94ceb6f8a53c58b303910ff5105a2da49
Author: Gao Hongtao <[email protected]>
AuthorDate: Sat Nov 29 22:02:40 2025 +0800
Enhance DictionaryFilter with map-based lookups for O(1) performance. Add
benchmarks for various dictionary sizes and types, including string and int64
arrays. Refactor value handling and extraction methods for improved efficiency.
(#872)
---
pkg/filter/dictionary_filter.go | 97 ++++++++++--------
pkg/filter/dictionary_filter_test.go | 185 +++++++++++++++++++++++++++++++++++
2 files changed, 240 insertions(+), 42 deletions(-)
diff --git a/pkg/filter/dictionary_filter.go b/pkg/filter/dictionary_filter.go
index 56543fb6..8df2db1c 100644
--- a/pkg/filter/dictionary_filter.go
+++ b/pkg/filter/dictionary_filter.go
@@ -18,51 +18,61 @@
package filter
import (
- "bytes"
-
+ "github.com/apache/skywalking-banyandb/pkg/convert"
pbv1 "github.com/apache/skywalking-banyandb/pkg/pb/v1"
)
// DictionaryFilter is a filter implementation backed by a dictionary.
+// It uses a map-based lookup for O(1) performance instead of O(n) linear
search.
type DictionaryFilter struct {
+ valueSet map[string]struct{}
values [][]byte
valueType pbv1.ValueType
}
// NewDictionaryFilter creates a new dictionary filter with the given values.
func NewDictionaryFilter(values [][]byte) *DictionaryFilter {
- return &DictionaryFilter{
+ df := &DictionaryFilter{
values: values,
}
+ df.buildValueSet()
+ return df
}
// MightContain checks if an item is in the dictionary.
+// For non-array types, it uses O(1) map lookup instead of O(n) linear search.
func (df *DictionaryFilter) MightContain(item []byte) bool {
if df.valueType == pbv1.ValueTypeStrArr || df.valueType ==
pbv1.ValueTypeInt64Arr {
- for _, serializedArray := range df.values {
- if containElement(serializedArray, item, df.valueType) {
- return true
- }
+ // For array types, check if the item exists in the
pre-computed element set
+ if df.valueSet != nil {
+ _, exists := df.valueSet[convert.BytesToString(item)]
+ return exists
}
return false
}
- for _, v := range df.values {
- if bytes.Equal(v, item) {
- return true
- }
+ // For non-array types, use O(1) map lookup
+ if df.valueSet != nil {
+ _, exists := df.valueSet[convert.BytesToString(item)]
+ return exists
}
return false
}
-// SetValues sets the dictionary values.
+// SetValues sets the dictionary values and builds the lookup set.
func (df *DictionaryFilter) SetValues(values [][]byte) {
df.values = values
+ df.buildValueSet()
}
// SetValueType sets the value type for the dictionary filter.
+// For array types, it rebuilds the lookup set by extracting elements from
serialized arrays.
func (df *DictionaryFilter) SetValueType(valueType pbv1.ValueType) {
df.valueType = valueType
+ // Rebuild the set for array types since elements need to be extracted
+ if valueType == pbv1.ValueTypeStrArr || valueType ==
pbv1.ValueTypeInt64Arr {
+ df.buildValueSet()
+ }
}
// Reset resets the dictionary filter.
@@ -72,33 +82,43 @@ func (df *DictionaryFilter) Reset() {
}
df.values = df.values[:0]
df.valueType = pbv1.ValueTypeUnknown
+ clear(df.valueSet)
}
-func containElement(serializedArray []byte, element []byte, valueType
pbv1.ValueType) bool {
- if len(serializedArray) == 0 {
- return false
+// buildValueSet builds a map-based lookup set from the dictionary values.
+// For non-array types, values are added directly.
+// For array types, elements are extracted from serialized arrays.
+func (df *DictionaryFilter) buildValueSet() {
+ if df.valueSet == nil {
+ df.valueSet = make(map[string]struct{}, len(df.values))
}
- if valueType == pbv1.ValueTypeInt64Arr {
- if len(element) != 8 {
- return false
- }
- for i := 0; i < len(serializedArray); i += 8 {
- if i+8 > len(serializedArray) {
- break
- }
- if bytes.Equal(serializedArray[i:i+8], element) {
- return true
+
+ if df.valueType == pbv1.ValueTypeInt64Arr {
+ // Extract int64 elements from serialized arrays
+ for _, serializedArray := range df.values {
+ for i := 0; i+8 <= len(serializedArray); i += 8 {
+
df.valueSet[convert.BytesToString(serializedArray[i:i+8])] = struct{}{}
}
}
- return false
+ return
}
- if valueType == pbv1.ValueTypeStrArr {
- return containString(serializedArray, element)
+
+ if df.valueType == pbv1.ValueTypeStrArr {
+ // Extract string elements from serialized arrays
+ for _, serializedArray := range df.values {
+ extractStrings(serializedArray, df.valueSet)
+ }
+ return
+ }
+
+ // For non-array types, add values directly
+ for _, v := range df.values {
+ df.valueSet[convert.BytesToString(v)] = struct{}{}
}
- return false
}
-func containString(serializedArray, element []byte) bool {
+// extractStrings extracts string elements from a serialized string array and
adds them to the set.
+func extractStrings(serializedArray []byte, set map[string]struct{}) {
const (
entityDelimiter = '|'
escape = '\\'
@@ -108,13 +128,9 @@ func containString(serializedArray, element []byte) bool {
var buf []byte
for len(src) > 0 {
buf = buf[:0]
- if len(src) == 0 {
- break
- }
if src[0] == entityDelimiter {
- if len(element) == 0 {
- return true
- }
+ // Empty string element
+ set[""] = struct{}{}
src = src[1:]
continue
}
@@ -122,23 +138,20 @@ func containString(serializedArray, element []byte) bool {
switch {
case src[0] == escape:
if len(src) < 2 {
- return false
+ return
}
src = src[1:]
buf = append(buf, src[0])
case src[0] == entityDelimiter:
src = src[1:]
- if bytes.Equal(buf, element) {
- return true
- }
+ set[string(buf)] = struct{}{}
goto nextElement
default:
buf = append(buf, src[0])
}
src = src[1:]
}
- return false
+ return
nextElement:
}
- return false
}
diff --git a/pkg/filter/dictionary_filter_test.go
b/pkg/filter/dictionary_filter_test.go
index 594479fb..af2a4667 100644
--- a/pkg/filter/dictionary_filter_test.go
+++ b/pkg/filter/dictionary_filter_test.go
@@ -19,6 +19,7 @@ package filter
import (
stdbytes "bytes"
+ "fmt"
"testing"
"github.com/stretchr/testify/assert"
@@ -130,3 +131,187 @@ func TestDictionaryFilterStrArr(t *testing.T) {
assert.False(df.MightContain(items[i]))
}
}
+
+// generateDictionaryValues creates test data with n unique values.
+func generateDictionaryValues(n int) [][]byte {
+ values := make([][]byte, n)
+ for i := 0; i < n; i++ {
+ values[i] = []byte(fmt.Sprintf("value_%d", i))
+ }
+ return values
+}
+
+// BenchmarkDictionaryFilterMightContain_Small benchmarks with 16 values
(small dictionary).
+func BenchmarkDictionaryFilterMightContain_Small(b *testing.B) {
+ values := generateDictionaryValues(16)
+ df := NewDictionaryFilter(values)
+
+ // Test item that exists (middle of list)
+ existingItem := values[8]
+ // Test item that doesn't exist
+ nonExistingItem := []byte("not_found")
+
+ b.Run("Existing", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(existingItem)
+ }
+ })
+
+ b.Run("NonExisting", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(nonExistingItem)
+ }
+ })
+}
+
+// BenchmarkDictionaryFilterMightContain_Medium benchmarks with 128 values.
+func BenchmarkDictionaryFilterMightContain_Medium(b *testing.B) {
+ values := generateDictionaryValues(128)
+ df := NewDictionaryFilter(values)
+
+ // Test item that exists (middle of list)
+ existingItem := values[64]
+ // Test item that doesn't exist
+ nonExistingItem := []byte("not_found")
+
+ b.Run("Existing", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(existingItem)
+ }
+ })
+
+ b.Run("NonExisting", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(nonExistingItem)
+ }
+ })
+}
+
+// BenchmarkDictionaryFilterMightContain_Max benchmarks with 256 values (max
dictionary size).
+func BenchmarkDictionaryFilterMightContain_Max(b *testing.B) {
+ values := generateDictionaryValues(256)
+ df := NewDictionaryFilter(values)
+
+ // Test item at the end (worst case for linear search)
+ lastItem := values[255]
+ // Test item that doesn't exist
+ nonExistingItem := []byte("not_found")
+
+ b.Run("ExistingLast", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(lastItem)
+ }
+ })
+
+ b.Run("NonExisting", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(nonExistingItem)
+ }
+ })
+}
+
+// BenchmarkDictionaryFilterMightContain_Int64Arr benchmarks array type
lookups.
+func BenchmarkDictionaryFilterMightContain_Int64Arr(b *testing.B) {
+ // Create multiple int64 arrays
+ arrays := make([][]byte, 10)
+ for i := 0; i < 10; i++ {
+ arr := make([]byte, 0, 24)
+ for j := 0; j < 3; j++ {
+ arr = append(arr, convert.Int64ToBytes(int64(i*3+j))...)
+ }
+ arrays[i] = arr
+ }
+
+ df := NewDictionaryFilter(arrays)
+ df.SetValueType(pbv1.ValueTypeInt64Arr)
+
+ existingItem := convert.Int64ToBytes(15) // exists in arrays[5]
+ nonExistingItem := convert.Int64ToBytes(999)
+
+ b.Run("Existing", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(existingItem)
+ }
+ })
+
+ b.Run("NonExisting", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(nonExistingItem)
+ }
+ })
+}
+
+// BenchmarkDictionaryFilterMightContain_StrArr benchmarks string array type
lookups.
+func BenchmarkDictionaryFilterMightContain_StrArr(b *testing.B) {
+ // Create multiple string arrays
+ arrays := make([][]byte, 10)
+ for i := 0; i < 10; i++ {
+ dst := make([]byte, 0)
+ for j := 0; j < 3; j++ {
+ dst = marshalVarArray(dst,
[]byte(fmt.Sprintf("element_%d_%d", i, j)))
+ }
+ arrays[i] = dst
+ }
+
+ df := NewDictionaryFilter(arrays)
+ df.SetValueType(pbv1.ValueTypeStrArr)
+
+ existingItem := []byte("element_5_1")
+ nonExistingItem := []byte("not_found")
+
+ b.Run("Existing", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(existingItem)
+ }
+ })
+
+ b.Run("NonExisting", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ df.MightContain(nonExistingItem)
+ }
+ })
+}
+
+// linearSearchMightContain is the old O(n) implementation for comparison.
+func linearSearchMightContain(values [][]byte, item []byte) bool {
+ for _, v := range values {
+ if stdbytes.Equal(v, item) {
+ return true
+ }
+ }
+ return false
+}
+
+// BenchmarkLinearSearch benchmarks the old linear search implementation for
comparison.
+func BenchmarkLinearSearch_Max(b *testing.B) {
+ values := generateDictionaryValues(256)
+
+ lastItem := values[255]
+ nonExistingItem := []byte("not_found")
+
+ var result bool
+ b.Run("ExistingLast", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ result = linearSearchMightContain(values, lastItem)
+ }
+ })
+
+ b.Run("NonExisting", func(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ result = linearSearchMightContain(values,
nonExistingItem)
+ }
+ })
+ _ = result // Prevent compiler optimization
+}