Re: [PR] Enhance tag filtering functionality by adding ContainsAll method [skywalking-banyandb]

via GitHub Tue, 16 Dec 2025 16:49:26 -0800


Copilot commented on code in PR #898:
URL: 
https://github.com/apache/skywalking-banyandb/pull/898#discussion_r2625216158



##########
banyand/internal/sidx/element.go:
##########
@@ -60,24 +59,17 @@ func unmarshalTag(dest [][]byte, src []byte, valueType 
pbv1.ValueType) ([][]byte
                return dest, nil
        }
        if valueType == pbv1.ValueTypeStrArr {
-               bb := bigValuePool.Get()
-               if bb == nil {
-                       bb = &bytes.Buffer{}
-               }
-               defer func() {
-                       bb.Buf = bb.Buf[:0]
-                       bigValuePool.Put(bb)
-               }()
-               var err error
-               for len(src) > 0 {
-                       bb.Buf, src, err = 
encoding.UnmarshalVarArray(bb.Buf[:0], src)
+               var (
+                       end  int
+                       next int
+                       err  error
+               )
+               for idx := 0; idx < len(src); idx = next {
+                       end, next, err = encoding.UnmarshalVarArray(src, idx)
                        if err != nil {
                                return nil, err
                        }
-                       // Make a copy since bb.Buf will be reused
-                       valueCopy := make([]byte, len(bb.Buf))
-                       copy(valueCopy, bb.Buf)
-                       dest = append(dest, valueCopy)
+                       dest = append(dest, src[idx:end])

Review Comment:
   The `UnmarshalVarArray` function decodes in-place and returns slices that 
reference the original `src` buffer. The code directly appends `src[idx:end]` 
to the `dest` slice without making a copy. This creates an aliasing issue: if 
the caller later modifies or reuses the `src` buffer, the returned slices will 
be affected. Consider making a copy of the decoded value before appending it to 
`dest`.



##########
pkg/filter/dictionary_filter_test.go:
##########
@@ -315,3 +541,220 @@ func BenchmarkLinearSearch_Max(b *testing.B) {
        })
        _ = result // Prevent compiler optimization
 }
+
+// generateInt64QueryItems creates query items for int64 array (subset of 
array elements).
+func generateInt64QueryItems(arrayLen int, percentage int) [][]byte {
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       for i := 0; i < queryLen; i++ {
+               items[i] = convert.Int64ToBytes(int64(i))
+       }
+       return items
+}
+
+// generateStrQueryItems creates query items for string array (subset of array 
elements).
+func generateStrQueryItems(arrayLen int, percentage int) [][]byte {
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       for i := 0; i < queryLen; i++ {
+               items[i] = []byte(fmt.Sprintf("element_%d", i))
+       }
+       return items
+}
+
+// generateInt64QueryItemsNonExisting creates query items that don't exist in 
the array.
+func generateInt64QueryItemsNonExisting(arrayLen int, percentage int) [][]byte 
{
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       // Use values that are definitely not in the array (starting from 
arrayLen)
+       for i := 0; i < queryLen; i++ {
+               items[i] = convert.Int64ToBytes(int64(arrayLen + i))
+       }
+       return items
+}
+
+// generateStrQueryItemsNonExisting creates query items that don't exist in 
the array.
+func generateStrQueryItemsNonExisting(arrayLen int, percentage int) [][]byte {
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       // Use values that are definitely not in the array
+       for i := 0; i < queryLen; i++ {
+               items[i] = []byte(fmt.Sprintf("nonexistent_%d", arrayLen+i))
+       }
+       return items
+}
+
+// benchmarkDictionaryFilterContainsAll is a helper function that benchmarks 
ContainsAll
+// for array types with given array count, array length, and query percentage.
+// arrayCount is the number of arrays in the dictionary (small=16, medium=128, 
max=256).
+// arrayLen is the number of elements in each array.
+func benchmarkDictionaryFilterContainsAll(b *testing.B, valueType 
pbv1.ValueType, arrayCount int, arrayLen int, percentage int, existing bool) {
+       var arrays [][]byte
+       var queryItems [][]byte
+
+       switch valueType {
+       case pbv1.ValueTypeInt64Arr:
+               arrays = make([][]byte, arrayCount)
+               for i := 0; i < arrayCount; i++ {
+                       // Create arrays with different starting values to 
ensure variety
+                       arr := make([]byte, 0, arrayLen*8)
+                       for j := 0; j < arrayLen; j++ {
+                               arr = append(arr, 
convert.Int64ToBytes(int64(i*arrayLen+j))...)
+                       }
+                       arrays[i] = arr
+               }
+               // Query items should match elements from the first array 
(index 0)
+               if existing {
+                       queryItems = generateInt64QueryItems(arrayLen, 
percentage)
+               } else {
+                       queryItems = 
generateInt64QueryItemsNonExisting(arrayLen, percentage)
+               }
+       case pbv1.ValueTypeStrArr:
+               arrays = make([][]byte, arrayCount)
+               for i := 0; i < arrayCount; i++ {
+                       // Create arrays with different starting values to 
ensure variety
+                       arr := make([]byte, 0)
+                       for j := 0; j < arrayLen; j++ {
+                               arr = marshalVarArray(arr, 
[]byte(fmt.Sprintf("element_%d_%d", i, j)))
+                       }
+                       arrays[i] = arr
+               }
+               // Query items should match elements from the first array 
(index 0)
+               if existing {
+                       queryItems = generateStrQueryItems(arrayLen, percentage)

Review Comment:
   The benchmark for string arrays has a mismatch between the stored data and 
query pattern. The stored arrays use the pattern `"element_%d_%d"` (line 630), 
but the query items use the pattern `"element_%d"` (line 566). This means the 
"Existing" benchmark cases will never actually find matches, making them 
equivalent to "NonExisting" cases and not testing the intended scenario.



##########
pkg/encoding/array.go:
##########
@@ -0,0 +1,92 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package encoding
+
+import (
+       "bytes"
+       "errors"
+)
+
+const (
+       // EntityDelimiter is the delimiter for entities in a variable-length 
array.
+       EntityDelimiter = '|'
+       // Escape is the escape character for entities in a variable-length 
array.
+       Escape = '\\'
+)
+
+// MarshalVarArray marshals a byte slice into a variable-length array format.
+// It escapes delimiter and escape characters within the source slice.
+func MarshalVarArray(dest, src []byte) []byte {
+       if bytes.IndexByte(src, EntityDelimiter) < 0 && bytes.IndexByte(src, 
Escape) < 0 {
+               dest = append(dest, src...)
+               dest = append(dest, EntityDelimiter)
+               return dest
+       }
+       for _, b := range src {
+               if b == EntityDelimiter || b == Escape {
+                       dest = append(dest, Escape)
+               }
+               dest = append(dest, b)
+       }
+       dest = append(dest, EntityDelimiter)
+       return dest
+}
+
+// UnmarshalVarArray unmarshals a variable-length array from src starting at 
idx.
+// It decodes in-place and returns:
+//   - end: the index of the first byte after the decoded value (exclusive)
+//   - next: the index of the next element (the byte after the delimiter)
+//
+// The caller can iterate without creating subslices by tracking indices:
+//
+//     for idx < len(src) {
+//         end, next, err := UnmarshalVarArray(src, idx)
+//         // use src[idx:end]
+//         idx = next
+//     }

Review Comment:
   The documentation for `UnmarshalVarArray` should explicitly warn that the 
function modifies the `src` buffer in-place during decoding. While the comment 
mentions "decode in-place", it's not clear enough that the buffer will be 
mutated. This is a critical detail since callers need to know they must copy 
the result if they need to preserve the original buffer or use the decoded 
values after subsequent unmarshal operations on the same buffer.



##########
pkg/filter/dictionary_filter.go:
##########
@@ -18,61 +18,74 @@
 package filter
 
 import (
-       "github.com/apache/skywalking-banyandb/pkg/convert"
+       "bytes"
+
+       "github.com/apache/skywalking-banyandb/pkg/encoding"
        pbv1 "github.com/apache/skywalking-banyandb/pkg/pb/v1"
 )
 
 // DictionaryFilter is a filter implementation backed by a dictionary.
-// It uses a map-based lookup for O(1) performance instead of O(n) linear 
search.
+// For non-array types: uses linear iteration through values.
+// For array types: uses iterative approach, extracting and sorting on-the-fly.
 type DictionaryFilter struct {
-       valueSet  map[string]struct{}
+       // Original serialized values
        values    [][]byte
        valueType pbv1.ValueType
 }
 
-// NewDictionaryFilter creates a new dictionary filter with the given values.
-func NewDictionaryFilter(values [][]byte) *DictionaryFilter {
-       df := &DictionaryFilter{
-               values: values,
+// MightContain checks if an item is in the dictionary.
+// For non-array types: linear iteration through values.
+// For array types: checks if the single item exists as an element in any 
stored array.
+func (df *DictionaryFilter) MightContain(item []byte) bool {
+       if df.valueType == pbv1.ValueTypeStrArr || df.valueType == 
pbv1.ValueTypeInt64Arr {
+               return false
+       }
+
+       for _, v := range df.values {
+               if bytes.Equal(v, item) {
+                       return true
+               }
        }
-       df.buildValueSet()
-       return df
+       return false

Review Comment:
   The implementation changed from O(1) map-based lookup to O(n) linear 
iteration for non-array types (lines 44-48). This represents a performance 
regression, especially for larger dictionaries. Dictionary filters can contain 
up to 256 values according to the benchmark tests. While the PR description 
mentions this change is for "optimized checks," the linear search is 
objectively slower than the previous map-based approach for non-array types. 
Consider whether the map-based lookup could be retained for non-array types 
while using the new iterative approach only for array types.



##########
pkg/filter/dictionary_filter.go:
##########
@@ -18,61 +18,74 @@
 package filter
 
 import (
-       "github.com/apache/skywalking-banyandb/pkg/convert"
+       "bytes"
+
+       "github.com/apache/skywalking-banyandb/pkg/encoding"
        pbv1 "github.com/apache/skywalking-banyandb/pkg/pb/v1"
 )
 
 // DictionaryFilter is a filter implementation backed by a dictionary.
-// It uses a map-based lookup for O(1) performance instead of O(n) linear 
search.
+// For non-array types: uses linear iteration through values.
+// For array types: uses iterative approach, extracting and sorting on-the-fly.
 type DictionaryFilter struct {
-       valueSet  map[string]struct{}
+       // Original serialized values
        values    [][]byte
        valueType pbv1.ValueType
 }
 
-// NewDictionaryFilter creates a new dictionary filter with the given values.
-func NewDictionaryFilter(values [][]byte) *DictionaryFilter {
-       df := &DictionaryFilter{
-               values: values,
+// MightContain checks if an item is in the dictionary.
+// For non-array types: linear iteration through values.
+// For array types: checks if the single item exists as an element in any 
stored array.
+func (df *DictionaryFilter) MightContain(item []byte) bool {
+       if df.valueType == pbv1.ValueTypeStrArr || df.valueType == 
pbv1.ValueTypeInt64Arr {
+               return false
+       }
+
+       for _, v := range df.values {
+               if bytes.Equal(v, item) {
+                       return true
+               }
        }
-       df.buildValueSet()
-       return df
+       return false
 }
 
-// MightContain checks if an item is in the dictionary.
-// For non-array types, it uses O(1) map lookup instead of O(n) linear search.
-func (df *DictionaryFilter) MightContain(item []byte) bool {
+// ContainsAll checks if all items are present in the dictionary.
+// For non-array types: checks if ANY of the items are present (OR semantics).
+// For array types: checks if ALL items form a subset of any stored array (AND 
semantics).
+func (df *DictionaryFilter) ContainsAll(items [][]byte) bool {
+       if len(items) == 0 {
+               return true
+       }
+
        if df.valueType == pbv1.ValueTypeStrArr || df.valueType == 
pbv1.ValueTypeInt64Arr {
-               // For array types, check if the item exists in the 
pre-computed element set
-               if df.valueSet != nil {
-                       _, exists := df.valueSet[convert.BytesToString(item)]
-                       return exists
+               for _, serializedArray := range df.values {
+                       if df.extractElements(serializedArray, items) {
+                               return true
+                       }
                }
                return false
        }
 
-       // For non-array types, use O(1) map lookup
-       if df.valueSet != nil {
-               _, exists := df.valueSet[convert.BytesToString(item)]
-               return exists
+       // For non-array types: a single value can only match one item
+       // If multiple items are requested, return false immediately
+       if len(items) != 1 {
+               return false
+       }
+
+       // Check if the single item exists
+       item := items[0]
+       for _, v := range df.values {
+               if bytes.Equal(v, item) {
+                       return true
+               }
        }
        return false
 }

Review Comment:
   The method name `ContainsAll` is misleading for non-array types. The name 
suggests AND semantics (all items must be present), but the implementation 
returns true if ANY single item is present (OR semantics). For non-array types, 
if `len(items) != 1`, it returns false, and if `len(items) == 1`, it checks if 
that one item exists. This is essentially "ContainsAny" behavior limited to 
single items. Consider renaming this method or adjusting the semantics to match 
the name. Note that the interface documentation at line 45 in tag_filter_op.go 
does describe this behavior, but the method name itself is still confusing.



##########
pkg/filter/dictionary_filter.go:
##########
@@ -82,76 +95,55 @@ func (df *DictionaryFilter) Reset() {
        }
        df.values = df.values[:0]
        df.valueType = pbv1.ValueTypeUnknown
-       clear(df.valueSet)
 }
 
-// buildValueSet builds a map-based lookup set from the dictionary values.
-// For non-array types, values are added directly.
-// For array types, elements are extracted from serialized arrays.
-func (df *DictionaryFilter) buildValueSet() {
-       if df.valueSet == nil {
-               df.valueSet = make(map[string]struct{}, len(df.values))
+// extractElements checks if all query values form a subset of the serialized 
array.
+// Returns true if all query values exist in the array (subset check).
+func (df *DictionaryFilter) extractElements(serializedArray []byte, values 
[][]byte) bool {
+       if len(values) == 0 {
+               return true
        }
 
        if df.valueType == pbv1.ValueTypeInt64Arr {
-               // Extract int64 elements from serialized arrays
-               for _, serializedArray := range df.values {
+               // For each query value, check if it exists in the array
+               for _, v := range values {
+                       found := false
                        for i := 0; i+8 <= len(serializedArray); i += 8 {
-                               
df.valueSet[convert.BytesToString(serializedArray[i:i+8])] = struct{}{}
+                               if bytes.Equal(v, serializedArray[i:i+8]) {
+                                       found = true
+                                       break
+                               }
+                       }
+                       if !found {
+                               return false
                        }
                }
-               return
+               return true
        }
 
        if df.valueType == pbv1.ValueTypeStrArr {
-               // Extract string elements from serialized arrays
-               for _, serializedArray := range df.values {
-                       extractStrings(serializedArray, df.valueSet)
-               }
-               return
-       }
-
-       // For non-array types, add values directly
-       for _, v := range df.values {
-               df.valueSet[convert.BytesToString(v)] = struct{}{}
-       }
-}
-
-// extractStrings extracts string elements from a serialized string array and 
adds them to the set.
-func extractStrings(serializedArray []byte, set map[string]struct{}) {
-       const (
-               entityDelimiter = '|'
-               escape          = '\\'
-       )
-
-       src := serializedArray
-       var buf []byte
-       for len(src) > 0 {
-               buf = buf[:0]
-               if src[0] == entityDelimiter {
-                       // Empty string element
-                       set[""] = struct{}{}
-                       src = src[1:]
-                       continue
-               }
-               for len(src) > 0 {
-                       switch {
-                       case src[0] == escape:
-                               if len(src) < 2 {
-                                       return
+               // For each query value, check if it exists in the array
+               // UnmarshalVarArray modifies the source in-place for decoding
+               // This approach has zero allocations and early-exits on match
+               for _, v := range values {
+                       found := false
+                       for idx := 0; idx < len(serializedArray); {
+                               end, next, err := 
encoding.UnmarshalVarArray(serializedArray, idx)
+                               if err != nil {
+                                       return false
+                               }
+                               if bytes.Equal(v, serializedArray[idx:end]) {
+                                       found = true
+                                       break
                                }
-                               src = src[1:]
-                               buf = append(buf, src[0])
-                       case src[0] == entityDelimiter:
-                               src = src[1:]
-                               set[string(buf)] = struct{}{}
-                               goto nextElement
-                       default:
-                               buf = append(buf, src[0])
+                               idx = next
+                       }
+                       if !found {
+                               return false
                        }
-                       src = src[1:]
                }
-               return
-       nextElement:
+               return true

Review Comment:
   The `UnmarshalVarArray` function modifies the `serializedArray` buffer 
in-place during decoding (see lines 81, 87 in array.go). This means that 
iterating through the same `serializedArray` multiple times (once per query 
value) will result in corrupted data on subsequent iterations. The first 
iteration will overwrite escaped characters in the buffer, breaking subsequent 
comparisons. Consider using a copy of the serialized array for each query value 
check, or restructure the algorithm to avoid multiple passes.



##########
banyand/trace/query.go:
##########
@@ -561,15 +561,17 @@ func mustDecodeTagValueAndArray(valueType pbv1.ValueType, 
value []byte, valueArr
                        }
                        return strArrTagValue(values)
                }
-               bb := bigValuePool.Generate()
-               defer bigValuePool.Release(bb)
-               var err error
-               for len(value) > 0 {
-                       bb.Buf, value, err = 
encoding.UnmarshalVarArray(bb.Buf[:0], value)
+               var (
+                       end  int
+                       next int
+                       err  error
+               )
+               for idx := 0; idx < len(value); idx = next {
+                       end, next, err = encoding.UnmarshalVarArray(value, idx)
                        if err != nil {
                                logger.Panicf("UnmarshalVarArray failed: %v", 
err)
                        }
-                       values = append(values, string(bb.Buf))
+                       values = append(values, string(value[idx:end]))

Review Comment:
   The `UnmarshalVarArray` function decodes in-place and returns slices that 
reference the original `value` buffer. The code directly appends 
`value[idx:end]` to the `values` slice without making a copy. This creates an 
aliasing issue: if the caller later modifies or reuses the `value` buffer, the 
returned string values will be affected. Consider making a copy of the decoded 
value before converting it to a string.



##########
pkg/filter/dictionary_filter_test.go:
##########
@@ -315,3 +541,220 @@ func BenchmarkLinearSearch_Max(b *testing.B) {
        })
        _ = result // Prevent compiler optimization
 }
+
+// generateInt64QueryItems creates query items for int64 array (subset of 
array elements).
+func generateInt64QueryItems(arrayLen int, percentage int) [][]byte {
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       for i := 0; i < queryLen; i++ {
+               items[i] = convert.Int64ToBytes(int64(i))
+       }
+       return items
+}
+
+// generateStrQueryItems creates query items for string array (subset of array 
elements).
+func generateStrQueryItems(arrayLen int, percentage int) [][]byte {
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       for i := 0; i < queryLen; i++ {
+               items[i] = []byte(fmt.Sprintf("element_%d", i))
+       }
+       return items
+}
+
+// generateInt64QueryItemsNonExisting creates query items that don't exist in 
the array.
+func generateInt64QueryItemsNonExisting(arrayLen int, percentage int) [][]byte 
{
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       // Use values that are definitely not in the array (starting from 
arrayLen)
+       for i := 0; i < queryLen; i++ {
+               items[i] = convert.Int64ToBytes(int64(arrayLen + i))
+       }
+       return items
+}
+
+// generateStrQueryItemsNonExisting creates query items that don't exist in 
the array.
+func generateStrQueryItemsNonExisting(arrayLen int, percentage int) [][]byte {
+       queryLen := (arrayLen * percentage) / 100
+       if queryLen == 0 {
+               queryLen = 1
+       }
+       items := make([][]byte, queryLen)
+       // Use values that are definitely not in the array
+       for i := 0; i < queryLen; i++ {
+               items[i] = []byte(fmt.Sprintf("nonexistent_%d", arrayLen+i))
+       }
+       return items
+}
+
+// benchmarkDictionaryFilterContainsAll is a helper function that benchmarks 
ContainsAll
+// for array types with given array count, array length, and query percentage.
+// arrayCount is the number of arrays in the dictionary (small=16, medium=128, 
max=256).
+// arrayLen is the number of elements in each array.
+func benchmarkDictionaryFilterContainsAll(b *testing.B, valueType 
pbv1.ValueType, arrayCount int, arrayLen int, percentage int, existing bool) {
+       var arrays [][]byte
+       var queryItems [][]byte
+
+       switch valueType {
+       case pbv1.ValueTypeInt64Arr:
+               arrays = make([][]byte, arrayCount)
+               for i := 0; i < arrayCount; i++ {
+                       // Create arrays with different starting values to 
ensure variety
+                       arr := make([]byte, 0, arrayLen*8)
+                       for j := 0; j < arrayLen; j++ {
+                               arr = append(arr, 
convert.Int64ToBytes(int64(i*arrayLen+j))...)
+                       }
+                       arrays[i] = arr
+               }
+               // Query items should match elements from the first array 
(index 0)
+               if existing {
+                       queryItems = generateInt64QueryItems(arrayLen, 
percentage)

Review Comment:
   The benchmark for int64 arrays has a mismatch between the stored data and 
query values. The stored arrays use values starting from `i*arrayLen+j` (line 
614), which for the first array (i=0) produces values 0 through arrayLen-1. 
However, `generateInt64QueryItems` also generates values 0 through queryLen-1 
(line 553). While this appears to work for the first array, the comment at line 
618 is misleading since the query generation doesn't explicitly target the 
first array - it just happens to overlap. The code should be more explicit 
about this relationship to avoid confusion.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Enhance tag filtering functionality by adding ContainsAll method [skywalking-banyandb]

Reply via email to