This is an automated email from the ASF dual-hosted git repository. placave pushed a commit to branch split-serde-comparator-hasher in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit 68fd4a4298ee50c0ba107e17ef8a31a987273104 Author: Pierre Lacave <[email protected]> AuthorDate: Sat Mar 23 00:02:31 2024 +0100 Split Serde/Comparator/Hasher and add reverse KLL order test --- ...y_of_doubles_serde.go => item_sketch_double.go} | 33 ++-- ...array_of_longs_serde.go => item_sketch_long.go} | 33 ++-- ...y_of_strings_serde.go => item_sketch_string.go} | 32 ++-- common/types.go | 13 +- examples/frequency_example_test.go | 2 +- examples/kll_example_test.go | 5 +- frequencies/items_sketch.go | 31 ++-- frequencies/items_sketch_test.go | 73 +++++--- frequencies/reverse_purge_item_hash_map.go | 12 +- frequencies/serde_compat_test.go | 7 +- frequencies/sketch_serialization_test.go | 24 +-- internal/generic_inequality_search.go | 46 ++--- kll/items_sketch.go | 122 +++++++------ kll/items_sketch_iterator.go | 2 +- kll/items_sketch_sorted_view.go | 61 ++++--- kll/items_sketch_test.go | 196 +++++++++++++++------ kll/items_sketch_validate.go | 16 +- kll/items_sletch_serialization_test.go | 28 +-- kll/utils.go | 4 +- static-analysis.datadog.yml | 3 + 20 files changed, 444 insertions(+), 299 deletions(-) diff --git a/common/array_of_doubles_serde.go b/common/item_sketch_double.go similarity index 70% rename from common/array_of_doubles_serde.go rename to common/item_sketch_double.go index 84d5872..d28e90f 100644 --- a/common/array_of_doubles_serde.go +++ b/common/item_sketch_double.go @@ -23,40 +23,43 @@ import ( "math" ) -type ArrayOfDoublesSerDe struct { +type ItemSketchDoubleComparator struct { + ReverseOrder bool +} +type ItemSketchDoubleHasher struct { scratch [8]byte } +type ItemSketchDoubleSerDe struct{} -func (f ArrayOfDoublesSerDe) Identity() float64 { - return 0 +func (f ItemSketchDoubleComparator) CompareFn() CompareFn[float64] { + return func(a float64, b float64) bool { + if f.ReverseOrder { + return a > b + } + return a < b + } } -func (f ArrayOfDoublesSerDe) Hash(item float64) uint64 { +func (f ItemSketchDoubleHasher) Hash(item float64) uint64 { binary.LittleEndian.PutUint64(f.scratch[:], math.Float64bits(item)) return murmur3.SeedSum64(_DEFAULT_SERDE_HASH_SEED, f.scratch[:]) } -func (f ArrayOfDoublesSerDe) LessFn() LessFn[float64] { - return func(a float64, b float64) bool { - return a < b - } -} - -func (f ArrayOfDoublesSerDe) SizeOf(item float64) int { +func (f ItemSketchDoubleSerDe) SizeOf(item float64) int { return 8 } -func (f ArrayOfDoublesSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { +func (f ItemSketchDoubleSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { return numItems * 8, nil } -func (f ArrayOfDoublesSerDe) SerializeOneToSlice(item float64) []byte { +func (f ItemSketchDoubleSerDe) SerializeOneToSlice(item float64) []byte { bytes := make([]byte, 8) binary.LittleEndian.PutUint64(bytes, math.Float64bits(item)) return bytes } -func (f ArrayOfDoublesSerDe) SerializeManyToSlice(item []float64) []byte { +func (f ItemSketchDoubleSerDe) SerializeManyToSlice(item []float64) []byte { if len(item) == 0 { return []byte{} } @@ -69,7 +72,7 @@ func (f ArrayOfDoublesSerDe) SerializeManyToSlice(item []float64) []byte { return bytes } -func (f ArrayOfDoublesSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]float64, error) { +func (f ItemSketchDoubleSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]float64, error) { if numItems == 0 { return []float64{}, nil } diff --git a/common/array_of_longs_serde.go b/common/item_sketch_long.go similarity index 69% rename from common/array_of_longs_serde.go rename to common/item_sketch_long.go index b829cdf..e6a8495 100644 --- a/common/array_of_longs_serde.go +++ b/common/item_sketch_long.go @@ -22,40 +22,43 @@ import ( "github.com/twmb/murmur3" ) -type ArrayOfLongsSerDe struct { +type ItemSketchLongComparator struct { + ReverseOrder bool +} +type ItemSketchLongHasher struct { scratch [8]byte } +type ItemSketchLongSerDe struct{} -func (f ArrayOfLongsSerDe) Identity() int64 { - return 0 +func (f ItemSketchLongComparator) CompareFn() CompareFn[int64] { + return func(a, b int64) bool { + if f.ReverseOrder { + return a > b + } + return a < b + } } -func (f ArrayOfLongsSerDe) Hash(item int64) uint64 { +func (f ItemSketchLongHasher) Hash(item int64) uint64 { binary.LittleEndian.PutUint64(f.scratch[:], uint64(item)) return murmur3.SeedSum64(_DEFAULT_SERDE_HASH_SEED, f.scratch[:]) } -func (f ArrayOfLongsSerDe) LessFn() LessFn[int64] { - return func(a int64, b int64) bool { - return a < b - } -} - -func (f ArrayOfLongsSerDe) SizeOf(item int64) int { +func (f ItemSketchLongSerDe) SizeOf(item int64) int { return 8 } -func (f ArrayOfLongsSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { +func (f ItemSketchLongSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { return numItems * 8, nil } -func (f ArrayOfLongsSerDe) SerializeOneToSlice(item int64) []byte { +func (f ItemSketchLongSerDe) SerializeOneToSlice(item int64) []byte { bytes := make([]byte, 8) binary.LittleEndian.PutUint64(bytes, uint64(item)) return bytes } -func (f ArrayOfLongsSerDe) SerializeManyToSlice(item []int64) []byte { +func (f ItemSketchLongSerDe) SerializeManyToSlice(item []int64) []byte { if len(item) == 0 { return []byte{} } @@ -68,7 +71,7 @@ func (f ArrayOfLongsSerDe) SerializeManyToSlice(item []int64) []byte { return bytes } -func (f ArrayOfLongsSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]int64, error) { +func (f ItemSketchLongSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]int64, error) { if numItems == 0 { return []int64{}, nil } diff --git a/common/array_of_strings_serde.go b/common/item_sketch_string.go similarity index 79% rename from common/array_of_strings_serde.go rename to common/item_sketch_string.go index a153f28..a6c9599 100644 --- a/common/array_of_strings_serde.go +++ b/common/item_sketch_string.go @@ -25,32 +25,34 @@ import ( "github.com/twmb/murmur3" ) -type ArrayOfStringsSerDe struct { +type ItemSketchStringComparator struct { + ReverseOrder bool } +type ItemSketchStringHasher struct{} +type ItemSketchStringSerDe struct{} -func (f ArrayOfStringsSerDe) Identity() string { - return "" +func (f ItemSketchStringComparator) CompareFn() CompareFn[string] { + return func(a, b string) bool { + if f.ReverseOrder { + return a > b + } + return a < b + } } -func (f ArrayOfStringsSerDe) Hash(item string) uint64 { +func (f ItemSketchStringHasher) Hash(item string) uint64 { datum := unsafe.Slice(unsafe.StringData(item), len(item)) return murmur3.SeedSum64(_DEFAULT_SERDE_HASH_SEED, datum[:]) } -func (f ArrayOfStringsSerDe) LessFn() LessFn[string] { - return func(a string, b string) bool { - return a < b - } -} - -func (f ArrayOfStringsSerDe) SizeOf(item string) int { +func (f ItemSketchStringSerDe) SizeOf(item string) int { if len(item) == 0 { return int(unsafe.Sizeof(uint32(0))) } return len(item) + int(unsafe.Sizeof(uint32(0))) } -func (f ArrayOfStringsSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { +func (f ItemSketchStringSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { if numItems <= 0 { return 0, nil } @@ -71,7 +73,7 @@ func (f ArrayOfStringsSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems in return offset - offsetBytes, nil } -func (f ArrayOfStringsSerDe) SerializeOneToSlice(item string) []byte { +func (f ItemSketchStringSerDe) SerializeOneToSlice(item string) []byte { if len(item) == 0 { return []byte{} } @@ -82,7 +84,7 @@ func (f ArrayOfStringsSerDe) SerializeOneToSlice(item string) []byte { return bytesOut } -func (f ArrayOfStringsSerDe) SerializeManyToSlice(item []string) []byte { +func (f ItemSketchStringSerDe) SerializeManyToSlice(item []string) []byte { if len(item) == 0 { return []byte{} } @@ -105,7 +107,7 @@ func (f ArrayOfStringsSerDe) SerializeManyToSlice(item []string) []byte { return bytesOut } -func (f ArrayOfStringsSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]string, error) { +func (f ItemSketchStringSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]string, error) { if numItems <= 0 { return []string{}, nil } diff --git a/common/types.go b/common/types.go index cedc993..28df795 100644 --- a/common/types.go +++ b/common/types.go @@ -17,12 +17,17 @@ package common -type LessFn[C comparable] func(C, C) bool +type CompareFn[C comparable] func(C, C) bool -type ItemSketchOp[C comparable] interface { - Identity() C +type ItemSketchComparator[C comparable] interface { + CompareFn() CompareFn[C] +} + +type ItemSketchHasher[C comparable] interface { Hash(item C) uint64 - LessFn() LessFn[C] +} + +type ItemSketchSerde[C comparable] interface { SizeOf(item C) int SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) SerializeManyToSlice(items []C) []byte diff --git a/examples/frequency_example_test.go b/examples/frequency_example_test.go index aae3b65..70362e3 100644 --- a/examples/frequency_example_test.go +++ b/examples/frequency_example_test.go @@ -26,7 +26,7 @@ import ( func TestFrequencyItemsSketch(t *testing.T) { // Create a new sketch with a maximum of 16 items - sketch, err := frequencies.NewFrequencyItemsSketchWithMaxMapSize[string](16, common.ArrayOfStringsSerDe{}) + sketch, err := frequencies.NewFrequencyItemsSketchWithMaxMapSize[string](16, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) // Update the sketch with some items diff --git a/examples/kll_example_test.go b/examples/kll_example_test.go index 80dcb00..272649c 100644 --- a/examples/kll_example_test.go +++ b/examples/kll_example_test.go @@ -26,8 +26,11 @@ import ( ) func TestKllItemsSketch(t *testing.T) { + // Create a comparison function for strings (or use common.ItemSketchStringComparator{}) + comparator := common.ItemSketchStringComparator{} + // Create a new KLL sketch - sketch, err := kll.NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + sketch, err := kll.NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) // Update the sketch with 1000 items diff --git a/frequencies/items_sketch.go b/frequencies/items_sketch.go index 2f912f0..511c041 100644 --- a/frequencies/items_sketch.go +++ b/frequencies/items_sketch.go @@ -61,10 +61,10 @@ type ItemsSketch[C comparable] struct { // Both the ultimate accuracy and size of this sketch are functions of lgMaxMapSize. // - lgCurMapSize, log2 of the starting (current) physical size of the internal hashFn // map managed by this sketch. -func NewFrequencyItemsSketch[C comparable](lgMaxMapSize int, lgCurMapSize int, operations common.ItemSketchOp[C]) (*ItemsSketch[C], error) { +func NewFrequencyItemsSketch[C comparable](lgMaxMapSize int, lgCurMapSize int, hasher common.ItemSketchHasher[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) { lgMaxMapSz := max(lgMaxMapSize, _LG_MIN_MAP_SIZE) lgCurMapSz := max(lgCurMapSize, _LG_MIN_MAP_SIZE) - hashMap, err := newReversePurgeItemHashMap[C](1<<lgCurMapSz, operations) + hashMap, err := newReversePurgeItemHashMap[C](1<<lgCurMapSz, hasher, serde) if err != nil { return nil, err } @@ -89,12 +89,12 @@ func NewFrequencyItemsSketch[C comparable](lgMaxMapSize int, lgCurMapSize int, o // sketch and must be a power of 2. The maximum capacity of this internal hash map is // 0.75 times * maxMapSize. Both the ultimate accuracy and size of this sketch are // functions of maxMapSize. -func NewFrequencyItemsSketchWithMaxMapSize[C comparable](maxMapSize int, operations common.ItemSketchOp[C]) (*ItemsSketch[C], error) { +func NewFrequencyItemsSketchWithMaxMapSize[C comparable](maxMapSize int, hasher common.ItemSketchHasher[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) { maxMapSz, err := internal.ExactLog2(maxMapSize) if err != nil { return nil, err } - return NewFrequencyItemsSketch[C](maxMapSz, _LG_MIN_MAP_SIZE, operations) + return NewFrequencyItemsSketch[C](maxMapSz, _LG_MIN_MAP_SIZE, hasher, serde) } // NewFrequencyItemsSketchFromSlice constructs a new ItemsSketch with the given maxMapSize and the @@ -104,7 +104,11 @@ func NewFrequencyItemsSketchWithMaxMapSize[C comparable](maxMapSize int, operati // sketch and must be a power of 2. The maximum capacity of this internal hash map is // 0.75 times * maxMapSize. Both the ultimate accuracy and size of this sketch are a // function of maxMapSize. -func NewFrequencyItemsSketchFromSlice[C comparable](slc []byte, operations common.ItemSketchOp[C]) (*ItemsSketch[C], error) { +func NewFrequencyItemsSketchFromSlice[C comparable](slc []byte, hasher common.ItemSketchHasher[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) { + if serde == nil { + return nil, errors.New("no SerDe provided") + } + pre0, err := checkPreambleSize(slc) //make sure preamble will fit maxPreLongs := internal.FamilyEnum.Frequency.MaxPreLongs @@ -132,7 +136,7 @@ func NewFrequencyItemsSketchFromSlice[C comparable](slc []byte, operations commo return nil, fmt.Errorf("(preLongs == 1) ^ empty == true") } if empty { - return NewFrequencyItemsSketchWithMaxMapSize[C](1<<_LG_MIN_MAP_SIZE, operations) + return NewFrequencyItemsSketchWithMaxMapSize[C](1<<_LG_MIN_MAP_SIZE, hasher, serde) } // Get full preamble preArr := make([]int64, preLongs) @@ -140,7 +144,7 @@ func NewFrequencyItemsSketchFromSlice[C comparable](slc []byte, operations commo preArr[j] = int64(binary.LittleEndian.Uint64(slc[j<<3:])) } - fis, err := NewFrequencyItemsSketch[C](int(lgMaxMapSize), int(lgCurMapSize), operations) + fis, err := NewFrequencyItemsSketch[C](lgMaxMapSize, lgCurMapSize, hasher, serde) if err != nil { return nil, err } @@ -161,7 +165,7 @@ func NewFrequencyItemsSketchFromSlice[C comparable](slc []byte, operations commo } // Get itemArray itemsOffset := preBytes + (8 * activeItems) - itemArray, err := operations.DeserializeManyFromSlice(slc[itemsOffset:], 0, activeItems) + itemArray, err := serde.DeserializeManyFromSlice(slc[itemsOffset:], 0, activeItems) if err != nil { return nil, err } @@ -391,7 +395,10 @@ func (i *ItemsSketch[C]) ToString() (string, error) { } // ToSlice returns a slice representation of this sketch -func (i *ItemsSketch[C]) ToSlice() []byte { +func (i *ItemsSketch[C]) ToSlice() ([]byte, error) { + if i.hashMap.serde == nil { + return nil, errors.New("no SerDe provided") + } preLongs := 0 outBytes := 0 empty := i.IsEmpty() @@ -402,7 +409,7 @@ func (i *ItemsSketch[C]) ToSlice() []byte { outBytes = 8 } else { preLongs = internal.FamilyEnum.Frequency.MaxPreLongs - bytes = i.hashMap.operations.SerializeManyToSlice(i.hashMap.getActiveKeys()) + bytes = i.hashMap.serde.SerializeManyToSlice(i.hashMap.getActiveKeys()) outBytes = ((preLongs + activeItems) << 3) + len(bytes) } @@ -437,12 +444,12 @@ func (i *ItemsSketch[C]) ToSlice() []byte { } copy(outArr[preBytes+(activeItems<<3):], bytes) } - return outArr + return outArr, nil } // Reset resets this sketch to a virgin state. func (i *ItemsSketch[C]) Reset() error { - hashMap, err := newReversePurgeItemHashMap[C](1<<_LG_MIN_MAP_SIZE, i.hashMap.operations) + hashMap, err := newReversePurgeItemHashMap[C](1<<_LG_MIN_MAP_SIZE, i.hashMap.hasher, i.hashMap.serde) if err != nil { return err } diff --git a/frequencies/items_sketch_test.go b/frequencies/items_sketch_test.go index 6eba843..24407e2 100644 --- a/frequencies/items_sketch_test.go +++ b/frequencies/items_sketch_test.go @@ -26,8 +26,8 @@ import ( ) func TestEmpty(t *testing.T) { - h := common.ArrayOfStringsSerDe{} - sketch, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, h) + h := common.ItemSketchStringHasher{} + sketch, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, h, nil) assert.NoError(t, err) assert.True(t, sketch.IsEmpty()) assert.Equal(t, sketch.GetNumActiveItems(), 0) @@ -41,7 +41,7 @@ func TestEmpty(t *testing.T) { } func TestOneItem(t *testing.T) { - sketch, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, nil) assert.NoError(t, err) err = sketch.Update("a") assert.NoError(t, err) @@ -57,7 +57,7 @@ func TestOneItem(t *testing.T) { } func TestSeveralItem(t *testing.T) { - sketch, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, nil) assert.NoError(t, err) err = sketch.Update("a") assert.NoError(t, err) @@ -106,7 +106,7 @@ func TestSeveralItem(t *testing.T) { } func TestEstimationMode(t *testing.T) { - sketch, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + sketch, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) err = sketch.UpdateMany(1, 10) assert.NoError(t, err) @@ -165,11 +165,22 @@ func TestEstimationMode(t *testing.T) { } } +func TestSerializeStringDeserializeNoSerde(t *testing.T) { + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, nil) + assert.NoError(t, err) + _, err = sketch1.ToSlice() + assert.Error(t, err) + + _, err = NewFrequencyItemsSketchFromSlice[string](nil, common.ItemSketchStringHasher{}, nil) + assert.Error(t, err) +} + func TestSerializeStringDeserializeEmpty(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) - bytes := sketch1.ToSlice() - sketch2, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + bytes, err := sketch1.ToSlice() + assert.NoError(t, err) + sketch2, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.True(t, sketch2.IsEmpty()) assert.Equal(t, sketch2.GetNumActiveItems(), 0) @@ -177,7 +188,7 @@ func TestSerializeStringDeserializeEmpty(t *testing.T) { } func TestSerializeDeserializeUtf8Strings(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) err = sketch1.Update("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") assert.NoError(t, err) @@ -188,8 +199,9 @@ func TestSerializeDeserializeUtf8Strings(t *testing.T) { err = sketch1.Update("ddddddddddddddddddddddddddddd") assert.NoError(t, err) - bytes := sketch1.ToSlice() - sketch2, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + bytes, err := sketch1.ToSlice() + assert.NoError(t, err) + sketch2, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) err = sketch2.Update("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") assert.NoError(t, err) @@ -216,14 +228,15 @@ func TestSerializeDeserializeUtf8Strings(t *testing.T) { } func TestSerializeDeserializeLong(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, common.ItemSketchLongSerDe{}) sketch1.Update(1) sketch1.Update(2) sketch1.Update(3) sketch1.Update(4) - bytes := sketch1.ToSlice() - sketch2, err := NewFrequencyItemsSketchFromSlice[int64](bytes, common.ArrayOfLongsSerDe{}) + bytes, err := sketch1.ToSlice() + assert.NoError(t, err) + sketch2, err := NewFrequencyItemsSketchFromSlice[int64](bytes, common.ItemSketchLongHasher{}, common.ItemSketchLongSerDe{}) sketch2.Update(2) sketch2.Update(3) sketch2.Update(2) @@ -246,7 +259,7 @@ func TestSerializeDeserializeLong(t *testing.T) { } func TestResize(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](2<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](2<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, nil) for i := 0; i < 32; i++ { err = sketch1.UpdateMany(strconv.Itoa(i), int64(i*i)) assert.NoError(t, err) @@ -254,7 +267,7 @@ func TestResize(t *testing.T) { } func TestMergeExact(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, nil) assert.NoError(t, err) err = sketch1.Update("a") assert.NoError(t, err) @@ -265,7 +278,7 @@ func TestMergeExact(t *testing.T) { err = sketch1.Update("d") assert.NoError(t, err) - sketch2, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch2, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, nil) assert.NoError(t, err) err = sketch2.Update("b") assert.NoError(t, err) @@ -294,20 +307,20 @@ func TestMergeExact(t *testing.T) { } func TestNullMapReturns(t *testing.T) { - map1, err := newReversePurgeItemHashMap[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + map1, err := newReversePurgeItemHashMap[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) assert.Nil(t, map1.getActiveKeys()) assert.Nil(t, map1.getActiveValues()) } func TestMisc(t *testing.T) { - sk1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + sk1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) assert.Equal(t, sk1.GetCurrentMapCapacity(), 6) est, err := sk1.GetEstimate(1) assert.NoError(t, err) assert.Equal(t, est, int64(0)) - sk2, err := NewFrequencyItemsSketchWithMaxMapSize[int64](8, common.ArrayOfLongsSerDe{}) + sk2, err := NewFrequencyItemsSketchWithMaxMapSize[int64](8, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) _, err = sk1.Merge(sk2) assert.NoError(t, err) @@ -328,14 +341,14 @@ func TestMisc(t *testing.T) { } func TestToString(t *testing.T) { - sk, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + sk, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) err = sk.Update(1) t.Log(sk.ToString()) } func TestFrequentItems1(t *testing.T) { - fis, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + fis, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) fis.Update(1) rows, err := fis.GetFrequentItems(ErrorTypeEnum.NoFalsePositives) @@ -352,17 +365,18 @@ func TestFrequentItems1(t *testing.T) { } func TestUpdateExceptions(t *testing.T) { - sk1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + sk1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, nil) assert.NoError(t, err) err = sk1.UpdateMany(1, -1) assert.Error(t, err) } func TestMemExceptions(t *testing.T) { - sk1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ArrayOfLongsSerDe{}) + sk1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](1<<_LG_MIN_MAP_SIZE, common.ItemSketchLongHasher{}, common.ItemSketchLongSerDe{}) assert.NoError(t, err) sk1.Update(1) - bytes := sk1.ToSlice() + bytes, err := sk1.ToSlice() + assert.NoError(t, err) pre0 := binary.LittleEndian.Uint64(bytes) //Now start corrupting tryBadMem(t, bytes, _PREAMBLE_LONGS_BYTE, 2) //Corrupt @@ -379,7 +393,7 @@ func TestMemExceptions(t *testing.T) { } func TestOneItemUtf8(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ArrayOfStringsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[string](1<<_LG_MIN_MAP_SIZE, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) err = sketch1.Update("\u5fb5") assert.NoError(t, err) @@ -390,8 +404,9 @@ func TestOneItemUtf8(t *testing.T) { assert.NoError(t, err) assert.Equal(t, est, int64(1)) - bytes := sketch1.ToSlice() - sketch2, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + bytes, err := sketch1.ToSlice() + assert.NoError(t, err) + sketch2, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.False(t, sketch2.IsEmpty()) assert.Equal(t, sketch2.GetNumActiveItems(), 1) @@ -418,7 +433,7 @@ func TestItemGetAprioriError(t *testing.T) { } func BenchmarkItemSketch(b *testing.B) { - sketch, err := NewFrequencyItemsSketch[int64](128, 8, common.ArrayOfLongsSerDe{}) + sketch, err := NewFrequencyItemsSketch[int64](128, 8, common.ItemSketchLongHasher{}, nil) assert.NoError(b, err) for i := 0; i < b.N; i++ { sketch.Update(int64(i)) diff --git a/frequencies/reverse_purge_item_hash_map.go b/frequencies/reverse_purge_item_hash_map.go index 8453da9..8d6dad6 100644 --- a/frequencies/reverse_purge_item_hash_map.go +++ b/frequencies/reverse_purge_item_hash_map.go @@ -32,7 +32,8 @@ type reversePurgeItemHashMap[C comparable] struct { values []int64 states []int16 numActive int - operations common.ItemSketchOp[C] + hasher common.ItemSketchHasher[C] + serde common.ItemSketchSerde[C] } type iteratorItemHashMap[C comparable] struct { @@ -58,7 +59,7 @@ const ( // - mapSize, This determines the number of cells in the arrays underlying the // HashMap implementation and must be a power of 2. // The hashFn table will be expected to store reversePurgeItemHashMapLoadFactor * mapSize (key, value) pairs. -func newReversePurgeItemHashMap[C comparable](mapSize int, operations common.ItemSketchOp[C]) (*reversePurgeItemHashMap[C], error) { +func newReversePurgeItemHashMap[C comparable](mapSize int, hasher common.ItemSketchHasher[C], serde common.ItemSketchSerde[C]) (*reversePurgeItemHashMap[C], error) { lgLength, err := internal.ExactLog2(mapSize) if err != nil { return nil, err @@ -70,7 +71,8 @@ func newReversePurgeItemHashMap[C comparable](mapSize int, operations common.Ite make([]int64, mapSize), make([]int16, mapSize), 0, - operations, + hasher, + serde, }, nil } @@ -103,7 +105,7 @@ func (r *reversePurgeItemHashMap[C]) getCapacity() int { func (r *reversePurgeItemHashMap[C]) adjustOrPutValue(key C, adjustAmount int64) error { var ( arrayMask = len(r.keys) - 1 - probe = r.operations.Hash(key) & uint64(arrayMask) + probe = r.hasher.Hash(key) & uint64(arrayMask) drift = 1 ) @@ -279,7 +281,7 @@ func (r *reversePurgeItemHashMap[C]) iterator() *iteratorItemHashMap[C] { func (r *reversePurgeItemHashMap[C]) hashProbe(key C) int { arrayMask := uint64(len(r.keys) - 1) - probe := r.operations.Hash(key) & arrayMask + probe := r.hasher.Hash(key) & arrayMask for r.states[probe] > 0 && r.keys[probe] != key { probe = (probe + 1) & arrayMask } diff --git a/frequencies/serde_compat_test.go b/frequencies/serde_compat_test.go index f32b635..a0db94e 100644 --- a/frequencies/serde_compat_test.go +++ b/frequencies/serde_compat_test.go @@ -24,14 +24,15 @@ import ( ) func TestItemsToLongs(t *testing.T) { - sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](8, common.ArrayOfLongsSerDe{}) + sketch1, err := NewFrequencyItemsSketchWithMaxMapSize[int64](8, common.ItemSketchLongHasher{}, common.ItemSketchLongSerDe{}) assert.NoError(t, err) sketch1.Update(1) sketch1.Update(2) sketch1.Update(3) sketch1.Update(4) - bytes := sketch1.ToSlice() + bytes, err := sketch1.ToSlice() + assert.NoError(t, err) sketch2, err := NewLongsSketchFromSlice(bytes) assert.NoError(t, err) sketch2.Update(2) @@ -64,7 +65,7 @@ func TestLongToItems(t *testing.T) { sketch1.Update(4) bytes := sketch1.ToSlice() - sketch2, err := NewFrequencyItemsSketchFromSlice[int64](bytes, common.ArrayOfLongsSerDe{}) + sketch2, err := NewFrequencyItemsSketchFromSlice[int64](bytes, common.ItemSketchLongHasher{}, common.ItemSketchLongSerDe{}) assert.NoError(t, err) sketch2.Update(2) sketch2.Update(3) diff --git a/frequencies/sketch_serialization_test.go b/frequencies/sketch_serialization_test.go index 3967670..12062af 100644 --- a/frequencies/sketch_serialization_test.go +++ b/frequencies/sketch_serialization_test.go @@ -66,7 +66,7 @@ func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { t.Run("String Frequency", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - sk, err := NewFrequencyItemsSketchWithMaxMapSize[string](64, common.ArrayOfStringsSerDe{}) + sk, err := NewFrequencyItemsSketchWithMaxMapSize[string](64, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) for i := 1; i <= n; i++ { err = sk.Update(strconv.Itoa(i)) @@ -85,7 +85,7 @@ func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { err = os.MkdirAll(internal.GoPath, os.ModePerm) assert.NoError(t, err) - slc := sk.ToSlice() + slc, err := sk.ToSlice() err = os.WriteFile(fmt.Sprintf("%s/frequent_string_n%d_go.sk", internal.GoPath, n), slc, 0644) if err != nil { t.Errorf("err != nil") @@ -94,7 +94,7 @@ func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { }) t.Run("String ut8", func(t *testing.T) { - sk, err := NewFrequencyItemsSketchWithMaxMapSize[string](64, common.ArrayOfStringsSerDe{}) + sk, err := NewFrequencyItemsSketchWithMaxMapSize[string](64, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.NoError(t, sk.UpdateMany("абвгд", 1)) @@ -108,7 +108,7 @@ func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { err = os.MkdirAll(internal.GoPath, os.ModePerm) assert.NoError(t, err) - slc := sk.ToSlice() + slc, err := sk.ToSlice() err = os.WriteFile(fmt.Sprintf("%s/frequent_string_utf8_go.sk", internal.GoPath), slc, 0644) if err != nil { t.Errorf("err != nil") @@ -116,7 +116,7 @@ func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { }) t.Run("String ascii", func(t *testing.T) { - sk, err := NewFrequencyItemsSketchWithMaxMapSize[string](64, common.ArrayOfStringsSerDe{}) + sk, err := NewFrequencyItemsSketchWithMaxMapSize[string](64, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.NoError(t, sk.UpdateMany("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)) @@ -127,7 +127,7 @@ func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { err = os.MkdirAll(internal.GoPath, os.ModePerm) assert.NoError(t, err) - slc := sk.ToSlice() + slc, err := sk.ToSlice() err = os.WriteFile(fmt.Sprintf("%s/frequent_string_ascii_go.sk", internal.GoPath), slc, 0644) if err != nil { t.Errorf("err != nil") @@ -166,7 +166,7 @@ func TestJavaCompat(t *testing.T) { for _, n := range nArr { bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_string_n%d_java.sk", internal.JavaPath, n)) assert.NoError(t, err) - sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) if err != nil { return } @@ -188,7 +188,7 @@ func TestJavaCompat(t *testing.T) { t.Run("String utf8", func(t *testing.T) { bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_string_utf8_java.sk", internal.JavaPath)) assert.NoError(t, err) - sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) if err != nil { return } @@ -221,7 +221,7 @@ func TestJavaCompat(t *testing.T) { t.Run("String ascii", func(t *testing.T) { bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_string_ascii_java.sk", internal.JavaPath)) assert.NoError(t, err) - sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) if err != nil { return } @@ -273,7 +273,7 @@ func TestCppCompat(t *testing.T) { for _, n := range nArr { bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_string_n%d_cpp.sk", internal.CppPath, n)) assert.NoError(t, err) - sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) if err != nil { return } @@ -295,7 +295,7 @@ func TestCppCompat(t *testing.T) { t.Run("String utf8", func(t *testing.T) { bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_string_utf8_cpp.sk", internal.CppPath)) assert.NoError(t, err) - sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) if err != nil { return } @@ -328,7 +328,7 @@ func TestCppCompat(t *testing.T) { t.Run("String ascii", func(t *testing.T) { bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_string_ascii_cpp.sk", internal.CppPath)) assert.NoError(t, err) - sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ArrayOfStringsSerDe{}) + sketch, err := NewFrequencyItemsSketchFromSlice[string](bytes, common.ItemSketchStringHasher{}, common.ItemSketchStringSerDe{}) if err != nil { return } diff --git a/internal/generic_inequality_search.go b/internal/generic_inequality_search.go index a9a0379..b43f453 100644 --- a/internal/generic_inequality_search.go +++ b/internal/generic_inequality_search.go @@ -30,7 +30,7 @@ const ( InequalityGT ) -func FindWithInequality[C comparable](arr []C, low int, high int, v C, crit Inequality, lessFn common.LessFn[C]) int { +func FindWithInequality[C comparable](arr []C, low int, high int, v C, crit Inequality, comparator common.CompareFn[C]) int { if len(arr) == 0 { return -1 } @@ -38,35 +38,35 @@ func FindWithInequality[C comparable](arr []C, low int, high int, v C, crit Ineq hi := high for lo <= hi { if hi-lo <= 1 { - return resolve(arr, lo, hi, v, crit, lessFn) + return resolve(arr, lo, hi, v, crit, comparator) } mid := lo + (hi-lo)/2 - ret := compare(arr, mid, mid+1, v, crit, lessFn) + ret := compare(arr, mid, mid+1, v, crit, comparator) if ret == -1 { hi = mid } else if ret == 1 { lo = mid + 1 } else { - return getIndex(arr, mid, mid+1, v, crit, lessFn) + return getIndex(arr, mid, mid+1, v, crit, comparator) } } return -1 } -func resolve[C comparable](arr []C, lo int, hi int, v C, crit Inequality, lessFn common.LessFn[C]) int { +func resolve[C comparable](arr []C, lo int, hi int, v C, crit Inequality, compareFn common.CompareFn[C]) int { result := 0 switch crit { case InequalityLT: if lo == hi { - if lessFn(v, arr[hi]) == false && v != arr[hi] { + if compareFn(v, arr[hi]) == false && v != arr[hi] { result = lo } else { result = -1 } } else { - if lessFn(v, arr[hi]) == false && v != arr[hi] { + if compareFn(v, arr[hi]) == false && v != arr[hi] { result = hi - } else if lessFn(v, arr[lo]) == false && v != arr[lo] { + } else if compareFn(v, arr[lo]) == false && v != arr[lo] { result = lo } else { result = -1 @@ -74,15 +74,15 @@ func resolve[C comparable](arr []C, lo int, hi int, v C, crit Inequality, lessFn } case InequalityLE: if lo == hi { - if lessFn(v, arr[lo]) == false { + if compareFn(v, arr[lo]) == false { result = lo } else { result = -1 } } else { - if lessFn(v, arr[hi]) == false { + if compareFn(v, arr[hi]) == false { result = hi - } else if lessFn(v, arr[lo]) == false { + } else if compareFn(v, arr[lo]) == false { result = lo } else { result = -1 @@ -91,15 +91,15 @@ func resolve[C comparable](arr []C, lo int, hi int, v C, crit Inequality, lessFn case InequalityGE: if lo == hi { - if lessFn(v, arr[lo]) || v == arr[lo] { + if compareFn(v, arr[lo]) || v == arr[lo] { result = lo } else { result = -1 } } else { - if lessFn(v, arr[lo]) || v == arr[lo] { + if compareFn(v, arr[lo]) || v == arr[lo] { result = lo - } else if lessFn(v, arr[hi]) || v == arr[hi] { + } else if compareFn(v, arr[hi]) || v == arr[hi] { result = hi } else { result = -1 @@ -107,15 +107,15 @@ func resolve[C comparable](arr []C, lo int, hi int, v C, crit Inequality, lessFn } case InequalityGT: if lo == hi { - if lessFn(v, arr[lo]) { + if compareFn(v, arr[lo]) { result = lo } else { result = -1 } } else { - if lessFn(v, arr[lo]) { + if compareFn(v, arr[lo]) { result = lo - } else if lessFn(v, arr[hi]) { + } else if compareFn(v, arr[hi]) { result = hi } else { result = -1 @@ -128,21 +128,21 @@ func resolve[C comparable](arr []C, lo int, hi int, v C, crit Inequality, lessFn return result } -func compare[C comparable](arr []C, a int, b int, v C, crit Inequality, lessFn common.LessFn[C]) int { +func compare[C comparable](arr []C, a int, b int, v C, crit Inequality, compareFn common.CompareFn[C]) int { result := 0 switch crit { case InequalityLT, InequalityGE: - if lessFn(v, arr[a]) || arr[a] == v { + if compareFn(v, arr[a]) || arr[a] == v { result = -1 - } else if lessFn(arr[b], v) { + } else if compareFn(arr[b], v) { result = 1 } else { result = 0 } case InequalityLE, InequalityGT: - if lessFn(v, arr[a]) { + if compareFn(v, arr[a]) { result = -1 - } else if lessFn(arr[b], v) || arr[b] == v { + } else if compareFn(arr[b], v) || arr[b] == v { result = 1 } else { result = 0 @@ -153,7 +153,7 @@ func compare[C comparable](arr []C, a int, b int, v C, crit Inequality, lessFn c return result } -func getIndex[C comparable](arr []C, a int, b int, v C, crit Inequality, lessFn common.LessFn[C]) int { +func getIndex[C comparable](arr []C, a int, b int, v C, crit Inequality, compareFn common.CompareFn[C]) int { result := 0 switch crit { case InequalityLT, InequalityLE: diff --git a/kll/items_sketch.go b/kll/items_sketch.go index 3cce1b9..8182a7b 100644 --- a/kll/items_sketch.go +++ b/kll/items_sketch.go @@ -50,7 +50,8 @@ type ItemsSketch[C comparable] struct { minItem *C maxItem *C sortedView *ItemsSketchSortedView[C] - itemsSketchOp common.ItemSketchOp[C] + serde common.ItemSketchSerde[C] + compareFn common.CompareFn[C] // Force deterministic offset for test, so that we can compare results across implementation. deterministicOffsetForTest bool @@ -79,31 +80,42 @@ var ( // NewKllItemsSketch create a new ItemsSketch with the given k and m. // The default k = 200 results in a normalized rank error of about 1.65%. // Larger K will have smaller error but the sketch will be larger (and slower). -func NewKllItemsSketch[C comparable](k uint16, m uint8, itemsSketchOp common.ItemSketchOp[C]) (*ItemsSketch[C], error) { +func NewKllItemsSketch[C comparable](k uint16, m uint8, compareFn common.CompareFn[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) { if k < _MIN_K || k > _MAX_K { return nil, fmt.Errorf("k must be >= %d and <= %d: %d", _MIN_K, _MAX_K, k) } + if compareFn == nil { + return nil, fmt.Errorf("no compare function provided") + } return &ItemsSketch[C]{ - k: k, - m: m, - minK: k, - numLevels: uint8(1), - levels: []uint32{uint32(k), uint32(k)}, - items: make([]C, k), - itemsSketchOp: itemsSketchOp, + k: k, + m: m, + minK: k, + numLevels: uint8(1), + levels: []uint32{uint32(k), uint32(k)}, + items: make([]C, k), + serde: serde, + compareFn: compareFn, }, nil } // NewKllItemsSketchWithDefault create a new ItemsSketch with default k and m. // The default k = 200 results in a normalized rank error of about 1.65%. -func NewKllItemsSketchWithDefault[C comparable](itemsSketchOp common.ItemSketchOp[C]) (*ItemsSketch[C], error) { - return NewKllItemsSketch[C](_DEFAULT_K, _DEFAULT_M, itemsSketchOp) +func NewKllItemsSketchWithDefault[C comparable](compareFn common.CompareFn[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) { + return NewKllItemsSketch[C](_DEFAULT_K, _DEFAULT_M, compareFn, serde) } // NewKllItemsSketchFromSlice create a new ItemsSketch from the given byte slice (serialized sketch). -func NewKllItemsSketchFromSlice[C comparable](sl []byte, itemsSketchOp common.ItemSketchOp[C]) (*ItemsSketch[C], error) { +func NewKllItemsSketchFromSlice[C comparable](sl []byte, compareFn common.CompareFn[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) { + if serde == nil { + return nil, fmt.Errorf("no SerDe provided") + } + + if compareFn == nil { + return nil, fmt.Errorf("no compare function provided") + } - memVal, err := newItemsSketchMemoryValidate(sl, itemsSketchOp) + memVal, err := newItemsSketchMemoryValidate(sl, serde) if err != nil { return nil, err } @@ -127,7 +139,7 @@ func NewKllItemsSketchFromSlice[C comparable](sl []byte, itemsSketchOp common.It items = make([]C, k) case _COMPACT_SINGLE: offset := _N_LONG_ADR - deserItems, err := itemsSketchOp.DeserializeManyFromSlice(sl, offset, 1) + deserItems, err := serde.DeserializeManyFromSlice(sl, offset, 1) if err != nil { return nil, err } @@ -137,20 +149,20 @@ func NewKllItemsSketchFromSlice[C comparable](sl []byte, itemsSketchOp common.It items[k-1] = deserItems[0] case _COMPACT_FULL: offset := int(_DATA_START_ADR + memVal.numLevels*4) - deserMinItems, err := itemsSketchOp.DeserializeManyFromSlice(sl, offset, 1) + deserMinItems, err := serde.DeserializeManyFromSlice(sl, offset, 1) minItem = &deserMinItems[0] if err != nil { return nil, err } - offset += itemsSketchOp.SizeOf(*minItem) - deserMaxItems, err := itemsSketchOp.DeserializeManyFromSlice(sl, offset, 1) + offset += serde.SizeOf(*minItem) + deserMaxItems, err := serde.DeserializeManyFromSlice(sl, offset, 1) maxItem = &deserMaxItems[0] if err != nil { return nil, err } - offset += itemsSketchOp.SizeOf(*maxItem) + offset += serde.SizeOf(*maxItem) numRetained := levelsArr[memVal.numLevels] - levelsArr[0] - deseRetItems, err := itemsSketchOp.DeserializeManyFromSlice(sl, offset, int(numRetained)) + deseRetItems, err := serde.DeserializeManyFromSlice(sl, offset, int(numRetained)) if err != nil { return nil, err } @@ -170,7 +182,8 @@ func NewKllItemsSketchFromSlice[C comparable](sl []byte, itemsSketchOp common.It items: items, minItem: minItem, maxItem: maxItem, - itemsSketchOp: itemsSketchOp, + serde: serde, + compareFn: compareFn, }, nil } @@ -197,7 +210,7 @@ func (s *ItemsSketch[C]) GetNumRetained() uint32 { // GetMinItem returns the minimum item of the stream. This may be distinct from the smallest item retained by the sketch algorithm. func (s *ItemsSketch[C]) GetMinItem() (C, error) { if s.IsEmpty() { - return s.itemsSketchOp.Identity(), fmt.Errorf("operation is undefined for an empty sketch") + return *new(C), fmt.Errorf("operation is undefined for an empty sketch") } return *s.minItem, nil } @@ -205,7 +218,7 @@ func (s *ItemsSketch[C]) GetMinItem() (C, error) { // GetMaxItem returns the maximum item of the stream. This may be distinct from the largest item retained by the sketch algorithm. func (s *ItemsSketch[C]) GetMaxItem() (C, error) { if s.IsEmpty() { - return s.itemsSketchOp.Identity(), fmt.Errorf("operation is undefined for an empty sketch") + return *new(C), fmt.Errorf("operation is undefined for an empty sketch") } return *s.maxItem, nil } @@ -265,14 +278,14 @@ func (s *ItemsSketch[C]) GetRanks(item []C, inclusive bool) ([]float64, error) { // If EXCLUSIVE, the given rank includes all quantiles < the quantile directly corresponding to the given rank. func (s *ItemsSketch[C]) GetQuantile(rank float64, inclusive bool) (C, error) { if s.IsEmpty() { - return s.itemsSketchOp.Identity(), fmt.Errorf("operation is undefined for an empty sketch") + return *new(C), fmt.Errorf("operation is undefined for an empty sketch") } if rank < 0.0 || rank > 1.0 { - return s.itemsSketchOp.Identity(), fmt.Errorf("normalized rank cannot be less than zero or greater than 1.0: %f", rank) + return *new(C), fmt.Errorf("normalized rank cannot be less than zero or greater than 1.0: %f", rank) } err := s.setupSortedView() if err != nil { - return s.itemsSketchOp.Identity(), err + return *new(C), err } return s.sortedView.GetQuantile(rank, inclusive) } @@ -430,7 +443,7 @@ func (s *ItemsSketch[C]) GetSortedView() (*ItemsSketchSortedView[C], error) { // Update this sketch with the given item. func (s *ItemsSketch[C]) Update(item C) { - s.updateItem(item, s.itemsSketchOp.LessFn()) + s.updateItem(item, s.compareFn) s.sortedView = nil } @@ -457,6 +470,10 @@ func (s *ItemsSketch[C]) Reset() { // ToSlice returns the serialized byte array of this sketch. func (s *ItemsSketch[C]) ToSlice() ([]byte, error) { + if s.serde == nil { + return nil, fmt.Errorf("no SerDe provided") + } + srcN := s.n var tgtStructure = _COMPACT_FULL if srcN == 0 { @@ -531,6 +548,9 @@ func (s *ItemsSketch[C]) ToSlice() ([]byte, error) { // GetSerializedSizeBytes Returns the current number of bytes this Sketch would require if serialized in compact form. func (s *ItemsSketch[C]) GetSerializedSizeBytes() (int, error) { + if s.serde == nil { + return 0, fmt.Errorf("no SerDe provided") + } return s.currentSerializedSizeBytes() } @@ -595,12 +615,12 @@ func (s *ItemsSketch[C]) getLevelsArrSizeBytes(structure sketchStructure) int { } func (s *ItemsSketch[C]) getMinMaxSizeBytes() int { - return s.itemsSketchOp.SizeOf(*s.minItem) + s.itemsSketchOp.SizeOf(*s.maxItem) + return s.serde.SizeOf(*s.minItem) + s.serde.SizeOf(*s.maxItem) } func (s *ItemsSketch[C]) getMinMaxByteArr() []byte { - minBytes := s.itemsSketchOp.SerializeOneToSlice(*s.minItem) - maxBytes := s.itemsSketchOp.SerializeOneToSlice(*s.maxItem) + minBytes := s.serde.SerializeOneToSlice(*s.minItem) + maxBytes := s.serde.SerializeOneToSlice(*s.maxItem) minMaxBytes := make([]byte, len(minBytes)+len(maxBytes)) copy(minMaxBytes, minBytes) copy(minMaxBytes[len(minBytes):], maxBytes) @@ -612,7 +632,7 @@ func (s *ItemsSketch[C]) getSingleItemSizeBytes() (int, error) { if err != nil { return 0, err } - return s.itemsSketchOp.SizeOf(v), nil + return s.serde.SizeOf(v), nil } func (s *ItemsSketch[C]) getSingleItemByteArr() ([]byte, error) { @@ -620,12 +640,12 @@ func (s *ItemsSketch[C]) getSingleItemByteArr() ([]byte, error) { if err != nil { return nil, err } - return s.itemsSketchOp.SerializeOneToSlice(v), nil + return s.serde.SerializeOneToSlice(v), nil } func (s *ItemsSketch[C]) getSingleItem() (C, error) { if s.n != 1 { - return s.itemsSketchOp.Identity(), fmt.Errorf("sketch must have exactly one item") + return *new(C), fmt.Errorf("sketch must have exactly one item") } return s.items[s.k-1], nil } @@ -639,7 +659,7 @@ func (s *ItemsSketch[C]) getRetainedItemsArray() []C { func (s *ItemsSketch[C]) getRetainedItemsByteArr() []byte { retArr := s.getRetainedItemsArray() - return s.itemsSketchOp.SerializeManyToSlice(retArr) + return s.serde.SerializeManyToSlice(retArr) } func (s *ItemsSketch[C]) getRetainedItemsSizeBytes() int { @@ -657,7 +677,7 @@ func (s *ItemsSketch[C]) setupSortedView() error { return nil } -func (s *ItemsSketch[C]) updateItem(item C, lessFn common.LessFn[C]) { +func (s *ItemsSketch[C]) updateItem(item C, compareFn common.CompareFn[C]) { if internal.IsNil(item) { return } @@ -665,10 +685,10 @@ func (s *ItemsSketch[C]) updateItem(item C, lessFn common.LessFn[C]) { s.minItem = &item s.maxItem = &item } else { - if lessFn(item, *s.minItem) { + if compareFn(item, *s.minItem) { s.minItem = &item } - if lessFn(*s.maxItem, item) { + if compareFn(*s.maxItem, item) { s.maxItem = &item } } @@ -713,7 +733,7 @@ func (s *ItemsSketch[C]) mergeItemsSketch(other *ItemsSketch[C]) { // MERGE: update this sketch with level0 items from the other sketch otherItemsArr = other.GetTotalItemsArray() for i := otherLevelsArr[0]; i < otherLevelsArr[1]; i++ { - s.updateItem(otherItemsArr[i], s.itemsSketchOp.LessFn()) + s.updateItem(otherItemsArr[i], s.compareFn) } // After the level 0 update, we capture the intermediate state of levels and items arrays... @@ -738,10 +758,10 @@ func (s *ItemsSketch[C]) mergeItemsSketch(other *ItemsSketch[C]) { populateItemWorkArrays(workbuf, worklevels, provisionalNumLevels, myCurNumLevels, myCurLevelsArr, myCurItemsArr, - otherNumLevels, otherLevelsArr, otherItemsArr, s.itemsSketchOp.LessFn()) + otherNumLevels, otherLevelsArr, otherItemsArr, s.compareFn) // notice that workbuf is being used as both the input and output - result := generalItemsCompress(s.k, s.m, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, s.itemsSketchOp.LessFn(), s.deterministicOffsetForTest) + result := generalItemsCompress(s.k, s.m, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, s.compareFn, s.deterministicOffsetForTest) targetItemCount := result[1] //was finalCapacity. Max size given k, m, numLevels curItemCount := result[2] //was finalPop @@ -798,14 +818,13 @@ func (s *ItemsSketch[C]) mergeItemsSketch(other *ItemsSketch[C]) { s.minItem = other.minItem s.maxItem = other.maxItem } else { - less := s.itemsSketchOp.LessFn() - if less(myMin, *other.minItem) { + if s.compareFn(myMin, *other.minItem) { s.minItem = &myMin } else { s.minItem = other.minItem } - if less(*other.maxItem, myMax) { + if s.compareFn(*other.maxItem, myMax) { s.maxItem = &myMax } else { s.maxItem = other.maxItem @@ -842,10 +861,9 @@ func (s *ItemsSketch[C]) compressWhileUpdatingSketch() { //the following is specific to generic Items myItemsArr := s.GetTotalItemsArray() if level == 0 { // level zero might not be sorted, so we must sort it if we wish to compact it - lessFn := s.itemsSketchOp.LessFn() tmpSlice := myItemsArr[adjBeg : adjBeg+adjPop] sort.Slice(tmpSlice, func(a, b int) bool { - return lessFn(tmpSlice[a], tmpSlice[b]) + return s.compareFn(tmpSlice[a], tmpSlice[b]) }) } if popAbove == 0 { @@ -855,7 +873,7 @@ func (s *ItemsSketch[C]) compressWhileUpdatingSketch() { mergeSortedItemsArrays( myItemsArr, adjBeg, halfAdjPop, myItemsArr, rawEnd, popAbove, - myItemsArr, adjBeg+halfAdjPop, s.itemsSketchOp.LessFn()) + myItemsArr, adjBeg+halfAdjPop, s.compareFn) } newIndex := myLevelsArr[level+1] - halfAdjPop // adjust boundaries of the level above s.levels[level+1] = newIndex @@ -1013,7 +1031,7 @@ func randomlyHalveDownItems[C comparable](buf []C, start uint32, length uint32, func mergeSortedItemsArrays[C comparable](bufA []C, startA uint32, lenA uint32, bufB []C, startB uint32, lenB uint32, - bufC []C, startC uint32, lessFn common.LessFn[C]) { + bufC []C, startC uint32, compareFn common.CompareFn[C]) { lenC := lenA + lenB limA := startA + lenA limB := startB + lenB @@ -1029,7 +1047,7 @@ func mergeSortedItemsArrays[C comparable](bufA []C, startA uint32, lenA uint32, } else if b == limB { bufC[c] = bufA[a] a++ - } else if lessFn(bufA[a], bufB[b]) { + } else if compareFn(bufA[a], bufB[b]) { bufC[c] = bufA[a] a++ } else { @@ -1042,7 +1060,7 @@ func mergeSortedItemsArrays[C comparable](bufA []C, startA uint32, lenA uint32, func populateItemWorkArrays[C comparable](workbuf []C, worklevels []uint32, provisionalNumLevels uint8, myCurNumLevels uint8, myCurLevelsArr []uint32, myCurItemsArr []C, otherNumLevels uint8, otherLevelsArr []uint32, otherItemsArr []C, - lessFn common.LessFn[C]) { + compareFn common.CompareFn[C]) { worklevels[0] = 0 // Note: the level zero data from "other" was already inserted into "self" @@ -1069,7 +1087,7 @@ func populateItemWorkArrays[C comparable](workbuf []C, worklevels []uint32, prov mergeSortedItemsArrays( myCurItemsArr, myCurLevelsArr[lvl], selfPop, otherItemsArr, otherLevelsArr[lvl], otherPop, - workbuf, worklevels[lvl], lessFn) + workbuf, worklevels[lvl], compareFn) } } } @@ -1083,7 +1101,7 @@ func generalItemsCompress[C comparable]( outBuf []C, outLevels []uint32, isLevelZeroSorted bool, - lessFn common.LessFn[C], + compareFn common.CompareFn[C], deterministicOffsetForTest bool, ) []uint32 { numLevels := numLevelsIn @@ -1137,7 +1155,7 @@ func generalItemsCompress[C comparable]( if (curLevel == 0) && !isLevelZeroSorted { tmpSlice := inBuf[adjBeg : adjBeg+adjPop] sort.Slice(tmpSlice, func(a, b int) bool { - return lessFn(tmpSlice[a], tmpSlice[b]) + return compareFn(tmpSlice[a], tmpSlice[b]) }) } @@ -1148,7 +1166,7 @@ func generalItemsCompress[C comparable]( mergeSortedItemsArrays( inBuf, adjBeg, halfAdjPop, inBuf, rawLim, popAbove, - inBuf, adjBeg+halfAdjPop, lessFn) + inBuf, adjBeg+halfAdjPop, compareFn) } // track the fact that we just eliminated some data diff --git a/kll/items_sketch_iterator.go b/kll/items_sketch_iterator.go index 985eb35..39484c3 100644 --- a/kll/items_sketch_iterator.go +++ b/kll/items_sketch_iterator.go @@ -27,7 +27,7 @@ type ItemsSketchIterator[C comparable] struct { level int weight int64 isInitialized bool - itemsSketchOp common.ItemSketchOp[C] + itemsSketchOp common.ItemSketchSerde[C] } func newItemsSketchIterator[C comparable]( diff --git a/kll/items_sketch_sorted_view.go b/kll/items_sketch_sorted_view.go index a90d62c..7b118fd 100644 --- a/kll/items_sketch_sorted_view.go +++ b/kll/items_sketch_sorted_view.go @@ -25,12 +25,12 @@ import ( ) type ItemsSketchSortedView[C comparable] struct { - quantiles []C - cumWeights []int64 - totalN uint64 - maxItem C - minItem C - itemsSketchOp common.ItemSketchOp[C] + quantiles []C + cumWeights []int64 + totalN uint64 + maxItem C + minItem C + compareFn common.CompareFn[C] } func newItemsSketchSortedView[C comparable](sketch *ItemsSketch[C]) (*ItemsSketchSortedView[C], error) { @@ -55,21 +55,21 @@ func newItemsSketchSortedView[C comparable](sketch *ItemsSketch[C]) (*ItemsSketc } if !sketch.isLevelZeroSorted { subSlice := srcQuantiles[srcLevels[0]:srcLevels[1]] - lessFn := sketch.itemsSketchOp.LessFn() + compareFn := sketch.compareFn sort.Slice(subSlice, func(a, b int) bool { - return lessFn(subSlice[a], subSlice[b]) + return compareFn(subSlice[a], subSlice[b]) }) } numQuantiles := srcLevels[srcNumLevels] - srcLevels[0] - quantiles, cumWeights := populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles, sketch.itemsSketchOp) + quantiles, cumWeights := populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles, sketch.compareFn) return &ItemsSketchSortedView[C]{ - quantiles: quantiles, - cumWeights: cumWeights, - totalN: totalN, - maxItem: maxItem, - minItem: minItem, - itemsSketchOp: sketch.itemsSketchOp, + quantiles: quantiles, + cumWeights: cumWeights, + totalN: totalN, + maxItem: maxItem, + minItem: minItem, + compareFn: sketch.compareFn, }, nil } @@ -82,7 +82,7 @@ func (s *ItemsSketchSortedView[C]) GetRank(item C, inclusive bool) (float64, err if inclusive { crit = internal.InequalityLE } - index := internal.FindWithInequality(s.quantiles, 0, length-1, item, crit, s.itemsSketchOp.LessFn()) + index := internal.FindWithInequality(s.quantiles, 0, length-1, item, crit, s.compareFn) if index == -1 { return 0, nil //EXCLUSIVE (LT) case: quantile <= minQuantile; INCLUSIVE (LE) case: quantile < minQuantile } @@ -91,11 +91,11 @@ func (s *ItemsSketchSortedView[C]) GetRank(item C, inclusive bool) (float64, err func (s *ItemsSketchSortedView[C]) GetQuantile(rank float64, inclusive bool) (C, error) { if s.totalN == 0 { - return s.itemsSketchOp.Identity(), errors.New("empty sketch") + return *new(C), errors.New("empty sketch") } err := checkNormalizedRankBounds(rank) if err != nil { - return s.itemsSketchOp.Identity(), err + return *new(C), err } index := s.getQuantileIndex(rank, inclusive) return s.quantiles[index], nil @@ -105,7 +105,7 @@ func (s *ItemsSketchSortedView[C]) GetPMF(splitPoints []C, inclusive bool) ([]fl if s.totalN == 0 { return nil, errors.New("empty sketch") } - err := checkItems(splitPoints, s.itemsSketchOp.LessFn()) + err := checkItems(splitPoints, s.compareFn) if err != nil { return nil, err } @@ -124,7 +124,7 @@ func (s *ItemsSketchSortedView[C]) GetCDF(splitPoints []C, inclusive bool) ([]fl if s.totalN == 0 { return nil, errors.New("empty sketch") } - err := checkItems(splitPoints, s.itemsSketchOp.LessFn()) + err := checkItems(splitPoints, s.compareFn) if err != nil { return nil, err } @@ -182,7 +182,7 @@ func (s *ItemsSketchSortedView[C]) GetPartitionBoundaries(numEquallySized int, i return newItemsSketchPartitionBoundaries[C](s.totalN, evSpQuantiles, evSpNatRanks, evSpNormRanks, s.maxItem, s.minItem, inclusive) } -func populateFromSketch[C comparable](srcQuantiles []C, levels []uint32, numLevels uint8, numQuantiles uint32, itemsSketchOp common.ItemSketchOp[C]) ([]C, []int64) { +func populateFromSketch[C comparable](srcQuantiles []C, levels []uint32, numLevels uint8, numQuantiles uint32, compareFn common.CompareFn[C]) ([]C, []int64) { quantiles := make([]C, numQuantiles) cumWeights := make([]int64, numQuantiles) myLevels := make([]uint32, numLevels+1) @@ -208,12 +208,12 @@ func populateFromSketch[C comparable](srcQuantiles []C, levels []uint32, numLeve weight *= 2 } numLevels = dstLevel - blockyTandemMergeSort(quantiles, cumWeights, myLevels, numLevels, itemsSketchOp) //create unit weights + blockyTandemMergeSort(quantiles, cumWeights, myLevels, numLevels, compareFn) //create unit weights convertToCumulative(cumWeights) return quantiles, cumWeights } -func blockyTandemMergeSort[C comparable](quantiles []C, weights []int64, levels []uint32, numLevels uint8, itemsSketchOp common.ItemSketchOp[C]) { +func blockyTandemMergeSort[C comparable](quantiles []C, weights []int64, levels []uint32, numLevels uint8, compareFn common.CompareFn[C]) { if numLevels == 1 { return } @@ -224,10 +224,10 @@ func blockyTandemMergeSort[C comparable](quantiles []C, weights []int64, levels weightsTmp := make([]int64, len(weights)) copy(weightsTmp, weights) // don't need the extra one here - blockyTandemMergeSortRecursion(quantilesTmp, weightsTmp, quantiles, weights, levels, 0, numLevels, itemsSketchOp) + blockyTandemMergeSortRecursion(quantilesTmp, weightsTmp, quantiles, weights, levels, 0, numLevels, compareFn) } -func blockyTandemMergeSortRecursion[C comparable](quantilesSrc []C, weightsSrc []int64, quantilesDst []C, weightsDst []int64, levels []uint32, startingLevel uint8, numLevels uint8, itemsSketchOp common.ItemSketchOp[C]) { +func blockyTandemMergeSortRecursion[C comparable](quantilesSrc []C, weightsSrc []int64, quantilesDst []C, weightsDst []int64, levels []uint32, startingLevel uint8, numLevels uint8, compareFn common.CompareFn[C]) { if numLevels == 1 { return } @@ -236,12 +236,12 @@ func blockyTandemMergeSortRecursion[C comparable](quantilesSrc []C, weightsSrc [ startingLevel1 := startingLevel startingLevel2 := startingLevel + numLevels1 // swap roles of src and dst - blockyTandemMergeSortRecursion(quantilesDst, weightsDst, quantilesSrc, weightsSrc, levels, startingLevel1, numLevels1, itemsSketchOp) - blockyTandemMergeSortRecursion(quantilesDst, weightsDst, quantilesSrc, weightsSrc, levels, startingLevel2, numLevels2, itemsSketchOp) - tandemMerge(quantilesSrc, weightsSrc, quantilesDst, weightsDst, levels, startingLevel1, numLevels1, startingLevel2, numLevels2, itemsSketchOp) + blockyTandemMergeSortRecursion(quantilesDst, weightsDst, quantilesSrc, weightsSrc, levels, startingLevel1, numLevels1, compareFn) + blockyTandemMergeSortRecursion(quantilesDst, weightsDst, quantilesSrc, weightsSrc, levels, startingLevel2, numLevels2, compareFn) + tandemMerge(quantilesSrc, weightsSrc, quantilesDst, weightsDst, levels, startingLevel1, numLevels1, startingLevel2, numLevels2, compareFn) } -func tandemMerge[C comparable](quantilesSrc []C, weightsSrc []int64, quantilesDst []C, weightsDst []int64, levels []uint32, startingLevel1 uint8, numLevels1 uint8, startingLevel2 uint8, numLevels2 uint8, itemsSketchOp common.ItemSketchOp[C]) { +func tandemMerge[C comparable](quantilesSrc []C, weightsSrc []int64, quantilesDst []C, weightsDst []int64, levels []uint32, startingLevel1 uint8, numLevels1 uint8, startingLevel2 uint8, numLevels2 uint8, compareFn common.CompareFn[C]) { fromIndex1 := levels[startingLevel1] toIndex1 := levels[startingLevel1+numLevels1] // exclusive fromIndex2 := levels[startingLevel2] @@ -250,9 +250,8 @@ func tandemMerge[C comparable](quantilesSrc []C, weightsSrc []int64, quantilesDs iSrc2 := fromIndex2 iDst := fromIndex1 - lessFn := itemsSketchOp.LessFn() for iSrc1 < toIndex1 && iSrc2 < toIndex2 { - if lessFn(quantilesSrc[iSrc1], quantilesSrc[iSrc2]) || quantilesSrc[iSrc1] == quantilesSrc[iSrc2] { + if compareFn(quantilesSrc[iSrc1], quantilesSrc[iSrc2]) || quantilesSrc[iSrc1] == quantilesSrc[iSrc2] { quantilesDst[iDst] = quantilesSrc[iSrc1] weightsDst[iDst] = weightsSrc[iSrc1] iSrc1++ diff --git a/kll/items_sketch_test.go b/kll/items_sketch_test.go index a0f6d23..fc65df4 100644 --- a/kll/items_sketch_test.go +++ b/kll/items_sketch_test.go @@ -22,6 +22,9 @@ import ( "github.com/apache/datasketches-go/common" "github.com/stretchr/testify/assert" "math" + "math/rand" + "strconv" + "strings" "testing" ) @@ -32,16 +35,18 @@ const ( ) func TestItemsSketch_KLimits(t *testing.T) { - _, err := NewKllItemsSketch[string](_MIN_K, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + _, err := NewKllItemsSketch[string](_MIN_K, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - _, err = NewKllItemsSketch[string](uint16(_MAX_K), _DEFAULT_M, common.ArrayOfStringsSerDe{}) + _, err = NewKllItemsSketch[string](uint16(_MAX_K), _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - _, err = NewKllItemsSketch[string](_MIN_K-1, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + _, err = NewKllItemsSketch[string](_MIN_K-1, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.Error(t, err) } func TestItemsSketch_Empty(t *testing.T) { - sketch, err := NewKllItemsSketch[string](200, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](200, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.True(t, sketch.IsEmpty()) assert.False(t, sketch.IsEstimationMode()) @@ -63,7 +68,8 @@ func TestItemsSketch_Empty(t *testing.T) { } func TestItemsSketch_BadQuantile(t *testing.T) { - sketch, err := NewKllItemsSketch[string](200, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](200, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sketch.Update("") // has to be non-empty to reach the check _, err = sketch.GetQuantile(-1, true) @@ -71,7 +77,8 @@ func TestItemsSketch_BadQuantile(t *testing.T) { } func TestItemsSketch_OneValue(t *testing.T) { - sketch, err := NewKllItemsSketch[string](200, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](200, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sketch.Update("A") assert.False(t, sketch.IsEmpty()) @@ -100,8 +107,9 @@ func TestItemsSketch_OneValue(t *testing.T) { } func TestItemsSketch_TenValues(t *testing.T) { + comparator := common.ItemSketchStringComparator{} tenStr := []string{"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"} - sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) strLen := len(tenStr) dblStrLen := float64(strLen) @@ -176,7 +184,8 @@ func TestItemsSketch_TenValues(t *testing.T) { } func TestItemsSketch_ManyValuesEstimationMode(T *testing.T) { - sketch, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(T, err) n := 1_000_000 digits := numDigits(n) @@ -236,7 +245,8 @@ func TestItemsSketch_ManyValuesEstimationMode(T *testing.T) { } func TestItemsSketch_GetRankGetCdfGetPmfConsistency(t *testing.T) { - sketch, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 1000 digits := numDigits(n) @@ -283,9 +293,10 @@ func TestItemsSketch_GetRankGetCdfGetPmfConsistency(t *testing.T) { } func TestItemsSketch_Merge(t *testing.T) { - sketch1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - sketch2, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + sketch2, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 10000 digits := numDigits(2 * n) @@ -326,9 +337,10 @@ func TestItemsSketch_Merge(t *testing.T) { } func TestItemsSketch_MergeLowerK(t *testing.T) { - sketch1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - sketch2, err := NewKllItemsSketch[string](_DEFAULT_K/2, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + sketch2, err := NewKllItemsSketch[string](_DEFAULT_K/2, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 10000 digits := numDigits(2 * n) @@ -374,9 +386,10 @@ func TestItemsSketch_MergeLowerK(t *testing.T) { } func TestItemsSketch_MergeEmptyLowerK(t *testing.T) { - sketch1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - sketch2, err := NewKllItemsSketch[string](_DEFAULT_K/2, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + sketch2, err := NewKllItemsSketch[string](_DEFAULT_K/2, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 10000 digits := numDigits(n) @@ -435,9 +448,10 @@ func TestItemsSketch_MergeEmptyLowerK(t *testing.T) { } func TestItemsSketch_MergeExactModeLowerK(t *testing.T) { - sketch1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - sketch2, err := NewKllItemsSketch[string](_DEFAULT_K/2, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + sketch2, err := NewKllItemsSketch[string](_DEFAULT_K/2, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 10000 digits := numDigits(n) @@ -453,9 +467,10 @@ func TestItemsSketch_MergeExactModeLowerK(t *testing.T) { } func TestItemsSketch_MergeMinMinValueFromOther(t *testing.T) { - sketch1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - sketch2, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + sketch2, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sketch1.Update(intToFixedLengthString(1, 1)) sketch2.Update(intToFixedLengthString(2, 1)) @@ -466,9 +481,10 @@ func TestItemsSketch_MergeMinMinValueFromOther(t *testing.T) { } func TestItemsSketch_MergeMinAndMaxFromOther(t *testing.T) { - sketch1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) - sketch2, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + sketch2, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 1_000_000 digits := numDigits(n) @@ -485,12 +501,14 @@ func TestItemsSketch_MergeMinAndMaxFromOther(t *testing.T) { } func TestItemsSketch_KTooSmall(t *testing.T) { - _, err := NewKllItemsSketch[string](_MIN_K-1, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + _, err := NewKllItemsSketch[string](_MIN_K-1, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.Error(t, err) } func TestItemsSketch_MinK(t *testing.T) { - sketch, err := NewKllItemsSketch[string](uint16(8), _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](uint16(8), _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 1000 digits := numDigits(n) @@ -507,7 +525,8 @@ func TestItemsSketch_MinK(t *testing.T) { } func TestItemsSketch_MaxK(t *testing.T) { - sketch, err := NewKllItemsSketch[string](uint16(_MAX_K), _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](uint16(_MAX_K), _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 1000 digits := numDigits(n) @@ -524,7 +543,8 @@ func TestItemsSketch_MaxK(t *testing.T) { } func TestItemsSketch_OutOfOrderSplitPoints(t *testing.T) { - sketch, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) s0 := intToFixedLengthString(0, 1) s1 := intToFixedLengthString(1, 1) @@ -534,7 +554,8 @@ func TestItemsSketch_OutOfOrderSplitPoints(t *testing.T) { } func TestItemsSketch_DuplicateSplitPoints(t *testing.T) { - sketch, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sketch.Update("A") sketch.Update("B") @@ -549,7 +570,8 @@ func TestItemsSketch_DuplicateSplitPoints(t *testing.T) { } func TestItemsSketch_CheckReset(t *testing.T) { - sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 100 digits := numDigits(n) @@ -576,7 +598,8 @@ func TestItemsSketch_CheckReset(t *testing.T) { } func TestItemsSketch_SortedView(t *testing.T) { - sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sketch.Update("A") sketch.Update("AB") @@ -604,12 +627,13 @@ func TestItemsSketch_SortedView(t *testing.T) { } func TestItemsSketch_CDF_PDF(t *testing.T) { + comparator := common.ItemSketchStringComparator{} cdfI := []float64{.25, .50, .75, 1.0, 1.0} cdfE := []float64{0.0, .25, .50, .75, 1.0} pmfI := []float64{.25, .25, .25, .25, 0.0} pmfE := []float64{0.0, .25, .25, .25, .25} toll := 1e-10 - sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + sketch, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) strIn := []string{"A", "AB", "ABC", "ABCD"} for i := 0; i < len(strIn); i++ { @@ -646,17 +670,18 @@ func TestItemsSketch_CDF_PDF(t *testing.T) { } func TestItemsSketch_DeserializeEmpty(t *testing.T) { - sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - memVal, err := newItemsSketchMemoryValidate[string](mem, common.ArrayOfStringsSerDe{}) + memVal, err := newItemsSketchMemoryValidate[string](mem, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.Equal(t, memVal.sketchStructure, _COMPACT_EMPTY) assert.Equal(t, len(mem), 8) - sk2, err := NewKllItemsSketchFromSlice[string](mem, common.ArrayOfStringsSerDe{}) + sk2, err := NewKllItemsSketchFromSlice[string](mem, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.Equal(t, sk2.GetN(), uint64(0)) _, err = sk2.GetMinItem() @@ -666,16 +691,17 @@ func TestItemsSketch_DeserializeEmpty(t *testing.T) { } func TestItemsSketch_DeserializeSingleItem(t *testing.T) { - sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sk1.Update("A") mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - memVal, err := newItemsSketchMemoryValidate[string](mem, common.ArrayOfStringsSerDe{}) + memVal, err := newItemsSketchMemoryValidate[string](mem, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.Equal(t, memVal.sketchStructure, _COMPACT_SINGLE) - sk2, err := NewKllItemsSketchFromSlice[string](mem, common.ArrayOfStringsSerDe{}) + sk2, err := NewKllItemsSketchFromSlice[string](mem, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.Equal(t, sk2.GetN(), uint64(1)) minV, err := sk2.GetMinItem() @@ -687,7 +713,8 @@ func TestItemsSketch_DeserializeSingleItem(t *testing.T) { } func TestItemsSketch_FewItems(t *testing.T) { - sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sk1.Update("A") sk1.Update("AB") @@ -695,14 +722,15 @@ func TestItemsSketch_FewItems(t *testing.T) { mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - memVal, err := newItemsSketchMemoryValidate[string](mem, common.ArrayOfStringsSerDe{}) + memVal, err := newItemsSketchMemoryValidate[string](mem, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.Equal(t, memVal.sketchStructure, _COMPACT_FULL) assert.Equal(t, len(mem), memVal.sketchBytes) } func TestItemsSketch_ManyItems(t *testing.T) { - sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 109 digits := numDigits(n) @@ -712,14 +740,15 @@ func TestItemsSketch_ManyItems(t *testing.T) { mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - memVal, err := newItemsSketchMemoryValidate[string](mem, common.ArrayOfStringsSerDe{}) + memVal, err := newItemsSketchMemoryValidate[string](mem, common.ItemSketchStringSerDe{}) assert.NoError(t, err) assert.Equal(t, memVal.sketchStructure, _COMPACT_FULL) assert.Equal(t, len(mem), memVal.sketchBytes) } func TestItemsSketch_SortedViewAfterReset(t *testing.T) { - sk, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sk.Update("1") sv, err := sk.GetSortedView() @@ -733,12 +762,13 @@ func TestItemsSketch_SortedViewAfterReset(t *testing.T) { } func TestItemsSketch_SerializeDeserializeEmpty(t *testing.T) { - sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - sk2, err := NewKllItemsSketchFromSlice[string](mem, common.ArrayOfStringsSerDe{}) + sk2, err := NewKllItemsSketchFromSlice[string](mem, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) s, err := sk1.GetSerializedSizeBytes() assert.NoError(t, err) @@ -762,13 +792,14 @@ func TestItemsSketch_SerializeDeserializeEmpty(t *testing.T) { } func TestItemsSketch_SerializeDeserializeOneValue(t *testing.T) { - sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketch[string](20, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) sk1.Update(" 1") mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - sk2, err := NewKllItemsSketchFromSlice[string](mem, common.ArrayOfStringsSerDe{}) + sk2, err := NewKllItemsSketchFromSlice[string](mem, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) s1SizeBytes, err := sk1.GetSerializedSizeBytes() assert.Equal(t, len(mem), s1SizeBytes) @@ -793,7 +824,8 @@ func TestItemsSketch_SerializeDeserializeOneValue(t *testing.T) { } func TestItemsSketch_SerializeDeserializeMultipleValue(t *testing.T) { - sk1, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + comparator := common.ItemSketchStringComparator{} + sk1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) n := 1000 for i := 0; i < n; i++ { @@ -808,7 +840,7 @@ func TestItemsSketch_SerializeDeserializeMultipleValue(t *testing.T) { mem, err := sk1.ToSlice() assert.NoError(t, err) assert.NotNil(t, mem) - sk2, err := NewKllItemsSketchFromSlice[string](mem, common.ArrayOfStringsSerDe{}) + sk2, err := NewKllItemsSketchFromSlice[string](mem, comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) s1, err := sk2.GetSerializedSizeBytes() assert.NoError(t, err) @@ -832,10 +864,10 @@ func TestItemsSketch_SerializeDeserializeMultipleValue(t *testing.T) { func TestSerializeDeserializeString(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} - serde := common.ArrayOfStringsSerDe{} + comparator := common.ItemSketchStringComparator{} for _, n := range nArr { digits := numDigits(n) - sk, err := NewKllItemsSketchWithDefault[string](serde) + sk, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), common.ItemSketchStringSerDe{}) assert.NoError(t, err) for i := 1; i <= n; i++ { sk.Update(intToFixedLengthString(i, digits)) @@ -843,7 +875,7 @@ func TestSerializeDeserializeString(t *testing.T) { slc, err := sk.ToSlice() assert.NoError(t, err) - sketch, err := NewKllItemsSketchFromSlice[string](slc, serde) + sketch, err := NewKllItemsSketchFromSlice[string](slc, comparator.CompareFn(), common.ItemSketchStringSerDe{}) if err != nil { return } @@ -872,11 +904,11 @@ func TestSerializeDeserializeString(t *testing.T) { weight := int64(0) it := sketch.GetIterator() - lessFn := serde.LessFn() + compareFn := comparator.CompareFn() for it.Next() { qut := it.GetQuantile() - assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) - assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) + assert.True(t, compareFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) + assert.True(t, !compareFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) weight += it.GetWeight() } assert.Equal(t, weight, int64(n)) @@ -886,9 +918,9 @@ func TestSerializeDeserializeString(t *testing.T) { func TestSerializeDeserializeFloat(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} - serde := common.ArrayOfDoublesSerDe{} + comparator := common.ItemSketchDoubleComparator{} for _, n := range nArr { - sk, err := NewKllItemsSketchWithDefault[float64](serde) + sk, err := NewKllItemsSketchWithDefault[float64](comparator.CompareFn(), common.ItemSketchDoubleSerDe{}) assert.NoError(t, err) for i := 1; i <= n; i++ { sk.Update(float64(i)) @@ -896,7 +928,7 @@ func TestSerializeDeserializeFloat(t *testing.T) { slc, err := sk.ToSlice() assert.NoError(t, err) - sketch, err := NewKllItemsSketchFromSlice[float64](slc, serde) + sketch, err := NewKllItemsSketchFromSlice[float64](slc, comparator.CompareFn(), common.ItemSketchDoubleSerDe{}) if err != nil { return } @@ -925,14 +957,62 @@ func TestSerializeDeserializeFloat(t *testing.T) { weight := int64(0) it := sketch.GetIterator() - lessFn := serde.LessFn() + compareFn := comparator.CompareFn() for it.Next() { qut := it.GetQuantile() - assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) - assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) + assert.True(t, compareFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) + assert.True(t, !compareFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) weight += it.GetWeight() } assert.Equal(t, weight, int64(n)) } } } + +func TestSerializeStringDeserializeNoSerde(t *testing.T) { + comparator := common.ItemSketchStringComparator{} + sketch1, err := NewKllItemsSketchWithDefault[string](comparator.CompareFn(), nil) + assert.NoError(t, err) + _, err = sketch1.ToSlice() + assert.Error(t, err) + + _, err = NewKllItemsSketchFromSlice[string](nil, comparator.CompareFn(), nil) + assert.Error(t, err) +} + +func TestNoCompare(t *testing.T) { + _, err := NewKllItemsSketchWithDefault[string](nil, nil) + assert.Error(t, err) +} + +// There is no guarantee that L0 is sorted after a merge. +// The issue is, during a merge, L0 must be sorted prior to a compaction to a higher level. +// Otherwise the higher levels would not be sorted properly. +func TestL0SortDuringMerge(t *testing.T) { + comparator := common.ItemSketchStringComparator{ + ReverseOrder: true, + } + sk1, err := NewKllItemsSketch[string](8, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) + assert.NoError(t, err) + sk2, err := NewKllItemsSketch[string](8, _DEFAULT_M, comparator.CompareFn(), common.ItemSketchStringSerDe{}) + assert.NoError(t, err) + n := 26 //don't change this + for i := 1; i <= n; i++ { + j := rand.Intn(n) + 1 + sk1.Update(intToFixedLengthString(j, 3)) + sk2.Update(intToFixedLengthString(j+100, 3)) + } + sk1.Merge(sk2) + //println(sk1.String(true, true)) //L1 and above should be sorted in reverse. Ignore L0. + lvl1size := sk1.levels[2] - sk1.levels[1] + itr := sk1.GetIterator() + itr.Next() + prev, _ := strconv.Atoi(strings.TrimSpace(itr.GetQuantile())) + for i := uint32(1); i < lvl1size; i++ { + if itr.Next() { + v, _ := strconv.Atoi(strings.TrimSpace(itr.GetQuantile())) + assert.True(t, v <= prev) + prev = v + } + } +} diff --git a/kll/items_sketch_validate.go b/kll/items_sketch_validate.go index 5837bd7..fd93a87 100644 --- a/kll/items_sketch_validate.go +++ b/kll/items_sketch_validate.go @@ -26,7 +26,7 @@ import ( type itemsSketchMemoryValidate[C comparable] struct { srcMem []byte - itemSketchOp common.ItemSketchOp[C] + serde common.ItemSketchSerde[C] sketchStructure sketchStructure // first 8 bytes of preamble @@ -55,7 +55,7 @@ type itemsSketchMemoryValidate[C comparable] struct { typeBytes int //always 0 for generic } -func newItemsSketchMemoryValidate[C comparable](srcMem []byte, itemSketchOp common.ItemSketchOp[C]) (*itemsSketchMemoryValidate[C], error) { +func newItemsSketchMemoryValidate[C comparable](srcMem []byte, serde common.ItemSketchSerde[C]) (*itemsSketchMemoryValidate[C], error) { capa := cap(srcMem) if capa < 8 { return nil, fmt.Errorf("Memory too small: %d", capa) @@ -84,7 +84,7 @@ func newItemsSketchMemoryValidate[C comparable](srcMem []byte, itemSketchOp comm typeBytes := 0 vlid := &itemsSketchMemoryValidate[C]{ srcMem: srcMem, - itemSketchOp: itemSketchOp, + serde: serde, sketchStructure: sketchStructure, preInts: preInts, serVer: serVer, @@ -116,7 +116,7 @@ func (vlid *itemsSketchMemoryValidate[C]) validate() error { } capacityItems := computeTotalItemCapacity(uint16(vlid.k), uint8(vlid.m), uint8(vlid.numLevels)) vlid.levelsArr[vlid.numLevels] = capacityItems //load the last one - sb, err := computeSketchBytes(vlid.srcMem, vlid.levelsArr, vlid.typeBytes, vlid.itemSketchOp) + sb, err := computeSketchBytes(vlid.srcMem, vlid.levelsArr, vlid.typeBytes, vlid.serde) if err != nil { return err } @@ -139,7 +139,7 @@ func (vlid *itemsSketchMemoryValidate[C]) validate() error { vlid.minK = uint16(vlid.k) vlid.numLevels = 1 //assumed vlid.levelsArr = []uint32{uint32(vlid.k) - 1, uint32(vlid.k)} - v, err := vlid.itemSketchOp.SizeOfMany(vlid.srcMem, _DATA_START_ADR_SINGLE_ITEM, 1) + v, err := vlid.serde.SizeOfMany(vlid.srcMem, _DATA_START_ADR_SINGLE_ITEM, 1) if err != nil { return err } @@ -150,20 +150,20 @@ func (vlid *itemsSketchMemoryValidate[C]) validate() error { return nil } -func computeSketchBytes[C comparable](srcMem []byte, levelsArr []uint32, typeBytes int, itemSketchOp common.ItemSketchOp[C]) (int, error) { +func computeSketchBytes[C comparable](srcMem []byte, levelsArr []uint32, typeBytes int, serde common.ItemSketchSerde[C]) (int, error) { numLevels := len(levelsArr) - 1 retainedItems := levelsArr[numLevels] - levelsArr[0] levelsLen := len(levelsArr) - 1 numItems := retainedItems offsetBytes := _DATA_START_ADR + levelsLen*4 if typeBytes == 1 { - v, err := itemSketchOp.SizeOfMany(srcMem, offsetBytes, int(numItems)) + v, err := serde.SizeOfMany(srcMem, offsetBytes, int(numItems)) if err != nil { return 0, err } offsetBytes += v + 2 //2 for min & max } else { - v, err := itemSketchOp.SizeOfMany(srcMem, offsetBytes, int(numItems)+2) //2 for min & max + v, err := serde.SizeOfMany(srcMem, offsetBytes, int(numItems)+2) //2 for min & max if err != nil { return 0, err } diff --git a/kll/items_sletch_serialization_test.go b/kll/items_sletch_serialization_test.go index 83db088..02e3cb8 100644 --- a/kll/items_sletch_serialization_test.go +++ b/kll/items_sletch_serialization_test.go @@ -34,9 +34,10 @@ func TestGenerateGoFiles(t *testing.T) { os.Mkdir(internal.GoPath, 0755) nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + comparatorString := common.ItemSketchStringComparator{} for _, n := range nArr { digits := numDigits(n) - sk, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + sk, err := NewKllItemsSketchWithDefault[string](comparatorString.CompareFn(), common.ItemSketchStringSerDe{}) sk.deterministicOffsetForTest = true assert.NoError(t, err) for i := 1; i <= n; i++ { @@ -48,8 +49,9 @@ func TestGenerateGoFiles(t *testing.T) { assert.NoError(t, err) } + comparatorDouble := common.ItemSketchDoubleComparator{} for _, n := range nArr { - sk, err := NewKllItemsSketchWithDefault[float64](common.ArrayOfDoublesSerDe{}) + sk, err := NewKllItemsSketchWithDefault[float64](comparatorDouble.CompareFn(), common.ItemSketchDoubleSerDe{}) sk.deterministicOffsetForTest = true assert.NoError(t, err) for i := 1; i <= n; i++ { @@ -65,12 +67,13 @@ func TestGenerateGoFiles(t *testing.T) { func TestJavaCompat(t *testing.T) { t.Run("Java KLL String", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} - serde := common.ArrayOfStringsSerDe{} + serde := common.ItemSketchStringSerDe{} + comparatorString := common.ItemSketchStringComparator{} for _, n := range nArr { digits := numDigits(n) bytes, err := os.ReadFile(fmt.Sprintf("%s/kll_string_n%d_java.sk", internal.JavaPath, n)) assert.NoError(t, err) - sketch, err := NewKllItemsSketchFromSlice[string](bytes, serde) + sketch, err := NewKllItemsSketchFromSlice[string](bytes, comparatorString.CompareFn(), serde) if err != nil { return } @@ -99,11 +102,11 @@ func TestJavaCompat(t *testing.T) { weight := int64(0) it := sketch.GetIterator() - lessFn := serde.LessFn() + compareFn := comparatorString.CompareFn() for it.Next() { qut := it.GetQuantile() - assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) - assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) + assert.True(t, compareFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) + assert.True(t, !compareFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) weight += it.GetWeight() } assert.Equal(t, weight, int64(n)) @@ -113,11 +116,12 @@ func TestJavaCompat(t *testing.T) { t.Run("Java KLL Double", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} - serde := common.ArrayOfDoublesSerDe{} + serde := common.ItemSketchDoubleSerDe{} + comparatorDouble := common.ItemSketchDoubleComparator{} for _, n := range nArr { bytes, err := os.ReadFile(fmt.Sprintf("%s/kll_double_n%d_java.sk", internal.JavaPath, n)) assert.NoError(t, err) - sketch, err := NewKllItemsSketchFromSlice[float64](bytes, serde) + sketch, err := NewKllItemsSketchFromSlice[float64](bytes, comparatorDouble.CompareFn(), serde) if err != nil { return } @@ -146,11 +150,11 @@ func TestJavaCompat(t *testing.T) { weight := int64(0) it := sketch.GetIterator() - lessFn := serde.LessFn() + compareFn := comparatorDouble.CompareFn() for it.Next() { qut := it.GetQuantile() - assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) - assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) + assert.True(t, compareFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) + assert.True(t, !compareFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) weight += it.GetWeight() } assert.Equal(t, weight, int64(n)) diff --git a/kll/utils.go b/kll/utils.go index f6a0ce4..a1e23e6 100644 --- a/kll/utils.go +++ b/kll/utils.go @@ -76,13 +76,13 @@ func checkNormalizedRankBounds(rank float64) error { return nil } -func checkItems[C comparable](items []C, lessFn common.LessFn[C]) error { +func checkItems[C comparable](items []C, compareFn common.CompareFn[C]) error { if len(items) == 1 && internal.IsNil(items[0]) { return errors.New("items must be unique, monotonically increasing and not nil") } for i := 0; i < len(items)-1; i++ { - if !internal.IsNil(items[i]) && !internal.IsNil(items[i+1]) && lessFn(items[i], items[i+1]) { + if !internal.IsNil(items[i]) && !internal.IsNil(items[i+1]) && compareFn(items[i], items[i+1]) { continue } return errors.New("items must be unique, monotonically increasing and not nil") diff --git a/static-analysis.datadog.yml b/static-analysis.datadog.yml new file mode 100644 index 0000000..0b86a20 --- /dev/null +++ b/static-analysis.datadog.yml @@ -0,0 +1,3 @@ +rulesets: + - go-best-practices + - go-security --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
