This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit badf2c50636f8223a9e457cb53be4ac07f6ba2c5 Author: Pierre Lacave <[email protected]> AuthorDate: Tue Dec 19 22:35:44 2023 +0100 Continue implementation of Frequency longSketch + QuickSelect --- common/family.go | 3 +- frequencies/long_sketch.go | 16 ++++++++ frequencies/long_sketch_test.go | 10 +++++ frequencies/preable_utils.go | 11 +++++ frequencies/reverse_purge_long_hash_map.go | 5 +-- thetacommon/quick_select.go | 65 ++++++++++++++++++++++++++++++ 6 files changed, 106 insertions(+), 4 deletions(-) diff --git a/common/family.go b/common/family.go index 7daa5dc..957934f 100644 --- a/common/family.go +++ b/common/family.go @@ -18,5 +18,6 @@ package common const ( - FamilyHllId = 7 + FamilyHllId = 7 + FamilyFrequencyId = 10 ) diff --git a/frequencies/long_sketch.go b/frequencies/long_sketch.go index b7c1ef8..75dcc3a 100644 --- a/frequencies/long_sketch.go +++ b/frequencies/long_sketch.go @@ -20,6 +20,7 @@ package frequencies import ( "fmt" "github.com/apache/datasketches-go/common" + "strings" ) type LongSketch struct { @@ -132,3 +133,18 @@ func (s *LongSketch) Update(item int64, count int64) error { } return nil } + +func (s *LongSketch) serializeToString() string { + var sb strings.Builder + //start the string with parameters of the sketch + serVer := serVer //0 + famID := common.FamilyFrequencyId + lgMaxMapSz := s.lgMaxMapSize + flags := 0 + if s.hashMap.numActive == 0 { + flags = emptyFlagMask + } + fmt.Fprintf(&sb, "%d,%d,%d,%d,%d,%d,", serVer, famID, lgMaxMapSz, flags, s.streamWeight, s.offset) + sb.WriteString(s.hashMap.serializeToString()) //numActive, curMaplen, key[i], value[i], ... + return sb.String() +} diff --git a/frequencies/long_sketch_test.go b/frequencies/long_sketch_test.go index 56e3568..03a0c69 100644 --- a/frequencies/long_sketch_test.go +++ b/frequencies/long_sketch_test.go @@ -32,4 +32,14 @@ func TestFrequentItsemsStringSerialTest(t *testing.T) { sketch.Update(15, 3443) sketch.Update(1000001, 1010230) sketch.Update(1000002, 1010230) + + s := sketch.serializeToString() + assert.Equal(t, "1,10,5,0,2024103,0,4,32,10,200,15,3443,1000001,1010230,1000002,1010230,", s) + /* + LongsSketch new_sketch0 = LongsSketch.getInstance(string0); + String new_string0 = new_sketch0.serializeToString(); + Assert.assertTrue(string0.equals(new_string0)); + Assert.assertTrue(new_sketch0.getMaximumMapCapacity() == sketch.getMaximumMapCapacity()); + Assert.assertTrue(new_sketch0.getCurrentMapCapacity() == sketch.getCurrentMapCapacity()); + */ } diff --git a/frequencies/preable_utils.go b/frequencies/preable_utils.go new file mode 100644 index 0000000..f04aefc --- /dev/null +++ b/frequencies/preable_utils.go @@ -0,0 +1,11 @@ +package frequencies + +const ( + + // emptyFlagMask flag bit masks + // due to a mistake different bits were used in C++ and Java to indicate empty sketch + // therefore both are set and checked for compatibility with historical binary format + emptyFlagMask = 5 + + serVer = 1 +) diff --git a/frequencies/reverse_purge_long_hash_map.go b/frequencies/reverse_purge_long_hash_map.go index 99b9a29..28df67d 100644 --- a/frequencies/reverse_purge_long_hash_map.go +++ b/frequencies/reverse_purge_long_hash_map.go @@ -20,6 +20,7 @@ package frequencies import ( "fmt" "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/thetacommon" "math/bits" "strconv" "strings" @@ -135,9 +136,7 @@ func (r *reversePurgeLongHashMap) purge(sampleSize int) int64 { i++ } - // TODO implement quickSelect - //val := quickSelect(samples, 0, numSamples-1, limit/2) - val := int64(0) + val := thetacommon.QuickSelect(samples, 0, numSamples-1, limit/2) r.adjustAllValuesBy(-1 * val) r.keepOnlyPositiveCounts() return val diff --git a/thetacommon/quick_select.go b/thetacommon/quick_select.go new file mode 100644 index 0000000..47da86a --- /dev/null +++ b/thetacommon/quick_select.go @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package thetacommon + +func QuickSelect(arr []int64, lo int, hi int, pivot int) int64 { + for hi > 0 { + j := partition(arr, lo, hi) + if j == pivot { + return arr[pivot] + } + if j > pivot { + hi = j - 1 + } else { + lo = j + 1 + } + } + return arr[pivot] +} + +func partition(arr []int64, lo int, hi int) int { + i := lo // left scan index + j := hi + 1 // right scan index + v := arr[lo] //partitioning item value + for { + // Scan right, scan left, check for scan complete, and exchange + for arr[i] < v { + i++ + if i == hi { + break + } + } + for v < arr[j] { + j-- + if j == lo { + break + } + } + if i >= j { + break + } + x := arr[i] + arr[i] = arr[j] + arr[j] = x + } + // put v=arr[j] into position with a[lo .. j-1] <= a[j] <= a[j+1 .. hi] + x := arr[lo] + arr[lo] = arr[j] + arr[j] = x + return j +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
