This is an automated email from the ASF dual-hosted git repository. placave pushed a commit to branch go-hll-merge-speed-profile in repository https://gitbox.apache.org/repos/asf/datasketches-characterization.git
commit cb87b12fbfd6977d0a6fd9e64d770b825437fd92 Author: Pierre Lacave <[email protected]> AuthorDate: Mon Apr 29 19:40:59 2024 +0200 [Go] Add go merge speed profile for HLL and cleanup --- go/configs.go | 74 ----------- go/distinct_count_accuracy_profile.go | 135 ++++++++++++++++--- go/distinct_count_bounds_accuracy_profile.go | 190 --------------------------- go/distinct_count_merge_accuracy_profile.go | 6 +- go/distinct_count_merge_speed_profile.go | 122 +++++++++++++++++ go/distinct_count_utils.go | 24 ++++ go/hll_sketch_accuracy_runner.go | 64 --------- go/hll_sketch_bounds_accuracy_runner.go | 100 -------------- go/main.go | 39 +++++- go/main_test.go | 8 +- 10 files changed, 304 insertions(+), 458 deletions(-) diff --git a/go/configs.go b/go/configs.go deleted file mode 100644 index 1e2cbac..0000000 --- a/go/configs.go +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -type distinctCountJobConfigType struct { - lgK int // lgK of distinct count sketch - - lgMinU int // The starting # of uniques that is printed at the end. - lgMaxU int // How high the # uniques go - UPPO int // The horizontal x-resolution of trials points - - lgMinT int // prints intermediate results starting w/ this lgMinT - lgMaxT int // The max trials - TPPO int // how often intermediate results are printed - - lgQK int // size of quantiles sketch - interData bool // intermediate data - - numTrials int - numSketches int - distinctKeysPerSketch int -} - -var ( - distinctCountJobConfig = distinctCountJobConfigType{ - lgK: 4, - - lgMinU: 0, - lgMaxU: 20, - UPPO: 16, - - lgMinT: 8, - lgMaxT: 20, - TPPO: 1, - - lgQK: 12, - interData: true, - } - distinctCountBoundsJobConfig = distinctCountJobConfigType{ - lgK: 4, - - lgMinU: 0, - lgMaxU: 20, - UPPO: 16, - - lgMinT: 10, - lgMaxT: 20, - TPPO: 1, - - lgQK: 12, - interData: true, - } - distinctCountMergeJobConfig = distinctCountJobConfigType{ - lgK: 12, - numTrials: 100, - numSketches: 8192, - distinctKeysPerSketch: 32768, - } -) diff --git a/go/distinct_count_accuracy_profile.go b/go/distinct_count_accuracy_profile.go index a696fee..0ecc81e 100644 --- a/go/distinct_count_accuracy_profile.go +++ b/go/distinct_count_accuracy_profile.go @@ -18,6 +18,9 @@ package main import ( "fmt" + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/hll" + "github.com/apache/datasketches-go/kll" "math" "strings" "time" @@ -25,20 +28,43 @@ import ( type DistinctCountAccuracyProfile struct { config distinctCountJobConfigType - runner DistinctCountAccuracyProfileRunner + sketch hll.HllSketch stats []baseAccuracyStats startTime int64 } -func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType, runner DistinctCountAccuracyProfileRunner) *DistinctCountAccuracyProfile { +type accuracyStats struct { + qsk *kll.ItemsSketch[float64] + sumLB3 float64 + sumLB2 float64 + sumLB1 float64 + sumUB3 float64 + sumUB2 float64 + sumUB1 float64 + sumEst float64 + sumRelErr float64 + sumSqRelErr float64 + trueValue uint64 +} + +func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType, tgtType hll.TgtHllType) *DistinctCountAccuracyProfile { + sketch, _ := hll.NewHllSketch(config.lgK, tgtType) return &DistinctCountAccuracyProfile{ config: config, - runner: runner, - stats: buildLog2AccuracyStatsArray(config.lgMinU, config.lgMaxU, config.UPPO, config.lgQK), + sketch: sketch, + stats: buildLog2AccuracyStatsArray(config.lgMinU, config.lgMaxU, config.uppo, config.lgQK), startTime: time.Now().UnixMilli(), } } +func newAccuracyStats(k int, trueValue uint64) *accuracyStats { + qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) + return &accuracyStats{ + qsk: qsk, + trueValue: trueValue, + } +} + func (d *DistinctCountAccuracyProfile) run() { minT := 1 << d.config.lgMinT maxT := 1 << d.config.lgMaxT @@ -53,11 +79,11 @@ func (d *DistinctCountAccuracyProfile) run() { if lastTpt == 0 { nextT = minT } else { - nextT = int(pwr2SeriesNext(d.config.TPPO, uint64(lastTpt))) + nextT = int(pwr2SeriesNext(d.config.tppo, uint64(lastTpt))) } delta := nextT - lastTpt for i := 0; i < delta; i++ { - vIn = d.runner.runTrial(d.stats, vIn) + vIn = d.runTrial(vIn) } lastTpt = nextT sb := &strings.Builder{} @@ -99,13 +125,19 @@ func (d *DistinctCountAccuracyProfile) process(cumTrials int, sb *strings.Builde q := d.stats[pt].(*accuracyStats) trueUniques := q.trueValue - meanEst := q.sumEst / float64(cumTrials) meanRelErr := q.sumRelErr / float64(cumTrials) meanSqErr := q.sumSqRelErr / float64(cumTrials) normMeanSqErr := meanSqErr / (float64(trueUniques) * float64(trueUniques)) rmsRelErr := math.Sqrt(normMeanSqErr) - q.rmse = rmsRelErr + + relLb3 := q.sumLB3/float64(cumTrials)/float64(trueUniques) - 1.0 + relLb2 := q.sumLB2/float64(cumTrials)/float64(trueUniques) - 1.0 + relLb1 := q.sumLB1/float64(cumTrials)/float64(trueUniques) - 1.0 + + relUb1 := q.sumUB1/float64(cumTrials)/float64(trueUniques) - 1.0 + relUb2 := q.sumUB2/float64(cumTrials)/float64(trueUniques) - 1.0 + relUb3 := q.sumUB3/float64(cumTrials)/float64(trueUniques) - 1.0 sb.WriteString(fmt.Sprintf("%d", trueUniques)) sb.WriteString("\t") @@ -122,17 +154,28 @@ func (d *DistinctCountAccuracyProfile) process(cumTrials int, sb *strings.Builde sb.WriteString(fmt.Sprintf("%d", cumTrials)) sb.WriteString("\t") - quants, _ := q.qsk.GetQuantiles(GAUSSIANS_4SD, true) + // Quantiles + quants, _ := q.qsk.GetQuantiles(GAUSSIANS_3SD, true) for i := 0; i < len(quants); i++ { - sb.WriteString(fmt.Sprintf("%e", float64(quants[i])/(float64(trueUniques))-1.0)) + sb.WriteString(fmt.Sprintf("%e", quants[i]/float64(trueUniques)-1.0)) sb.WriteString("\t") } - sb.WriteString(fmt.Sprintf("%d", 0)) + // Bound averages + sb.WriteString(fmt.Sprintf("%e", relLb3)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", relLb2)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", relLb1)) sb.WriteString("\t") - sb.WriteString(fmt.Sprintf("%d", 0)) + sb.WriteString(fmt.Sprintf("%e", relUb1)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", relUb2)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", relUb3)) sb.WriteString("\n") + } } @@ -149,8 +192,6 @@ func (d *DistinctCountAccuracyProfile) setHeader(sb *strings.Builder) string { sb.WriteString("\t") sb.WriteString("Min") sb.WriteString("\t") - sb.WriteString("Q(.0000317)") - sb.WriteString("\t") sb.WriteString("Q(.00135)") sb.WriteString("\t") sb.WriteString("Q(.02275)") @@ -165,17 +206,73 @@ func (d *DistinctCountAccuracyProfile) setHeader(sb *strings.Builder) string { sb.WriteString("\t") sb.WriteString("Q(.99865)") sb.WriteString("\t") - sb.WriteString("Q(.9999683)") - sb.WriteString("\t") sb.WriteString("Max") sb.WriteString("\t") - sb.WriteString("Bytes") + sb.WriteString("avgLB3") + sb.WriteString("\t") + sb.WriteString("avgLB2") sb.WriteString("\t") - sb.WriteString("ReMerit") - sb.WriteString("\n") + sb.WriteString("avgLB1") + sb.WriteString("\t") + sb.WriteString("avgUB1") + sb.WriteString("\t") + sb.WriteString("avgUB2") + sb.WriteString("\t") + sb.WriteString("avgUB3") + sb.WriteString("\t") + sb.WriteString("Max") return sb.String() } +func (d *DistinctCountAccuracyProfile) runTrial(key uint64) uint64 { + d.sketch.Reset() + + lastUniques := uint64(0) + for _, ostat := range d.stats { + stat := ostat.(*accuracyStats) + delta := stat.trueValue - lastUniques + for u := uint64(0); u < delta; u++ { + key++ + d.sketch.UpdateUInt64(key) + } + lastUniques += delta + est, _ := d.sketch.GetEstimate() + lb3, _ := d.sketch.GetLowerBound(3) + lb2, _ := d.sketch.GetLowerBound(2) + lb1, _ := d.sketch.GetLowerBound(1) + + ub1, _ := d.sketch.GetUpperBound(1) + ub2, _ := d.sketch.GetUpperBound(2) + ub3, _ := d.sketch.GetUpperBound(3) + + stat.update(est, lb3, lb2, lb1, ub1, ub2, ub3) + } + + return key +} + +func (a *accuracyStats) update( + est float64, + lb3 float64, + lb2 float64, + lb1 float64, + ub1 float64, + ub2 float64, + ub3 float64, +) { + a.qsk.Update(est) + a.sumLB3 += lb3 + a.sumLB2 += lb2 + a.sumLB1 += lb1 + a.sumUB1 += ub1 + a.sumUB2 += ub2 + a.sumUB3 += ub3 + a.sumEst += est + a.sumRelErr += est/float64(a.trueValue) - 1.0 + erro := est - float64(a.trueValue) + a.sumSqRelErr += erro * erro +} + func buildLog2AccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []baseAccuracyStats { qLen := countPoints(lgMin, lgMax, ppo) qArr := make([]baseAccuracyStats, qLen) diff --git a/go/distinct_count_bounds_accuracy_profile.go b/go/distinct_count_bounds_accuracy_profile.go deleted file mode 100644 index 80c072f..0000000 --- a/go/distinct_count_bounds_accuracy_profile.go +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package main - -import ( - "fmt" - "strings" - "time" -) - -type DistinctCountBoundsAccuracyProfile struct { - config distinctCountJobConfigType - runner DistinctCountAccuracyProfileRunner - stats []baseAccuracyStats - startTime int64 -} - -func NewDistinctCountBoundsAccuracyProfile(config distinctCountJobConfigType, runner DistinctCountAccuracyProfileRunner) *DistinctCountBoundsAccuracyProfile { - return &DistinctCountBoundsAccuracyProfile{ - config: config, - runner: runner, - stats: buildLog2BoundsAccuracyStatsArray(config.lgMinU, config.lgMaxU, config.UPPO, config.lgQK), - startTime: time.Now().UnixMilli(), - } -} - -func (d *DistinctCountBoundsAccuracyProfile) run() { - minT := 1 << d.config.lgMinT - maxT := 1 << d.config.lgMaxT - maxU := 1 << d.config.lgMaxU - - vIn := uint64(0) - - // This will generate a table of data for each intermediate Trials point - lastTpt := 0 - for lastTpt < maxT { - nextT := lastTpt - if lastTpt == 0 { - nextT = minT - } else { - nextT = int(pwr2SeriesNext(d.config.TPPO, uint64(lastTpt))) - } - delta := nextT - lastTpt - for i := 0; i < delta; i++ { - vIn = d.runner.runTrial(d.stats, vIn) - } - lastTpt = nextT - sb := &strings.Builder{} - if nextT < maxT { - if d.config.interData { - sb.Reset() - d.setHeader(sb) - d.process(lastTpt, sb) - fmt.Println(sb.String()) - } - } else { - sb.Reset() - d.setHeader(sb) - d.process(lastTpt, sb) - fmt.Println(sb.String()) - } - - fmt.Printf("Config: : %+v\n", d.config) - fmt.Printf("Cum Trials : %d\n", lastTpt) - fmt.Printf("Cum Updates : %d\n", vIn) - currentTime_mS := time.Now().UnixMilli() - cumTime_mS := currentTime_mS - d.startTime - fmt.Printf("Cum Time : %s\n", time.Duration(cumTime_mS*1000*1000)) - timePerTrial_mS := float64(cumTime_mS) / float64(lastTpt) - avgUpdateTime_ns := timePerTrial_mS * 1e6 / float64(maxU) - fmt.Printf("Time Per Trial, mSec: %f\n", timePerTrial_mS) - fmt.Printf("Avg Update Time, nSec: %f\n", avgUpdateTime_ns) - fmt.Printf("Date Time : %s\n", time.Now().Format(time.RFC3339)) - timeToComplete_mS := int64(timePerTrial_mS * float64(maxT-lastTpt)) - fmt.Printf("Est Time to Complete: %s\n", time.Duration(timeToComplete_mS*1000*1000)) - fmt.Printf("Est Time at Completion: %s\n", time.Now().Add(time.Duration(timeToComplete_mS*1000*1000)).Format(time.RFC3339)) - fmt.Println("") - } -} - -func (d *DistinctCountBoundsAccuracyProfile) process(cumTrials int, sb *strings.Builder) { - points := len(d.stats) - for pt := 0; pt < points; pt++ { - q := d.stats[pt].(*boundsAccuracyStats) - - trueUniques := q.trueValue - relLb3 := q.sumLB3/float64(cumTrials)/float64(trueUniques) - 1.0 - relLb2 := q.sumLB2/float64(cumTrials)/float64(trueUniques) - 1.0 - relLb1 := q.sumLB1/float64(cumTrials)/float64(trueUniques) - 1.0 - - relLUb1 := q.sumUB1/float64(cumTrials)/float64(trueUniques) - 1.0 - relLUb2 := q.sumUB2/float64(cumTrials)/float64(trueUniques) - 1.0 - relLUb3 := q.sumUB3/float64(cumTrials)/float64(trueUniques) - 1.0 - - // OUTPUT - sb.WriteString(fmt.Sprintf("%d", trueUniques)) - sb.WriteString("\t") - // TRIALS - sb.WriteString(fmt.Sprintf("%d", cumTrials)) - sb.WriteString("\t") - - // Quantiles - quants, _ := q.qsk.GetQuantiles(GAUSSIANS_3SD, true) - for i := 0; i < len(quants); i++ { - sb.WriteString(fmt.Sprintf("%e", quants[i]/float64(trueUniques)-1.0)) - sb.WriteString("\t") - } - - // Bound averages - sb.WriteString(fmt.Sprintf("%e", relLb3)) - sb.WriteString("\t") - sb.WriteString(fmt.Sprintf("%e", relLb2)) - sb.WriteString("\t") - sb.WriteString(fmt.Sprintf("%e", relLb1)) - sb.WriteString("\t") - - sb.WriteString(fmt.Sprintf("%e", relLUb1)) - sb.WriteString("\t") - sb.WriteString(fmt.Sprintf("%e", relLUb2)) - sb.WriteString("\t") - sb.WriteString(fmt.Sprintf("%e", relLUb3)) - sb.WriteString("\n") - - } -} - -func (d *DistinctCountBoundsAccuracyProfile) setHeader(sb *strings.Builder) string { - sb.WriteString("InU") - sb.WriteString("\t") - sb.WriteString("Trials") - sb.WriteString("\t") - sb.WriteString("Min") - sb.WriteString("\t") - sb.WriteString("\t") - sb.WriteString("Q(.00135)") - sb.WriteString("\t") - sb.WriteString("Q(.02275)") - sb.WriteString("\t") - sb.WriteString("Q(.15866)") - sb.WriteString("\t") - sb.WriteString("Q(.5)") - sb.WriteString("\t") - sb.WriteString("Q(.84134)") - sb.WriteString("\t") - sb.WriteString("Q(.97725)") - sb.WriteString("\t") - sb.WriteString("Q(.99865)") - sb.WriteString("\t") - sb.WriteString("\t") - sb.WriteString("Max") - sb.WriteString("\t") - sb.WriteString("avgLB3") - sb.WriteString("\t") - sb.WriteString("avgLB2") - sb.WriteString("\t") - sb.WriteString("avgLB1") - sb.WriteString("\t") - sb.WriteString("avgUB1") - sb.WriteString("\t") - sb.WriteString("avgUB2") - sb.WriteString("\t") - sb.WriteString("avgUB3") - sb.WriteString("\n") - return sb.String() -} - -func buildLog2BoundsAccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []baseAccuracyStats { - qLen := countPoints(lgMin, lgMax, ppo) - qArr := make([]baseAccuracyStats, qLen) - p := uint64(1) << lgMin - for i := 0; i < qLen; i++ { - qArr[i] = newBoundsAccuracyStats(1<<lgQK, p) - p = pwr2SeriesNext(ppo, p) - } - return qArr -} diff --git a/go/distinct_count_merge_accuracy_profile.go b/go/distinct_count_merge_accuracy_profile.go index dffce1d..6d5ed88 100644 --- a/go/distinct_count_merge_accuracy_profile.go +++ b/go/distinct_count_merge_accuracy_profile.go @@ -27,11 +27,13 @@ import ( type DistinctCountMergeAccuracyProfile struct { config distinctCountJobConfigType startTime int64 + tgtType hll.TgtHllType } -func NewDistinctCountMergeAccuracyProfile(config distinctCountJobConfigType) *DistinctCountMergeAccuracyProfile { +func NewDistinctCountMergeAccuracyProfile(config distinctCountJobConfigType, tgtType hll.TgtHllType) *DistinctCountMergeAccuracyProfile { return &DistinctCountMergeAccuracyProfile{ config: config, + tgtType: tgtType, startTime: time.Now().UnixMilli(), } } @@ -49,7 +51,7 @@ func (d *DistinctCountMergeAccuracyProfile) run() { union, _ := hll.NewUnion(d.config.lgK) for s := 0; s < d.config.numSketches; s++ { - sk, _ := hll.NewHllSketch(d.config.lgK, hll.TgtHllTypeHll8) + sk, _ := hll.NewHllSketch(d.config.lgK, d.tgtType) for k := 0; k < d.config.distinctKeysPerSketch; k++ { sk.UpdateInt64(key) key += 1 diff --git a/go/distinct_count_merge_speed_profile.go b/go/distinct_count_merge_speed_profile.go new file mode 100644 index 0000000..333cfc6 --- /dev/null +++ b/go/distinct_count_merge_speed_profile.go @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package main + +import ( + "fmt" + "github.com/apache/datasketches-go/hll" + "strings" + "time" +) + +type DistinctCountMergeSpeedProfile struct { + config distinctCountJobConfigType + union hll.Union + source hll.HllSketch + startTime int64 +} + +type mergeSpeedStats struct { + mergeTime_nS float64 + totalTime_nS float64 +} + +func NewDistinctCountMergeSpeedProfile(config distinctCountJobConfigType, tgtType hll.TgtHllType) *DistinctCountMergeSpeedProfile { + union, _ := hll.NewUnion(21) + return &DistinctCountMergeSpeedProfile{ + config: config, + union: union, + startTime: time.Now().UnixMilli(), + } +} + +func (d *DistinctCountMergeSpeedProfile) run() { + sb := &strings.Builder{} + d.setHeader(sb) + fmt.Println(sb.String()) + + stats := &mergeSpeedStats{} + vIn := int64(0) + for lgK := d.config.minLgK; lgK <= d.config.maxLgK; lgK++ { + var ( + lgT = d.config.maxLgK - lgK + d.config.lgMinT + trials = 1 << lgT + sumMergeTime_nS = 0.0 + sumTotalTime_nS = 0.0 + ) + sb.Reset() + vIn = d.resetMerge(lgK, vIn) + for t := 0; t < trials; t++ { + d.runTrial(stats, lgK, d.config.lgDeltaU) + sumMergeTime_nS += stats.mergeTime_nS + sumTotalTime_nS += stats.totalTime_nS + } + stats.mergeTime_nS = sumMergeTime_nS / float64(trials) + stats.totalTime_nS = sumTotalTime_nS / float64(trials) + d.process(stats, lgK, lgT, sb) + fmt.Println(sb.String()) + } +} + +func (d *DistinctCountMergeSpeedProfile) setHeader(sb *strings.Builder) string { + sb.WriteString("LgK") + sb.WriteString("\t") + sb.WriteString("LgT") + sb.WriteString("\t") + sb.WriteString("Merge_nS") + sb.WriteString("\t") + sb.WriteString("Total_nS") + sb.WriteString("\t") + sb.WriteString("PerSlot_nS") + return sb.String() +} + +func (d *DistinctCountMergeSpeedProfile) runTrial(stats *mergeSpeedStats, lgK int, lgDeltaU int) { + start := uint64(0) + mergeTime_nS := uint64(0) + + start = uint64(time.Now().UnixNano()) + d.union.UpdateSketch(d.source) + mergeTime_nS = uint64(time.Now().UnixNano()) - start + + stats.mergeTime_nS = float64(mergeTime_nS) + stats.totalTime_nS = float64(mergeTime_nS) +} +func (d *DistinctCountMergeSpeedProfile) process(stats *mergeSpeedStats, lgK int, lgT int, sb *strings.Builder) string { + sb.WriteString(fmt.Sprintf("%d", lgK)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%d", lgT)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", stats.mergeTime_nS)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", stats.totalTime_nS)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%e", stats.totalTime_nS/float64(uint64(1<<lgK)))) + return sb.String() +} + +func (d *DistinctCountMergeSpeedProfile) resetMerge(lgK int, vIn int64) int64 { + d.union, _ = hll.NewUnion(lgK) + d.source, _ = hll.NewHllSketch(lgK, hll.TgtHllTypeDefault) + U := 2 << lgK + for i := 0; i < U; i++ { + vIn++ + d.union.UpdateInt64(vIn) + d.source.UpdateInt64(vIn) + } + return vIn +} diff --git a/go/distinct_count_utils.go b/go/distinct_count_utils.go index 5093457..0d3ec4e 100644 --- a/go/distinct_count_utils.go +++ b/go/distinct_count_utils.go @@ -23,6 +23,30 @@ type DistinctCountAccuracyProfileRunner interface { runTrial(stats []baseAccuracyStats, key uint64) uint64 } +type distinctCountJobConfigType struct { + lgK int // lgK of distinct count sketch + + lgMinU int // The starting # of uniques that is printed at the end. + lgMaxU int // How high the # uniques go + uppo int // The horizontal x-resolution of trials points + + lgMinT int // prints intermediate results starting w/ this lgMinT + lgMaxT int // The max trials + tppo int // how often intermediate results are printed + + minLgK int // X-axis LgK Profile + maxLgK int // X-axis LgK Profile + + lgDeltaU int + + lgQK int // size of quantiles sketch + interData bool // intermediate data + + numTrials int + numSketches int + distinctKeysPerSketch int +} + const ( M4SD = 0.0000316712418331 //minus 4 StdDev M3SD = 0.0013498980316301 //minus 3 StdDev diff --git a/go/hll_sketch_accuracy_runner.go b/go/hll_sketch_accuracy_runner.go index 5dcf374..d732b5d 100644 --- a/go/hll_sketch_accuracy_runner.go +++ b/go/hll_sketch_accuracy_runner.go @@ -17,67 +17,3 @@ package main -import ( - "github.com/apache/datasketches-go/common" - "github.com/apache/datasketches-go/hll" - "github.com/apache/datasketches-go/kll" -) - -// HllSketchAccuracyRunner is A Runner for HLL tracking accuracyStats -type HllSketchAccuracyRunner struct { - sketch hll.HllSketch -} - -type accuracyStats struct { - qsk *kll.ItemsSketch[float64] - sumEst float64 - sumRelErr float64 - sumSqRelErr float64 - rmse float64 - trueValue uint64 - uniques int - bytes int -} - -func newAccuracyStats(k int, trueValue uint64) *accuracyStats { - qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) - return &accuracyStats{ - qsk: qsk, - trueValue: trueValue, - uniques: int(trueValue), - } -} - -func (a *accuracyStats) update(est float64) { - a.qsk.Update(est) - a.sumEst += est - a.sumRelErr += est/float64(a.trueValue) - 1.0 - erro := est - float64(a.trueValue) - a.sumSqRelErr += erro * erro -} - -func NewHllSketchAccuracyRunner(lgK int, tgtType hll.TgtHllType) *HllSketchAccuracyRunner { - sketch, _ := hll.NewHllSketch(lgK, tgtType) - return &HllSketchAccuracyRunner{ - sketch: sketch, - } -} - -func (h *HllSketchAccuracyRunner) runTrial(stats []baseAccuracyStats, key uint64) uint64 { - h.sketch.Reset() - - lastUniques := uint64(0) - for _, ostat := range stats { - stat := ostat.(*accuracyStats) - delta := stat.trueValue - lastUniques - for u := uint64(0); u < delta; u++ { - h.sketch.UpdateUInt64(key) - key++ - } - lastUniques += delta - est, _ := h.sketch.GetEstimate() - stat.update(est) - } - - return key -} diff --git a/go/hll_sketch_bounds_accuracy_runner.go b/go/hll_sketch_bounds_accuracy_runner.go deleted file mode 100644 index 4cf7760..0000000 --- a/go/hll_sketch_bounds_accuracy_runner.go +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "github.com/apache/datasketches-go/common" - "github.com/apache/datasketches-go/hll" - "github.com/apache/datasketches-go/kll" -) - -// HllSketchBoundsAccuracyRunner is A Runner for HLL tracking boundsAccuracyStats -type HllSketchBoundsAccuracyRunner struct { - sketch hll.HllSketch -} - -type boundsAccuracyStats struct { - qsk *kll.ItemsSketch[float64] - sumLB3 float64 - sumLB2 float64 - sumLB1 float64 - sumUB3 float64 - sumUB2 float64 - sumUB1 float64 - trueValue uint64 -} - -func newBoundsAccuracyStats(k int, trueValue uint64) *boundsAccuracyStats { - qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) - return &boundsAccuracyStats{ - qsk: qsk, - trueValue: trueValue, - } -} - -func (a *boundsAccuracyStats) update( - est float64, - lb3 float64, - lb2 float64, - lb1 float64, - ub1 float64, - ub2 float64, - ub3 float64, -) { - a.qsk.Update(est) - a.sumLB3 += lb3 - a.sumLB2 += lb2 - a.sumLB1 += lb1 - a.sumUB1 += ub1 - a.sumUB2 += ub2 - a.sumUB3 += ub3 -} - -func NewHllSketchBoundsAccuracyRunner(lgK int, tgtType hll.TgtHllType) *HllSketchBoundsAccuracyRunner { - sketch, _ := hll.NewHllSketch(lgK, tgtType) - return &HllSketchBoundsAccuracyRunner{ - sketch: sketch, - } -} - -func (h *HllSketchBoundsAccuracyRunner) runTrial(stats []baseAccuracyStats, key uint64) uint64 { - h.sketch.Reset() - - lastUniques := uint64(0) - for _, ostat := range stats { - stat := ostat.(*boundsAccuracyStats) - delta := stat.trueValue - lastUniques - for u := uint64(0); u < delta; u++ { - key++ - h.sketch.UpdateUInt64(key) - } - lastUniques += delta - est, _ := h.sketch.GetEstimate() - lb3, _ := h.sketch.GetLowerBound(3) - lb2, _ := h.sketch.GetLowerBound(2) - lb1, _ := h.sketch.GetLowerBound(1) - - ub1, _ := h.sketch.GetUpperBound(1) - ub2, _ := h.sketch.GetUpperBound(2) - ub3, _ := h.sketch.GetUpperBound(3) - - stat.update(est, lb3, lb2, lb1, ub1, ub2, ub3) - } - - return key -} diff --git a/go/main.go b/go/main.go index beb1604..8aebea3 100644 --- a/go/main.go +++ b/go/main.go @@ -26,15 +26,40 @@ import ( var ( jobs = map[string]JobProfile{ "distinct_count_accuracy_profile": NewDistinctCountAccuracyProfile( - distinctCountJobConfig, - NewHllSketchAccuracyRunner(distinctCountJobConfig.lgK, hll.TgtHllTypeHll8 /* tgtType */), - ), - "distinct_count_bound_accuracy_profile": NewDistinctCountBoundsAccuracyProfile( - distinctCountBoundsJobConfig, - NewHllSketchBoundsAccuracyRunner(distinctCountBoundsJobConfig.lgK, hll.TgtHllTypeHll8 /* tgtType */), + distinctCountJobConfigType{ + lgK: 11, + + lgMinU: 0, + lgMaxU: 20, + uppo: 16, + + lgMinT: 8, + lgMaxT: 20, + tppo: 1, + + lgQK: 12, + interData: true, + }, + hll.TgtHllTypeHll8, ), "distinct_count_merge_accuracy_profile": NewDistinctCountMergeAccuracyProfile( - distinctCountMergeJobConfig, + distinctCountJobConfigType{ + lgK: 12, + numTrials: 100, + numSketches: 8192, + distinctKeysPerSketch: 32768, + }, + hll.TgtHllTypeHll8, + ), + "distinct_count_merge_speed_profile": NewDistinctCountMergeSpeedProfile( + distinctCountJobConfigType{ + minLgK: 10, + maxLgK: 21, + lgMinT: 11, + lgMaxT: 11, + lgDeltaU: 2, + }, + hll.TgtHllTypeHll8, ), } ) diff --git a/go/main_test.go b/go/main_test.go index f7fd6cd..b07b55d 100644 --- a/go/main_test.go +++ b/go/main_test.go @@ -25,6 +25,10 @@ func TestHllSketchAccuracyRunner(t *testing.T) { jobs["distinct_count_accuracy_profile"].run() } -func TestHllSketchBoundsAccuracyRunner(t *testing.T) { - jobs["distinct_count_bound_accuracy_profile"].run() +func TestHllSketchMergeAccuracyRunner(t *testing.T) { + jobs["distinct_count_merge_accuracy_profile"].run() +} + +func TestHllSketchMergeSpeedRunner(t *testing.T) { + jobs["distinct_count_merge_speed_profile"].run() } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
