This is an automated email from the ASF dual-hosted git repository. placave pushed a commit to branch hll-bound-accuracy-go in repository https://gitbox.apache.org/repos/asf/datasketches-characterization.git
commit b952ab781a9a0ad83ac06a5dcd6d7a4927cbfcc5 Author: Pierre Lacave <[email protected]> AuthorDate: Mon Apr 15 12:25:06 2024 +0200 Add HLL accuracy bounds and merge accuracy characterization profile --- go/configs.go | 16 ++- go/distinct_count_accuracy_profile.go | 81 +++-------- ...o => distinct_count_bounds_accuracy_profile.go} | 159 ++++++++------------- go/distinct_count_merge_accuracy_profile.go | 73 ++++++++++ go/{main.go => distinct_count_utils.go} | 48 +++---- ...cy_profile.go => hll_sketch_accuracy_runner.go} | 36 ++++- go/hll_sketch_bounds_accuracy_runner.go | 100 +++++++++++++ go/main.go | 15 +- 8 files changed, 322 insertions(+), 206 deletions(-) diff --git a/go/configs.go b/go/configs.go index 0be92bf..e5d4b17 100644 --- a/go/configs.go +++ b/go/configs.go @@ -17,9 +17,9 @@ package main -import "github.com/apache/datasketches-go/hll" - type distinctCountJobConfigType struct { + lgK int // lgK of distinct count sketch + lgMinU int // The starting # of uniques that is printed at the end. lgMaxU int // How high the # uniques go UPPO int // The horizontal x-resolution of trials points @@ -31,7 +31,9 @@ type distinctCountJobConfigType struct { lgQK int // size of quantiles sketch interData bool // intermediate data - runner DistinctCountAccuracyProfileRunner + numTrials int + numSketches int + distinctKeysPerSketch int } var ( @@ -46,7 +48,11 @@ var ( lgQK: 12, interData: true, - - runner: NewHllSketchAccuracyRunner(4 /* lgK */, hll.TgtHllTypeHll8 /* tgtType */), + } + distinctCountMergeJobConfig = distinctCountJobConfigType{ + lgK: 12, + numTrials: 100, + numSketches: 8192, + distinctKeysPerSketch: 32768, } ) diff --git a/go/distinct_count_accuracy_profile.go b/go/distinct_count_accuracy_profile.go index 25cf8a7..a696fee 100644 --- a/go/distinct_count_accuracy_profile.go +++ b/go/distinct_count_accuracy_profile.go @@ -18,72 +18,22 @@ package main import ( "fmt" - "github.com/apache/datasketches-go/common" - "github.com/apache/datasketches-go/kll" "math" "strings" "time" ) -const ( - M4SD = 0.0000316712418331 //minus 4 StdDev - M3SD = 0.0013498980316301 //minus 3 StdDev - M2SD = 0.0227501319481792 //minus 2 StdDev - M1SD = 0.1586552539314570 //minus 1 StdDev - MED = 0.5 //median - P1SD = 0.8413447460685430 //plus 1 StdDev - P2SD = 0.9772498680518210 //plus 2 StdDev - P3SD = 0.9986501019683700 //plus 3 StdDev - P4SD = 0.9999683287581670 //plus 4 StdDev -) - -var ( - GAUSSIANS_4SD = []float64{0.0, M4SD, M3SD, M2SD, M1SD, MED, P1SD, P2SD, P3SD, P4SD, 1.0} -) - -type DistinctCountAccuracyProfileRunner interface { - runTrial(stats []*accuracyStats, key uint64) uint64 -} - -type accuracyStats struct { - qsk *kll.ItemsSketch[float64] - sumEst float64 - sumRelErr float64 - sumSqRelErr float64 - rmse float64 - trueValue uint64 - uniques int - bytes int -} - -func newAccuracyStats(k int, trueValue uint64) *accuracyStats { - qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) - return &accuracyStats{ - qsk: qsk, - trueValue: trueValue, - uniques: int(trueValue), - } -} - -func (a *accuracyStats) update(est float64) { - a.qsk.Update(est) - a.sumEst += est - a.sumRelErr += est/float64(a.trueValue) - 1.0 - erro := est - float64(a.trueValue) - a.sumSqRelErr += erro * erro -} - type DistinctCountAccuracyProfile struct { config distinctCountJobConfigType runner DistinctCountAccuracyProfileRunner - stats []*accuracyStats + stats []baseAccuracyStats startTime int64 } -func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType) *DistinctCountAccuracyProfile { +func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType, runner DistinctCountAccuracyProfileRunner) *DistinctCountAccuracyProfile { return &DistinctCountAccuracyProfile{ config: config, - runner: config.runner, + runner: runner, stats: buildLog2AccuracyStatsArray(config.lgMinU, config.lgMaxU, config.UPPO, config.lgQK), startTime: time.Now().UnixMilli(), } @@ -113,13 +63,15 @@ func (d *DistinctCountAccuracyProfile) run() { sb := &strings.Builder{} if nextT < maxT { if d.config.interData { - fmt.Println(getHeader()) - process(d.stats, lastTpt, sb) + sb.Reset() + d.setHeader(sb) + d.process(lastTpt, sb) fmt.Println(sb.String()) } } else { - fmt.Println(getHeader()) - process(d.stats, lastTpt, sb) + sb.Reset() + d.setHeader(sb) + d.process(lastTpt, sb) fmt.Println(sb.String()) } @@ -141,11 +93,10 @@ func (d *DistinctCountAccuracyProfile) run() { } } -func process(qArr []*accuracyStats, cumTrials int, sb *strings.Builder) { - points := len(qArr) - sb.Reset() +func (d *DistinctCountAccuracyProfile) process(cumTrials int, sb *strings.Builder) { + points := len(d.stats) for pt := 0; pt < points; pt++ { - q := qArr[pt] + q := d.stats[pt].(*accuracyStats) trueUniques := q.trueValue @@ -185,8 +136,7 @@ func process(qArr []*accuracyStats, cumTrials int, sb *strings.Builder) { } } -func getHeader() string { - sb := &strings.Builder{} +func (d *DistinctCountAccuracyProfile) setHeader(sb *strings.Builder) string { sb.WriteString("TrueU") sb.WriteString("\t") sb.WriteString("MeanEst") @@ -222,12 +172,13 @@ func getHeader() string { sb.WriteString("Bytes") sb.WriteString("\t") sb.WriteString("ReMerit") + sb.WriteString("\n") return sb.String() } -func buildLog2AccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []*accuracyStats { +func buildLog2AccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []baseAccuracyStats { qLen := countPoints(lgMin, lgMax, ppo) - qArr := make([]*accuracyStats, qLen) + qArr := make([]baseAccuracyStats, qLen) p := uint64(1) << lgMin for i := 0; i < qLen; i++ { qArr[i] = newAccuracyStats(1<<lgQK, p) diff --git a/go/distinct_count_accuracy_profile.go b/go/distinct_count_bounds_accuracy_profile.go similarity index 55% copy from go/distinct_count_accuracy_profile.go copy to go/distinct_count_bounds_accuracy_profile.go index 25cf8a7..e5a4d2b 100644 --- a/go/distinct_count_accuracy_profile.go +++ b/go/distinct_count_bounds_accuracy_profile.go @@ -18,78 +18,27 @@ package main import ( "fmt" - "github.com/apache/datasketches-go/common" - "github.com/apache/datasketches-go/kll" - "math" "strings" "time" ) -const ( - M4SD = 0.0000316712418331 //minus 4 StdDev - M3SD = 0.0013498980316301 //minus 3 StdDev - M2SD = 0.0227501319481792 //minus 2 StdDev - M1SD = 0.1586552539314570 //minus 1 StdDev - MED = 0.5 //median - P1SD = 0.8413447460685430 //plus 1 StdDev - P2SD = 0.9772498680518210 //plus 2 StdDev - P3SD = 0.9986501019683700 //plus 3 StdDev - P4SD = 0.9999683287581670 //plus 4 StdDev -) - -var ( - GAUSSIANS_4SD = []float64{0.0, M4SD, M3SD, M2SD, M1SD, MED, P1SD, P2SD, P3SD, P4SD, 1.0} -) - -type DistinctCountAccuracyProfileRunner interface { - runTrial(stats []*accuracyStats, key uint64) uint64 -} - -type accuracyStats struct { - qsk *kll.ItemsSketch[float64] - sumEst float64 - sumRelErr float64 - sumSqRelErr float64 - rmse float64 - trueValue uint64 - uniques int - bytes int -} - -func newAccuracyStats(k int, trueValue uint64) *accuracyStats { - qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) - return &accuracyStats{ - qsk: qsk, - trueValue: trueValue, - uniques: int(trueValue), - } -} - -func (a *accuracyStats) update(est float64) { - a.qsk.Update(est) - a.sumEst += est - a.sumRelErr += est/float64(a.trueValue) - 1.0 - erro := est - float64(a.trueValue) - a.sumSqRelErr += erro * erro -} - -type DistinctCountAccuracyProfile struct { +type DistinctCountBoundsAccuracyProfile struct { config distinctCountJobConfigType runner DistinctCountAccuracyProfileRunner - stats []*accuracyStats + stats []baseAccuracyStats startTime int64 } -func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType) *DistinctCountAccuracyProfile { - return &DistinctCountAccuracyProfile{ +func NewDistinctCountBoundsAccuracyProfile(config distinctCountJobConfigType, runner DistinctCountAccuracyProfileRunner) *DistinctCountBoundsAccuracyProfile { + return &DistinctCountBoundsAccuracyProfile{ config: config, - runner: config.runner, - stats: buildLog2AccuracyStatsArray(config.lgMinU, config.lgMaxU, config.UPPO, config.lgQK), + runner: runner, + stats: buildLog2BoundsAccuracyStatsArray(config.lgMinU, config.lgMaxU, config.UPPO, config.lgQK), startTime: time.Now().UnixMilli(), } } -func (d *DistinctCountAccuracyProfile) run() { +func (d *DistinctCountBoundsAccuracyProfile) run() { minT := 1 << d.config.lgMinT maxT := 1 << d.config.lgMaxT maxU := 1 << d.config.lgMaxU @@ -113,13 +62,15 @@ func (d *DistinctCountAccuracyProfile) run() { sb := &strings.Builder{} if nextT < maxT { if d.config.interData { - fmt.Println(getHeader()) - process(d.stats, lastTpt, sb) + sb.Reset() + d.setHeader(sb) + d.process(lastTpt, sb) fmt.Println(sb.String()) } } else { - fmt.Println(getHeader()) - process(d.stats, lastTpt, sb) + sb.Reset() + d.setHeader(sb) + d.process(lastTpt, sb) fmt.Println(sb.String()) } @@ -141,65 +92,59 @@ func (d *DistinctCountAccuracyProfile) run() { } } -func process(qArr []*accuracyStats, cumTrials int, sb *strings.Builder) { - points := len(qArr) - sb.Reset() +func (d *DistinctCountBoundsAccuracyProfile) process(cumTrials int, sb *strings.Builder) { + points := len(d.stats) for pt := 0; pt < points; pt++ { - q := qArr[pt] + q := d.stats[pt].(*boundsAccuracyStats) trueUniques := q.trueValue + relLb3 := q.sumLB3/float64(cumTrials)/float64(trueUniques) - 1.0 + relLb2 := q.sumLB2/float64(cumTrials)/float64(trueUniques) - 1.0 + relLb1 := q.sumLB1/float64(cumTrials)/float64(trueUniques) - 1.0 - meanEst := q.sumEst / float64(cumTrials) - meanRelErr := q.sumRelErr / float64(cumTrials) - meanSqErr := q.sumSqRelErr / float64(cumTrials) - normMeanSqErr := meanSqErr / (float64(trueUniques) * float64(trueUniques)) - rmsRelErr := math.Sqrt(normMeanSqErr) - q.rmse = rmsRelErr + relLUb1 := q.sumUB1/float64(cumTrials)/float64(trueUniques) - 1.0 + relLUb2 := q.sumUB2/float64(cumTrials)/float64(trueUniques) - 1.0 + relLUb3 := q.sumUB3/float64(cumTrials)/float64(trueUniques) - 1.0 + // OUTPUT sb.WriteString(fmt.Sprintf("%d", trueUniques)) sb.WriteString("\t") - - sb.WriteString(fmt.Sprintf("%e", meanEst)) - sb.WriteString("\t") - - sb.WriteString(fmt.Sprintf("%e", meanRelErr)) - sb.WriteString("\t") - - sb.WriteString(fmt.Sprintf("%e", rmsRelErr)) - sb.WriteString("\t") - + // TRIALS sb.WriteString(fmt.Sprintf("%d", cumTrials)) sb.WriteString("\t") - quants, _ := q.qsk.GetQuantiles(GAUSSIANS_4SD, true) + // Quantiles + quants, _ := q.qsk.GetQuantiles(GAUSSIANS_3SD, true) for i := 0; i < len(quants); i++ { - sb.WriteString(fmt.Sprintf("%e", float64(quants[i])/(float64(trueUniques))-1.0)) + sb.WriteString(fmt.Sprintf("%f", quants[i]/float64(trueUniques)-1.0)) sb.WriteString("\t") } - sb.WriteString(fmt.Sprintf("%d", 0)) + // Bound averages + sb.WriteString(fmt.Sprintf("%f", relLb3)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%f", relLb2)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%f", relLb1)) + sb.WriteString("\t") + + sb.WriteString(fmt.Sprintf("%f", relLUb1)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%f", relLUb2)) + sb.WriteString("\t") + sb.WriteString(fmt.Sprintf("%f", relLUb3)) sb.WriteString("\t") - sb.WriteString(fmt.Sprintf("%d", 0)) - sb.WriteString("\n") } } -func getHeader() string { - sb := &strings.Builder{} - sb.WriteString("TrueU") - sb.WriteString("\t") - sb.WriteString("MeanEst") - sb.WriteString("\t") - sb.WriteString("MeanRelErr") - sb.WriteString("\t") - sb.WriteString("RMS_RE") +func (d *DistinctCountBoundsAccuracyProfile) setHeader(sb *strings.Builder) string { + sb.WriteString("InU") sb.WriteString("\t") sb.WriteString("Trials") sb.WriteString("\t") sb.WriteString("Min") sb.WriteString("\t") - sb.WriteString("Q(.0000317)") sb.WriteString("\t") sb.WriteString("Q(.00135)") sb.WriteString("\t") @@ -215,22 +160,30 @@ func getHeader() string { sb.WriteString("\t") sb.WriteString("Q(.99865)") sb.WriteString("\t") - sb.WriteString("Q(.9999683)") sb.WriteString("\t") sb.WriteString("Max") sb.WriteString("\t") - sb.WriteString("Bytes") + sb.WriteString("avgLB3") + sb.WriteString("\t") + sb.WriteString("avgLB2") + sb.WriteString("\t") + sb.WriteString("avgLB1") + sb.WriteString("\t") + sb.WriteString("avgUB1") + sb.WriteString("\t") + sb.WriteString("avgUB2") sb.WriteString("\t") - sb.WriteString("ReMerit") + sb.WriteString("avgUB3") + sb.WriteString("\n") return sb.String() } -func buildLog2AccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []*accuracyStats { +func buildLog2BoundsAccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []baseAccuracyStats { qLen := countPoints(lgMin, lgMax, ppo) - qArr := make([]*accuracyStats, qLen) + qArr := make([]baseAccuracyStats, qLen) p := uint64(1) << lgMin for i := 0; i < qLen; i++ { - qArr[i] = newAccuracyStats(1<<lgQK, p) + qArr[i] = newBoundsAccuracyStats(1<<lgQK, p) p = pwr2SeriesNext(ppo, p) } return qArr diff --git a/go/distinct_count_merge_accuracy_profile.go b/go/distinct_count_merge_accuracy_profile.go new file mode 100644 index 0000000..dffce1d --- /dev/null +++ b/go/distinct_count_merge_accuracy_profile.go @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package main + +import ( + "fmt" + "github.com/apache/datasketches-go/hll" + "math" + "math/rand/v2" + "time" +) + +type DistinctCountMergeAccuracyProfile struct { + config distinctCountJobConfigType + startTime int64 +} + +func NewDistinctCountMergeAccuracyProfile(config distinctCountJobConfigType) *DistinctCountMergeAccuracyProfile { + return &DistinctCountMergeAccuracyProfile{ + config: config, + startTime: time.Now().UnixMilli(), + } +} + +func (d *DistinctCountMergeAccuracyProfile) run() { + key := rand.Int64() + trueCount := d.config.numSketches * d.config.distinctKeysPerSketch + + var ( + sumEstimates float64 + sumOfSquaredDeviationsFromTrueCount float64 + ) + + for t := 0; t < d.config.numTrials; t++ { + union, _ := hll.NewUnion(d.config.lgK) + + for s := 0; s < d.config.numSketches; s++ { + sk, _ := hll.NewHllSketch(d.config.lgK, hll.TgtHllTypeHll8) + for k := 0; k < d.config.distinctKeysPerSketch; k++ { + sk.UpdateInt64(key) + key += 1 + } + union.UpdateSketch(sk) + } + skRes, _ := union.GetResult(hll.TgtHllTypeDefault) + estimatedCount, _ := skRes.GetEstimate() + sumEstimates += estimatedCount + sumOfSquaredDeviationsFromTrueCount += (estimatedCount - float64(trueCount)) * (estimatedCount - float64(trueCount)) + } + + meanEstimate := sumEstimates / float64(d.config.numTrials) + meanRelativeError := meanEstimate/float64(trueCount) - 1 + relativeStandardError := math.Sqrt(sumOfSquaredDeviationsFromTrueCount/float64(d.config.numTrials)) / float64(trueCount) + + fmt.Println(fmt.Sprintf("True count: %d", trueCount)) + fmt.Println(fmt.Sprintf("Mean Estimate: %f", meanEstimate)) + fmt.Println(fmt.Sprintf("Mean Relative Error: %f", meanRelativeError)) + fmt.Println(fmt.Sprintf("Relative Standard Error: %f", relativeStandardError)) +} diff --git a/go/main.go b/go/distinct_count_utils.go similarity index 54% copy from go/main.go copy to go/distinct_count_utils.go index 1ac185f..5093457 100644 --- a/go/main.go +++ b/go/distinct_count_utils.go @@ -14,38 +14,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package main -import ( - "fmt" - "os" -) - -var ( - jobs = map[string]JobProfile{ - "distinct_count_accuracy_profile": NewDistinctCountAccuracyProfile(distinctCountJobConfig), - } -) - -func usage() { - fmt.Println("Usage: go run main.go <job>") - fmt.Println("Available jobs:") - for job := range jobs { - fmt.Println(job) - } - os.Exit(1) +type baseAccuracyStats interface { } -func main() { - if len(os.Args) < 2 || os.Args[1] == "-h" || os.Args[1] == "--help" { - usage() - } +type DistinctCountAccuracyProfileRunner interface { + runTrial(stats []baseAccuracyStats, key uint64) uint64 +} - job, ok := jobs[os.Args[1]] - if !ok { - usage() - } +const ( + M4SD = 0.0000316712418331 //minus 4 StdDev + M3SD = 0.0013498980316301 //minus 3 StdDev + M2SD = 0.0227501319481792 //minus 2 StdDev + M1SD = 0.1586552539314570 //minus 1 StdDev + MED = 0.5 //median + P1SD = 0.8413447460685430 //plus 1 StdDev + P2SD = 0.9772498680518210 //plus 2 StdDev + P3SD = 0.9986501019683700 //plus 3 StdDev + P4SD = 0.9999683287581670 //plus 4 StdDev +) - job.run() -} +var ( + GAUSSIANS_4SD = []float64{0.0, M4SD, M3SD, M2SD, M1SD, MED, P1SD, P2SD, P3SD, P4SD, 1.0} + GAUSSIANS_3SD = []float64{0.0, M3SD, M2SD, M1SD, MED, P1SD, P2SD, P3SD, 1.0} +) diff --git a/go/hll_sketch_accuracy_profile.go b/go/hll_sketch_accuracy_runner.go similarity index 58% rename from go/hll_sketch_accuracy_profile.go rename to go/hll_sketch_accuracy_runner.go index 8d667c9..5dcf374 100644 --- a/go/hll_sketch_accuracy_profile.go +++ b/go/hll_sketch_accuracy_runner.go @@ -18,13 +18,44 @@ package main import ( + "github.com/apache/datasketches-go/common" "github.com/apache/datasketches-go/hll" + "github.com/apache/datasketches-go/kll" ) +// HllSketchAccuracyRunner is A Runner for HLL tracking accuracyStats type HllSketchAccuracyRunner struct { sketch hll.HllSketch } +type accuracyStats struct { + qsk *kll.ItemsSketch[float64] + sumEst float64 + sumRelErr float64 + sumSqRelErr float64 + rmse float64 + trueValue uint64 + uniques int + bytes int +} + +func newAccuracyStats(k int, trueValue uint64) *accuracyStats { + qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) + return &accuracyStats{ + qsk: qsk, + trueValue: trueValue, + uniques: int(trueValue), + } +} + +func (a *accuracyStats) update(est float64) { + a.qsk.Update(est) + a.sumEst += est + a.sumRelErr += est/float64(a.trueValue) - 1.0 + erro := est - float64(a.trueValue) + a.sumSqRelErr += erro * erro +} + func NewHllSketchAccuracyRunner(lgK int, tgtType hll.TgtHllType) *HllSketchAccuracyRunner { sketch, _ := hll.NewHllSketch(lgK, tgtType) return &HllSketchAccuracyRunner{ @@ -32,11 +63,12 @@ func NewHllSketchAccuracyRunner(lgK int, tgtType hll.TgtHllType) *HllSketchAccur } } -func (h *HllSketchAccuracyRunner) runTrial(stats []*accuracyStats, key uint64) uint64 { +func (h *HllSketchAccuracyRunner) runTrial(stats []baseAccuracyStats, key uint64) uint64 { h.sketch.Reset() lastUniques := uint64(0) - for _, stat := range stats { + for _, ostat := range stats { + stat := ostat.(*accuracyStats) delta := stat.trueValue - lastUniques for u := uint64(0); u < delta; u++ { h.sketch.UpdateUInt64(key) diff --git a/go/hll_sketch_bounds_accuracy_runner.go b/go/hll_sketch_bounds_accuracy_runner.go new file mode 100644 index 0000000..f62faf8 --- /dev/null +++ b/go/hll_sketch_bounds_accuracy_runner.go @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/hll" + "github.com/apache/datasketches-go/kll" +) + +// HllSketchBoundsAccuracyRunner is A Runner for HLL tracking boundsAccuracyStats +type HllSketchBoundsAccuracyRunner struct { + sketch hll.HllSketch +} + +type boundsAccuracyStats struct { + qsk *kll.ItemsSketch[float64] + sumLB3 float64 + sumLB2 float64 + sumLB1 float64 + sumUB3 float64 + sumUB2 float64 + sumUB1 float64 + trueValue uint64 +} + +func newBoundsAccuracyStats(k int, trueValue uint64) *boundsAccuracyStats { + qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, common.ArrayOfDoublesSerDe{}) + return &boundsAccuracyStats{ + qsk: qsk, + trueValue: trueValue, + } +} + +func (a *boundsAccuracyStats) update( + est float64, + lb3 float64, + lb2 float64, + lb1 float64, + ub1 float64, + ub2 float64, + ub3 float64, +) { + a.qsk.Update(est) + a.sumLB3 += lb3 + a.sumLB2 += lb2 + a.sumLB1 += lb1 + a.sumUB1 += ub1 + a.sumUB2 += ub2 + a.sumUB3 += ub3 +} + +func NewHllSketchBoundsAccuracyRunner(lgK int, tgtType hll.TgtHllType) *HllSketchBoundsAccuracyRunner { + sketch, _ := hll.NewHllSketch(lgK, tgtType) + return &HllSketchBoundsAccuracyRunner{ + sketch: sketch, + } +} + +func (h *HllSketchBoundsAccuracyRunner) runTrial(stats []baseAccuracyStats, key uint64) uint64 { + h.sketch.Reset() + + lastUniques := uint64(0) + for _, ostat := range stats { + stat := ostat.(*boundsAccuracyStats) + delta := stat.trueValue - lastUniques + for u := uint64(0); u < delta; u++ { + h.sketch.UpdateUInt64(key) + key++ + } + lastUniques += delta + est, _ := h.sketch.GetEstimate() + lb3, _ := h.sketch.GetLowerBound(3) + lb2, _ := h.sketch.GetLowerBound(2) + lb1, _ := h.sketch.GetLowerBound(1) + + ub1, _ := h.sketch.GetUpperBound(1) + ub2, _ := h.sketch.GetUpperBound(2) + ub3, _ := h.sketch.GetUpperBound(3) + + stat.update(est, lb3, lb2, lb1, ub1, ub2, ub3) + } + + return key +} diff --git a/go/main.go b/go/main.go index 1ac185f..eeda1ea 100644 --- a/go/main.go +++ b/go/main.go @@ -19,12 +19,23 @@ package main import ( "fmt" + "github.com/apache/datasketches-go/hll" "os" ) var ( jobs = map[string]JobProfile{ - "distinct_count_accuracy_profile": NewDistinctCountAccuracyProfile(distinctCountJobConfig), + "distinct_count_accuracy_profile": NewDistinctCountAccuracyProfile( + distinctCountJobConfig, + NewHllSketchAccuracyRunner(distinctCountJobConfig.lgK, hll.TgtHllTypeHll8 /* tgtType */), + ), + "distinct_count_bound_accuracy_profile": NewDistinctCountBoundsAccuracyProfile( + distinctCountJobConfig, + NewHllSketchBoundsAccuracyRunner(distinctCountJobConfig.lgK, hll.TgtHllTypeHll8 /* tgtType */), + ), + "distinct_count_merge_accuracy_profile": NewDistinctCountMergeAccuracyProfile( + distinctCountMergeJobConfig, + ), } ) @@ -32,7 +43,7 @@ func usage() { fmt.Println("Usage: go run main.go <job>") fmt.Println("Available jobs:") for job := range jobs { - fmt.Println(job) + fmt.Println(fmt.Sprintf("\t%s", job)) } os.Exit(1) } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
