This is an automated email from the ASF dual-hosted git repository.

placave pushed a commit to branch go-hll-merge-speed-profile
in repository 
https://gitbox.apache.org/repos/asf/datasketches-characterization.git

commit cb87b12fbfd6977d0a6fd9e64d770b825437fd92
Author: Pierre Lacave <[email protected]>
AuthorDate: Mon Apr 29 19:40:59 2024 +0200

    [Go] Add go merge speed profile for HLL and cleanup
---
 go/configs.go                                |  74 -----------
 go/distinct_count_accuracy_profile.go        | 135 ++++++++++++++++---
 go/distinct_count_bounds_accuracy_profile.go | 190 ---------------------------
 go/distinct_count_merge_accuracy_profile.go  |   6 +-
 go/distinct_count_merge_speed_profile.go     | 122 +++++++++++++++++
 go/distinct_count_utils.go                   |  24 ++++
 go/hll_sketch_accuracy_runner.go             |  64 ---------
 go/hll_sketch_bounds_accuracy_runner.go      | 100 --------------
 go/main.go                                   |  39 +++++-
 go/main_test.go                              |   8 +-
 10 files changed, 304 insertions(+), 458 deletions(-)

diff --git a/go/configs.go b/go/configs.go
deleted file mode 100644
index 1e2cbac..0000000
--- a/go/configs.go
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package main
-
-type distinctCountJobConfigType struct {
-       lgK int // lgK of distinct count sketch
-
-       lgMinU int // The starting # of uniques that is printed at the end.
-       lgMaxU int // How high the # uniques go
-       UPPO   int // The horizontal x-resolution of trials points
-
-       lgMinT int // prints intermediate results starting w/ this lgMinT
-       lgMaxT int // The max trials
-       TPPO   int // how often intermediate results are printed
-
-       lgQK      int  // size of quantiles sketch
-       interData bool // intermediate data
-
-       numTrials             int
-       numSketches           int
-       distinctKeysPerSketch int
-}
-
-var (
-       distinctCountJobConfig = distinctCountJobConfigType{
-               lgK: 4,
-
-               lgMinU: 0,
-               lgMaxU: 20,
-               UPPO:   16,
-
-               lgMinT: 8,
-               lgMaxT: 20,
-               TPPO:   1,
-
-               lgQK:      12,
-               interData: true,
-       }
-       distinctCountBoundsJobConfig = distinctCountJobConfigType{
-               lgK: 4,
-
-               lgMinU: 0,
-               lgMaxU: 20,
-               UPPO:   16,
-
-               lgMinT: 10,
-               lgMaxT: 20,
-               TPPO:   1,
-
-               lgQK:      12,
-               interData: true,
-       }
-       distinctCountMergeJobConfig = distinctCountJobConfigType{
-               lgK:                   12,
-               numTrials:             100,
-               numSketches:           8192,
-               distinctKeysPerSketch: 32768,
-       }
-)
diff --git a/go/distinct_count_accuracy_profile.go 
b/go/distinct_count_accuracy_profile.go
index a696fee..0ecc81e 100644
--- a/go/distinct_count_accuracy_profile.go
+++ b/go/distinct_count_accuracy_profile.go
@@ -18,6 +18,9 @@ package main
 
 import (
        "fmt"
+       "github.com/apache/datasketches-go/common"
+       "github.com/apache/datasketches-go/hll"
+       "github.com/apache/datasketches-go/kll"
        "math"
        "strings"
        "time"
@@ -25,20 +28,43 @@ import (
 
 type DistinctCountAccuracyProfile struct {
        config    distinctCountJobConfigType
-       runner    DistinctCountAccuracyProfileRunner
+       sketch    hll.HllSketch
        stats     []baseAccuracyStats
        startTime int64
 }
 
-func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType, runner 
DistinctCountAccuracyProfileRunner) *DistinctCountAccuracyProfile {
+type accuracyStats struct {
+       qsk         *kll.ItemsSketch[float64]
+       sumLB3      float64
+       sumLB2      float64
+       sumLB1      float64
+       sumUB3      float64
+       sumUB2      float64
+       sumUB1      float64
+       sumEst      float64
+       sumRelErr   float64
+       sumSqRelErr float64
+       trueValue   uint64
+}
+
+func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType, 
tgtType hll.TgtHllType) *DistinctCountAccuracyProfile {
+       sketch, _ := hll.NewHllSketch(config.lgK, tgtType)
        return &DistinctCountAccuracyProfile{
                config:    config,
-               runner:    runner,
-               stats:     buildLog2AccuracyStatsArray(config.lgMinU, 
config.lgMaxU, config.UPPO, config.lgQK),
+               sketch:    sketch,
+               stats:     buildLog2AccuracyStatsArray(config.lgMinU, 
config.lgMaxU, config.uppo, config.lgQK),
                startTime: time.Now().UnixMilli(),
        }
 }
 
+func newAccuracyStats(k int, trueValue uint64) *accuracyStats {
+       qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, 
common.ArrayOfDoublesSerDe{})
+       return &accuracyStats{
+               qsk:       qsk,
+               trueValue: trueValue,
+       }
+}
+
 func (d *DistinctCountAccuracyProfile) run() {
        minT := 1 << d.config.lgMinT
        maxT := 1 << d.config.lgMaxT
@@ -53,11 +79,11 @@ func (d *DistinctCountAccuracyProfile) run() {
                if lastTpt == 0 {
                        nextT = minT
                } else {
-                       nextT = int(pwr2SeriesNext(d.config.TPPO, 
uint64(lastTpt)))
+                       nextT = int(pwr2SeriesNext(d.config.tppo, 
uint64(lastTpt)))
                }
                delta := nextT - lastTpt
                for i := 0; i < delta; i++ {
-                       vIn = d.runner.runTrial(d.stats, vIn)
+                       vIn = d.runTrial(vIn)
                }
                lastTpt = nextT
                sb := &strings.Builder{}
@@ -99,13 +125,19 @@ func (d *DistinctCountAccuracyProfile) process(cumTrials 
int, sb *strings.Builde
                q := d.stats[pt].(*accuracyStats)
 
                trueUniques := q.trueValue
-
                meanEst := q.sumEst / float64(cumTrials)
                meanRelErr := q.sumRelErr / float64(cumTrials)
                meanSqErr := q.sumSqRelErr / float64(cumTrials)
                normMeanSqErr := meanSqErr / (float64(trueUniques) * 
float64(trueUniques))
                rmsRelErr := math.Sqrt(normMeanSqErr)
-               q.rmse = rmsRelErr
+
+               relLb3 := q.sumLB3/float64(cumTrials)/float64(trueUniques) - 1.0
+               relLb2 := q.sumLB2/float64(cumTrials)/float64(trueUniques) - 1.0
+               relLb1 := q.sumLB1/float64(cumTrials)/float64(trueUniques) - 1.0
+
+               relUb1 := q.sumUB1/float64(cumTrials)/float64(trueUniques) - 1.0
+               relUb2 := q.sumUB2/float64(cumTrials)/float64(trueUniques) - 1.0
+               relUb3 := q.sumUB3/float64(cumTrials)/float64(trueUniques) - 1.0
 
                sb.WriteString(fmt.Sprintf("%d", trueUniques))
                sb.WriteString("\t")
@@ -122,17 +154,28 @@ func (d *DistinctCountAccuracyProfile) process(cumTrials 
int, sb *strings.Builde
                sb.WriteString(fmt.Sprintf("%d", cumTrials))
                sb.WriteString("\t")
 
-               quants, _ := q.qsk.GetQuantiles(GAUSSIANS_4SD, true)
+               // Quantiles
+               quants, _ := q.qsk.GetQuantiles(GAUSSIANS_3SD, true)
                for i := 0; i < len(quants); i++ {
-                       sb.WriteString(fmt.Sprintf("%e", 
float64(quants[i])/(float64(trueUniques))-1.0))
+                       sb.WriteString(fmt.Sprintf("%e", 
quants[i]/float64(trueUniques)-1.0))
                        sb.WriteString("\t")
                }
 
-               sb.WriteString(fmt.Sprintf("%d", 0))
+               // Bound averages
+               sb.WriteString(fmt.Sprintf("%e", relLb3))
+               sb.WriteString("\t")
+               sb.WriteString(fmt.Sprintf("%e", relLb2))
+               sb.WriteString("\t")
+               sb.WriteString(fmt.Sprintf("%e", relLb1))
                sb.WriteString("\t")
-               sb.WriteString(fmt.Sprintf("%d", 0))
 
+               sb.WriteString(fmt.Sprintf("%e", relUb1))
+               sb.WriteString("\t")
+               sb.WriteString(fmt.Sprintf("%e", relUb2))
+               sb.WriteString("\t")
+               sb.WriteString(fmt.Sprintf("%e", relUb3))
                sb.WriteString("\n")
+
        }
 }
 
@@ -149,8 +192,6 @@ func (d *DistinctCountAccuracyProfile) setHeader(sb 
*strings.Builder) string {
        sb.WriteString("\t")
        sb.WriteString("Min")
        sb.WriteString("\t")
-       sb.WriteString("Q(.0000317)")
-       sb.WriteString("\t")
        sb.WriteString("Q(.00135)")
        sb.WriteString("\t")
        sb.WriteString("Q(.02275)")
@@ -165,17 +206,73 @@ func (d *DistinctCountAccuracyProfile) setHeader(sb 
*strings.Builder) string {
        sb.WriteString("\t")
        sb.WriteString("Q(.99865)")
        sb.WriteString("\t")
-       sb.WriteString("Q(.9999683)")
-       sb.WriteString("\t")
        sb.WriteString("Max")
        sb.WriteString("\t")
-       sb.WriteString("Bytes")
+       sb.WriteString("avgLB3")
+       sb.WriteString("\t")
+       sb.WriteString("avgLB2")
        sb.WriteString("\t")
-       sb.WriteString("ReMerit")
-       sb.WriteString("\n")
+       sb.WriteString("avgLB1")
+       sb.WriteString("\t")
+       sb.WriteString("avgUB1")
+       sb.WriteString("\t")
+       sb.WriteString("avgUB2")
+       sb.WriteString("\t")
+       sb.WriteString("avgUB3")
+       sb.WriteString("\t")
+       sb.WriteString("Max")
        return sb.String()
 }
 
+func (d *DistinctCountAccuracyProfile) runTrial(key uint64) uint64 {
+       d.sketch.Reset()
+
+       lastUniques := uint64(0)
+       for _, ostat := range d.stats {
+               stat := ostat.(*accuracyStats)
+               delta := stat.trueValue - lastUniques
+               for u := uint64(0); u < delta; u++ {
+                       key++
+                       d.sketch.UpdateUInt64(key)
+               }
+               lastUniques += delta
+               est, _ := d.sketch.GetEstimate()
+               lb3, _ := d.sketch.GetLowerBound(3)
+               lb2, _ := d.sketch.GetLowerBound(2)
+               lb1, _ := d.sketch.GetLowerBound(1)
+
+               ub1, _ := d.sketch.GetUpperBound(1)
+               ub2, _ := d.sketch.GetUpperBound(2)
+               ub3, _ := d.sketch.GetUpperBound(3)
+
+               stat.update(est, lb3, lb2, lb1, ub1, ub2, ub3)
+       }
+
+       return key
+}
+
+func (a *accuracyStats) update(
+       est float64,
+       lb3 float64,
+       lb2 float64,
+       lb1 float64,
+       ub1 float64,
+       ub2 float64,
+       ub3 float64,
+) {
+       a.qsk.Update(est)
+       a.sumLB3 += lb3
+       a.sumLB2 += lb2
+       a.sumLB1 += lb1
+       a.sumUB1 += ub1
+       a.sumUB2 += ub2
+       a.sumUB3 += ub3
+       a.sumEst += est
+       a.sumRelErr += est/float64(a.trueValue) - 1.0
+       erro := est - float64(a.trueValue)
+       a.sumSqRelErr += erro * erro
+}
+
 func buildLog2AccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) 
[]baseAccuracyStats {
        qLen := countPoints(lgMin, lgMax, ppo)
        qArr := make([]baseAccuracyStats, qLen)
diff --git a/go/distinct_count_bounds_accuracy_profile.go 
b/go/distinct_count_bounds_accuracy_profile.go
deleted file mode 100644
index 80c072f..0000000
--- a/go/distinct_count_bounds_accuracy_profile.go
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package main
-
-import (
-       "fmt"
-       "strings"
-       "time"
-)
-
-type DistinctCountBoundsAccuracyProfile struct {
-       config    distinctCountJobConfigType
-       runner    DistinctCountAccuracyProfileRunner
-       stats     []baseAccuracyStats
-       startTime int64
-}
-
-func NewDistinctCountBoundsAccuracyProfile(config distinctCountJobConfigType, 
runner DistinctCountAccuracyProfileRunner) *DistinctCountBoundsAccuracyProfile {
-       return &DistinctCountBoundsAccuracyProfile{
-               config:    config,
-               runner:    runner,
-               stats:     buildLog2BoundsAccuracyStatsArray(config.lgMinU, 
config.lgMaxU, config.UPPO, config.lgQK),
-               startTime: time.Now().UnixMilli(),
-       }
-}
-
-func (d *DistinctCountBoundsAccuracyProfile) run() {
-       minT := 1 << d.config.lgMinT
-       maxT := 1 << d.config.lgMaxT
-       maxU := 1 << d.config.lgMaxU
-
-       vIn := uint64(0)
-
-       // This will generate a table of data for each intermediate Trials point
-       lastTpt := 0
-       for lastTpt < maxT {
-               nextT := lastTpt
-               if lastTpt == 0 {
-                       nextT = minT
-               } else {
-                       nextT = int(pwr2SeriesNext(d.config.TPPO, 
uint64(lastTpt)))
-               }
-               delta := nextT - lastTpt
-               for i := 0; i < delta; i++ {
-                       vIn = d.runner.runTrial(d.stats, vIn)
-               }
-               lastTpt = nextT
-               sb := &strings.Builder{}
-               if nextT < maxT {
-                       if d.config.interData {
-                               sb.Reset()
-                               d.setHeader(sb)
-                               d.process(lastTpt, sb)
-                               fmt.Println(sb.String())
-                       }
-               } else {
-                       sb.Reset()
-                       d.setHeader(sb)
-                       d.process(lastTpt, sb)
-                       fmt.Println(sb.String())
-               }
-
-               fmt.Printf("Config:             : %+v\n", d.config)
-               fmt.Printf("Cum Trials          : %d\n", lastTpt)
-               fmt.Printf("Cum Updates         : %d\n", vIn)
-               currentTime_mS := time.Now().UnixMilli()
-               cumTime_mS := currentTime_mS - d.startTime
-               fmt.Printf("Cum Time            : %s\n", 
time.Duration(cumTime_mS*1000*1000))
-               timePerTrial_mS := float64(cumTime_mS) / float64(lastTpt)
-               avgUpdateTime_ns := timePerTrial_mS * 1e6 / float64(maxU)
-               fmt.Printf("Time Per Trial, mSec: %f\n", timePerTrial_mS)
-               fmt.Printf("Avg Update Time, nSec: %f\n", avgUpdateTime_ns)
-               fmt.Printf("Date Time           : %s\n", 
time.Now().Format(time.RFC3339))
-               timeToComplete_mS := int64(timePerTrial_mS * 
float64(maxT-lastTpt))
-               fmt.Printf("Est Time to Complete: %s\n", 
time.Duration(timeToComplete_mS*1000*1000))
-               fmt.Printf("Est Time at Completion: %s\n", 
time.Now().Add(time.Duration(timeToComplete_mS*1000*1000)).Format(time.RFC3339))
-               fmt.Println("")
-       }
-}
-
-func (d *DistinctCountBoundsAccuracyProfile) process(cumTrials int, sb 
*strings.Builder) {
-       points := len(d.stats)
-       for pt := 0; pt < points; pt++ {
-               q := d.stats[pt].(*boundsAccuracyStats)
-
-               trueUniques := q.trueValue
-               relLb3 := q.sumLB3/float64(cumTrials)/float64(trueUniques) - 1.0
-               relLb2 := q.sumLB2/float64(cumTrials)/float64(trueUniques) - 1.0
-               relLb1 := q.sumLB1/float64(cumTrials)/float64(trueUniques) - 1.0
-
-               relLUb1 := q.sumUB1/float64(cumTrials)/float64(trueUniques) - 
1.0
-               relLUb2 := q.sumUB2/float64(cumTrials)/float64(trueUniques) - 
1.0
-               relLUb3 := q.sumUB3/float64(cumTrials)/float64(trueUniques) - 
1.0
-
-               // OUTPUT
-               sb.WriteString(fmt.Sprintf("%d", trueUniques))
-               sb.WriteString("\t")
-               // TRIALS
-               sb.WriteString(fmt.Sprintf("%d", cumTrials))
-               sb.WriteString("\t")
-
-               // Quantiles
-               quants, _ := q.qsk.GetQuantiles(GAUSSIANS_3SD, true)
-               for i := 0; i < len(quants); i++ {
-                       sb.WriteString(fmt.Sprintf("%e", 
quants[i]/float64(trueUniques)-1.0))
-                       sb.WriteString("\t")
-               }
-
-               // Bound averages
-               sb.WriteString(fmt.Sprintf("%e", relLb3))
-               sb.WriteString("\t")
-               sb.WriteString(fmt.Sprintf("%e", relLb2))
-               sb.WriteString("\t")
-               sb.WriteString(fmt.Sprintf("%e", relLb1))
-               sb.WriteString("\t")
-
-               sb.WriteString(fmt.Sprintf("%e", relLUb1))
-               sb.WriteString("\t")
-               sb.WriteString(fmt.Sprintf("%e", relLUb2))
-               sb.WriteString("\t")
-               sb.WriteString(fmt.Sprintf("%e", relLUb3))
-               sb.WriteString("\n")
-
-       }
-}
-
-func (d *DistinctCountBoundsAccuracyProfile) setHeader(sb *strings.Builder) 
string {
-       sb.WriteString("InU")
-       sb.WriteString("\t")
-       sb.WriteString("Trials")
-       sb.WriteString("\t")
-       sb.WriteString("Min")
-       sb.WriteString("\t")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.00135)")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.02275)")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.15866)")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.5)")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.84134)")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.97725)")
-       sb.WriteString("\t")
-       sb.WriteString("Q(.99865)")
-       sb.WriteString("\t")
-       sb.WriteString("\t")
-       sb.WriteString("Max")
-       sb.WriteString("\t")
-       sb.WriteString("avgLB3")
-       sb.WriteString("\t")
-       sb.WriteString("avgLB2")
-       sb.WriteString("\t")
-       sb.WriteString("avgLB1")
-       sb.WriteString("\t")
-       sb.WriteString("avgUB1")
-       sb.WriteString("\t")
-       sb.WriteString("avgUB2")
-       sb.WriteString("\t")
-       sb.WriteString("avgUB3")
-       sb.WriteString("\n")
-       return sb.String()
-}
-
-func buildLog2BoundsAccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) 
[]baseAccuracyStats {
-       qLen := countPoints(lgMin, lgMax, ppo)
-       qArr := make([]baseAccuracyStats, qLen)
-       p := uint64(1) << lgMin
-       for i := 0; i < qLen; i++ {
-               qArr[i] = newBoundsAccuracyStats(1<<lgQK, p)
-               p = pwr2SeriesNext(ppo, p)
-       }
-       return qArr
-}
diff --git a/go/distinct_count_merge_accuracy_profile.go 
b/go/distinct_count_merge_accuracy_profile.go
index dffce1d..6d5ed88 100644
--- a/go/distinct_count_merge_accuracy_profile.go
+++ b/go/distinct_count_merge_accuracy_profile.go
@@ -27,11 +27,13 @@ import (
 type DistinctCountMergeAccuracyProfile struct {
        config    distinctCountJobConfigType
        startTime int64
+       tgtType   hll.TgtHllType
 }
 
-func NewDistinctCountMergeAccuracyProfile(config distinctCountJobConfigType) 
*DistinctCountMergeAccuracyProfile {
+func NewDistinctCountMergeAccuracyProfile(config distinctCountJobConfigType, 
tgtType hll.TgtHllType) *DistinctCountMergeAccuracyProfile {
        return &DistinctCountMergeAccuracyProfile{
                config:    config,
+               tgtType:   tgtType,
                startTime: time.Now().UnixMilli(),
        }
 }
@@ -49,7 +51,7 @@ func (d *DistinctCountMergeAccuracyProfile) run() {
                union, _ := hll.NewUnion(d.config.lgK)
 
                for s := 0; s < d.config.numSketches; s++ {
-                       sk, _ := hll.NewHllSketch(d.config.lgK, 
hll.TgtHllTypeHll8)
+                       sk, _ := hll.NewHllSketch(d.config.lgK, d.tgtType)
                        for k := 0; k < d.config.distinctKeysPerSketch; k++ {
                                sk.UpdateInt64(key)
                                key += 1
diff --git a/go/distinct_count_merge_speed_profile.go 
b/go/distinct_count_merge_speed_profile.go
new file mode 100644
index 0000000..333cfc6
--- /dev/null
+++ b/go/distinct_count_merge_speed_profile.go
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package main
+
+import (
+       "fmt"
+       "github.com/apache/datasketches-go/hll"
+       "strings"
+       "time"
+)
+
+type DistinctCountMergeSpeedProfile struct {
+       config    distinctCountJobConfigType
+       union     hll.Union
+       source    hll.HllSketch
+       startTime int64
+}
+
+type mergeSpeedStats struct {
+       mergeTime_nS float64
+       totalTime_nS float64
+}
+
+func NewDistinctCountMergeSpeedProfile(config distinctCountJobConfigType, 
tgtType hll.TgtHllType) *DistinctCountMergeSpeedProfile {
+       union, _ := hll.NewUnion(21)
+       return &DistinctCountMergeSpeedProfile{
+               config:    config,
+               union:     union,
+               startTime: time.Now().UnixMilli(),
+       }
+}
+
+func (d *DistinctCountMergeSpeedProfile) run() {
+       sb := &strings.Builder{}
+       d.setHeader(sb)
+       fmt.Println(sb.String())
+
+       stats := &mergeSpeedStats{}
+       vIn := int64(0)
+       for lgK := d.config.minLgK; lgK <= d.config.maxLgK; lgK++ {
+               var (
+                       lgT             = d.config.maxLgK - lgK + 
d.config.lgMinT
+                       trials          = 1 << lgT
+                       sumMergeTime_nS = 0.0
+                       sumTotalTime_nS = 0.0
+               )
+               sb.Reset()
+               vIn = d.resetMerge(lgK, vIn)
+               for t := 0; t < trials; t++ {
+                       d.runTrial(stats, lgK, d.config.lgDeltaU)
+                       sumMergeTime_nS += stats.mergeTime_nS
+                       sumTotalTime_nS += stats.totalTime_nS
+               }
+               stats.mergeTime_nS = sumMergeTime_nS / float64(trials)
+               stats.totalTime_nS = sumTotalTime_nS / float64(trials)
+               d.process(stats, lgK, lgT, sb)
+               fmt.Println(sb.String())
+       }
+}
+
+func (d *DistinctCountMergeSpeedProfile) setHeader(sb *strings.Builder) string 
{
+       sb.WriteString("LgK")
+       sb.WriteString("\t")
+       sb.WriteString("LgT")
+       sb.WriteString("\t")
+       sb.WriteString("Merge_nS")
+       sb.WriteString("\t")
+       sb.WriteString("Total_nS")
+       sb.WriteString("\t")
+       sb.WriteString("PerSlot_nS")
+       return sb.String()
+}
+
+func (d *DistinctCountMergeSpeedProfile) runTrial(stats *mergeSpeedStats, lgK 
int, lgDeltaU int) {
+       start := uint64(0)
+       mergeTime_nS := uint64(0)
+
+       start = uint64(time.Now().UnixNano())
+       d.union.UpdateSketch(d.source)
+       mergeTime_nS = uint64(time.Now().UnixNano()) - start
+
+       stats.mergeTime_nS = float64(mergeTime_nS)
+       stats.totalTime_nS = float64(mergeTime_nS)
+}
+func (d *DistinctCountMergeSpeedProfile) process(stats *mergeSpeedStats, lgK 
int, lgT int, sb *strings.Builder) string {
+       sb.WriteString(fmt.Sprintf("%d", lgK))
+       sb.WriteString("\t")
+       sb.WriteString(fmt.Sprintf("%d", lgT))
+       sb.WriteString("\t")
+       sb.WriteString(fmt.Sprintf("%e", stats.mergeTime_nS))
+       sb.WriteString("\t")
+       sb.WriteString(fmt.Sprintf("%e", stats.totalTime_nS))
+       sb.WriteString("\t")
+       sb.WriteString(fmt.Sprintf("%e", 
stats.totalTime_nS/float64(uint64(1<<lgK))))
+       return sb.String()
+}
+
+func (d *DistinctCountMergeSpeedProfile) resetMerge(lgK int, vIn int64) int64 {
+       d.union, _ = hll.NewUnion(lgK)
+       d.source, _ = hll.NewHllSketch(lgK, hll.TgtHllTypeDefault)
+       U := 2 << lgK
+       for i := 0; i < U; i++ {
+               vIn++
+               d.union.UpdateInt64(vIn)
+               d.source.UpdateInt64(vIn)
+       }
+       return vIn
+}
diff --git a/go/distinct_count_utils.go b/go/distinct_count_utils.go
index 5093457..0d3ec4e 100644
--- a/go/distinct_count_utils.go
+++ b/go/distinct_count_utils.go
@@ -23,6 +23,30 @@ type DistinctCountAccuracyProfileRunner interface {
        runTrial(stats []baseAccuracyStats, key uint64) uint64
 }
 
+type distinctCountJobConfigType struct {
+       lgK int // lgK of distinct count sketch
+
+       lgMinU int // The starting # of uniques that is printed at the end.
+       lgMaxU int // How high the # uniques go
+       uppo   int // The horizontal x-resolution of trials points
+
+       lgMinT int // prints intermediate results starting w/ this lgMinT
+       lgMaxT int // The max trials
+       tppo   int // how often intermediate results are printed
+
+       minLgK int // X-axis LgK Profile
+       maxLgK int // X-axis LgK Profile
+
+       lgDeltaU int
+
+       lgQK      int  // size of quantiles sketch
+       interData bool // intermediate data
+
+       numTrials             int
+       numSketches           int
+       distinctKeysPerSketch int
+}
+
 const (
        M4SD = 0.0000316712418331 //minus 4 StdDev
        M3SD = 0.0013498980316301 //minus 3 StdDev
diff --git a/go/hll_sketch_accuracy_runner.go b/go/hll_sketch_accuracy_runner.go
index 5dcf374..d732b5d 100644
--- a/go/hll_sketch_accuracy_runner.go
+++ b/go/hll_sketch_accuracy_runner.go
@@ -17,67 +17,3 @@
 
 package main
 
-import (
-       "github.com/apache/datasketches-go/common"
-       "github.com/apache/datasketches-go/hll"
-       "github.com/apache/datasketches-go/kll"
-)
-
-// HllSketchAccuracyRunner is A Runner for HLL tracking accuracyStats
-type HllSketchAccuracyRunner struct {
-       sketch hll.HllSketch
-}
-
-type accuracyStats struct {
-       qsk         *kll.ItemsSketch[float64]
-       sumEst      float64
-       sumRelErr   float64
-       sumSqRelErr float64
-       rmse        float64
-       trueValue   uint64
-       uniques     int
-       bytes       int
-}
-
-func newAccuracyStats(k int, trueValue uint64) *accuracyStats {
-       qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, 
common.ArrayOfDoublesSerDe{})
-       return &accuracyStats{
-               qsk:       qsk,
-               trueValue: trueValue,
-               uniques:   int(trueValue),
-       }
-}
-
-func (a *accuracyStats) update(est float64) {
-       a.qsk.Update(est)
-       a.sumEst += est
-       a.sumRelErr += est/float64(a.trueValue) - 1.0
-       erro := est - float64(a.trueValue)
-       a.sumSqRelErr += erro * erro
-}
-
-func NewHllSketchAccuracyRunner(lgK int, tgtType hll.TgtHllType) 
*HllSketchAccuracyRunner {
-       sketch, _ := hll.NewHllSketch(lgK, tgtType)
-       return &HllSketchAccuracyRunner{
-               sketch: sketch,
-       }
-}
-
-func (h *HllSketchAccuracyRunner) runTrial(stats []baseAccuracyStats, key 
uint64) uint64 {
-       h.sketch.Reset()
-
-       lastUniques := uint64(0)
-       for _, ostat := range stats {
-               stat := ostat.(*accuracyStats)
-               delta := stat.trueValue - lastUniques
-               for u := uint64(0); u < delta; u++ {
-                       h.sketch.UpdateUInt64(key)
-                       key++
-               }
-               lastUniques += delta
-               est, _ := h.sketch.GetEstimate()
-               stat.update(est)
-       }
-
-       return key
-}
diff --git a/go/hll_sketch_bounds_accuracy_runner.go 
b/go/hll_sketch_bounds_accuracy_runner.go
deleted file mode 100644
index 4cf7760..0000000
--- a/go/hll_sketch_bounds_accuracy_runner.go
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package main
-
-import (
-       "github.com/apache/datasketches-go/common"
-       "github.com/apache/datasketches-go/hll"
-       "github.com/apache/datasketches-go/kll"
-)
-
-// HllSketchBoundsAccuracyRunner is A Runner for HLL tracking 
boundsAccuracyStats
-type HllSketchBoundsAccuracyRunner struct {
-       sketch hll.HllSketch
-}
-
-type boundsAccuracyStats struct {
-       qsk       *kll.ItemsSketch[float64]
-       sumLB3    float64
-       sumLB2    float64
-       sumLB1    float64
-       sumUB3    float64
-       sumUB2    float64
-       sumUB1    float64
-       trueValue uint64
-}
-
-func newBoundsAccuracyStats(k int, trueValue uint64) *boundsAccuracyStats {
-       qsk, _ := kll.NewKllItemsSketch[float64](uint16(k), 8, 
common.ArrayOfDoublesSerDe{})
-       return &boundsAccuracyStats{
-               qsk:       qsk,
-               trueValue: trueValue,
-       }
-}
-
-func (a *boundsAccuracyStats) update(
-       est float64,
-       lb3 float64,
-       lb2 float64,
-       lb1 float64,
-       ub1 float64,
-       ub2 float64,
-       ub3 float64,
-) {
-       a.qsk.Update(est)
-       a.sumLB3 += lb3
-       a.sumLB2 += lb2
-       a.sumLB1 += lb1
-       a.sumUB1 += ub1
-       a.sumUB2 += ub2
-       a.sumUB3 += ub3
-}
-
-func NewHllSketchBoundsAccuracyRunner(lgK int, tgtType hll.TgtHllType) 
*HllSketchBoundsAccuracyRunner {
-       sketch, _ := hll.NewHllSketch(lgK, tgtType)
-       return &HllSketchBoundsAccuracyRunner{
-               sketch: sketch,
-       }
-}
-
-func (h *HllSketchBoundsAccuracyRunner) runTrial(stats []baseAccuracyStats, 
key uint64) uint64 {
-       h.sketch.Reset()
-
-       lastUniques := uint64(0)
-       for _, ostat := range stats {
-               stat := ostat.(*boundsAccuracyStats)
-               delta := stat.trueValue - lastUniques
-               for u := uint64(0); u < delta; u++ {
-                       key++
-                       h.sketch.UpdateUInt64(key)
-               }
-               lastUniques += delta
-               est, _ := h.sketch.GetEstimate()
-               lb3, _ := h.sketch.GetLowerBound(3)
-               lb2, _ := h.sketch.GetLowerBound(2)
-               lb1, _ := h.sketch.GetLowerBound(1)
-
-               ub1, _ := h.sketch.GetUpperBound(1)
-               ub2, _ := h.sketch.GetUpperBound(2)
-               ub3, _ := h.sketch.GetUpperBound(3)
-
-               stat.update(est, lb3, lb2, lb1, ub1, ub2, ub3)
-       }
-
-       return key
-}
diff --git a/go/main.go b/go/main.go
index beb1604..8aebea3 100644
--- a/go/main.go
+++ b/go/main.go
@@ -26,15 +26,40 @@ import (
 var (
        jobs = map[string]JobProfile{
                "distinct_count_accuracy_profile": 
NewDistinctCountAccuracyProfile(
-                       distinctCountJobConfig,
-                       NewHllSketchAccuracyRunner(distinctCountJobConfig.lgK, 
hll.TgtHllTypeHll8 /* tgtType */),
-               ),
-               "distinct_count_bound_accuracy_profile": 
NewDistinctCountBoundsAccuracyProfile(
-                       distinctCountBoundsJobConfig,
-                       
NewHllSketchBoundsAccuracyRunner(distinctCountBoundsJobConfig.lgK, 
hll.TgtHllTypeHll8 /* tgtType */),
+                       distinctCountJobConfigType{
+                               lgK: 11,
+
+                               lgMinU: 0,
+                               lgMaxU: 20,
+                               uppo:   16,
+
+                               lgMinT: 8,
+                               lgMaxT: 20,
+                               tppo:   1,
+
+                               lgQK:      12,
+                               interData: true,
+                       },
+                       hll.TgtHllTypeHll8,
                ),
                "distinct_count_merge_accuracy_profile": 
NewDistinctCountMergeAccuracyProfile(
-                       distinctCountMergeJobConfig,
+                       distinctCountJobConfigType{
+                               lgK:                   12,
+                               numTrials:             100,
+                               numSketches:           8192,
+                               distinctKeysPerSketch: 32768,
+                       },
+                       hll.TgtHllTypeHll8,
+               ),
+               "distinct_count_merge_speed_profile": 
NewDistinctCountMergeSpeedProfile(
+                       distinctCountJobConfigType{
+                               minLgK:   10,
+                               maxLgK:   21,
+                               lgMinT:   11,
+                               lgMaxT:   11,
+                               lgDeltaU: 2,
+                       },
+                       hll.TgtHllTypeHll8,
                ),
        }
 )
diff --git a/go/main_test.go b/go/main_test.go
index f7fd6cd..b07b55d 100644
--- a/go/main_test.go
+++ b/go/main_test.go
@@ -25,6 +25,10 @@ func TestHllSketchAccuracyRunner(t *testing.T) {
        jobs["distinct_count_accuracy_profile"].run()
 }
 
-func TestHllSketchBoundsAccuracyRunner(t *testing.T) {
-       jobs["distinct_count_bound_accuracy_profile"].run()
+func TestHllSketchMergeAccuracyRunner(t *testing.T) {
+       jobs["distinct_count_merge_accuracy_profile"].run()
+}
+
+func TestHllSketchMergeSpeedRunner(t *testing.T) {
+       jobs["distinct_count_merge_speed_profile"].run()
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to