This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch hll
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
The following commit(s) were added to refs/heads/hll by this push:
new 3aa5e38 HLL sketch functions and tests
3aa5e38 is described below
commit 3aa5e387c6eb74d661be7cc54cb27bd5cb9c26b7
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Wed Aug 21 14:55:52 2024 -0700
HLL sketch functions and tests
---
hll/README.md | 35 +++++++
hll/{ => sqxl}/hll_sketch_agg_string.sqlx | 2 +-
.../hll_sketch_agg_union.sqlx} | 106 +++++++++------------
hll/{ => sqxl}/hll_sketch_get_estimate.sqlx | 0
hll/sqxl/hll_sketch_scalar_union.sqlx | 62 ++++++++++++
hll/{ => sqxl}/hll_sketch_to_string.sqlx | 0
hll/test/hll_sketch_test.sql | 23 +++++
7 files changed, 164 insertions(+), 64 deletions(-)
diff --git a/hll/README.md b/hll/README.md
new file mode 100644
index 0000000..daf02ba
--- /dev/null
+++ b/hll/README.md
@@ -0,0 +1,35 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Apache DataSketches functions for Google Cloud BigQuery
+
+Functions to support HLL sketch and union operations.
+
+## Example
+
+ select t.hll_sketch_get_estimate(
+ t.hll_sketch_scalar_union(
+ (select t.hll_sketch_agg_string(str, struct<int, string>(10,
"HLL_8")) from unnest(["a", "b", "c"]) as str),
+ (select t.hll_sketch_agg_string(str, struct<int, string>(10,
"HLL_8")) from unnest(["c", "d", "e"]) as str),
+ 10,
+ "HLL_8"
+ )
+ );
+
+ result: 5
diff --git a/hll/hll_sketch_agg_string.sqlx
b/hll/sqxl/hll_sketch_agg_string.sqlx
similarity index 99%
copy from hll/hll_sketch_agg_string.sqlx
copy to hll/sqxl/hll_sketch_agg_string.sqlx
index c08d5cf..4855324 100644
--- a/hll/hll_sketch_agg_string.sqlx
+++ b/hll/sqxl/hll_sketch_agg_string.sqlx
@@ -27,7 +27,7 @@ OPTIONS (
description = '''Creates a sketch that represents the cardinality of the
given STRING column.
Param str: the STRING column of identifiers.
Param lg_k: the sketch accuracy/size parameter as an integer in the range [4,
26].
-Param tgt_type: The HLL type to use, if or when the sketch reaches that state
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state.
Returns: an HLL Sketch, as bytes, from which the cardinality can be obtained.
For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
) AS R"""
diff --git a/hll/hll_sketch_agg_string.sqlx b/hll/sqxl/hll_sketch_agg_union.sqlx
similarity index 60%
rename from hll/hll_sketch_agg_string.sqlx
rename to hll/sqxl/hll_sketch_agg_union.sqlx
index c08d5cf..2f089ba 100644
--- a/hll/hll_sketch_agg_string.sqlx
+++ b/hll/sqxl/hll_sketch_agg_union.sqlx
@@ -19,16 +19,16 @@
config { hasOutput: true }
-CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(str STRING, params STRUCT<lg_k
BYTEINT, tgt_type STRING> NOT AGGREGATE)
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(sketch BYTES, params
STRUCT<lg_k BYTEINT, tgt_type STRING> NOT AGGREGATE)
RETURNS BYTES
LANGUAGE js
OPTIONS (
library=["gs://$GCS_BUCKET/hll_sketch.mjs"],
- description = '''Creates a sketch that represents the cardinality of the
given STRING column.
-Param str: the STRING column of identifiers.
+ description = '''Creates a sketch that represents the union of the given
column of sketches.
+Param sketch: the column of sketches. Each as bytes.
Param lg_k: the sketch accuracy/size parameter as an integer in the range [4,
26].
-Param tgt_type: The HLL type to use, if or when the sketch reaches that state
-Returns: an HLL Sketch, as bytes, from which the cardinality can be obtained.
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state.
+Returns: an HLL Sketch, as bytes, from which the union cardinality can be
obtained.
For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
) AS R"""
import ModuleFactory from "gs://$GCS_BUCKET/hll_sketch.mjs";
@@ -36,19 +36,22 @@ var Module = await ModuleFactory();
const default_lg_k = Number(12);
const default_tgt_type = Module.TargetHllType.HLL_4;
-function destroyState(state) {
- if (state.sketch) {
- state.sketch.delete();
- state.sketch = null;
- }
- if (state.union) {
- state.union.delete();
- state.union = null;
+// ensures we have a union
+// if there is a serialized sketch, add it to the union and destroy it
+function ensureUnion(state) {
+ try {
+ if (state.union == null) {
+ state.union = new Module.hll_union(state.lg_k);
+ }
+ if (state.serialized != null) {
+ state.union.updateWithBytes(state.serialized);
+ state.serialized = null;
+ }
+ } catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
}
- state.serialized = null;
}
-// UDAF interface
export function initialState(params) {
try {
var state = {
@@ -65,55 +68,38 @@ export function initialState(params) {
} else {
throw new Error("unrecognized HLL type " + params.tgt_type);
}
- state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type);
+ state.union = new Module.hll_union(state.lg_k);
return state;
} catch (e) {
throw new Error(Module.getExceptionMessage(e));
}
}
-export function aggregate(state, str) {
- try {
- if (state.sketch == null) { // for transition deserialize-aggregate
- state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type);
+export function aggregate(state, sketch) {
+ if (sketch != null) {
+ ensureUnion(state);
+ try {
+ state.union.updateWithBytes(sketch);
+ } catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
}
- state.sketch.updateString(str);
- } catch (e) {
- throw new Error(Module.getExceptionMessage(e));
}
}
export function serialize(state) {
- if (state.sketch == null) return state; // for transition
deserialize-serialize
+ if (state.union == null) return state; // for transition
deserialize-serialize
try {
- try {
- // for prior transition deserialize-aggregate
- // merge aggregated and serialized state
- if (state.sketch != null && state.serialized != null) {
- var u = null;
- try {
- u = new Module.hll_union(state.lg_k);
- u.updateWithSketch(state.sketch);
- u.updateWithBytes(state.serialized);
- state.serialized = u.getResultAsUint8Array(state.tgt_type);
- } finally {
- if (u != null) u.delete();
- }
- } else if (state.sketch != null) {
- state.serialized = state.sketch.serializeAsUint8Array();
- } else if (state.union != null) {
- state.serialized = state.union.getResultAsUint8Array(state.tgt_type);
- }
- return {
- lg_k: state.lg_k,
- tgt_type: state.tgt_type,
- serialized: state.serialized
- };
- } catch (e) {
- throw new Error(Module.getExceptionMessage(e));
- }
+ ensureUnion(state);
+ return {
+ lg_k: state.lg_k,
+ tgt_type: state.tgt_type,
+ serialized: state.union.getResultAsUint8Array(state.tgt_type)
+ };
+ } catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
} finally {
- destroyState(state);
+ state.union.delete();
+ state.union = null;
}
}
@@ -122,24 +108,18 @@ export function deserialize(state) {
}
export function merge(state, other_state) {
- try {
- if (state.union == null) {
- state.union = new Module.hll_union(state.lg_k);
- }
- if (state.serialized) {
- state.union.updateWithBytes(state.serialized);
- state.serialized = null;
- }
- if (other_state.serialized) {
+ ensureUnion(state);
+ if (other_state.serialized != null) {
+ try {
state.union.updateWithBytes(other_state.serialized);
other_state.serialized = null;
+ } catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
}
- } catch (e) {
- throw new Error(Module.getExceptionMessage(e));
}
}
export function finalize(state) {
- return serialize(state).serialized
+ return serialize(state).serialized;
}
""";
diff --git a/hll/hll_sketch_get_estimate.sqlx
b/hll/sqxl/hll_sketch_get_estimate.sqlx
similarity index 100%
rename from hll/hll_sketch_get_estimate.sqlx
rename to hll/sqxl/hll_sketch_get_estimate.sqlx
diff --git a/hll/sqxl/hll_sketch_scalar_union.sqlx
b/hll/sqxl/hll_sketch_scalar_union.sqlx
new file mode 100644
index 0000000..446b343
--- /dev/null
+++ b/hll/sqxl/hll_sketch_scalar_union.sqlx
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE FUNCTION ${self()}(sketchA BYTES, sketchB BYTES, lg_k
BYTEINT, tgt_type STRING)
+RETURNS BYTES
+LANGUAGE js
+OPTIONS (
+ library=["gs://$GCS_BUCKET/hll_sketch.js"],
+ description = '''Computes a sketch that represents the union of the two
given sketches.
+Param sketchA: the first sketch as bytes.
+Param sketchB: the second sketch as bytes.
+Param lg_k: the sketch accuracy/size parameter as an integer in the range [4,
26].
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state.
+Returns an HLL Sketch, as bytes, from which the union cardinality can be
obtained.
+For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
+) AS R"""
+const default_lg_k = 12;
+const default_tgt_type = Module.TargetHllType.HLL_4;
+try {
+ var type;
+ if (tgt_type == null) {
+ type = default_tgt_type;
+ } else if (tgt_type == "HLL_4") {
+ type = Module.TargetHllType.HLL_4;
+ } else if (tgt_type == "HLL_6") {
+ type = Module.TargetHllType.HLL_6;
+ } else if (tgt_type == "HLL_8") {
+ type = Module.TargetHllType.HLL_8;
+ } else {
+ throw new Error("unrecognized HLL type " + tgt_type);
+ }
+ var union = null;
+ try {
+ union = new Module.hll_union(lg_k ? lg_k : default_lg_k);
+ union.updateWithB64(sketchA)
+ union.updateWithB64(sketchB)
+ return union.getResultB64(type);
+ } finally {
+ if (union != null) union.delete();
+ }
+} catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
+}
+""";
diff --git a/hll/hll_sketch_to_string.sqlx b/hll/sqxl/hll_sketch_to_string.sqlx
similarity index 100%
rename from hll/hll_sketch_to_string.sqlx
rename to hll/sqxl/hll_sketch_to_string.sqlx
diff --git a/hll/test/hll_sketch_test.sql b/hll/test/hll_sketch_test.sql
new file mode 100644
index 0000000..29e3533
--- /dev/null
+++ b/hll/test/hll_sketch_test.sql
@@ -0,0 +1,23 @@
+select t.hll_sketch_get_estimate(t.hll_sketch_agg_string(s, struct<int,
string>(null, null))) from unnest(["a", "b", "c"]) as s;
+
+select t.hll_sketch_get_estimate(
+ t.hll_sketch_scalar_union(
+ (select t.hll_sketch_agg_string(str, struct<int, string>(10, "HLL_8"))
from unnest(["a", "b", "c"]) as str),
+ (select t.hll_sketch_agg_string(str, struct<int, string>(10, "HLL_8"))
from unnest(["c", "d", "e"]) as str),
+ 10,
+ "HLL_8"
+ )
+);
+
+create or replace table t.hll_sketch(sketch bytes);
+
+insert into t.hll_sketch
+(select t.hll_sketch_agg_string(cast(value as string), struct<int,
string>(null, null)) from unnest(GENERATE_ARRAY(1, 10000, 1)) as value);
+insert into t.hll_sketch
+(select t.hll_sketch_agg_string(cast(value as string), struct<int,
string>(null, null)) from unnest(GENERATE_ARRAY(100000, 110000, 1)) as value);
+
+select t.hll_sketch_to_string(
+ t.hll_sketch_agg_union(sketch, struct<int, string>(null, null))
+) from t.hll_sketch;
+
+drop table t.hll_sketch;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]