This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch tuple_sketch_int64
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
The following commit(s) were added to refs/heads/tuple_sketch_int64 by this
push:
new 6ae5568 tuple sketch build from int64
6ae5568 is described below
commit 6ae55683fd995e3c9240eb0ea7810abe7796ca14
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Fri Sep 13 17:45:16 2024 -0700
tuple sketch build from int64
---
tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx | 36 ++++++
...ple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx | 138 +++++++++++++++++++++
tuple/test/tuple_sketch_int_test.sql | 26 ++--
tuple/tuple_sketch_int64.cpp | 3 +
4 files changed, 190 insertions(+), 13 deletions(-)
diff --git a/tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx
b/tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx
new file mode 100644
index 0000000..bef0e00
--- /dev/null
+++ b/tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(key INT64, value INT64)
+RETURNS BYTES
+OPTIONS (
+ description = '''Creates a sketch that represents the cardinality of the
given INT64 column
+with an additional INT64 value associated with each key.
+Multiple values for the same key are aggregated using one of the selectable
+operations: SUM, MIN, MAX.
+Param key: the INT64 column of identifiers.
+Param value: the INT64 value associated with the key.
+Returns: a Compact Tuple Sketch, as bytes, which can be queried for results
+or used in other sketch operations.
+For more details:
https://datasketches.apache.org/docs/Tuple/TupleOverview.html'''
+) AS (
+ $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(key, value,
STRUCT<BYTEINT, INT64, FLOAT64, STRING>(NULL, NULL, NULL, NULL))
+);
diff --git a/tuple/sqlx/tuple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx
b/tuple/sqlx/tuple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx
new file mode 100644
index 0000000..af375d7
--- /dev/null
+++ b/tuple/sqlx/tuple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(key INT64, value INT64, params
STRUCT<lg_k BYTEINT, seed INT64, p FLOAT64, mode STRING> NOT AGGREGATE)
+RETURNS BYTES
+LANGUAGE js
+OPTIONS (
+ library=["gs://$GCS_BUCKET/tuple_sketch_int64.mjs"],
+ description = '''Creates a sketch that represents the cardinality of the
given INT64 column
+with an additional INT64 value associated with each key.
+Multiple values for the same key are aggregated using one of the selectable
+operations: SUM, MIN, MAX, ONE.
+Param key: the INT64 column of identifiers.
+Param value: the INT64 value associated with the key.
+Param lg_k: the sketch accuracy/size parameter as an integer in the range [4,
26].
+Param seed: the seed to be used by the underlying hash function.
+Param p: sampling probability (initial theta). The default is 1, so the sketch
retains
+all entries until it reaches the limit, at which point it goes into the
estimation mode
+and reduces the effective sampling probability (theta) as necessary.
+Param mode: aggregate values using SUM, MIN, MAX, ONE (constant 1).
+Returns: a Compact Tuple Sketch, as bytes, which can be queried for results
+or used in other sketch operations.
+For more details:
https://datasketches.apache.org/docs/Tuple/TupleOverview.html'''
+) AS R"""
+import ModuleFactory from "gs://$GCS_BUCKET/tuple_sketch_int64.mjs";
+var Module = await ModuleFactory();
+const default_lg_k = Number(Module.DEFAULT_LG_K);
+const default_seed = BigInt(Module.DEFAULT_SEED);
+const default_p = 1.0;
+
+function destroyState(state) {
+ if (state.sketch) {
+ state.sketch.delete();
+ state.sketch = null;
+ }
+ if (state.union) {
+ state.union.delete();
+ state.union = null;
+ }
+ state.serialized = null;
+}
+
+// UDAF interface
+export function initialState(params) {
+ var state = {
+ lg_k: params.lg_k == null ? default_lg_k : Number(params.lg_k),
+ seed: params.seed == null ? default_seed : BigInt(params.seed),
+ p: params.p == null ? default_p : params.p,
+ mode: params.mode == null ? "" : params.mode
+ };
+ state.sketch = new Module.update_tuple_sketch_int64(state.lg_k, state.seed,
state.p, state.mode);
+ return state;
+}
+
+export function aggregate(state, key, value) {
+ if (state.sketch == null) {
+ state.sketch = new Module.update_tuple_sketch_int64(state.lg_k,
state.seed, state.p, state.mode);
+ }
+ state.sketch.updateInt64(key, value);
+}
+
+export function serialize(state) {
+ if (state.sketch == null) return state; // for transition
deserialize-serialize
+ try {
+ // for prior transition deserialize-aggregate
+ // merge aggregated and serialized state
+ if (state.sketch != null && state.serialized != null) {
+ var u = null;
+ try {
+ u = new Module.tuple_union_int64(state.lg_k, state.seed, state.mode);
+ u.updateWithUpdateSketch(state.sketch);
+ u.updateWithBytes(state.serialized, state.seed);
+ state.serialized = u.getResultAsUint8Array();
+ } finally {
+ if (u != null) u.delete();
+ }
+ } else if (state.sketch != null) {
+ state.serialized = state.sketch.serializeAsUint8Array();
+ } else if (state.union != null) {
+ state.serialized = state.union.getResultAsUint8Array();
+ }
+ return {
+ lg_k: state.lg_k,
+ seed: state.seed,
+ mode: state.mode,
+ serialized: state.serialized
+ };
+ } catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
+ } finally {
+ destroyState(state);
+ }
+}
+
+export function deserialize(state) {
+ return state;
+}
+
+export function merge(state, other_state) {
+ try {
+ if (!state.union) {
+ state.union = new Module.tuple_union_int64(state.lg_k, state.seed,
state.mode);
+ }
+ if (state.serialized) {
+ state.union.updateWithBytes(state.serialized, state.seed);
+ state.serialized = null;
+ }
+ if (other_state.serialized) {
+ state.union.updateWithBytes(other_state.serialized, state.seed);
+ other_state.serialized = null;
+ }
+ } catch (e) {
+ throw new Error(Module.getExceptionMessage(e));
+ }
+}
+
+export function finalize(state) {
+ return serialize(state).serialized
+}
+""";
diff --git a/tuple/test/tuple_sketch_int_test.sql
b/tuple/test/tuple_sketch_int_test.sql
index f30836b..a1ffd87 100644
--- a/tuple/test/tuple_sketch_int_test.sql
+++ b/tuple/test/tuple_sketch_int_test.sql
@@ -38,24 +38,24 @@ select $BQ_DATASET.tuple_sketch_int64_to_string(
drop table $BQ_DATASET.tuple_sketch;
# expected 5
-select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed(
- $BQ_DATASET.tuple_sketch_int64_union_lgk_seed_mode(
- (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from
unnest(["a", "b", "c"]) as str),
- (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from
unnest(["c", "d", "e"]) as str),
- 10,
- null,
- "MIN"
- ),
- null
+select $BQ_DATASET.tuple_sketch_int64_get_estimate(
+ $BQ_DATASET.tuple_sketch_int64_union(
+ (select $BQ_DATASET.tuple_sketch_int64_agg_int64(key, 1) from unnest([1,
2, 3]) as key),
+ (select $BQ_DATASET.tuple_sketch_int64_agg_int64(key, 1) from unnest([3,
4, 5]) as key)
+ )
);
+# full signatures
# expected 5
select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed(
- $BQ_DATASET.tuple_sketch_int64_union(
- (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from
unnest(["a", "b", "c"]) as str),
- (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from
unnest(["c", "d", "e"]) as str)
+ $BQ_DATASET.tuple_sketch_int64_union_lgk_seed_mode(
+ (select $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(key, 1,
STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "MIN")) from unnest([1,
2, 3]) as key),
+ (select $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(key, 1,
STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "MIN")) from unnest([3,
4, 5]) as key),
+ 10,
+ 111,
+ "MIN"
),
- null
+ 111
);
# expected 1
diff --git a/tuple/tuple_sketch_int64.cpp b/tuple/tuple_sketch_int64.cpp
index 0c59ade..169e121 100644
--- a/tuple/tuple_sketch_int64.cpp
+++ b/tuple/tuple_sketch_int64.cpp
@@ -100,6 +100,9 @@ EMSCRIPTEN_BINDINGS(tuple_sketch_int64) {
.function("updateString",
emscripten::optional_override([](update_tuple_sketch_int64& self, const
std::string& key, Update value) {
self.update(key, value);
}))
+ .function("updateInt64",
emscripten::optional_override([](update_tuple_sketch_int64& self, uint64_t key,
Update value) {
+ self.update(key, value);
+ }))
.function("serializeAsUint8Array", emscripten::optional_override([](const
update_tuple_sketch_int64& self) {
auto bytes = self.compact().serialize();
return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(),
bytes.data()));
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]