This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch_int64
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git


The following commit(s) were added to refs/heads/tuple_sketch_int64 by this 
push:
     new 6ae5568  tuple sketch build from int64
6ae5568 is described below

commit 6ae55683fd995e3c9240eb0ea7810abe7796ca14
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Fri Sep 13 17:45:16 2024 -0700

    tuple sketch build from int64
---
 tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx       |  36 ++++++
 ...ple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx | 138 +++++++++++++++++++++
 tuple/test/tuple_sketch_int_test.sql               |  26 ++--
 tuple/tuple_sketch_int64.cpp                       |   3 +
 4 files changed, 190 insertions(+), 13 deletions(-)

diff --git a/tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx 
b/tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx
new file mode 100644
index 0000000..bef0e00
--- /dev/null
+++ b/tuple/sqlx/tuple_sketch_int64_agg_int64.sqlx
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(key INT64, value INT64)
+RETURNS BYTES 
+OPTIONS (
+  description = '''Creates a sketch that represents the cardinality of the 
given INT64 column
+with an additional INT64 value associated with each key.
+Multiple values for the same key are aggregated using one of the selectable
+operations: SUM, MIN, MAX. 
+Param key: the INT64 column of identifiers.
+Param value: the INT64 value associated with the key.
+Returns: a Compact Tuple Sketch, as bytes, which can be queried for results
+or used in other sketch operations.
+For more details: 
https://datasketches.apache.org/docs/Tuple/TupleOverview.html'''
+) AS (
+  $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(key, value, 
STRUCT<BYTEINT, INT64, FLOAT64, STRING>(NULL, NULL, NULL, NULL))
+);
diff --git a/tuple/sqlx/tuple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx 
b/tuple/sqlx/tuple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx
new file mode 100644
index 0000000..af375d7
--- /dev/null
+++ b/tuple/sqlx/tuple_sketch_int64_agg_int64_lgk_seed_p_mode.sqlx
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(key INT64, value INT64, params 
STRUCT<lg_k BYTEINT, seed INT64, p FLOAT64, mode STRING> NOT AGGREGATE)
+RETURNS BYTES 
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/tuple_sketch_int64.mjs"],
+  description = '''Creates a sketch that represents the cardinality of the 
given INT64 column
+with an additional INT64 value associated with each key.
+Multiple values for the same key are aggregated using one of the selectable
+operations: SUM, MIN, MAX, ONE. 
+Param key: the INT64 column of identifiers.
+Param value: the INT64 value associated with the key.
+Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 
26].
+Param seed: the seed to be used by the underlying hash function.
+Param p: sampling probability (initial theta). The default is 1, so the sketch 
retains
+all entries until it reaches the limit, at which point it goes into the 
estimation mode
+and reduces the effective sampling probability (theta) as necessary.
+Param mode: aggregate values using SUM, MIN, MAX, ONE (constant 1).
+Returns: a Compact Tuple Sketch, as bytes, which can be queried for results
+or used in other sketch operations.
+For more details: 
https://datasketches.apache.org/docs/Tuple/TupleOverview.html'''
+) AS R"""
+import ModuleFactory from "gs://$GCS_BUCKET/tuple_sketch_int64.mjs";
+var Module = await ModuleFactory();
+const default_lg_k = Number(Module.DEFAULT_LG_K);
+const default_seed = BigInt(Module.DEFAULT_SEED);
+const default_p = 1.0;
+
+function destroyState(state) {
+  if (state.sketch) {
+    state.sketch.delete();
+    state.sketch = null;
+  }
+  if (state.union) {
+    state.union.delete();
+    state.union = null;
+  }
+  state.serialized = null;
+}
+
+// UDAF interface
+export function initialState(params) {
+  var state = {
+    lg_k: params.lg_k == null ? default_lg_k : Number(params.lg_k),
+    seed: params.seed == null ? default_seed : BigInt(params.seed),
+    p: params.p == null ? default_p : params.p,
+    mode: params.mode == null ? "" : params.mode
+  };
+  state.sketch = new Module.update_tuple_sketch_int64(state.lg_k, state.seed, 
state.p, state.mode);
+  return state;
+}
+
+export function aggregate(state, key, value) {
+  if (state.sketch == null) {
+    state.sketch = new Module.update_tuple_sketch_int64(state.lg_k, 
state.seed, state.p, state.mode);
+  }
+  state.sketch.updateInt64(key, value);
+}
+
+export function serialize(state) {
+  if (state.sketch == null) return state; // for transition 
deserialize-serialize
+  try {
+    // for prior transition deserialize-aggregate
+    // merge aggregated and serialized state
+    if (state.sketch != null && state.serialized != null) {
+      var u = null;
+      try {
+        u = new Module.tuple_union_int64(state.lg_k, state.seed, state.mode);
+        u.updateWithUpdateSketch(state.sketch);
+        u.updateWithBytes(state.serialized, state.seed);
+        state.serialized = u.getResultAsUint8Array();
+      } finally {
+        if (u != null) u.delete();
+      }
+    } else if (state.sketch != null) {
+      state.serialized = state.sketch.serializeAsUint8Array();
+    } else if (state.union != null) {
+      state.serialized = state.union.getResultAsUint8Array();
+    }
+    return {
+      lg_k: state.lg_k,
+      seed: state.seed,
+      mode: state.mode,
+      serialized: state.serialized
+    };
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
+  } finally {
+    destroyState(state);
+  }
+}
+
+export function deserialize(state) {
+  return state;
+}
+
+export function merge(state, other_state) {
+  try {
+    if (!state.union) {
+      state.union = new Module.tuple_union_int64(state.lg_k, state.seed, 
state.mode);
+    }
+    if (state.serialized) {
+      state.union.updateWithBytes(state.serialized, state.seed);
+      state.serialized = null;
+    }
+    if (other_state.serialized) {
+      state.union.updateWithBytes(other_state.serialized, state.seed);
+      other_state.serialized = null;
+    }
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
+  }
+}
+
+export function finalize(state) {
+  return serialize(state).serialized
+}
+""";
diff --git a/tuple/test/tuple_sketch_int_test.sql 
b/tuple/test/tuple_sketch_int_test.sql
index f30836b..a1ffd87 100644
--- a/tuple/test/tuple_sketch_int_test.sql
+++ b/tuple/test/tuple_sketch_int_test.sql
@@ -38,24 +38,24 @@ select $BQ_DATASET.tuple_sketch_int64_to_string(
 drop table $BQ_DATASET.tuple_sketch;
 
 # expected 5
-select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed(
-  $BQ_DATASET.tuple_sketch_int64_union_lgk_seed_mode(
-    (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from 
unnest(["a", "b", "c"]) as str),
-    (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from 
unnest(["c", "d", "e"]) as str),
-    10,
-    null,
-    "MIN"
-  ),
-  null
+select $BQ_DATASET.tuple_sketch_int64_get_estimate(
+  $BQ_DATASET.tuple_sketch_int64_union(
+    (select $BQ_DATASET.tuple_sketch_int64_agg_int64(key, 1) from unnest([1, 
2, 3]) as key),
+    (select $BQ_DATASET.tuple_sketch_int64_agg_int64(key, 1) from unnest([3, 
4, 5]) as key)
+  )
 );
 
+# full signatures
 # expected 5
 select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed(
-  $BQ_DATASET.tuple_sketch_int64_union(
-    (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from 
unnest(["a", "b", "c"]) as str),
-    (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from 
unnest(["c", "d", "e"]) as str)
+  $BQ_DATASET.tuple_sketch_int64_union_lgk_seed_mode(
+    (select $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(key, 1, 
STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "MIN")) from unnest([1, 
2, 3]) as key),
+    (select $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(key, 1, 
STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "MIN")) from unnest([3, 
4, 5]) as key),
+    10,
+    111,
+    "MIN"
   ),
-  null
+  111
 );
 
 # expected 1
diff --git a/tuple/tuple_sketch_int64.cpp b/tuple/tuple_sketch_int64.cpp
index 0c59ade..169e121 100644
--- a/tuple/tuple_sketch_int64.cpp
+++ b/tuple/tuple_sketch_int64.cpp
@@ -100,6 +100,9 @@ EMSCRIPTEN_BINDINGS(tuple_sketch_int64) {
     .function("updateString", 
emscripten::optional_override([](update_tuple_sketch_int64& self, const 
std::string& key, Update value) {
       self.update(key, value);
     }))
+    .function("updateInt64", 
emscripten::optional_override([](update_tuple_sketch_int64& self, uint64_t key, 
Update value) {
+      self.update(key, value);
+    }))
     .function("serializeAsUint8Array", emscripten::optional_override([](const 
update_tuple_sketch_int64& self) {
       auto bytes = self.compact().serialize();
       return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), 
bytes.data()));


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to