This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch hll
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git


The following commit(s) were added to refs/heads/hll by this push:
     new 3aa5e38  HLL sketch functions and tests
3aa5e38 is described below

commit 3aa5e387c6eb74d661be7cc54cb27bd5cb9c26b7
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Wed Aug 21 14:55:52 2024 -0700

    HLL sketch functions and tests
---
 hll/README.md                                      |  35 +++++++
 hll/{ => sqxl}/hll_sketch_agg_string.sqlx          |   2 +-
 .../hll_sketch_agg_union.sqlx}                     | 106 +++++++++------------
 hll/{ => sqxl}/hll_sketch_get_estimate.sqlx        |   0
 hll/sqxl/hll_sketch_scalar_union.sqlx              |  62 ++++++++++++
 hll/{ => sqxl}/hll_sketch_to_string.sqlx           |   0
 hll/test/hll_sketch_test.sql                       |  23 +++++
 7 files changed, 164 insertions(+), 64 deletions(-)

diff --git a/hll/README.md b/hll/README.md
new file mode 100644
index 0000000..daf02ba
--- /dev/null
+++ b/hll/README.md
@@ -0,0 +1,35 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+
+# Apache DataSketches functions for Google Cloud BigQuery 
+
+Functions to support HLL sketch and union operations.
+
+## Example
+
+       select t.hll_sketch_get_estimate(
+         t.hll_sketch_scalar_union(
+           (select t.hll_sketch_agg_string(str, struct<int, string>(10, 
"HLL_8")) from unnest(["a", "b", "c"]) as str),
+           (select t.hll_sketch_agg_string(str, struct<int, string>(10, 
"HLL_8")) from unnest(["c", "d", "e"]) as str),
+           10,
+           "HLL_8"
+         )
+       );
+       
+       result: 5
diff --git a/hll/hll_sketch_agg_string.sqlx 
b/hll/sqxl/hll_sketch_agg_string.sqlx
similarity index 99%
copy from hll/hll_sketch_agg_string.sqlx
copy to hll/sqxl/hll_sketch_agg_string.sqlx
index c08d5cf..4855324 100644
--- a/hll/hll_sketch_agg_string.sqlx
+++ b/hll/sqxl/hll_sketch_agg_string.sqlx
@@ -27,7 +27,7 @@ OPTIONS (
   description = '''Creates a sketch that represents the cardinality of the 
given STRING column.
 Param str: the STRING column of identifiers.
 Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 
26].
-Param tgt_type: The HLL type to use, if or when the sketch reaches that state
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state.
 Returns: an HLL Sketch, as bytes, from which the cardinality can be obtained.
 For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
 ) AS R"""
diff --git a/hll/hll_sketch_agg_string.sqlx b/hll/sqxl/hll_sketch_agg_union.sqlx
similarity index 60%
rename from hll/hll_sketch_agg_string.sqlx
rename to hll/sqxl/hll_sketch_agg_union.sqlx
index c08d5cf..2f089ba 100644
--- a/hll/hll_sketch_agg_string.sqlx
+++ b/hll/sqxl/hll_sketch_agg_union.sqlx
@@ -19,16 +19,16 @@
 
 config { hasOutput: true }
 
-CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(str STRING, params STRUCT<lg_k 
BYTEINT, tgt_type STRING> NOT AGGREGATE)
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(sketch BYTES, params 
STRUCT<lg_k BYTEINT, tgt_type STRING> NOT AGGREGATE)
 RETURNS BYTES
 LANGUAGE js
 OPTIONS (
   library=["gs://$GCS_BUCKET/hll_sketch.mjs"],
-  description = '''Creates a sketch that represents the cardinality of the 
given STRING column.
-Param str: the STRING column of identifiers.
+  description = '''Creates a sketch that represents the union of the given 
column of sketches.
+Param sketch: the column of sketches. Each as bytes.
 Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 
26].
-Param tgt_type: The HLL type to use, if or when the sketch reaches that state
-Returns: an HLL Sketch, as bytes, from which the cardinality can be obtained.
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state.
+Returns: an HLL Sketch, as bytes, from which the union cardinality can be 
obtained.
 For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
 ) AS R"""
 import ModuleFactory from "gs://$GCS_BUCKET/hll_sketch.mjs";
@@ -36,19 +36,22 @@ var Module = await ModuleFactory();
 const default_lg_k = Number(12);
 const default_tgt_type = Module.TargetHllType.HLL_4;
 
-function destroyState(state) {
-  if (state.sketch) {
-    state.sketch.delete();
-    state.sketch = null;
-  }
-  if (state.union) {
-    state.union.delete();
-    state.union = null;
+// ensures we have a union
+// if there is a serialized sketch, add it to the union and destroy it
+function ensureUnion(state) {
+  try {
+    if (state.union == null) {
+      state.union = new Module.hll_union(state.lg_k);
+    }
+    if (state.serialized != null) {
+      state.union.updateWithBytes(state.serialized);
+      state.serialized = null;
+    }
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
   }
-  state.serialized = null;
 }
 
-// UDAF interface
 export function initialState(params) {
   try {
     var state = {
@@ -65,55 +68,38 @@ export function initialState(params) {
     } else {
       throw new Error("unrecognized HLL type " + params.tgt_type);
     }
-    state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type);
+    state.union = new Module.hll_union(state.lg_k);
     return state;
   } catch (e) {
     throw new Error(Module.getExceptionMessage(e));
   }
 }
 
-export function aggregate(state, str) {
-  try {
-    if (state.sketch == null) { // for transition deserialize-aggregate
-      state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type);
+export function aggregate(state, sketch) {
+  if (sketch != null) {
+    ensureUnion(state);
+    try {
+      state.union.updateWithBytes(sketch);
+    } catch (e) {
+      throw new Error(Module.getExceptionMessage(e));
     }
-    state.sketch.updateString(str);
-  } catch (e) {
-    throw new Error(Module.getExceptionMessage(e));
   }
 }
 
 export function serialize(state) {
-  if (state.sketch == null) return state; // for transition 
deserialize-serialize
+  if (state.union == null) return state; // for transition 
deserialize-serialize
   try {
-    try {
-      // for prior transition deserialize-aggregate
-      // merge aggregated and serialized state
-      if (state.sketch != null && state.serialized != null) {
-        var u = null;
-        try {
-          u = new Module.hll_union(state.lg_k);
-          u.updateWithSketch(state.sketch);
-          u.updateWithBytes(state.serialized);
-          state.serialized = u.getResultAsUint8Array(state.tgt_type);
-        } finally {
-          if (u != null) u.delete();
-        }
-      } else if (state.sketch != null) {
-        state.serialized = state.sketch.serializeAsUint8Array();
-      } else if (state.union != null) {
-        state.serialized = state.union.getResultAsUint8Array(state.tgt_type);
-      }
-      return {
-        lg_k: state.lg_k,
-        tgt_type: state.tgt_type,
-        serialized: state.serialized
-      };
-    } catch (e) {
-      throw new Error(Module.getExceptionMessage(e));
-    }
+    ensureUnion(state);
+    return {
+      lg_k: state.lg_k,
+      tgt_type: state.tgt_type,
+      serialized: state.union.getResultAsUint8Array(state.tgt_type)
+    };
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
   } finally {
-    destroyState(state);
+    state.union.delete();
+    state.union = null;
   }
 }
 
@@ -122,24 +108,18 @@ export function deserialize(state) {
 }
 
 export function merge(state, other_state) {
-  try {
-    if (state.union == null) {
-      state.union = new Module.hll_union(state.lg_k);
-    }
-    if (state.serialized) {
-      state.union.updateWithBytes(state.serialized);
-      state.serialized = null;
-    }
-    if (other_state.serialized) {
+  ensureUnion(state);
+  if (other_state.serialized != null) {
+    try {
       state.union.updateWithBytes(other_state.serialized);
       other_state.serialized = null;
+    } catch (e) {
+      throw new Error(Module.getExceptionMessage(e));
     }
-  } catch (e) {
-    throw new Error(Module.getExceptionMessage(e));
   }
 }
 
 export function finalize(state) {
-  return serialize(state).serialized
+  return serialize(state).serialized;
 }
 """;
diff --git a/hll/hll_sketch_get_estimate.sqlx 
b/hll/sqxl/hll_sketch_get_estimate.sqlx
similarity index 100%
rename from hll/hll_sketch_get_estimate.sqlx
rename to hll/sqxl/hll_sketch_get_estimate.sqlx
diff --git a/hll/sqxl/hll_sketch_scalar_union.sqlx 
b/hll/sqxl/hll_sketch_scalar_union.sqlx
new file mode 100644
index 0000000..446b343
--- /dev/null
+++ b/hll/sqxl/hll_sketch_scalar_union.sqlx
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE FUNCTION ${self()}(sketchA BYTES, sketchB BYTES, lg_k 
BYTEINT, tgt_type STRING)
+RETURNS BYTES
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/hll_sketch.js"],
+  description = '''Computes a sketch that represents the union of the two 
given sketches.
+Param sketchA: the first sketch as bytes.
+Param sketchB: the second sketch as bytes.
+Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 
26].
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state.
+Returns an HLL Sketch, as bytes, from which the union cardinality can be 
obtained.
+For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
+) AS R"""
+const default_lg_k = 12;
+const default_tgt_type = Module.TargetHllType.HLL_4;
+try {
+  var type;
+  if (tgt_type == null) {
+    type = default_tgt_type;
+  } else if (tgt_type == "HLL_4") {
+    type = Module.TargetHllType.HLL_4;
+  } else if (tgt_type == "HLL_6") {
+    type = Module.TargetHllType.HLL_6;
+  } else if (tgt_type == "HLL_8") {
+    type = Module.TargetHllType.HLL_8;
+  } else {
+    throw new Error("unrecognized HLL type " + tgt_type);
+  }
+  var union = null;
+  try {
+    union = new Module.hll_union(lg_k ? lg_k : default_lg_k);
+    union.updateWithB64(sketchA)
+    union.updateWithB64(sketchB)
+    return union.getResultB64(type);
+  } finally {
+    if (union != null) union.delete();
+  }
+} catch (e) {
+  throw new Error(Module.getExceptionMessage(e));
+}
+""";
diff --git a/hll/hll_sketch_to_string.sqlx b/hll/sqxl/hll_sketch_to_string.sqlx
similarity index 100%
rename from hll/hll_sketch_to_string.sqlx
rename to hll/sqxl/hll_sketch_to_string.sqlx
diff --git a/hll/test/hll_sketch_test.sql b/hll/test/hll_sketch_test.sql
new file mode 100644
index 0000000..29e3533
--- /dev/null
+++ b/hll/test/hll_sketch_test.sql
@@ -0,0 +1,23 @@
+select t.hll_sketch_get_estimate(t.hll_sketch_agg_string(s, struct<int, 
string>(null, null))) from unnest(["a", "b", "c"]) as s;
+
+select t.hll_sketch_get_estimate(
+  t.hll_sketch_scalar_union(
+    (select t.hll_sketch_agg_string(str, struct<int, string>(10, "HLL_8")) 
from unnest(["a", "b", "c"]) as str),
+    (select t.hll_sketch_agg_string(str, struct<int, string>(10, "HLL_8")) 
from unnest(["c", "d", "e"]) as str),
+    10,
+    "HLL_8"
+  )
+);
+
+create or replace table t.hll_sketch(sketch bytes);
+
+insert into t.hll_sketch
+(select t.hll_sketch_agg_string(cast(value as string), struct<int, 
string>(null, null)) from unnest(GENERATE_ARRAY(1, 10000, 1)) as value);
+insert into t.hll_sketch
+(select t.hll_sketch_agg_string(cast(value as string), struct<int, 
string>(null, null)) from unnest(GENERATE_ARRAY(100000, 110000, 1)) as value);
+
+select t.hll_sketch_to_string(
+  t.hll_sketch_agg_union(sketch, struct<int, string>(null, null))
+) from t.hll_sketch;
+
+drop table t.hll_sketch;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to