This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch kll in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit f7ea68336c035ee953765e8a0f196ec777d86afc Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue Aug 13 15:23:24 2024 -0700 KLL sketch --- Makefile | 5 +- kll_sketch.cpp | 65 +++++++++++++++++++ kll_sketch_float_build.sqlx | 101 +++++++++++++++++++++++++++++ kll_sketch_float_get_quantile.sqlx | 41 ++++++++++++ kll_sketch_float_get_rank.sqlx | 41 ++++++++++++ kll_sketch_float_merge.sqlx | 101 +++++++++++++++++++++++++++++ Makefile => kll_sketch_float_to_tring.sqlx | 49 ++++++-------- 7 files changed, 374 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index bd6f65f..991417c 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ EMCC=emcc EMCFLAGS=-Idatasketches-cpp/common/include \ -Idatasketches-cpp/theta/include \ -Idatasketches-cpp/cpc/include \ + -Idatasketches-cpp/kll/include \ --no-entry \ -sWASM_BIGINT=1 \ -sEXPORTED_FUNCTIONS=[_malloc,_free] \ @@ -27,7 +28,9 @@ EMCFLAGS=-Idatasketches-cpp/common/include \ -O3 \ --bind -all: theta_sketch.mjs theta_sketch.js theta_sketch.wasm cpc_sketch.mjs cpc_sketch.js cpc_sketch.wasm +all: theta_sketch.mjs theta_sketch.js theta_sketch.wasm \ + cpc_sketch.mjs cpc_sketch.js cpc_sketch.wasm \ + kll_sketch.mjs kll_sketch.js kll_sketch.wasm %.mjs: %.cpp $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@ diff --git a/kll_sketch.cpp b/kll_sketch.cpp new file mode 100644 index 0000000..6bb74e0 --- /dev/null +++ b/kll_sketch.cpp @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <strstream> +#include <emscripten/bind.h> + +#include <kll_sketch.hpp> + +#include "base64.hpp" + +using kll_sketch_float = datasketches::kll_sketch<float>; + +const emscripten::val Uint8Array = emscripten::val::global("Uint8Array"); + +EMSCRIPTEN_BINDINGS(kll_sketch_float) { + emscripten::function("getExceptionMessage", emscripten::optional_override([](intptr_t ptr) { + return std::string(reinterpret_cast<std::exception*>(ptr)->what()); + })); + + emscripten::class_<kll_sketch_float>("kll_sketch_float") + .constructor(emscripten::optional_override([](uint16_t k) { + return new kll_sketch_float(k); + })) + .function("isEmpty", &kll_sketch_float::is_empty) + .function("update", emscripten::optional_override([](kll_sketch_float& self, float value) { + self.update(value); + })) + .function("mergeBytes", emscripten::optional_override([](kll_sketch_float& self, const std::string& bytes) { + self.merge(kll_sketch_float::deserialize(bytes.data(), bytes.size())); + }), emscripten::allow_raw_pointers()) + .function("mergeBuffer", emscripten::optional_override([](kll_sketch_float& self, intptr_t bytes, size_t size) { + self.merge(kll_sketch_float::deserialize(reinterpret_cast<void*>(bytes), size)); + })) + .function("serializeAsUint8Array", emscripten::optional_override([](const kll_sketch_float& self) { + auto bytes = self.serialize(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); + })) + .class_function("deserializeFromB64", emscripten::optional_override([](const std::string& b64) { + std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); + b64_decode(b64.data(), b64.size(), bytes.data()); + return new kll_sketch_float(kll_sketch_float::deserialize(bytes.data(), bytes.size())); + }), emscripten::allow_raw_pointers()) + .function("getRank", &kll_sketch_float::get_rank) + .function("getQuantile", &kll_sketch_float::get_quantile) + .function("toString", emscripten::optional_override([](const kll_sketch_float& self) { + return std::string(self.to_string()); + })) + ; +} diff --git a/kll_sketch_float_build.sqlx b/kll_sketch_float_build.sqlx new file mode 100644 index 0000000..45acde9 --- /dev/null +++ b/kll_sketch_float_build.sqlx @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CREATE OR REPLACE AGGREGATE FUNCTION `$BQ_PROJECT.$BQ_DATASET`.kll_sketch_float_build(value FLOAT64, k INT NOT AGGREGATE) +RETURNS BYTES +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/kll_sketch.mjs"], + description = '''Creates a sketch that represents the distribution of the given column. +Param value: the column of values. +Param k: the sketch accuracy/size parameter as an integer in the range [8, 65535]. +Returns a KLL Sketch, as bytes. +For more details: https://datasketches.apache.org/docs/KLL/KLLSketch.html''' +) AS R""" +import ModuleFactory from "gs://$GCS_BUCKET/kll_sketch.mjs"; +var Module = await ModuleFactory(); +const default_k = Number(200); + +// UDAF interface +export function initialState(k) { + try { + var state = { + k: k == null ? default_k : Number(k), + }; + state.sketch = new Module.kll_sketch_float(state.k); + return state; + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function aggregate(state, value) { + try { + if (state.sketch == null) { // for transition deserialize-aggregate + state.sketch = new Module.kll_sketch_float(state.k); + } + state.sketch.update(value); + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function serialize(state) { + if (state.sketch == null) return state; // for transition deserialize-serialize + try { + // for prior transition deserialize-aggregate + // merge aggregated and serialized state + if (state.sketch != null && state.serialized != null) { + sketch.mergeBytes(state.serialized); + } + return { + k: state.k, + serialized: state.sketch.serializeAsUint8Array() + }; + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } finally { + state.sketch.delete(); + } +} + +export function deserialize(serialized) { + return serialized; +} + +export function merge(state, other_state) { + try { + if (state.sketch == null) { + state.sketch = new Module.kll_sketch_float(state.k); + } + if (state.serialized != null) { + state.sketch.mergeBytes(state.serialized); + state.serialized = null; + } + if (other_state.serialized != null) { + state.sketch.mergeBytes(other_state.serialized); + other_state.serialized = null; + } + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function finalize(state) { + return serialize(state).serialized; +} +"""; diff --git a/kll_sketch_float_get_quantile.sqlx b/kll_sketch_float_get_quantile.sqlx new file mode 100644 index 0000000..04c0a2c --- /dev/null +++ b/kll_sketch_float_get_quantile.sqlx @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CREATE OR REPLACE FUNCTION `$BQ_PROJECT.$BQ_DATASET`.kll_sketch_float_get_quantile(sketch BYTES, rank FLOAT64, inclusive BOOL) +RETURNS FLOAT64 +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/kll_sketch.js"], + description = '''Returns a value from the sketch that is the best approximation to a value from the original stream with the given rank. +Param sketch: the given sketch in serialized form. +Param rank: rank of a value in the hypothetical sorted stream. +Param inclusive: if true, the given rank is considered inclusive (includes weight of a value) +Returns an approximate quantile associated with the given rank. +For more details: https://datasketches.apache.org/docs/KLL/KLLSketch.html''' +) AS R""" +try { + var sketch = Module.kll_sketch_float.deserializeFromB64(sketch); + try { + if (sketch.isEmpty()) return null; + return sketch.getRank(rank, inclusive); + } finally { + sketch.delete(); + } +} catch (e) { + throw new Error(Module.getExceptionMessage(e)); +} +"""; diff --git a/kll_sketch_float_get_rank.sqlx b/kll_sketch_float_get_rank.sqlx new file mode 100644 index 0000000..796717e --- /dev/null +++ b/kll_sketch_float_get_rank.sqlx @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CREATE OR REPLACE FUNCTION `$BQ_PROJECT.$BQ_DATASET`.kll_sketch_float_get_rank(sketch BYTES, value FLOAT64, inclusive BOOL) +RETURNS FLOAT64 +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/kll_sketch.js"], + description = '''Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive. +Param sketch: the given sketch in serialized form. +Param value: value to be ranked. +Param inclusive: if true the weight of the given value is included into the rank. +Returns an approximate rank of the given value. +For more details: https://datasketches.apache.org/docs/KLL/KLLSketch.html''' +) AS R""" +try { + var sketch = Module.kll_sketch_float.deserializeFromB64(sketch); + try { + if (sketch.isEmpty()) return null; + return sketch.getQuantile(value, inclusive); + } finally { + sketch.delete(); + } +} catch (e) { + throw new Error(Module.getExceptionMessage(e)); +} +"""; diff --git a/kll_sketch_float_merge.sqlx b/kll_sketch_float_merge.sqlx new file mode 100644 index 0000000..78f4da7 --- /dev/null +++ b/kll_sketch_float_merge.sqlx @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CREATE OR REPLACE AGGREGATE FUNCTION `$BQ_PROJECT.$BQ_DATASET`.kll_sketch_float_merge(sketch BYTES, k INT NOT AGGREGATE) +RETURNS BYTES +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/kll_sketch.mjs"], + description = '''Merges sketches from the given column. +Param sketch: the column of values. +Param k: the sketch accuracy/size parameter as an integer in the range [8, 65535]. +Returns a serialized KLL sketch as bytes. +For more details: https://datasketches.apache.org/docs/KLL/KLLSketch.html''' +) AS R""" +import ModuleFactory from "gs://$GCS_BUCKET/kll_sketch.mjs"; +var Module = await ModuleFactory(); +const default_k = Number(200); + +// UDAF interface +export function initialState(k) { + try { + var state = { + k: k == null ? default_k : Number(k), + }; + state.sketch = new Module.kll_sketch_float(state.k); + return state; + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function aggregate(state, sketch) { + try { + if (state.sketch == null) { // for transition deserialize-aggregate + state.sketch = new Module.kll_sketch_float(state.k); + } + state.sketch.mergeBytes(sketch); + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function serialize(state) { + if (state.sketch == null) return state; // for transition deserialize-serialize + try { + // for prior transition deserialize-aggregate + // merge aggregated and serialized state + if (state.sketch != null && state.serialized != null) { + sketch.mergeBytes(state.serialized); + } + return { + k: state.k, + serialized: state.sketch.serializeAsUint8Array() + }; + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } finally { + state.sketch.delete(); + } +} + +export function deserialize(serialized) { + return serialized; +} + +export function merge(state, other_state) { + try { + if (state.sketch == null) { + state.sketch = new Module.kll_sketch_float(state.k); + } + if (state.serialized != null) { + state.sketch.mergeBytes(state.serialized); + state.serialized = null; + } + if (other_state.serialized != null) { + state.sketch.mergeBytes(other_state.serialized); + other_state.serialized = null; + } + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function finalize(state) { + return serialize(state).serialized; +} +"""; diff --git a/Makefile b/kll_sketch_float_to_tring.sqlx similarity index 53% copy from Makefile copy to kll_sketch_float_to_tring.sqlx index bd6f65f..d3c3384 100644 --- a/Makefile +++ b/kll_sketch_float_to_tring.sqlx @@ -15,31 +15,24 @@ # specific language governing permissions and limitations # under the License. -EMCC=emcc -EMCFLAGS=-Idatasketches-cpp/common/include \ - -Idatasketches-cpp/theta/include \ - -Idatasketches-cpp/cpc/include \ - --no-entry \ - -sWASM_BIGINT=1 \ - -sEXPORTED_FUNCTIONS=[_malloc,_free] \ - -sENVIRONMENT=shell \ - -sTOTAL_MEMORY=1024MB \ - -O3 \ - --bind - -all: theta_sketch.mjs theta_sketch.js theta_sketch.wasm cpc_sketch.mjs cpc_sketch.js cpc_sketch.wasm - -%.mjs: %.cpp - $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@ - -# this rule creates a non-es6 loadable library -%.js: %.cpp - $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@ - -%.wasm: %.cpp - $(EMCC) $< $(EMCFLAGS) -sSTANDALONE_WASM=1 -o $@ - -clean: - $(RM) *.mjs *.js *.wasm - -.PHONY: clean +CREATE OR REPLACE FUNCTION `$BQ_PROJECT.$BQ_DATASET`.kll_sketch_float_to_string(base64 BYTES) +RETURNS STRING +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/kll_sketch.js"], + description = '''Returns a summary string that represents the state of the given sketch. +Param base64 the given sketch as base64 encoded bytes. +Returns a string that represents the state of the given sketch. +For more details: https://datasketches.apache.org/docs/KLL/KLLSketch.html''' +) AS R""" +try { + var sketch = Module.kll_sketch_float.deserializeFromB64(base64); + try { + return sketch.toString(); + } finally { + sketch.delete(); + } +} catch (e) { + throw new Error(Module.getExceptionMessage(e)); +} +"""; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
