This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 87aeb2ad78e2106f1d8df84d4d84975c7cde5b5a Author: Gabor Kaszab <gaborkas...@cloudera.com> AuthorDate: Thu Jul 30 09:41:00 2020 +0200 IMPALA-9963: Implement ds_kll_n() function This function receives a serialized Apache DataSketches KLL sketch and returns how many input values were fed into this sketch. Change-Id: I166e87a468e68e888ac15fca7429ac2552dbb781 Reviewed-on: http://gerrit.cloudera.org:8080/16259 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/exprs/datasketches-common.h | 2 +- be/src/exprs/datasketches-functions-ir.cc | 11 +++++++ be/src/exprs/datasketches-functions.h | 5 +++ common/function-registry/impala_functions.py | 2 ++ .../queries/QueryTest/datasketches-kll.test | 37 ++++++++++++++++++++++ 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/be/src/exprs/datasketches-common.h b/be/src/exprs/datasketches-common.h index 7560692..37a6458 100644 --- a/be/src/exprs/datasketches-common.h +++ b/be/src/exprs/datasketches-common.h @@ -37,7 +37,7 @@ const int DS_SKETCH_CONFIG = 12; /// Logs a common error message saying that sketch deserialization failed. void LogSketchDeserializationError(FunctionContext* ctx); -/// Receives a serialized DataSketches sketch (either Hll or KLL) in +/// Receives a serialized DataSketches sketch (either Hll or KLL) in /// 'serialized_sketch', deserializes it and puts the deserialized sketch into 'sketch'. /// The outgoing 'sketch' will hold the same configs as 'serialized_sketch' regardless of /// what was provided when it was constructed before this function call. Returns false if diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc index d2898bc..b76cbe9 100644 --- a/be/src/exprs/datasketches-functions-ir.cc +++ b/be/src/exprs/datasketches-functions-ir.cc @@ -59,5 +59,16 @@ FloatVal DataSketchesFunctions::DsKllQuantile(FunctionContext* ctx, } } +BigIntVal DataSketchesFunctions::DsKllN(FunctionContext* ctx, + const StringVal& serialized_sketch) { + if (serialized_sketch.is_null || serialized_sketch.len == 0) return BigIntVal::null(); + datasketches::kll_sketch<float> sketch; + if (!DeserializeDsSketch(serialized_sketch, &sketch)) { + LogSketchDeserializationError(ctx); + return BigIntVal::null(); + } + return sketch.get_n(); +} + } diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h index 143fd69..bd6b76c 100644 --- a/be/src/exprs/datasketches-functions.h +++ b/be/src/exprs/datasketches-functions.h @@ -42,6 +42,11 @@ public: /// of [0,1]. Otherwise this function returns error. static FloatVal DsKllQuantile(FunctionContext* ctx, const StringVal& serialized_sketch, const DoubleVal& rank); + + /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If + /// it is not, then the query fails. + /// Returns the number of input values fed to 'serialized_sketch'. + static BigIntVal DsKllN(FunctionContext* ctx, const StringVal& serialized_sketch); }; } diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py index 8398785..fbed357 100644 --- a/common/function-registry/impala_functions.py +++ b/common/function-registry/impala_functions.py @@ -935,6 +935,8 @@ visible_functions = [ '_ZN6impala21DataSketchesFunctions13DsHllEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'], [['ds_kll_quantile'], 'FLOAT', ['STRING', 'DOUBLE'], '_ZN6impala21DataSketchesFunctions13DsKllQuantileEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_9DoubleValE'], + [['ds_kll_n'], 'BIGINT', ['STRING'], + '_ZN6impala21DataSketchesFunctions6DsKllNEPN10impala_udf15FunctionContextERKNS1_9StringValE'], ] invisible_functions = [ diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test index b7b734b..ee240bf 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test +++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test @@ -144,3 +144,40 @@ FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT ---- RESULTS 100.1999969482422,25000.099609375,50.90000152587891,NULL,50.5,NULL ==== +---- QUERY +# Check that ds_kll_n() returns null for an empty sketch. +select ds_kll_n(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable; +---- RESULTS +NULL +---- TYPES +BIGINT +==== +---- QUERY +# Check that ds_kll_n() returns null for a null input. +select ds_kll_n(c) from functional_parquet.nulltable; +---- RESULTS +NULL +---- TYPES +BIGINT +==== +---- QUERY +# Check that ds_kll_n() returns error for strings that are not serialized sketches. +select ds_kll_n(date_string_col) from functional_parquet.alltypestiny; +---- CATCH +UDF ERROR: Unable to deserialize sketch +==== +---- QUERY +select ds_kll_n(float_sketch) from sketch_store where year=2009 and month=1; +---- RESULTS +25 +---- TYPES +BIGINT +==== +---- QUERY +# Check that ds_kll_n() works on sketches created by Hive. +select ds_kll_n(f) from kll_sketches_from_hive; +---- RESULTS +6 +---- TYPES +BIGINT +====