This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 87aeb2ad78e2106f1d8df84d4d84975c7cde5b5a
Author: Gabor Kaszab <gaborkas...@cloudera.com>
AuthorDate: Thu Jul 30 09:41:00 2020 +0200

    IMPALA-9963: Implement ds_kll_n() function
    
    This function receives a serialized Apache DataSketches KLL sketch
    and returns how many input values were fed into this sketch.
    
    Change-Id: I166e87a468e68e888ac15fca7429ac2552dbb781
    Reviewed-on: http://gerrit.cloudera.org:8080/16259
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/exprs/datasketches-common.h                 |  2 +-
 be/src/exprs/datasketches-functions-ir.cc          | 11 +++++++
 be/src/exprs/datasketches-functions.h              |  5 +++
 common/function-registry/impala_functions.py       |  2 ++
 .../queries/QueryTest/datasketches-kll.test        | 37 ++++++++++++++++++++++
 5 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/be/src/exprs/datasketches-common.h 
b/be/src/exprs/datasketches-common.h
index 7560692..37a6458 100644
--- a/be/src/exprs/datasketches-common.h
+++ b/be/src/exprs/datasketches-common.h
@@ -37,7 +37,7 @@ const int DS_SKETCH_CONFIG = 12;
 /// Logs a common error message saying that sketch deserialization failed.
 void LogSketchDeserializationError(FunctionContext* ctx);
 
-/// Receives a serialized DataSketches sketch  (either Hll or KLL) in
+/// Receives a serialized DataSketches sketch (either Hll or KLL) in
 /// 'serialized_sketch', deserializes it and puts the deserialized sketch into 
'sketch'.
 /// The outgoing 'sketch' will hold the same configs as 'serialized_sketch' 
regardless of
 /// what was provided when it was constructed before this function call. 
Returns false if
diff --git a/be/src/exprs/datasketches-functions-ir.cc 
b/be/src/exprs/datasketches-functions-ir.cc
index d2898bc..b76cbe9 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -59,5 +59,16 @@ FloatVal 
DataSketchesFunctions::DsKllQuantile(FunctionContext* ctx,
   }
 }
 
+BigIntVal DataSketchesFunctions::DsKllN(FunctionContext* ctx,
+    const StringVal& serialized_sketch) {
+  if (serialized_sketch.is_null || serialized_sketch.len == 0) return 
BigIntVal::null();
+  datasketches::kll_sketch<float> sketch;
+  if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+    LogSketchDeserializationError(ctx);
+    return BigIntVal::null();
+  }
+  return sketch.get_n();
+}
+
 }
 
diff --git a/be/src/exprs/datasketches-functions.h 
b/be/src/exprs/datasketches-functions.h
index 143fd69..bd6b76c 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -42,6 +42,11 @@ public:
   /// of [0,1]. Otherwise this function returns error.
   static FloatVal DsKllQuantile(FunctionContext* ctx, const StringVal& 
serialized_sketch,
       const DoubleVal& rank);
+
+  /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL 
sketch. If
+  /// it is not, then the query fails.
+  /// Returns the number of input values fed to 'serialized_sketch'.
+  static BigIntVal DsKllN(FunctionContext* ctx, const StringVal& 
serialized_sketch);
 };
 
 }
diff --git a/common/function-registry/impala_functions.py 
b/common/function-registry/impala_functions.py
index 8398785..fbed357 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -935,6 +935,8 @@ visible_functions = [
       
'_ZN6impala21DataSketchesFunctions13DsHllEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_kll_quantile'], 'FLOAT', ['STRING', 'DOUBLE'],
       
'_ZN6impala21DataSketchesFunctions13DsKllQuantileEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_9DoubleValE'],
+  [['ds_kll_n'], 'BIGINT', ['STRING'],
+      
'_ZN6impala21DataSketchesFunctions6DsKllNEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
 ]
 
 invisible_functions = [
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test 
b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index b7b734b..ee240bf 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -144,3 +144,40 @@ FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
 ---- RESULTS
 100.1999969482422,25000.099609375,50.90000152587891,NULL,50.5,NULL
 ====
+---- QUERY
+# Check that ds_kll_n() returns null for an empty sketch.
+select ds_kll_n(ds_kll_sketch(cast(f2 as float))) from 
functional_parquet.emptytable;
+---- RESULTS
+NULL
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() returns null for a null input.
+select ds_kll_n(c) from functional_parquet.nulltable;
+---- RESULTS
+NULL
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() returns error for strings that are not serialized 
sketches.
+select ds_kll_n(date_string_col) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+select ds_kll_n(float_sketch) from sketch_store where year=2009 and month=1;
+---- RESULTS
+25
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() works on sketches created by Hive.
+select ds_kll_n(f) from kll_sketches_from_hive;
+---- RESULTS
+6
+---- TYPES
+BIGINT
+====

Reply via email to