This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push: new 21918ef IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values 21918ef is described below commit 21918ef18b166021577770cb55b70bb2ccad0213 Author: Adam Tamas <ta...@cloudera.com> AuthorDate: Tue Jul 21 13:53:03 2020 +0200 IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values In Hive empty strings doesn't count as separate values when querying count(distinct) estimates using Apache DataSketches HLL algorithm on strings and varchars. For compatibility's sake Impala should not take it either. Tests: -added extra tests for hll with empty strings Change-Id: Ie7648217bbe2f66b817788f131c062f349b1e9ad Reviewed-on: http://gerrit.cloudera.org:8080/16226 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/exprs/aggregate-functions-ir.cc | 2 +- .../queries/QueryTest/datasketches-hll.test | 31 +++++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc index 09629b8..e3db0cc 100644 --- a/be/src/exprs/aggregate-functions-ir.cc +++ b/be/src/exprs/aggregate-functions-ir.cc @@ -1666,7 +1666,7 @@ void AggregateFunctions::DsHllUpdate(FunctionContext* ctx, const T& src, template <> void AggregateFunctions::DsHllUpdate( FunctionContext* ctx, const StringVal& src, StringVal* dst) { - if (src.is_null) return; + if (src.is_null || src.len == 0) return; DCHECK(!dst->is_null); DCHECK_EQ(dst->len, sizeof(datasketches::hll_sketch)); datasketches::hll_sketch* sketch_ptr = diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test index bb192ca..b55e85f 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test +++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test @@ -141,12 +141,12 @@ select ds_hll_estimate(date_string_col) from functional_parquet.alltypestiny; UDF ERROR: Unable to deserialize sketch. ==== ---- QUERY -# Check that ds_hll_estimate returns null for null inputs. -select ds_hll_estimate(c) from functional_parquet.nulltable; +# Check that ds_hll_estimate returns null for null and empty string inputs. +select ds_hll_estimate(b), ds_hll_estimate(c) from functional_parquet.nulltable; ---- RESULTS -NULL +NULL,NULL ---- TYPES -BIGINT +BIGINT,BIGINT ==== ---- QUERY # Check that sketches made by Hive can be read and used for estimating by Impala. @@ -226,3 +226,26 @@ BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT ---- RESULTS 5,7,6,5,6,8,6,6,NULL ==== +---- QUERY +# IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values +create table empty_string (s string, v varchar(1), c char(1)); +insert into empty_string values + ("", cast("" as varchar(1)), cast("" as char(1))), + ("a", cast("a" as varchar(1)), cast("a" as char(1))), + ("", cast("" as varchar(1)), cast("" as char(1))), + ("b", cast("b" as varchar(1)), cast("b" as char(1))), + ("b", cast("b" as varchar(1)), cast("b" as char(1))); +# Check if HLL works with empty strings. +select + ds_hll_estimate(ds_hll_sketch(s)), + ds_hll_estimate(ds_hll_sketch(v)), + ds_hll_estimate(ds_hll_sketch(c)), + ds_hll_sketch_and_estimate(s), + ds_hll_sketch_and_estimate(v), + ds_hll_sketch_and_estimate(c) +from empty_string +---- RESULTS +2,2,3,2,2,3 +---- TYPES +BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT +====