This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 21918ef  IMPALA-9942: DataSketches HLL shouldn't take empty strings as 
distinct values
21918ef is described below

commit 21918ef18b166021577770cb55b70bb2ccad0213
Author: Adam Tamas <ta...@cloudera.com>
AuthorDate: Tue Jul 21 13:53:03 2020 +0200

    IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct 
values
    
    In Hive empty strings doesn't count as separate values when querying
    count(distinct) estimates using Apache DataSketches HLL algorithm
    on strings and varchars.
    For compatibility's sake Impala should not take it either.
    
    Tests:
    -added extra tests for hll with empty strings
    
    Change-Id: Ie7648217bbe2f66b817788f131c062f349b1e9ad
    Reviewed-on: http://gerrit.cloudera.org:8080/16226
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/exprs/aggregate-functions-ir.cc             |  2 +-
 .../queries/QueryTest/datasketches-hll.test        | 31 +++++++++++++++++++---
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/be/src/exprs/aggregate-functions-ir.cc 
b/be/src/exprs/aggregate-functions-ir.cc
index 09629b8..e3db0cc 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1666,7 +1666,7 @@ void AggregateFunctions::DsHllUpdate(FunctionContext* 
ctx, const T& src,
 template <>
 void AggregateFunctions::DsHllUpdate(
     FunctionContext* ctx, const StringVal& src, StringVal* dst) {
-  if (src.is_null) return;
+  if (src.is_null || src.len == 0) return;
   DCHECK(!dst->is_null);
   DCHECK_EQ(dst->len, sizeof(datasketches::hll_sketch));
   datasketches::hll_sketch* sketch_ptr =
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test 
b/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test
index bb192ca..b55e85f 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/datasketches-hll.test
@@ -141,12 +141,12 @@ select ds_hll_estimate(date_string_col) from 
functional_parquet.alltypestiny;
 UDF ERROR: Unable to deserialize sketch.
 ====
 ---- QUERY
-# Check that ds_hll_estimate returns null for null inputs.
-select ds_hll_estimate(c) from functional_parquet.nulltable;
+# Check that ds_hll_estimate returns null for null and empty string inputs.
+select ds_hll_estimate(b), ds_hll_estimate(c) from 
functional_parquet.nulltable;
 ---- RESULTS
-NULL
+NULL,NULL
 ---- TYPES
-BIGINT
+BIGINT,BIGINT
 ====
 ---- QUERY
 # Check that sketches made by Hive can be read and used for estimating by 
Impala.
@@ -226,3 +226,26 @@ 
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
 ---- RESULTS
 5,7,6,5,6,8,6,6,NULL
 ====
+---- QUERY
+# IMPALA-9942: DataSketches HLL shouldn't take empty strings as distinct values
+create table empty_string (s string, v varchar(1), c char(1));
+insert into empty_string values
+    ("", cast("" as varchar(1)), cast("" as char(1))),
+    ("a", cast("a" as varchar(1)), cast("a" as char(1))),
+    ("", cast("" as varchar(1)), cast("" as char(1))),
+    ("b", cast("b" as varchar(1)), cast("b" as char(1))),
+    ("b", cast("b" as varchar(1)), cast("b" as char(1)));
+# Check if HLL works with empty strings.
+select
+    ds_hll_estimate(ds_hll_sketch(s)),
+    ds_hll_estimate(ds_hll_sketch(v)),
+    ds_hll_estimate(ds_hll_sketch(c)),
+    ds_hll_sketch_and_estimate(s),
+    ds_hll_sketch_and_estimate(v),
+    ds_hll_sketch_and_estimate(c)
+from empty_string
+---- RESULTS
+2,2,3,2,2,3
+---- TYPES
+BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
+====

Reply via email to