This is an automated email from the ASF dual-hosted git repository.

jihoonson pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new a1ea658  Introducing a new config to ignore nulls while computing 
String Cardinality (#12345)
a1ea658 is described below

commit a1ea6581156bf415fe991cebf74182b24d5b2994
Author: somu-imply <[email protected]>
AuthorDate: Tue Mar 29 14:31:36 2022 -0700

    Introducing a new config to ignore nulls while computing String Cardinality 
(#12345)
    
    * Counting nulls in String cardinality with a config
    
    * Adding tests for the new config
    
    * Wrapping the vectorize part to allow backward compatibility
    
    * Adding different tests, cleaning the code and putting the check at the 
proper position, handling hasRow() and hasValue() changes
    
    * Updating testcase and code
    
    * Adding null handling test to improve coverage
    
    * Checkstyle fix
    
    * Adding 1 more change in docs
    
    * Making docs clearer
---
 .../apache/druid/common/config/NullHandling.java   |  20 ++-
 .../common/config/NullValueHandlingConfig.java     |  28 ++++-
 .../druid/common/config/NullHandlingTest.java      |  11 ++
 docs/configuration/index.md                        |   2 +-
 ...ardinalityAggregatorColumnSelectorStrategy.java |   3 +-
 ...ingleValueStringCardinalityVectorProcessor.java |   2 +-
 .../cardinality/CardinalityAggregatorTest.java     | 140 +++++++++++++++++++--
 website/.spelling                                  |   1 +
 8 files changed, 193 insertions(+), 14 deletions(-)

diff --git 
a/core/src/main/java/org/apache/druid/common/config/NullHandling.java 
b/core/src/main/java/org/apache/druid/common/config/NullHandling.java
index bd0f0ee..f7d3146 100644
--- a/core/src/main/java/org/apache/druid/common/config/NullHandling.java
+++ b/core/src/main/java/org/apache/druid/common/config/NullHandling.java
@@ -58,7 +58,13 @@ public class NullHandling
   @VisibleForTesting
   public static void initializeForTests()
   {
-    INSTANCE = new NullValueHandlingConfig(null);
+    INSTANCE = new NullValueHandlingConfig(null, null);
+  }
+
+  @VisibleForTesting
+  public static void initializeForTestsWithValues(Boolean useDefForNull, 
Boolean ignoreNullForString)
+  {
+    INSTANCE = new NullValueHandlingConfig(useDefForNull, ignoreNullForString);
   }
 
   /**
@@ -73,6 +79,18 @@ public class NullHandling
     return INSTANCE.isUseDefaultValuesForNull();
   }
 
+  /**
+   * whether nulls should be counted during String cardinality
+   */
+  public static boolean ignoreNullsForStringCardinality()
+  {
+    // this should only be null in a unit test context, in production this 
will be injected by the null handling module
+    if (INSTANCE == null) {
+      throw new IllegalStateException("NullHandling module not initialized, 
call NullHandling.initializeForTests()");
+    }
+    return INSTANCE.isIgnoreNullsForStringCardinality();
+  }
+
   public static boolean sqlCompatible()
   {
     return !replaceWithDefault();
diff --git 
a/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
 
b/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
index c733f84..fbdc852 100644
--- 
a/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
+++ 
b/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
@@ -26,13 +26,22 @@ public class NullValueHandlingConfig
 {
   public static final String NULL_HANDLING_CONFIG_STRING = 
"druid.generic.useDefaultValueForNull";
 
+  //added to preserve backward compatibility
+  //and not count nulls during cardinality aggrgation over strings
+
+  public static final String NULL_HANDLING_DURING_STRING_CARDINALITY = 
"druid.generic.ignoreNullsForStringCardinality";
+
   @JsonProperty("useDefaultValueForNull")
   private final boolean useDefaultValuesForNull;
 
+  @JsonProperty("ignoreNullsForStringCardinality")
+  private final boolean ignoreNullsForStringCardinality;
+
 
   @JsonCreator
   public NullValueHandlingConfig(
-      @JsonProperty("useDefaultValueForNull") Boolean useDefaultValuesForNull
+      @JsonProperty("useDefaultValueForNull") Boolean useDefaultValuesForNull,
+      @JsonProperty("ignoreNullsForStringCardinality") Boolean 
ignoreNullsForStringCardinality
   )
   {
     if (useDefaultValuesForNull == null) {
@@ -40,6 +49,23 @@ public class NullValueHandlingConfig
     } else {
       this.useDefaultValuesForNull = useDefaultValuesForNull;
     }
+    if (ignoreNullsForStringCardinality == null) {
+      this.ignoreNullsForStringCardinality = 
Boolean.valueOf(System.getProperty(
+          NULL_HANDLING_DURING_STRING_CARDINALITY,
+          "false"
+      ));
+    } else {
+      if (this.useDefaultValuesForNull) {
+        this.ignoreNullsForStringCardinality = ignoreNullsForStringCardinality;
+      } else {
+        this.ignoreNullsForStringCardinality = false;
+      }
+    }
+  }
+
+  public boolean isIgnoreNullsForStringCardinality()
+  {
+    return ignoreNullsForStringCardinality;
   }
 
   public boolean isUseDefaultValuesForNull()
diff --git 
a/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java 
b/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
index 36e3f36..1419c61 100644
--- a/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
+++ b/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
@@ -90,4 +90,15 @@ public class NullHandlingTest
   {
     Assert.assertNull(NullHandling.defaultValueForClass(Object.class));
   }
+
+  @Test
+  public void test_ignoreNullsStrings()
+  {
+    NullHandling.initializeForTestsWithValues(false, true);
+    Assert.assertFalse(NullHandling.ignoreNullsForStringCardinality());
+
+    NullHandling.initializeForTestsWithValues(true, false);
+    Assert.assertFalse(NullHandling.ignoreNullsForStringCardinality());
+
+  }
 }
diff --git a/docs/configuration/index.md b/docs/configuration/index.md
index bc4d10c..6383778 100644
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@@ -761,7 +761,7 @@ Prior to version 0.13.0, Druid string columns treated `''` 
and `null` values as
 |Property|Description|Default|
 |---|---|---|
 |`druid.generic.useDefaultValueForNull`|When set to `true`, `null` values will 
be stored as `''` for string columns and `0` for numeric columns. Set to 
`false` to store and query data in SQL compatible mode.|`true`|
-
+|`druid.generic.ignoreNullsForStringCardinality`|When set to `true`, `null` 
values will be ignored for the built-in cardinality aggregator over string 
columns. Set to `false` to include `null` values while estimating cardinality 
of only string columns using the built-in cardinality aggregator. This setting 
takes effect only when `druid.generic.useDefaultValueForNull` is set to `true` 
and is ignored in SQL compatibility mode. Additionally, empty strings 
(equivalent to null) are not counte [...]
 This mode does have a storage size and query performance cost, see [segment 
documentation](../design/segments.md#sql-compatible-null-handling) for more 
details.
 
 ### HTTP Client
diff --git 
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
 
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
index 9d49e30..cfa68d0 100644
--- 
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
+++ 
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
@@ -39,7 +39,8 @@ public class 
StringCardinalityAggregatorColumnSelectorStrategy implements Cardin
     // SQL standard spec does not count null values,
     // Skip counting null values when we are not replacing null with default 
value.
     // A special value for null in case null handling is configured to use 
empty string for null.
-    if (NullHandling.replaceWithDefault() || s != null) {
+    // check if nulls are to be ignored
+    if ((NullHandling.replaceWithDefault() && 
!NullHandling.ignoreNullsForStringCardinality()) || s != null) {
       
collector.add(CardinalityAggregator.HASH_FUNCTION.hashUnencodedChars(nullToSpecial(s)).asBytes());
     }
   }
diff --git 
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
 
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
index 080ce04..e9173d8 100644
--- 
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
+++ 
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
@@ -51,6 +51,7 @@ public class SingleValueStringCardinalityVectorProcessor 
implements CardinalityV
 
       final HyperLogLogCollector collector = 
HyperLogLogCollector.makeCollector(buf);
 
+
       for (int i = startRow; i < endRow; i++) {
         final String value = selector.lookupName(vector[i]);
         
StringCardinalityAggregatorColumnSelectorStrategy.addStringToCollector(collector,
 value);
@@ -74,7 +75,6 @@ public class SingleValueStringCardinalityVectorProcessor 
implements CardinalityV
 
       for (int i = 0; i < numRows; i++) {
         final String s = selector.lookupName(vector[rows != null ? rows[i] : 
i]);
-
         if (NullHandling.replaceWithDefault() || s != null) {
           final int position = positions[i] + positionOffset;
           buf.limit(position + 
HyperLogLogCollector.getLatestNumBytesForDenseStorage());
diff --git 
a/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
 
b/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
index 65631c5..5d87586 100644
--- 
a/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
+++ 
b/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
@@ -381,12 +381,12 @@ public class CardinalityAggregatorTest
   @Test
   public void testAggregateRows()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     CardinalityAggregator agg = new CardinalityAggregator(
         dimInfoList,
         true
     );
 
-
     for (int i = 0; i < VALUES1.size(); ++i) {
       aggregate(selectorList, agg);
     }
@@ -397,6 +397,7 @@ public class CardinalityAggregatorTest
   @Test
   public void testAggregateValues()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     CardinalityAggregator agg = new CardinalityAggregator(
         dimInfoList,
         false
@@ -405,13 +406,22 @@ public class CardinalityAggregatorTest
     for (int i = 0; i < VALUES1.size(); ++i) {
       aggregate(selectorList, agg);
     }
-    Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0, 
(Double) valueAggregatorFactory.finalizeComputation(agg.get()), 0.05);
-    Assert.assertEquals(NullHandling.replaceWithDefault() ? 7L : 6L, 
rowAggregatorFactoryRounded.finalizeComputation(agg.get()));
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 7.0 : 6.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg.get()),
+        0.05
+    );
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 7L : 6L,
+        rowAggregatorFactoryRounded.finalizeComputation(agg.get())
+    );
+
   }
 
   @Test
   public void testBufferAggregateRows()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     CardinalityBufferAggregator agg = new CardinalityBufferAggregator(
         dimInfoList.toArray(new ColumnSelectorPlus[0]),
         true
@@ -434,6 +444,7 @@ public class CardinalityAggregatorTest
   @Test
   public void testBufferAggregateValues()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     CardinalityBufferAggregator agg = new CardinalityBufferAggregator(
         dimInfoList.toArray(new ColumnSelectorPlus[0]),
         false
@@ -449,13 +460,21 @@ public class CardinalityAggregatorTest
     for (int i = 0; i < VALUES1.size(); ++i) {
       bufferAggregate(selectorList, agg, buf, pos);
     }
-    Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0, 
(Double) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)), 0.05);
-    Assert.assertEquals(NullHandling.replaceWithDefault() ? 7L : 6L, 
rowAggregatorFactoryRounded.finalizeComputation(agg.get(buf, pos)));
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 7.0 : 6.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)),
+        0.05
+    );
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 7L : 6L,
+        rowAggregatorFactoryRounded.finalizeComputation(agg.get(buf, pos))
+    );
   }
 
   @Test
   public void testCombineRows()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     List<DimensionSelector> selector1 = Collections.singletonList(dim1);
     List<DimensionSelector> selector2 = Collections.singletonList(dim2);
     List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>> 
dimInfo1 = Collections.singletonList(
@@ -501,6 +520,7 @@ public class CardinalityAggregatorTest
   @Test
   public void testCombineValues()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     List<DimensionSelector> selector1 = Collections.singletonList(dim1);
     List<DimensionSelector> selector2 = Collections.singletonList(dim2);
 
@@ -528,10 +548,16 @@ public class CardinalityAggregatorTest
     for (int i = 0; i < VALUES2.size(); ++i) {
       aggregate(selector2, agg2);
     }
-
-    Assert.assertEquals(NullHandling.replaceWithDefault() ? 4.0 : 3.0, 
(Double) valueAggregatorFactory.finalizeComputation(agg1.get()), 0.05);
-    Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0, 
(Double) valueAggregatorFactory.finalizeComputation(agg2.get()), 0.05);
-
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 4.0 : 3.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg1.get()),
+        0.05
+    );
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 7.0 : 6.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg2.get()),
+        0.05
+    );
     Assert.assertEquals(
         NullHandling.replaceWithDefault() ? 7.0 : 6.0,
         (Double) rowAggregatorFactory.finalizeComputation(
@@ -547,6 +573,7 @@ public class CardinalityAggregatorTest
   @Test
   public void testAggregateRowsWithExtraction()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     CardinalityAggregator agg = new CardinalityAggregator(
         dimInfoListWithExtraction,
         true
@@ -569,6 +596,7 @@ public class CardinalityAggregatorTest
   @Test
   public void testAggregateValuesWithExtraction()
   {
+    NullHandling.initializeForTestsWithValues(null, null);
     CardinalityAggregator agg = new CardinalityAggregator(
         dimInfoListWithExtraction,
         false
@@ -635,4 +663,98 @@ public class CardinalityAggregatorTest
         objectMapper.readValue(objectMapper.writeValueAsString(factory2), 
AggregatorFactory.class)
     );
   }
+
+  //ignoreNullsForStringCardinality tests
+  @Test
+  public void testAggregateRowsIgnoreNulls()
+  {
+    NullHandling.initializeForTestsWithValues(null, true);
+    CardinalityAggregator agg = new CardinalityAggregator(
+        dimInfoList,
+        true
+    );
+
+    for (int i = 0; i < VALUES1.size(); ++i) {
+      aggregate(selectorList, agg);
+    }
+    Assert.assertEquals(9.0, (Double) 
rowAggregatorFactory.finalizeComputation(agg.get()), 0.05);
+    Assert.assertEquals(9L, 
rowAggregatorFactoryRounded.finalizeComputation(agg.get()));
+  }
+
+  @Test
+  public void testAggregateValuesIgnoreNulls()
+  {
+    NullHandling.initializeForTestsWithValues(null, true);
+    CardinalityAggregator agg = new CardinalityAggregator(
+        dimInfoList,
+        false
+    );
+
+    for (int i = 0; i < VALUES1.size(); ++i) {
+      aggregate(selectorList, agg);
+    }
+    //setting is not applied when druid.generic.useDefaultValueForNull=false
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 6.0 : 6.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg.get()),
+        0.05
+    );
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 6L : 6L,
+        rowAggregatorFactoryRounded.finalizeComputation(agg.get())
+    );
+  }
+
+  @Test
+  public void testCombineValuesIgnoreNulls()
+  {
+    NullHandling.initializeForTestsWithValues(null, true);
+    List<DimensionSelector> selector1 = Collections.singletonList(dim1);
+    List<DimensionSelector> selector2 = Collections.singletonList(dim2);
+
+    List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>> 
dimInfo1 = Collections.singletonList(
+        new ColumnSelectorPlus<>(
+            dimSpec1.getDimension(),
+            dimSpec1.getOutputName(),
+            new StringCardinalityAggregatorColumnSelectorStrategy(), dim1
+        )
+    );
+    List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>> 
dimInfo2 = Collections.singletonList(
+        new ColumnSelectorPlus<>(
+            dimSpec1.getDimension(),
+            dimSpec1.getOutputName(),
+            new StringCardinalityAggregatorColumnSelectorStrategy(), dim2
+        )
+    );
+
+    CardinalityAggregator agg1 = new CardinalityAggregator(dimInfo1, false);
+    CardinalityAggregator agg2 = new CardinalityAggregator(dimInfo2, false);
+
+    for (int i = 0; i < VALUES1.size(); ++i) {
+      aggregate(selector1, agg1);
+    }
+    for (int i = 0; i < VALUES2.size(); ++i) {
+      aggregate(selector2, agg2);
+    }
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 3.0 : 3.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg1.get()),
+        0.05
+    );
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 6.0 : 6.0,
+        (Double) valueAggregatorFactory.finalizeComputation(agg2.get()),
+        0.05
+    );
+    Assert.assertEquals(
+        NullHandling.replaceWithDefault() ? 6.0 : 6.0,
+        (Double) rowAggregatorFactory.finalizeComputation(
+            rowAggregatorFactory.combine(
+                agg1.get(),
+                agg2.get()
+            )
+        ),
+        0.05
+    );
+  }
 }
diff --git a/website/.spelling b/website/.spelling
index b627fb1..ade85e5 100644
--- a/website/.spelling
+++ b/website/.spelling
@@ -694,6 +694,7 @@ doubleMeanNoNulls
 doubleMin
 doubleSum
 druid.generic.useDefaultValueForNull
+druid.generic.ignoreNullsForStringCardinality
 limitSpec
 longMax
 longAny

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to