This is an automated email from the ASF dual-hosted git repository.
jihoonson pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new a1ea658 Introducing a new config to ignore nulls while computing
String Cardinality (#12345)
a1ea658 is described below
commit a1ea6581156bf415fe991cebf74182b24d5b2994
Author: somu-imply <[email protected]>
AuthorDate: Tue Mar 29 14:31:36 2022 -0700
Introducing a new config to ignore nulls while computing String Cardinality
(#12345)
* Counting nulls in String cardinality with a config
* Adding tests for the new config
* Wrapping the vectorize part to allow backward compatibility
* Adding different tests, cleaning the code and putting the check at the
proper position, handling hasRow() and hasValue() changes
* Updating testcase and code
* Adding null handling test to improve coverage
* Checkstyle fix
* Adding 1 more change in docs
* Making docs clearer
---
.../apache/druid/common/config/NullHandling.java | 20 ++-
.../common/config/NullValueHandlingConfig.java | 28 ++++-
.../druid/common/config/NullHandlingTest.java | 11 ++
docs/configuration/index.md | 2 +-
...ardinalityAggregatorColumnSelectorStrategy.java | 3 +-
...ingleValueStringCardinalityVectorProcessor.java | 2 +-
.../cardinality/CardinalityAggregatorTest.java | 140 +++++++++++++++++++--
website/.spelling | 1 +
8 files changed, 193 insertions(+), 14 deletions(-)
diff --git
a/core/src/main/java/org/apache/druid/common/config/NullHandling.java
b/core/src/main/java/org/apache/druid/common/config/NullHandling.java
index bd0f0ee..f7d3146 100644
--- a/core/src/main/java/org/apache/druid/common/config/NullHandling.java
+++ b/core/src/main/java/org/apache/druid/common/config/NullHandling.java
@@ -58,7 +58,13 @@ public class NullHandling
@VisibleForTesting
public static void initializeForTests()
{
- INSTANCE = new NullValueHandlingConfig(null);
+ INSTANCE = new NullValueHandlingConfig(null, null);
+ }
+
+ @VisibleForTesting
+ public static void initializeForTestsWithValues(Boolean useDefForNull,
Boolean ignoreNullForString)
+ {
+ INSTANCE = new NullValueHandlingConfig(useDefForNull, ignoreNullForString);
}
/**
@@ -73,6 +79,18 @@ public class NullHandling
return INSTANCE.isUseDefaultValuesForNull();
}
+ /**
+ * whether nulls should be counted during String cardinality
+ */
+ public static boolean ignoreNullsForStringCardinality()
+ {
+ // this should only be null in a unit test context, in production this
will be injected by the null handling module
+ if (INSTANCE == null) {
+ throw new IllegalStateException("NullHandling module not initialized,
call NullHandling.initializeForTests()");
+ }
+ return INSTANCE.isIgnoreNullsForStringCardinality();
+ }
+
public static boolean sqlCompatible()
{
return !replaceWithDefault();
diff --git
a/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
b/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
index c733f84..fbdc852 100644
---
a/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
+++
b/core/src/main/java/org/apache/druid/common/config/NullValueHandlingConfig.java
@@ -26,13 +26,22 @@ public class NullValueHandlingConfig
{
public static final String NULL_HANDLING_CONFIG_STRING =
"druid.generic.useDefaultValueForNull";
+ //added to preserve backward compatibility
+ //and not count nulls during cardinality aggrgation over strings
+
+ public static final String NULL_HANDLING_DURING_STRING_CARDINALITY =
"druid.generic.ignoreNullsForStringCardinality";
+
@JsonProperty("useDefaultValueForNull")
private final boolean useDefaultValuesForNull;
+ @JsonProperty("ignoreNullsForStringCardinality")
+ private final boolean ignoreNullsForStringCardinality;
+
@JsonCreator
public NullValueHandlingConfig(
- @JsonProperty("useDefaultValueForNull") Boolean useDefaultValuesForNull
+ @JsonProperty("useDefaultValueForNull") Boolean useDefaultValuesForNull,
+ @JsonProperty("ignoreNullsForStringCardinality") Boolean
ignoreNullsForStringCardinality
)
{
if (useDefaultValuesForNull == null) {
@@ -40,6 +49,23 @@ public class NullValueHandlingConfig
} else {
this.useDefaultValuesForNull = useDefaultValuesForNull;
}
+ if (ignoreNullsForStringCardinality == null) {
+ this.ignoreNullsForStringCardinality =
Boolean.valueOf(System.getProperty(
+ NULL_HANDLING_DURING_STRING_CARDINALITY,
+ "false"
+ ));
+ } else {
+ if (this.useDefaultValuesForNull) {
+ this.ignoreNullsForStringCardinality = ignoreNullsForStringCardinality;
+ } else {
+ this.ignoreNullsForStringCardinality = false;
+ }
+ }
+ }
+
+ public boolean isIgnoreNullsForStringCardinality()
+ {
+ return ignoreNullsForStringCardinality;
}
public boolean isUseDefaultValuesForNull()
diff --git
a/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
b/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
index 36e3f36..1419c61 100644
--- a/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
+++ b/core/src/test/java/org/apache/druid/common/config/NullHandlingTest.java
@@ -90,4 +90,15 @@ public class NullHandlingTest
{
Assert.assertNull(NullHandling.defaultValueForClass(Object.class));
}
+
+ @Test
+ public void test_ignoreNullsStrings()
+ {
+ NullHandling.initializeForTestsWithValues(false, true);
+ Assert.assertFalse(NullHandling.ignoreNullsForStringCardinality());
+
+ NullHandling.initializeForTestsWithValues(true, false);
+ Assert.assertFalse(NullHandling.ignoreNullsForStringCardinality());
+
+ }
}
diff --git a/docs/configuration/index.md b/docs/configuration/index.md
index bc4d10c..6383778 100644
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@@ -761,7 +761,7 @@ Prior to version 0.13.0, Druid string columns treated `''`
and `null` values as
|Property|Description|Default|
|---|---|---|
|`druid.generic.useDefaultValueForNull`|When set to `true`, `null` values will
be stored as `''` for string columns and `0` for numeric columns. Set to
`false` to store and query data in SQL compatible mode.|`true`|
-
+|`druid.generic.ignoreNullsForStringCardinality`|When set to `true`, `null`
values will be ignored for the built-in cardinality aggregator over string
columns. Set to `false` to include `null` values while estimating cardinality
of only string columns using the built-in cardinality aggregator. This setting
takes effect only when `druid.generic.useDefaultValueForNull` is set to `true`
and is ignored in SQL compatibility mode. Additionally, empty strings
(equivalent to null) are not counte [...]
This mode does have a storage size and query performance cost, see [segment
documentation](../design/segments.md#sql-compatible-null-handling) for more
details.
### HTTP Client
diff --git
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
index 9d49e30..cfa68d0 100644
---
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
+++
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/StringCardinalityAggregatorColumnSelectorStrategy.java
@@ -39,7 +39,8 @@ public class
StringCardinalityAggregatorColumnSelectorStrategy implements Cardin
// SQL standard spec does not count null values,
// Skip counting null values when we are not replacing null with default
value.
// A special value for null in case null handling is configured to use
empty string for null.
- if (NullHandling.replaceWithDefault() || s != null) {
+ // check if nulls are to be ignored
+ if ((NullHandling.replaceWithDefault() &&
!NullHandling.ignoreNullsForStringCardinality()) || s != null) {
collector.add(CardinalityAggregator.HASH_FUNCTION.hashUnencodedChars(nullToSpecial(s)).asBytes());
}
}
diff --git
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
index 080ce04..e9173d8 100644
---
a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
+++
b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/vector/SingleValueStringCardinalityVectorProcessor.java
@@ -51,6 +51,7 @@ public class SingleValueStringCardinalityVectorProcessor
implements CardinalityV
final HyperLogLogCollector collector =
HyperLogLogCollector.makeCollector(buf);
+
for (int i = startRow; i < endRow; i++) {
final String value = selector.lookupName(vector[i]);
StringCardinalityAggregatorColumnSelectorStrategy.addStringToCollector(collector,
value);
@@ -74,7 +75,6 @@ public class SingleValueStringCardinalityVectorProcessor
implements CardinalityV
for (int i = 0; i < numRows; i++) {
final String s = selector.lookupName(vector[rows != null ? rows[i] :
i]);
-
if (NullHandling.replaceWithDefault() || s != null) {
final int position = positions[i] + positionOffset;
buf.limit(position +
HyperLogLogCollector.getLatestNumBytesForDenseStorage());
diff --git
a/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
b/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
index 65631c5..5d87586 100644
---
a/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
+++
b/processing/src/test/java/org/apache/druid/query/aggregation/cardinality/CardinalityAggregatorTest.java
@@ -381,12 +381,12 @@ public class CardinalityAggregatorTest
@Test
public void testAggregateRows()
{
+ NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoList,
true
);
-
for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selectorList, agg);
}
@@ -397,6 +397,7 @@ public class CardinalityAggregatorTest
@Test
public void testAggregateValues()
{
+ NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoList,
false
@@ -405,13 +406,22 @@ public class CardinalityAggregatorTest
for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selectorList, agg);
}
- Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg.get()), 0.05);
- Assert.assertEquals(NullHandling.replaceWithDefault() ? 7L : 6L,
rowAggregatorFactoryRounded.finalizeComputation(agg.get()));
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 7.0 : 6.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg.get()),
+ 0.05
+ );
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 7L : 6L,
+ rowAggregatorFactoryRounded.finalizeComputation(agg.get())
+ );
+
}
@Test
public void testBufferAggregateRows()
{
+ NullHandling.initializeForTestsWithValues(null, null);
CardinalityBufferAggregator agg = new CardinalityBufferAggregator(
dimInfoList.toArray(new ColumnSelectorPlus[0]),
true
@@ -434,6 +444,7 @@ public class CardinalityAggregatorTest
@Test
public void testBufferAggregateValues()
{
+ NullHandling.initializeForTestsWithValues(null, null);
CardinalityBufferAggregator agg = new CardinalityBufferAggregator(
dimInfoList.toArray(new ColumnSelectorPlus[0]),
false
@@ -449,13 +460,21 @@ public class CardinalityAggregatorTest
for (int i = 0; i < VALUES1.size(); ++i) {
bufferAggregate(selectorList, agg, buf, pos);
}
- Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)), 0.05);
- Assert.assertEquals(NullHandling.replaceWithDefault() ? 7L : 6L,
rowAggregatorFactoryRounded.finalizeComputation(agg.get(buf, pos)));
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 7.0 : 6.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)),
+ 0.05
+ );
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 7L : 6L,
+ rowAggregatorFactoryRounded.finalizeComputation(agg.get(buf, pos))
+ );
}
@Test
public void testCombineRows()
{
+ NullHandling.initializeForTestsWithValues(null, null);
List<DimensionSelector> selector1 = Collections.singletonList(dim1);
List<DimensionSelector> selector2 = Collections.singletonList(dim2);
List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>>
dimInfo1 = Collections.singletonList(
@@ -501,6 +520,7 @@ public class CardinalityAggregatorTest
@Test
public void testCombineValues()
{
+ NullHandling.initializeForTestsWithValues(null, null);
List<DimensionSelector> selector1 = Collections.singletonList(dim1);
List<DimensionSelector> selector2 = Collections.singletonList(dim2);
@@ -528,10 +548,16 @@ public class CardinalityAggregatorTest
for (int i = 0; i < VALUES2.size(); ++i) {
aggregate(selector2, agg2);
}
-
- Assert.assertEquals(NullHandling.replaceWithDefault() ? 4.0 : 3.0,
(Double) valueAggregatorFactory.finalizeComputation(agg1.get()), 0.05);
- Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg2.get()), 0.05);
-
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 4.0 : 3.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg1.get()),
+ 0.05
+ );
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 7.0 : 6.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg2.get()),
+ 0.05
+ );
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) rowAggregatorFactory.finalizeComputation(
@@ -547,6 +573,7 @@ public class CardinalityAggregatorTest
@Test
public void testAggregateRowsWithExtraction()
{
+ NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoListWithExtraction,
true
@@ -569,6 +596,7 @@ public class CardinalityAggregatorTest
@Test
public void testAggregateValuesWithExtraction()
{
+ NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoListWithExtraction,
false
@@ -635,4 +663,98 @@ public class CardinalityAggregatorTest
objectMapper.readValue(objectMapper.writeValueAsString(factory2),
AggregatorFactory.class)
);
}
+
+ //ignoreNullsForStringCardinality tests
+ @Test
+ public void testAggregateRowsIgnoreNulls()
+ {
+ NullHandling.initializeForTestsWithValues(null, true);
+ CardinalityAggregator agg = new CardinalityAggregator(
+ dimInfoList,
+ true
+ );
+
+ for (int i = 0; i < VALUES1.size(); ++i) {
+ aggregate(selectorList, agg);
+ }
+ Assert.assertEquals(9.0, (Double)
rowAggregatorFactory.finalizeComputation(agg.get()), 0.05);
+ Assert.assertEquals(9L,
rowAggregatorFactoryRounded.finalizeComputation(agg.get()));
+ }
+
+ @Test
+ public void testAggregateValuesIgnoreNulls()
+ {
+ NullHandling.initializeForTestsWithValues(null, true);
+ CardinalityAggregator agg = new CardinalityAggregator(
+ dimInfoList,
+ false
+ );
+
+ for (int i = 0; i < VALUES1.size(); ++i) {
+ aggregate(selectorList, agg);
+ }
+ //setting is not applied when druid.generic.useDefaultValueForNull=false
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 6.0 : 6.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg.get()),
+ 0.05
+ );
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 6L : 6L,
+ rowAggregatorFactoryRounded.finalizeComputation(agg.get())
+ );
+ }
+
+ @Test
+ public void testCombineValuesIgnoreNulls()
+ {
+ NullHandling.initializeForTestsWithValues(null, true);
+ List<DimensionSelector> selector1 = Collections.singletonList(dim1);
+ List<DimensionSelector> selector2 = Collections.singletonList(dim2);
+
+ List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>>
dimInfo1 = Collections.singletonList(
+ new ColumnSelectorPlus<>(
+ dimSpec1.getDimension(),
+ dimSpec1.getOutputName(),
+ new StringCardinalityAggregatorColumnSelectorStrategy(), dim1
+ )
+ );
+ List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>>
dimInfo2 = Collections.singletonList(
+ new ColumnSelectorPlus<>(
+ dimSpec1.getDimension(),
+ dimSpec1.getOutputName(),
+ new StringCardinalityAggregatorColumnSelectorStrategy(), dim2
+ )
+ );
+
+ CardinalityAggregator agg1 = new CardinalityAggregator(dimInfo1, false);
+ CardinalityAggregator agg2 = new CardinalityAggregator(dimInfo2, false);
+
+ for (int i = 0; i < VALUES1.size(); ++i) {
+ aggregate(selector1, agg1);
+ }
+ for (int i = 0; i < VALUES2.size(); ++i) {
+ aggregate(selector2, agg2);
+ }
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 3.0 : 3.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg1.get()),
+ 0.05
+ );
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 6.0 : 6.0,
+ (Double) valueAggregatorFactory.finalizeComputation(agg2.get()),
+ 0.05
+ );
+ Assert.assertEquals(
+ NullHandling.replaceWithDefault() ? 6.0 : 6.0,
+ (Double) rowAggregatorFactory.finalizeComputation(
+ rowAggregatorFactory.combine(
+ agg1.get(),
+ agg2.get()
+ )
+ ),
+ 0.05
+ );
+ }
}
diff --git a/website/.spelling b/website/.spelling
index b627fb1..ade85e5 100644
--- a/website/.spelling
+++ b/website/.spelling
@@ -694,6 +694,7 @@ doubleMeanNoNulls
doubleMin
doubleSum
druid.generic.useDefaultValueForNull
+druid.generic.ignoreNullsForStringCardinality
limitSpec
longMax
longAny
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]