[druid] branch master updated: adjust topn heap operation when string is dictionary encoded, but not uniquely (#12291)

gian Tue, 08 Mar 2022 14:32:51 -0800

This is an automated email from the ASF dual-hosted git repository.

gian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git



The following commit(s) were added to refs/heads/master by this push:
     new dae53ae  adjust topn heap operation when string is dictionary encoded, 
but not uniquely (#12291)
dae53ae is described below

commit dae53ae36a357919698cb8bb3e90934ee64b9fcf
Author: Clint Wylie <[email protected]>
AuthorDate: Tue Mar 8 14:32:40 2022 -0800

    adjust topn heap operation when string is dictionary encoded, but not 
uniquely (#12291)
    
    * add topn heap optimization when string is dictionary encoded, but not 
uniquely
    
    * use array instead
    
    * is same
    
    * fix javadoc
    
    * fix
    
    * Update StringTopNColumnAggregatesProcessor.java
---
 .../types/StringTopNColumnAggregatesProcessor.java | 34 +++++++++++++---------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git 
a/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
 
b/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
index 36f3800..d10221b 100644
--- 
a/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
+++ 
b/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
@@ -52,9 +52,8 @@ public class StringTopNColumnAggregatesProcessor implements 
TopNColumnAggregates
   @Override
   public int getCardinality(DimensionSelector selector)
   {
-    // only report the underlying selector cardinality if the column the 
selector is for is dictionary encoded, and
-    // the dictionary values are unique, that is they have a 1:1 mapping 
between dictionaryId and column value
-    if 
(capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue())
 {
+    // only report the underlying selector cardinality if the column the 
selector is for is dictionary encoded
+    if (capabilities.isDictionaryEncoded().isTrue()) {
       return selector.getValueCardinality();
     }
     return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
@@ -117,17 +116,11 @@ public class StringTopNColumnAggregatesProcessor 
implements TopNColumnAggregates
   )
   {
     final boolean notUnknown = selector.getValueCardinality() != 
DimensionDictionarySelector.CARDINALITY_UNKNOWN;
-    final boolean unique = 
capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue();
-    // we must know cardinality to use array based aggregation
-    // we check for uniquely dictionary encoded values because non-unique 
(meaning dictionary ids do not have a 1:1
-    // relation with values) negates many of the benefits of array aggregation:
-    // - if different dictionary ids map to the same value but dictionary ids 
are unique to that value (*:1), then
-    //   array aggregation will be correct but will still have to potentially 
perform many map lookups and lose the
-    //   performance benefit array aggregation is trying to provide
-    // - in cases where the same dictionary ids map to different values (1:* 
or *:*), results can be entirely
-    //   incorrect since an aggregator for a different value might be chosen 
from the array based on the re-used
-    //   dictionary id
-    if (notUnknown && unique) {
+    final boolean hasDictionary = capabilities.isDictionaryEncoded().isTrue();
+    // we must know cardinality to use array based aggregation. in cases where 
the same dictionary ids map to different
+    // values (1:* or *:*), results can be entirely incorrect since an 
aggregator for a different value might be
+    // chosen from the array based on the re-used dictionary id
+    if (notUnknown && hasDictionary) {
       return scanAndAggregateWithCardinalityKnown(query, cursor, selector, 
rowSelector);
     } else {
       return scanAndAggregateWithCardinalityUnknown(query, cursor, selector);
@@ -140,6 +133,11 @@ public class StringTopNColumnAggregatesProcessor 
implements TopNColumnAggregates
     this.aggregatesStore = new HashMap<>();
   }
 
+  /**
+   * scan and aggregate when column is dictionary encoded and value 
cardinality is known up front, so values are
+   * aggregated into an array position specified by the dictionaryid, which if 
not already present, are translated
+   * into the key and fetched (or created if they key hasn't been encountered) 
from the {@link #aggregatesStore}
+   */
   private long scanAndAggregateWithCardinalityKnown(
       TopNQuery query,
       Cursor cursor,
@@ -172,6 +170,14 @@ public class StringTopNColumnAggregatesProcessor 
implements TopNColumnAggregates
     return processedRows;
   }
 
+  /**
+   * this method is to allow scan and aggregate when values are not dictionary 
encoded
+   * (e.g. {@link DimensionSelector#nameLookupPossibleInAdvance()} is false 
and/or when
+   * {@link ColumnCapabilities#isDictionaryEncoded()} is false). This mode 
also uses hash table aggregation, storing
+   * results in {@link #aggregatesStore}, and must call {@link 
DimensionSelector#lookupName(int)} for every row which
+   * is processed and cannot cache lookups, or use the dictionary id in any 
way other than to lookup the current row
+   * value.
+   */
   private long scanAndAggregateWithCardinalityUnknown(
       TopNQuery query,
       Cursor cursor,

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[druid] branch master updated: adjust topn heap operation when string is dictionary encoded, but not uniquely (#12291)

Reply via email to