This is an automated email from the ASF dual-hosted git repository.
gian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new dae53ae adjust topn heap operation when string is dictionary encoded,
but not uniquely (#12291)
dae53ae is described below
commit dae53ae36a357919698cb8bb3e90934ee64b9fcf
Author: Clint Wylie <[email protected]>
AuthorDate: Tue Mar 8 14:32:40 2022 -0800
adjust topn heap operation when string is dictionary encoded, but not
uniquely (#12291)
* add topn heap optimization when string is dictionary encoded, but not
uniquely
* use array instead
* is same
* fix javadoc
* fix
* Update StringTopNColumnAggregatesProcessor.java
---
.../types/StringTopNColumnAggregatesProcessor.java | 34 +++++++++++++---------
1 file changed, 20 insertions(+), 14 deletions(-)
diff --git
a/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
b/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
index 36f3800..d10221b 100644
---
a/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
+++
b/processing/src/main/java/org/apache/druid/query/topn/types/StringTopNColumnAggregatesProcessor.java
@@ -52,9 +52,8 @@ public class StringTopNColumnAggregatesProcessor implements
TopNColumnAggregates
@Override
public int getCardinality(DimensionSelector selector)
{
- // only report the underlying selector cardinality if the column the
selector is for is dictionary encoded, and
- // the dictionary values are unique, that is they have a 1:1 mapping
between dictionaryId and column value
- if
(capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue())
{
+ // only report the underlying selector cardinality if the column the
selector is for is dictionary encoded
+ if (capabilities.isDictionaryEncoded().isTrue()) {
return selector.getValueCardinality();
}
return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
@@ -117,17 +116,11 @@ public class StringTopNColumnAggregatesProcessor
implements TopNColumnAggregates
)
{
final boolean notUnknown = selector.getValueCardinality() !=
DimensionDictionarySelector.CARDINALITY_UNKNOWN;
- final boolean unique =
capabilities.isDictionaryEncoded().and(capabilities.areDictionaryValuesUnique()).isTrue();
- // we must know cardinality to use array based aggregation
- // we check for uniquely dictionary encoded values because non-unique
(meaning dictionary ids do not have a 1:1
- // relation with values) negates many of the benefits of array aggregation:
- // - if different dictionary ids map to the same value but dictionary ids
are unique to that value (*:1), then
- // array aggregation will be correct but will still have to potentially
perform many map lookups and lose the
- // performance benefit array aggregation is trying to provide
- // - in cases where the same dictionary ids map to different values (1:*
or *:*), results can be entirely
- // incorrect since an aggregator for a different value might be chosen
from the array based on the re-used
- // dictionary id
- if (notUnknown && unique) {
+ final boolean hasDictionary = capabilities.isDictionaryEncoded().isTrue();
+ // we must know cardinality to use array based aggregation. in cases where
the same dictionary ids map to different
+ // values (1:* or *:*), results can be entirely incorrect since an
aggregator for a different value might be
+ // chosen from the array based on the re-used dictionary id
+ if (notUnknown && hasDictionary) {
return scanAndAggregateWithCardinalityKnown(query, cursor, selector,
rowSelector);
} else {
return scanAndAggregateWithCardinalityUnknown(query, cursor, selector);
@@ -140,6 +133,11 @@ public class StringTopNColumnAggregatesProcessor
implements TopNColumnAggregates
this.aggregatesStore = new HashMap<>();
}
+ /**
+ * scan and aggregate when column is dictionary encoded and value
cardinality is known up front, so values are
+ * aggregated into an array position specified by the dictionaryid, which if
not already present, are translated
+ * into the key and fetched (or created if they key hasn't been encountered)
from the {@link #aggregatesStore}
+ */
private long scanAndAggregateWithCardinalityKnown(
TopNQuery query,
Cursor cursor,
@@ -172,6 +170,14 @@ public class StringTopNColumnAggregatesProcessor
implements TopNColumnAggregates
return processedRows;
}
+ /**
+ * this method is to allow scan and aggregate when values are not dictionary
encoded
+ * (e.g. {@link DimensionSelector#nameLookupPossibleInAdvance()} is false
and/or when
+ * {@link ColumnCapabilities#isDictionaryEncoded()} is false). This mode
also uses hash table aggregation, storing
+ * results in {@link #aggregatesStore}, and must call {@link
DimensionSelector#lookupName(int)} for every row which
+ * is processed and cannot cache lookups, or use the dictionary id in any
way other than to lookup the current row
+ * value.
+ */
private long scanAndAggregateWithCardinalityUnknown(
TopNQuery query,
Cursor cursor,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]