This is an automated email from the ASF dual-hosted git repository. cwylie pushed a commit to branch 0.15.1-incubating in repository https://gitbox.apache.org/repos/asf/incubator-druid.git
commit 21d3600e61ccbb052e1618d1ac4f6be53c6687dd Author: Gian Merlino <[email protected]> AuthorDate: Fri Jul 5 13:17:05 2019 -0700 Fix ExpressionVirtualColumn capabilities; fix groupBy's improper uses of StorageAdapter#getColumnCapabilities. (#8013) * GroupBy: Fix improper uses of StorageAdapter#getColumnCapabilities. 1) A usage in "isArrayAggregateApplicable" that would potentially incorrectly use array-based aggregation on a virtual column that shadows a real column. 2) A usage in "process" that would potentially use the more expensive multi-value aggregation path on a singly-valued virtual column. (No correctness issue, but a performance issue.) * Add addl javadoc. * ExpressionVirtualColumn: Set multi-value flag. --- .../epinephelinae/GroupByQueryEngineV2.java | 55 +++++++++++++++------- .../org/apache/druid/segment/StorageAdapter.java | 5 ++ .../segment/virtual/ExpressionVirtualColumn.java | 5 +- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index e3c609e..7594b90 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -48,11 +48,13 @@ import org.apache.druid.query.groupby.epinephelinae.column.LongGroupByColumnSele import org.apache.druid.query.groupby.epinephelinae.column.NullableValueGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.StringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.strategy.GroupByStrategyV2; +import org.apache.druid.segment.ColumnSelectorFactory; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.Cursor; import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.StorageAdapter; +import org.apache.druid.segment.VirtualColumns; import org.apache.druid.segment.column.ColumnCapabilities; import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.data.IndexedInts; @@ -115,14 +117,6 @@ public class GroupByQueryEngineV2 null ); - final boolean allSingleValueDims = query - .getDimensions() - .stream() - .allMatch(dimension -> { - final ColumnCapabilities columnCapabilities = storageAdapter.getColumnCapabilities(dimension.getDimension()); - return columnCapabilities != null && !columnCapabilities.hasMultipleValues(); - }); - final ResourceHolder<ByteBuffer> bufferHolder = intermediateResultsBufferPool.take(); final String fudgeTimestampString = NullHandling.emptyToNullIfNeeded( @@ -140,18 +134,38 @@ public class GroupByQueryEngineV2 @Override public GroupByEngineIterator make() { + final ColumnSelectorFactory columnSelectorFactory = cursor.getColumnSelectorFactory(); + final boolean allSingleValueDims = query + .getDimensions() + .stream() + .allMatch(dimension -> { + final ColumnCapabilities columnCapabilities = columnSelectorFactory.getColumnCapabilities( + dimension.getDimension() + ); + return columnCapabilities != null && !columnCapabilities.hasMultipleValues(); + }); + ColumnSelectorPlus<GroupByColumnSelectorStrategy>[] selectorPlus = DimensionHandlerUtils .createColumnSelectorPluses( STRATEGY_FACTORY, query.getDimensions(), - cursor.getColumnSelectorFactory() + columnSelectorFactory ); GroupByColumnSelectorPlus[] dims = createGroupBySelectorPlus(selectorPlus); final ByteBuffer buffer = bufferHolder.get(); - // Check array-based aggregation is applicable - if (isArrayAggregateApplicable(querySpecificConfig, query, dims, storageAdapter, buffer)) { + // Check if array-based aggregation is applicable + final boolean useArrayAggregation = isArrayAggregateApplicable( + querySpecificConfig, + query, + dims, + storageAdapter, + query.getVirtualColumns(), + buffer + ); + + if (useArrayAggregation) { return new ArrayAggregateIterator( query, querySpecificConfig, @@ -191,6 +205,7 @@ public class GroupByQueryEngineV2 GroupByQuery query, GroupByColumnSelectorPlus[] dims, StorageAdapter storageAdapter, + VirtualColumns virtualColumns, ByteBuffer buffer ) { @@ -206,11 +221,19 @@ public class GroupByQueryEngineV2 columnCapabilities = null; cardinality = 1; } else if (dims.length == 1) { + // Only real columns can use array-based aggregation, since virtual columns cannot currently report their + // cardinality. We need to check if a virtual column exists with the same name, since virtual columns can shadow + // real columns, and we might miss that since we're going directly to the StorageAdapter (which only knows about + // real columns). + if (virtualColumns.exists(dims[0].getName())) { + return false; + } + columnCapabilities = storageAdapter.getColumnCapabilities(dims[0].getName()); cardinality = storageAdapter.getDimensionCardinality(dims[0].getName()); } else { - columnCapabilities = null; - cardinality = -1; // ArrayAggregateIterator is not available + // Cannot use array-based aggregation with more than one dimension. + return false; } // Choose array-based aggregation if the grouping key is a single string dimension of a @@ -225,11 +248,11 @@ public class GroupByQueryEngineV2 aggregatorFactories ); - // Check that all keys and aggregated values can be contained the buffer + // Check that all keys and aggregated values can be contained in the buffer return requiredBufferCapacity <= buffer.capacity(); + } else { + return false; } - - return false; } private static class GroupByStrategyFactory implements ColumnSelectorStrategyFactory<GroupByColumnSelectorStrategy> diff --git a/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java b/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java index 3319d09..1dfc00e 100644 --- a/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java +++ b/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java @@ -56,6 +56,11 @@ public interface StorageAdapter extends CursorFactory * the column does exist but the capabilities are unknown. The latter is possible with dynamically discovered * columns. * + * Note that StorageAdapters are representations of "real" segments, so they are not aware of any virtual columns + * that may be involved in a query. In general, query engines should instead use the method + * {@link ColumnSelectorFactory#getColumnCapabilities(String)}, which returns capabilities for virtual columns as + * well. + * * @param column column name * * @return capabilities, or null diff --git a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java index 45a25c4..f8988d6 100644 --- a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java +++ b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java @@ -103,7 +103,10 @@ public class ExpressionVirtualColumn implements VirtualColumn @Override public ColumnCapabilities capabilities(String columnName) { - return new ColumnCapabilitiesImpl().setType(outputType); + // Note: Ideally we would only "setHasMultipleValues(true)" if the expression in question could potentially return + // multiple values. However, we don't currently have a good way of determining this, so to be safe we always + // set the flag. + return new ColumnCapabilitiesImpl().setType(outputType).setHasMultipleValues(true); } @Override --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
