This is an automated email from the ASF dual-hosted git repository. sorabh pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
commit 99707a3e0938c15d57ccfd8dbc7c15ae4877067f Author: Venkata Jyothsna Donapati <[email protected]> AuthorDate: Tue Apr 23 12:05:49 2019 -0700 DRILL-7199: Optimize population of metadata for non-interesting columns closes #1771 --- .../base/AbstractGroupScanWithMetadata.java | 48 +++++++++++--- .../base/SimpleFileTableMetadataProvider.java | 6 ++ .../store/parquet/AbstractParquetGroupScan.java | 3 + .../store/parquet/BaseParquetMetadataProvider.java | 13 +++- .../exec/store/parquet/FilterEvaluatorUtils.java | 8 +++ .../store/parquet/ParquetTableMetadataUtils.java | 29 +++++---- .../exec/physical/base/TableMetadataProvider.java | 7 ++ .../metastore/NonInterestingColumnsMetadata.java | 74 ++++++++++++++++++++++ 8 files changed, 165 insertions(+), 23 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/AbstractGroupScanWithMetadata.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/AbstractGroupScanWithMetadata.java index a547fb8..3be82df 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/AbstractGroupScanWithMetadata.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/AbstractGroupScanWithMetadata.java @@ -51,6 +51,7 @@ import org.apache.drill.metastore.ColumnStatistics; import org.apache.drill.metastore.ColumnStatisticsKind; import org.apache.drill.metastore.FileMetadata; import org.apache.drill.metastore.LocationProvider; +import org.apache.drill.metastore.NonInterestingColumnsMetadata; import org.apache.drill.metastore.PartitionMetadata; import org.apache.drill.metastore.TableMetadata; import org.apache.drill.metastore.TableStatisticsKind; @@ -86,6 +87,7 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca // partition metadata info: mixed partition values for all partition keys in the same list protected List<PartitionMetadata> partitions; + protected NonInterestingColumnsMetadata nonInterestingColumnsMetadata; protected List<SchemaPath> partitionColumns; protected LogicalExpression filter; protected List<SchemaPath> columns; @@ -115,7 +117,7 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca this.partitionColumns = that.partitionColumns; this.partitions = that.partitions; this.files = that.files; - + this.nonInterestingColumnsMetadata = that.nonInterestingColumnsMetadata; this.fileSet = that.fileSet == null ? null : new HashSet<>(that.fileSet); } @@ -151,18 +153,29 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca */ @Override public long getColumnValueCount(SchemaPath column) { - long tableRowCount = (long) TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata()); + long tableRowCount, colNulls; + Long nulls; ColumnStatistics columnStats = getTableMetadata().getColumnStatistics(column); - long colNulls; + ColumnStatistics nonInterestingColStats = null; + if (columnStats == null) { + nonInterestingColStats = getNonInterestingColumnsMetadata().getColumnStatistics(column); + } + if (columnStats != null) { - Long nulls = (Long) columnStats.getStatistic(ColumnStatisticsKind.NULLS_COUNT); - colNulls = nulls != null ? nulls : Statistic.NO_COLUMN_STATS; + tableRowCount = (long) TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata()); + } else if (nonInterestingColStats != null) { + tableRowCount = (long) TableStatisticsKind.ROW_COUNT.getValue(getNonInterestingColumnsMetadata()); } else { - return 0; + return 0; // returns 0 if the column doesn't exist in the table. } + + columnStats = columnStats != null ? columnStats : nonInterestingColStats; + nulls = (Long) columnStats.getStatistic(ColumnStatisticsKind.NULLS_COUNT); + colNulls = nulls != null ? nulls : Statistic.NO_COLUMN_STATS; + return Statistic.NO_COLUMN_STATS == tableRowCount - || Statistic.NO_COLUMN_STATS == colNulls - ? Statistic.NO_COLUMN_STATS : tableRowCount - colNulls; + || Statistic.NO_COLUMN_STATS == colNulls + ? Statistic.NO_COLUMN_STATS : tableRowCount - colNulls; } @Override @@ -266,6 +279,7 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca filteredMetadata.withTable(getTableMetadata()) .withPartitions(getNextOrEmpty(getPartitionsMetadata())) .withFiles(filesMap) + .withNonInterestingColumns(getNonInterestingColumnsMetadata()) .withMatching(false); } @@ -387,6 +401,7 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca .withTable(getTableMetadata()) .withPartitions(getPartitionsMetadata()) .withFiles(filesMap) + .withNonInterestingColumns(getNonInterestingColumnsMetadata()) .withMatching(matchAllMetadata) .build(); } @@ -520,6 +535,14 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca return partitions; } + @JsonIgnore + public NonInterestingColumnsMetadata getNonInterestingColumnsMetadata() { + if (nonInterestingColumnsMetadata == null) { + nonInterestingColumnsMetadata = metadataProvider.getNonInterestingColumnsMeta(); + } + return nonInterestingColumnsMetadata; + } + /** * This class is responsible for filtering different metadata levels. */ @@ -531,6 +554,7 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca protected TableMetadata tableMetadata; protected List<PartitionMetadata> partitions = Collections.emptyList(); protected Map<Path, FileMetadata> files = Collections.emptyMap(); + protected NonInterestingColumnsMetadata nonInterestingColumnsMetadata; // for the case when filtering is possible for partitions, but files count exceeds // PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD, new group scan with at least filtered partitions @@ -558,6 +582,11 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca return this; } + public GroupScanWithMetadataFilterer withNonInterestingColumns(NonInterestingColumnsMetadata nonInterestingColumns) { + this.nonInterestingColumnsMetadata = nonInterestingColumns; + return this; + } + public GroupScanWithMetadataFilterer withFiles(Map<Path, FileMetadata> files) { this.files = files; return this; @@ -729,6 +758,9 @@ public abstract class AbstractGroupScanWithMetadata extends AbstractFileGroupSca locationProvider.getLocation(), source.supportsFileImplicitColumns()); } + if (source.getNonInterestingColumnsMetadata() != null) { + columnsStatistics.putAll(source.getNonInterestingColumnsMetadata().getColumnsStatistics()); + } RowsMatch match = FilterEvaluatorUtils.matches(filterPredicate, columnsStatistics, (long) metadata.getStatistic(TableStatisticsKind.ROW_COUNT), metadata.getSchema(), schemaPathsInExpr); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/SimpleFileTableMetadataProvider.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/SimpleFileTableMetadataProvider.java index ecf45af..15ecdd9 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/SimpleFileTableMetadataProvider.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/SimpleFileTableMetadataProvider.java @@ -26,6 +26,7 @@ import org.apache.drill.metastore.ColumnStatistics; import org.apache.drill.metastore.ColumnStatisticsImpl; import org.apache.drill.metastore.FileMetadata; import org.apache.drill.metastore.FileTableMetadata; +import org.apache.drill.metastore.NonInterestingColumnsMetadata; import org.apache.drill.metastore.PartitionMetadata; import org.apache.drill.metastore.TableMetadata; import org.apache.hadoop.fs.Path; @@ -86,6 +87,11 @@ public class SimpleFileTableMetadataProvider implements TableMetadataProvider { return null; } + @Override + public NonInterestingColumnsMetadata getNonInterestingColumnsMeta() { + return null; + } + public static class Builder implements SimpleFileTableMetadataProviderBuilder { private String tableName; private Path location; diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java index 34cf354..d5a752f 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java @@ -290,6 +290,7 @@ public abstract class AbstractParquetGroupScan extends AbstractGroupScanWithMeta builder.withRowGroups(rowGroupsMap) .withTable(getTableMetadata()) .withPartitions(getNextOrEmpty(getPartitionsMetadata())) + .withNonInterestingColumns(getNonInterestingColumnsMetadata()) .withFiles(filesMap) .withMatching(false); } @@ -363,6 +364,7 @@ public abstract class AbstractParquetGroupScan extends AbstractGroupScanWithMeta .withTable(getTableMetadata()) .withPartitions(getPartitionsMetadata()) .withFiles(qualifiedFiles) + .withNonInterestingColumns(getNonInterestingColumnsMetadata()) .withMatching(matchAllMetadata) .build(); } @@ -500,6 +502,7 @@ public abstract class AbstractParquetGroupScan extends AbstractGroupScanWithMeta newScan.files = files; newScan.rowGroups = rowGroups; newScan.matchAllMetadata = matchAllMetadata; + newScan.nonInterestingColumnsMetadata = nonInterestingColumnsMetadata; // since builder is used when pruning happens, entries and fileSet should be expanded if (!newScan.getFilesMetadata().isEmpty()) { newScan.entries = newScan.getFilesMetadata().keySet().stream() diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java index 1877356..42b22ec 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java @@ -24,6 +24,7 @@ import org.apache.drill.exec.record.metadata.MetadataUtils; import org.apache.drill.exec.record.metadata.TupleMetadata; import org.apache.drill.metastore.BaseMetadata; import org.apache.drill.metastore.ColumnStatisticsImpl; +import org.apache.drill.metastore.NonInterestingColumnsMetadata; import org.apache.drill.metastore.StatisticsKind; import org.apache.drill.metastore.TableMetadata; import org.apache.drill.metastore.TableStatisticsKind; @@ -88,6 +89,7 @@ public abstract class BaseParquetMetadataProvider implements ParquetMetadataProv private TableMetadata tableMetadata; private List<PartitionMetadata> partitions; private Map<Path, FileMetadata> files; + private NonInterestingColumnsMetadata nonInterestingColumnsMetadata; // whether metadata for row groups should be collected to create files, partitions and table metadata private final boolean collectMetadata = false; @@ -160,6 +162,7 @@ public abstract class BaseParquetMetadataProvider implements ParquetMetadataProv TableMetadata tableMetadata = getTableMetadata(); getPartitionsMetadata(); getRowGroupsMeta(); + getNonInterestingColumnsMeta(); this.tableMetadata = ParquetTableMetadataUtils.updateRowCount(tableMetadata, getRowGroupsMeta()); parquetTableMetadata = null; } @@ -178,10 +181,19 @@ public abstract class BaseParquetMetadataProvider implements ParquetMetadataProv getFilesMetadata(); getPartitionsMetadata(); getRowGroupsMeta(); + getNonInterestingColumnsMeta(); parquetTableMetadata = null; } @Override + public NonInterestingColumnsMetadata getNonInterestingColumnsMeta() { + if (nonInterestingColumnsMetadata == null) { + nonInterestingColumnsMetadata = ParquetTableMetadataUtils.getNonInterestingColumnsMeta(parquetTableMetadata); + } + return nonInterestingColumnsMetadata; + } + + @Override @SuppressWarnings("unchecked") public TableMetadata getTableMetadata() { if (tableMetadata == null) { @@ -235,7 +247,6 @@ public abstract class BaseParquetMetadataProvider implements ParquetMetadataProv new ColumnStatisticsImpl(DrillStatsTable.getEstimatedColumnStats(statsTable, column), ParquetTableMetadataUtils.getNaturalNullsFirstComparator())); } - columnsStatistics.putAll(ParquetTableMetadataUtils.populateNonInterestingColumnsStats(columnsStatistics.keySet(), parquetTableMetadata)); } tableMetadata = new FileTableMetadata(tableName, tableLocation, schema, columnsStatistics, tableStatistics, -1L, "", partitionKeys); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FilterEvaluatorUtils.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FilterEvaluatorUtils.java index 708d4e4..ab8f88f 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FilterEvaluatorUtils.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FilterEvaluatorUtils.java @@ -21,6 +21,7 @@ import org.apache.drill.exec.record.metadata.ColumnMetadata; import org.apache.drill.exec.record.metadata.SchemaPathUtils; import org.apache.drill.exec.record.metadata.TupleMetadata; import org.apache.drill.exec.store.parquet.metadata.MetadataBase; +import org.apache.drill.metastore.NonInterestingColumnsMetadata; import org.apache.drill.metastore.RowGroupMetadata; import org.apache.drill.metastore.TableStatisticsKind; import org.apache.drill.exec.expr.FilterBuilder; @@ -62,7 +63,14 @@ public class FilterEvaluatorUtils { expr.<Set<SchemaPath>, Void, RuntimeException>accept(new FieldReferenceFinder(), null)); RowGroupMetadata rowGroupMetadata = new ArrayList<>(ParquetTableMetadataUtils.getRowGroupsMetadata(footer).values()).get(rowGroupIndex); + NonInterestingColumnsMetadata nonInterestingColumnsMetadata = ParquetTableMetadataUtils.getNonInterestingColumnsMeta(footer); Map<SchemaPath, ColumnStatistics> columnsStatistics = rowGroupMetadata.getColumnsStatistics(); + + // Add column statistics of non-interesting columns if there are any + if (nonInterestingColumnsMetadata != null) { + columnsStatistics.putAll(nonInterestingColumnsMetadata.getColumnsStatistics()); + } + columnsStatistics = ParquetTableMetadataUtils.addImplicitColumnsStatistics(columnsStatistics, schemaPathsInExpr, Collections.emptyList(), options, rowGroupMetadata.getLocation(), true); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java index 8df6585..fb0ea88 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java @@ -35,6 +35,7 @@ import org.apache.drill.metastore.ColumnStatistics; import org.apache.drill.metastore.ColumnStatisticsImpl; import org.apache.drill.metastore.ColumnStatisticsKind; import org.apache.drill.metastore.FileMetadata; +import org.apache.drill.metastore.NonInterestingColumnsMetadata; import org.apache.drill.metastore.PartitionMetadata; import org.apache.drill.metastore.RowGroupMetadata; import org.apache.drill.metastore.StatisticsKind; @@ -195,7 +196,6 @@ public class ParquetTableMetadataUtils { } columnsStatistics.put(column, new ColumnStatisticsImpl(statisticsMap, statisticsList.iterator().next().getValueComparator())); } - columnsStatistics.putAll(populateNonInterestingColumnsStats(columnsStatistics.keySet(), parquetTableMetadata)); return columnsStatistics; } @@ -287,27 +287,27 @@ public class ParquetTableMetadataUtils { statistics.put(ColumnStatisticsKind.NULLS_COUNT, nulls); columnsStatistics.put(colPath, new ColumnStatisticsImpl(statistics, comparator)); } - columnsStatistics.putAll(populateNonInterestingColumnsStats(columnsStatistics.keySet(), tableMetadata)); return columnsStatistics; } /** - * Populates the non-interesting column's statistics - * @param schemaPaths columns paths which should be ignored + * Returns the non-interesting column's metadata * @param parquetTableMetadata the source of column metadata for non-interesting column's statistics - * @return returns non-interesting column statistics map + * @return returns non-interesting columns metadata */ - @SuppressWarnings("unchecked") - public static Map<SchemaPath, ColumnStatistics> populateNonInterestingColumnsStats( - Set<SchemaPath> schemaPaths, MetadataBase.ParquetTableMetadataBase parquetTableMetadata) { + public static NonInterestingColumnsMetadata getNonInterestingColumnsMeta(MetadataBase.ParquetTableMetadataBase parquetTableMetadata) { Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>(); if (parquetTableMetadata instanceof Metadata_V4.ParquetTableMetadata_v4) { - ConcurrentHashMap<Metadata_V4.ColumnTypeMetadata_v4.Key, Metadata_V4.ColumnTypeMetadata_v4 > columnTypeInfoMap = - ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfoMap(); - if ( columnTypeInfoMap == null ) { return columnsStatistics; } // in some cases for runtime pruning + ConcurrentHashMap<Metadata_V4.ColumnTypeMetadata_v4.Key, Metadata_V4.ColumnTypeMetadata_v4> columnTypeInfoMap = + ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfoMap(); + + if (columnTypeInfoMap == null) { + return new NonInterestingColumnsMetadata(columnsStatistics); + } // in some cases for runtime pruning + for (Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata : columnTypeInfoMap.values()) { - SchemaPath schemaPath = SchemaPath.getCompoundPath(columnTypeMetadata.name); - if (!schemaPaths.contains(schemaPath)) { + if (!columnTypeMetadata.isInteresting) { + SchemaPath schemaPath = SchemaPath.getCompoundPath(columnTypeMetadata.name); Map<StatisticsKind, Object> statistics = new HashMap<>(); statistics.put(ColumnStatisticsKind.NULLS_COUNT, Statistic.NO_COLUMN_STATS); PrimitiveType.PrimitiveTypeName primitiveType = columnTypeMetadata.primitiveType; @@ -316,8 +316,9 @@ public class ParquetTableMetadataUtils { columnsStatistics.put(schemaPath, new ColumnStatisticsImpl<>(statistics, comparator)); } } + return new NonInterestingColumnsMetadata(columnsStatistics); } - return columnsStatistics; + return new NonInterestingColumnsMetadata(columnsStatistics); } /** diff --git a/metastore/file-metadata/src/main/java/org/apache/drill/exec/physical/base/TableMetadataProvider.java b/metastore/file-metadata/src/main/java/org/apache/drill/exec/physical/base/TableMetadataProvider.java index 2607892..f0228d4 100644 --- a/metastore/file-metadata/src/main/java/org/apache/drill/exec/physical/base/TableMetadataProvider.java +++ b/metastore/file-metadata/src/main/java/org/apache/drill/exec/physical/base/TableMetadataProvider.java @@ -19,6 +19,7 @@ package org.apache.drill.exec.physical.base; import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.metastore.FileMetadata; +import org.apache.drill.metastore.NonInterestingColumnsMetadata; import org.apache.drill.metastore.PartitionMetadata; import org.apache.drill.metastore.TableMetadata; import org.apache.hadoop.fs.Path; @@ -85,4 +86,10 @@ public interface TableMetadataProvider { * @return list of {@link FileMetadata} instances which belongs to specified partitions */ List<FileMetadata> getFilesForPartition(PartitionMetadata partition); + + /** + * Returns {@link NonInterestingColumnsMetadata} instance which provides metadata for non-interesting columns. + * @return {@link NonInterestingColumnsMetadata} instance + */ + NonInterestingColumnsMetadata getNonInterestingColumnsMeta(); } diff --git a/metastore/metastore-api/src/main/java/org/apache/drill/metastore/NonInterestingColumnsMetadata.java b/metastore/metastore-api/src/main/java/org/apache/drill/metastore/NonInterestingColumnsMetadata.java new file mode 100644 index 0000000..56e6bd1 --- /dev/null +++ b/metastore/metastore-api/src/main/java/org/apache/drill/metastore/NonInterestingColumnsMetadata.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.metastore; + +import org.apache.drill.common.expression.SchemaPath; +import org.apache.drill.exec.record.metadata.ColumnMetadata; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import java.util.Map; + +/** + * Represents a metadata for the non-interesting columns. Since the refresh command doesn't store the non-interesting + * columns stats in the cache file, there is a need to mark column statistics of non-interesting as unknown to + * differentiate the non-interesting columns from non-existent columns. Since the sole purpose of this class is to store + * column statistics for non-interesting columns, some methods like getSchema, getStatistic, getColumn are not applicable + * to NonInterestingColumnsMetadata. + */ +public class NonInterestingColumnsMetadata implements BaseMetadata { + private final Map<SchemaPath, ColumnStatistics> columnsStatistics; + + public NonInterestingColumnsMetadata( + Map<SchemaPath, ColumnStatistics> columnsStatistics) { + this.columnsStatistics = columnsStatistics; + } + + @Override + public Map<SchemaPath, ColumnStatistics> getColumnsStatistics() { + return columnsStatistics; + } + + @Override + public ColumnStatistics getColumnStatistics(SchemaPath columnName) { + return columnsStatistics.get(columnName); + } + + @Override + public TupleMetadata getSchema() { + return null; + } + + @Override + public Object getStatistic(StatisticsKind statisticsKind) { + return null; + } + + @Override + public boolean containsExactStatistics(StatisticsKind statisticsKind) { + return false; + } + + @Override + public Object getStatisticsForColumn(SchemaPath columnName, StatisticsKind statisticsKind) { + return columnsStatistics.get(columnName).getStatistic(statisticsKind); + } + + @Override + public ColumnMetadata getColumn(SchemaPath name) { + return null; + } +}
