This is an automated email from the ASF dual-hosted git repository. agozhiy pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
commit 7951e4ca6ec2948df5abdf23a3206be9f53f0c25 Author: Volodymyr Vysotskyi <[email protected]> AuthorDate: Fri Feb 14 17:51:52 2020 +0200 DRILL-7565: ANALYZE TABLE ... REFRESH METADATA does not work for empty Parquet files - Fixed ConvertMetadataAggregateToDirectScanRule rule to distinguish array columns correctly and proceed using other parquet metadata if such columns are found. - Added new implicit column which signalizes whether the empty result is obtained during collecting metadata and helps to distinguish real data results from metadata results. - Updated scan to return row with metadata if the above implicit column is present. - Added unit tests for checking the correctness of both optional and required columns from empty files. closes #1985 --- .../java/org/apache/drill/exec/ExecConstants.java | 21 +- .../drill/exec/metastore/ColumnNamesOptions.java | 19 +- .../metastore/analyze/AnalyzeFileInfoProvider.java | 1 + .../analyze/MetadataAggregateContext.java | 24 +- .../apache/drill/exec/physical/impl/ScanBatch.java | 16 ++ .../impl/metadata/MetadataAggregateHelper.java | 40 ++- .../impl/metadata/MetadataControllerBatch.java | 2 +- .../ConvertMetadataAggregateToDirectScanRule.java | 43 +-- .../sql/handlers/MetastoreAnalyzeTableHandler.java | 16 +- .../exec/server/options/SystemOptionManager.java | 1 + .../apache/drill/exec/store/ColumnExplorer.java | 7 +- .../store/parquet/BaseParquetMetadataProvider.java | 2 +- .../java-exec/src/main/resources/drill-module.conf | 1 + .../drill/exec/sql/TestMetastoreCommands.java | 310 +++++++++++++++++++++ 14 files changed, 436 insertions(+), 67 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java b/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java index 5938ca1..1276384 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java @@ -487,31 +487,36 @@ public final class ExecConstants { public static final OptionValidator IMPLICIT_SUFFIX_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_SUFFIX_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.10. Sets the implicit column name for the suffix column.")); public static final String IMPLICIT_FQN_COLUMN_LABEL = "drill.exec.storage.implicit.fqn.column.label"; - public static final OptionValidator IMPLICIT_FQN_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_FQN_COLUMN_LABEL, + public static final StringValidator IMPLICIT_FQN_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_FQN_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.10. Sets the implicit column name for the fqn column.")); public static final String IMPLICIT_FILEPATH_COLUMN_LABEL = "drill.exec.storage.implicit.filepath.column.label"; - public static final OptionValidator IMPLICIT_FILEPATH_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_FILEPATH_COLUMN_LABEL, + public static final StringValidator IMPLICIT_FILEPATH_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_FILEPATH_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.10. Sets the implicit column name for the filepath column.")); public static final String IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL = "drill.exec.storage.implicit.row_group_index.column.label"; - public static final OptionValidator IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL, + public static final StringValidator IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.17. Sets the implicit column name for the row group index (rgi) column. " + "For internal usage when producing Metastore analyze.")); public static final String IMPLICIT_ROW_GROUP_START_COLUMN_LABEL = "drill.exec.storage.implicit.row_group_start.column.label"; - public static final OptionValidator IMPLICIT_ROW_GROUP_START_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_ROW_GROUP_START_COLUMN_LABEL, + public static final StringValidator IMPLICIT_ROW_GROUP_START_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_ROW_GROUP_START_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.17. Sets the implicit column name for the row group start (rgs) column. " + "For internal usage when producing Metastore analyze.")); public static final String IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL = "drill.exec.storage.implicit.row_group_length.column.label"; - public static final OptionValidator IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL, + public static final StringValidator IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.17. Sets the implicit column name for the row group length (rgl) column. " + "For internal usage when producing Metastore analyze.")); public static final String IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL = "drill.exec.storage.implicit.last_modified_time.column.label"; - public static final OptionValidator IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL, + public static final StringValidator IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL, new OptionDescription("Available as of Drill 1.17. Sets the implicit column name for the lastModifiedTime column. " + "For internal usage when producing Metastore analyze.")); + public static final String IMPLICIT_PROJECT_METADATA_COLUMN_LABEL = "drill.exec.storage.implicit.project_metadata.column.label"; + public static final StringValidator IMPLICIT_PROJECT_METADATA_COLUMN_LABEL_VALIDATOR = new StringValidator(IMPLICIT_PROJECT_METADATA_COLUMN_LABEL, + new OptionDescription("Available as of Drill 1.18. Sets the implicit column name for the $project_metadata$ column. " + + "For internal usage when producing Metastore analyze.")); + public static final String JSON_READ_NUMBERS_AS_DOUBLE = "store.json.read_numbers_as_double"; public static final BooleanValidator JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR = new BooleanValidator(JSON_READ_NUMBERS_AS_DOUBLE, new OptionDescription("Reads numbers with or without a decimal point as DOUBLE. Prevents schema change errors.")); @@ -1109,14 +1114,14 @@ public final class ExecConstants { */ public static final String METASTORE_USE_SCHEMA_METADATA = "metastore.metadata.use_schema"; public static final BooleanValidator METASTORE_USE_SCHEMA_METADATA_VALIDATOR = new BooleanValidator(METASTORE_USE_SCHEMA_METADATA, - new OptionDescription("Enables schema usage, stored to the Metastore. Default is false. (Drill 1.17+)")); + new OptionDescription("Enables schema usage, stored to the Metastore. Default is true. (Drill 1.17+)")); /** * Option for enabling statistics usage, stored in the Metastore, at the planning stage. */ public static final String METASTORE_USE_STATISTICS_METADATA = "metastore.metadata.use_statistics"; public static final BooleanValidator METASTORE_USE_STATISTICS_METADATA_VALIDATOR = new BooleanValidator(METASTORE_USE_STATISTICS_METADATA, - new OptionDescription("Enables statistics usage, stored in the Metastore, at the planning stage. Default is false. (Drill 1.17+)")); + new OptionDescription("Enables statistics usage, stored in the Metastore, at the planning stage. Default is true. (Drill 1.17+)")); /** * Option for collecting schema and / or column statistics for every table after CTAS and CTTAS execution. diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/ColumnNamesOptions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/ColumnNamesOptions.java index 0b9faca..7a23c12 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/ColumnNamesOptions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/ColumnNamesOptions.java @@ -32,14 +32,16 @@ public class ColumnNamesOptions { private final String rowGroupStart; private final String rowGroupLength; private final String lastModifiedTime; + private final String projectMetadataColumn; public ColumnNamesOptions(OptionManager optionManager) { - this.fullyQualifiedName = optionManager.getOption(ExecConstants.IMPLICIT_FQN_COLUMN_LABEL).string_val; - this.partitionColumnNameLabel = optionManager.getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; - this.rowGroupIndex = optionManager.getOption(ExecConstants.IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL).string_val; - this.rowGroupStart = optionManager.getOption(ExecConstants.IMPLICIT_ROW_GROUP_START_COLUMN_LABEL).string_val; - this.rowGroupLength = optionManager.getOption(ExecConstants.IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL).string_val; - this.lastModifiedTime = optionManager.getOption(ExecConstants.IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL).string_val; + this.fullyQualifiedName = optionManager.getOption(ExecConstants.IMPLICIT_FQN_COLUMN_LABEL_VALIDATOR); + this.partitionColumnNameLabel = optionManager.getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL_VALIDATOR); + this.rowGroupIndex = optionManager.getOption(ExecConstants.IMPLICIT_ROW_GROUP_INDEX_COLUMN_LABEL_VALIDATOR); + this.rowGroupStart = optionManager.getOption(ExecConstants.IMPLICIT_ROW_GROUP_START_COLUMN_LABEL_VALIDATOR); + this.rowGroupLength = optionManager.getOption(ExecConstants.IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL_VALIDATOR); + this.lastModifiedTime = optionManager.getOption(ExecConstants.IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL_VALIDATOR); + this.projectMetadataColumn = optionManager.getOption(ExecConstants.IMPLICIT_PROJECT_METADATA_COLUMN_LABEL_VALIDATOR); } public String partitionColumnNameLabel() { @@ -66,6 +68,10 @@ public class ColumnNamesOptions { return lastModifiedTime; } + public String projectMetadataColumn() { + return projectMetadataColumn; + } + @Override public String toString() { return new StringJoiner(", ", ColumnNamesOptions.class.getSimpleName() + "[", "]") @@ -75,6 +81,7 @@ public class ColumnNamesOptions { .add("rowGroupStart='" + rowGroupStart + "'") .add("rowGroupLength='" + rowGroupLength + "'") .add("lastModifiedTime='" + lastModifiedTime + "'") + .add("projectMetadataColumn='" + projectMetadataColumn + "'") .toString(); } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/AnalyzeFileInfoProvider.java b/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/AnalyzeFileInfoProvider.java index a4bf0ad..1d2f2db 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/AnalyzeFileInfoProvider.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/AnalyzeFileInfoProvider.java @@ -64,6 +64,7 @@ public abstract class AnalyzeFileInfoProvider implements AnalyzeInfoProvider { List<SchemaPath> projectionList = new ArrayList<>(getSegmentColumns(table, columnNamesOptions)); projectionList.add(SchemaPath.getSimplePath(columnNamesOptions.fullyQualifiedName())); projectionList.add(SchemaPath.getSimplePath(columnNamesOptions.lastModifiedTime())); + projectionList.add(SchemaPath.getSimplePath(columnNamesOptions.projectMetadataColumn())); return Collections.unmodifiableList(projectionList); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/MetadataAggregateContext.java b/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/MetadataAggregateContext.java index 9108345..99db025 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/MetadataAggregateContext.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/metastore/analyze/MetadataAggregateContext.java @@ -35,7 +35,11 @@ import java.util.StringJoiner; public class MetadataAggregateContext { private final List<NamedExpression> groupByExpressions; private final List<SchemaPath> interestingColumns; - private final List<SchemaPath> excludedColumns; + + /** + * List of columns which do not belong to table schema, but used to pass some metadata information like file location, row group index, etc. + */ + private final List<SchemaPath> metadataColumns; private final boolean createNewAggregations; private final MetadataType metadataLevel; @@ -43,7 +47,7 @@ public class MetadataAggregateContext { this.groupByExpressions = builder.groupByExpressions; this.interestingColumns = builder.interestingColumns; this.createNewAggregations = builder.createNewAggregations; - this.excludedColumns = builder.excludedColumns; + this.metadataColumns = builder.metadataColumns; this.metadataLevel = builder.metadataLevel; } @@ -63,8 +67,8 @@ public class MetadataAggregateContext { } @JsonProperty - public List<SchemaPath> excludedColumns() { - return excludedColumns; + public List<SchemaPath> metadataColumns() { + return metadataColumns; } @JsonProperty @@ -78,7 +82,7 @@ public class MetadataAggregateContext { .add("groupByExpressions=" + groupByExpressions) .add("interestingColumns=" + interestingColumns) .add("createNewAggregations=" + createNewAggregations) - .add("excludedColumns=" + excludedColumns) + .add("excludedColumns=" + metadataColumns) .toString(); } @@ -91,7 +95,7 @@ public class MetadataAggregateContext { .groupByExpressions(groupByExpressions) .interestingColumns(interestingColumns) .createNewAggregations(createNewAggregations) - .excludedColumns(excludedColumns) + .metadataColumns(metadataColumns) .metadataLevel(metadataLevel); } @@ -101,7 +105,7 @@ public class MetadataAggregateContext { private List<SchemaPath> interestingColumns; private Boolean createNewAggregations; private MetadataType metadataLevel; - private List<SchemaPath> excludedColumns; + private List<SchemaPath> metadataColumns; public MetadataAggregateContextBuilder groupByExpressions(List<NamedExpression> groupByExpressions) { this.groupByExpressions = groupByExpressions; @@ -123,15 +127,15 @@ public class MetadataAggregateContext { return this; } - public MetadataAggregateContextBuilder excludedColumns(List<SchemaPath> excludedColumns) { - this.excludedColumns = excludedColumns; + public MetadataAggregateContextBuilder metadataColumns(List<SchemaPath> metadataColumns) { + this.metadataColumns = metadataColumns; return this; } public MetadataAggregateContext build() { Objects.requireNonNull(groupByExpressions, "groupByExpressions were not set"); Objects.requireNonNull(createNewAggregations, "createNewAggregations was not set"); - Objects.requireNonNull(excludedColumns, "excludedColumns were not set"); + Objects.requireNonNull(metadataColumns, "metadataColumns were not set"); Objects.requireNonNull(metadataLevel, "metadataLevel was not set"); return new MetadataAggregateContext(this); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/ScanBatch.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/ScanBatch.java index e159ec6..d1d1c48 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/ScanBatch.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/ScanBatch.java @@ -29,6 +29,7 @@ import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.common.map.CaseInsensitiveMap; import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.common.types.Types; +import org.apache.drill.exec.ExecConstants; import org.apache.drill.exec.exception.OutOfMemoryException; import org.apache.drill.exec.exception.SchemaChangeException; import org.apache.drill.exec.expr.TypeHelper; @@ -234,6 +235,21 @@ public class ScanBatch implements CloseableRecordBatch { logger.trace("currentReader.next return recordCount={}", recordCount); Preconditions.checkArgument(recordCount >= 0, "recordCount from RecordReader.next() should not be negative"); boolean isNewSchema = mutator.isNewSchema(); + // If scan is done for collecting metadata, additional implicit column `$project_metadata$` + // will be projected to handle the case when scan may return empty results (scan on empty file or row group). + // Scan will return single row for the case when empty file or row group is present with correct + // values of other implicit columns (like `fqn`, `rgi`), so this metadata will be stored to the Metastore. + if (implicitValues != null) { + String projectMetadataColumn = context.getOptions().getOption(ExecConstants.IMPLICIT_PROJECT_METADATA_COLUMN_LABEL_VALIDATOR); + if (recordCount > 0) { + // Sets the implicit value to false to signal that some results were returned and there is no need for creating an additional record. + implicitValues.replace(projectMetadataColumn, Boolean.FALSE.toString()); + } else if (Boolean.parseBoolean(implicitValues.get(projectMetadataColumn))) { + recordCount++; + // Sets implicit value to null to avoid affecting resulting count value. + implicitValues.put(projectMetadataColumn, null); + } + } populateImplicitVectors(); mutator.container.setValueCount(recordCount); oContext.getStats().batchReceived(0, recordCount, isNewSchema); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataAggregateHelper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataAggregateHelper.java index 6f00dea..9122de1 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataAggregateHelper.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataAggregateHelper.java @@ -22,7 +22,9 @@ import org.apache.calcite.sql.type.SqlTypeName; import org.apache.drill.common.expression.ExpressionPosition; import org.apache.drill.common.expression.FieldReference; import org.apache.drill.common.expression.FunctionCall; +import org.apache.drill.common.expression.IfExpression; import org.apache.drill.common.expression.LogicalExpression; +import org.apache.drill.common.expression.NullExpression; import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.common.expression.ValueExpressions; import org.apache.drill.common.logical.data.NamedExpression; @@ -55,6 +57,7 @@ public class MetadataAggregateHelper { private final ColumnNamesOptions columnNamesOptions; private final BatchSchema schema; private final AggPrelBase.OperatorPhase phase; + private final List<SchemaPath> excludedColumns; public MetadataAggregateHelper(MetadataAggregateContext context, ColumnNamesOptions columnNamesOptions, BatchSchema schema, AggPrelBase.OperatorPhase phase) { @@ -63,6 +66,8 @@ public class MetadataAggregateHelper { this.schema = schema; this.phase = phase; this.valueExpressions = new ArrayList<>(); + this.excludedColumns = new ArrayList<>(context.metadataColumns()); + excludedColumns.add(SchemaPath.getSimplePath(columnNamesOptions.projectMetadataColumn())); createAggregatorInternal(); } @@ -71,8 +76,6 @@ public class MetadataAggregateHelper { } private void createAggregatorInternal() { - List<SchemaPath> excludedColumns = context.excludedColumns(); - // Iterates through input expressions and adds aggregate calls for table fields // to collect required statistics (MIN, MAX, COUNT, etc.) or aggregate calls to merge incoming metadata getUnflattenedFileds(Lists.newArrayList(schema), null) @@ -117,16 +120,16 @@ public class MetadataAggregateHelper { } } - for (SchemaPath excludedColumn : excludedColumns) { - if (excludedColumn.equals(SchemaPath.getSimplePath(columnNamesOptions.rowGroupStart())) - || excludedColumn.equals(SchemaPath.getSimplePath(columnNamesOptions.rowGroupLength()))) { - LogicalExpression lastModifiedTime = new FunctionCall("any_value", + for (SchemaPath metadataColumns : context.metadataColumns()) { + if (metadataColumns.equals(SchemaPath.getSimplePath(columnNamesOptions.rowGroupStart())) + || metadataColumns.equals(SchemaPath.getSimplePath(columnNamesOptions.rowGroupLength()))) { + LogicalExpression anyValueCall = new FunctionCall("any_value", Collections.singletonList( - FieldReference.getWithQuotedRef(excludedColumn.getRootSegmentPath())), + FieldReference.getWithQuotedRef(metadataColumns.getRootSegmentPath())), ExpressionPosition.UNKNOWN); - valueExpressions.add(new NamedExpression(lastModifiedTime, - FieldReference.getWithQuotedRef(excludedColumn.getRootSegmentPath()))); + valueExpressions.add(new NamedExpression(anyValueCall, + FieldReference.getWithQuotedRef(metadataColumns.getRootSegmentPath()))); } } @@ -207,9 +210,8 @@ public class MetadataAggregateHelper { */ private void addCollectListCall(List<LogicalExpression> fieldList) { ArrayList<LogicalExpression> collectListArguments = new ArrayList<>(fieldList); - List<SchemaPath> excludedColumns = context.excludedColumns(); // populate columns which weren't included in the schema, but should be collected to the COLLECTED_MAP_FIELD - for (SchemaPath logicalExpressions : excludedColumns) { + for (SchemaPath logicalExpressions : context.metadataColumns()) { // adds string literal with field name to the list collectListArguments.add(ValueExpressions.getChar(logicalExpressions.getRootSegmentPath(), DrillRelDataTypeSystem.DRILL_REL_DATATYPE_SYSTEM.getDefaultPrecision(SqlTypeName.VARCHAR))); @@ -254,7 +256,7 @@ public class MetadataAggregateHelper { private void addMetadataAggregateCalls() { AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.forEach((statisticsKind, sqlKind) -> { LogicalExpression call = new FunctionCall(sqlKind.name(), - Collections.singletonList(ValueExpressions.getBigInt(1)), ExpressionPosition.UNKNOWN); + Collections.singletonList(FieldReference.getWithQuotedRef(columnNamesOptions.projectMetadataColumn())), ExpressionPosition.UNKNOWN); valueExpressions.add( new NamedExpression(call, FieldReference.getWithQuotedRef(AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind)))); @@ -275,7 +277,6 @@ public class MetadataAggregateHelper { for (MaterializedField field : fields) { // statistics collecting is not supported for array types if (field.getType().getMode() != TypeProtos.DataMode.REPEATED) { - List<SchemaPath> excludedColumns = context.excludedColumns(); // excludedColumns are applied for root fields only if (parentFields != null || !excludedColumns.contains(SchemaPath.getSimplePath(field.getName()))) { List<String> currentPath; @@ -313,8 +314,19 @@ public class MetadataAggregateHelper { if (interestingColumns == null || interestingColumns.contains(fieldRef)) { // collect statistics for all or only interesting columns if they are specified AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.forEach((statisticsKind, sqlKind) -> { + // constructs "case when is not null projectMetadataColumn then column1 else null end" call + // to avoid using default values for required columns when data for empty result is obtained + LogicalExpression caseExpr = IfExpression.newBuilder() + .setIfCondition(new IfExpression.IfCondition( + new FunctionCall( + "isnotnull", + Collections.singletonList(FieldReference.getWithQuotedRef(columnNamesOptions.projectMetadataColumn())), + ExpressionPosition.UNKNOWN), fieldRef)) + .setElse(NullExpression.INSTANCE) + .build(); + LogicalExpression call = new FunctionCall(sqlKind.name(), - Collections.singletonList(fieldRef), ExpressionPosition.UNKNOWN); + Collections.singletonList(caseExpr), ExpressionPosition.UNKNOWN); valueExpressions.add( new NamedExpression(call, FieldReference.getWithQuotedRef(AnalyzeColumnUtils.getColumnStatisticsFieldName(fieldName, statisticsKind)))); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataControllerBatch.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataControllerBatch.java index 7314961..8ee3beb 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataControllerBatch.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/metadata/MetadataControllerBatch.java @@ -586,9 +586,9 @@ public class MetadataControllerBatch extends AbstractBinaryRecordBatch<MetadataC Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create(); Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>(); for (ColumnMetadata column : columnMetadata) { - String fieldName = AnalyzeColumnUtils.getColumnName(column.name()); if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) { + String fieldName = AnalyzeColumnUtils.getColumnName(column.name()); StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name()); columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind)); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/ConvertMetadataAggregateToDirectScanRule.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/ConvertMetadataAggregateToDirectScanRule.java index 80a463b..f9b266e 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/ConvertMetadataAggregateToDirectScanRule.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/ConvertMetadataAggregateToDirectScanRule.java @@ -39,6 +39,7 @@ import org.apache.drill.exec.store.ColumnExplorer.ImplicitFileColumns; import org.apache.drill.exec.store.dfs.DrillFileSystem; import org.apache.drill.exec.store.dfs.FormatSelection; import org.apache.drill.exec.store.direct.DirectGroupScan; +import org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider; import org.apache.drill.exec.store.parquet.ParquetGroupScan; import org.apache.drill.exec.store.pojo.DynamicPojoRecordReader; import org.apache.drill.exec.util.ImpersonationUtil; @@ -50,7 +51,6 @@ import org.apache.drill.metastore.statistics.ColumnStatisticsKind; import org.apache.drill.metastore.statistics.ExactStatisticsConstants; import org.apache.drill.metastore.statistics.StatisticsKind; import org.apache.drill.metastore.statistics.TableStatisticsKind; -import org.apache.drill.metastore.util.SchemaPathUtils; import org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable; import org.apache.drill.shaded.guava.com.google.common.collect.Multimap; import org.apache.drill.shaded.guava.com.google.common.collect.Table; @@ -94,11 +94,11 @@ import java.util.stream.IntStream; * </pre> */ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { + private static final Logger logger = LoggerFactory.getLogger(ConvertMetadataAggregateToDirectScanRule.class); + public static final ConvertMetadataAggregateToDirectScanRule INSTANCE = new ConvertMetadataAggregateToDirectScanRule(); - private static final Logger logger = LoggerFactory.getLogger(ConvertMetadataAggregateToDirectScanRule.class); - public ConvertMetadataAggregateToDirectScanRule() { super( RelOptHelper.some(MetadataAggRel.class, RelOptHelper.any(DrillScanRel.class)), @@ -210,7 +210,7 @@ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { // do not gather statistics for array columns as it is not supported by Metastore if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) { - return null; + continue; } if (IsPredicate.isNullOrEmpty(columnStatistics)) { @@ -232,6 +232,8 @@ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { columnStatisticsFieldName, statsValue.getClass()); recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue); + } else { + recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE); } } } @@ -244,6 +246,8 @@ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { if (statisticsValue != null) { schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass()); recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue); + } else { + recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE); } } @@ -258,15 +262,19 @@ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { // DynamicPojoRecordReader requires LinkedHashMap with fields order // which corresponds to the value position in record list. - LinkedHashMap<String, Class<?>> orderedSchema = recordsTable.rowKeySet().stream() - .collect(Collectors.toMap( - Function.identity(), - column -> schema.getOrDefault(column, Integer.class), - (o, n) -> n, - LinkedHashMap::new)); + LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>(); + for (String s : recordsTable.rowKeySet()) { + Class<?> clazz = schema.get(s); + if (clazz != null) { + orderedSchema.put(s, clazz); + } else { + return null; + } + } IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream() .map(column -> recordsTable.get(column, currentIndex)) + .map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null) .collect(Collectors.toList()); List<List<Object>> records = IntStream.range(0, rowIndex) @@ -288,12 +296,11 @@ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { * @return {@code true} if any segment in the schema path is an array, {@code false} otherwise */ private static boolean containsArrayColumn(TupleMetadata schema, SchemaPath schemaPath) { - ColumnMetadata columnMetadata = SchemaPathUtils.getColumnMetadata(schemaPath, schema); PathSegment currentPath = schemaPath.getRootSegment(); - ColumnMetadata currentColumn = columnMetadata; - do { - if (currentColumn.isArray()) { - return false; + ColumnMetadata columnMetadata = schema.metadata(currentPath.getNameSegment().getPath()); + while (columnMetadata != null) { + if (columnMetadata.isArray()) { + return true; } else if (columnMetadata.isMap()) { currentPath = currentPath.getChild(); columnMetadata = columnMetadata.tupleSchema().metadata(currentPath.getNameSegment().getPath()); @@ -301,9 +308,9 @@ public class ConvertMetadataAggregateToDirectScanRule extends RelOptRule { currentPath = currentPath.getChild(); columnMetadata = ((DictColumnMetadata) columnMetadata).valueColumnMetadata(); } else { - return true; + return false; } - } while (columnMetadata != null); - return true; + } + return false; } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/MetastoreAnalyzeTableHandler.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/MetastoreAnalyzeTableHandler.java index 36eae41..2be8dfc 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/MetastoreAnalyzeTableHandler.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/sql/handlers/MetastoreAnalyzeTableHandler.java @@ -406,13 +406,13 @@ public class MetastoreAnalyzeTableHandler extends DefaultSqlHandler { SchemaPath lastModifiedTimeField = SchemaPath.getSimplePath(config.getContext().getOptions().getString(ExecConstants.IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL)); - List<SchemaPath> excludedColumns = Arrays.asList(locationField, lastModifiedTimeField); + List<SchemaPath> metadataColumns = Arrays.asList(locationField, lastModifiedTimeField); MetadataAggregateContext aggregateContext = MetadataAggregateContext.builder() .groupByExpressions(Collections.emptyList()) .interestingColumns(statisticsColumns) .createNewAggregations(createNewAggregations) - .excludedColumns(excludedColumns) + .metadataColumns(metadataColumns) .metadataLevel(MetadataType.TABLE) .build(); @@ -433,7 +433,7 @@ public class MetastoreAnalyzeTableHandler extends DefaultSqlHandler { SchemaPath lastModifiedTimeField = SchemaPath.getSimplePath(config.getContext().getOptions().getString(ExecConstants.IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL)); - List<SchemaPath> excludedColumns = Arrays.asList(lastModifiedTimeField, locationField); + List<SchemaPath> metadataColumns = Arrays.asList(lastModifiedTimeField, locationField); List<NamedExpression> groupByExpressions = new ArrayList<>(segmentExpressions); @@ -441,7 +441,7 @@ public class MetastoreAnalyzeTableHandler extends DefaultSqlHandler { .groupByExpressions(groupByExpressions.subList(0, segmentLevel)) .interestingColumns(statisticsColumns) .createNewAggregations(createNewAggregations) - .excludedColumns(excludedColumns) + .metadataColumns(metadataColumns) .metadataLevel(MetadataType.SEGMENT) .build(); @@ -461,7 +461,7 @@ public class MetastoreAnalyzeTableHandler extends DefaultSqlHandler { SchemaPath lastModifiedTimeField = SchemaPath.getSimplePath(config.getContext().getOptions().getString(ExecConstants.IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL)); - List<SchemaPath> excludedColumns = Arrays.asList(lastModifiedTimeField, locationField); + List<SchemaPath> metadataColumns = Arrays.asList(lastModifiedTimeField, locationField); NamedExpression locationExpression = new NamedExpression(locationField, FieldReference.getWithQuotedRef(MetastoreAnalyzeConstants.LOCATION_FIELD)); @@ -472,7 +472,7 @@ public class MetastoreAnalyzeTableHandler extends DefaultSqlHandler { .groupByExpressions(fileGroupByExpressions) .interestingColumns(statisticsColumns) .createNewAggregations(createNewAggregations) - .excludedColumns(excludedColumns) + .metadataColumns(metadataColumns) .metadataLevel(MetadataType.FILE) .build(); @@ -505,13 +505,13 @@ public class MetastoreAnalyzeTableHandler extends DefaultSqlHandler { SchemaPath rowGroupLengthField = SchemaPath.getSimplePath(config.getContext().getOptions().getString(ExecConstants.IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL)); - List<SchemaPath> excludedColumns = Arrays.asList(lastModifiedTimeField, locationField, rgiField, rowGroupStartField, rowGroupLengthField); + List<SchemaPath> metadataColumns = Arrays.asList(lastModifiedTimeField, locationField, rgiField, rowGroupStartField, rowGroupLengthField); MetadataAggregateContext aggregateContext = MetadataAggregateContext.builder() .groupByExpressions(rowGroupGroupByExpressions) .interestingColumns(statisticsColumns) .createNewAggregations(createNewAggregations) - .excludedColumns(excludedColumns) + .metadataColumns(metadataColumns) .metadataLevel(MetadataType.ROW_GROUP) .build(); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java b/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java index e3ed2f6..430e0a1 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/server/options/SystemOptionManager.java @@ -267,6 +267,7 @@ public class SystemOptionManager extends BaseOptionManager implements AutoClosea new OptionDefinition(ExecConstants.IMPLICIT_ROW_GROUP_START_COLUMN_LABEL_VALIDATOR), new OptionDefinition(ExecConstants.IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL_VALIDATOR), new OptionDefinition(ExecConstants.IMPLICIT_LAST_MODIFIED_TIME_COLUMN_LABEL_VALIDATOR), + new OptionDefinition(ExecConstants.IMPLICIT_PROJECT_METADATA_COLUMN_LABEL_VALIDATOR), new OptionDefinition(ExecConstants.CODE_GEN_EXP_IN_METHOD_SIZE_VALIDATOR), new OptionDefinition(ExecConstants.CREATE_PREPARE_STATEMENT_TIMEOUT_MILLIS_VALIDATOR), new OptionDefinition(ExecConstants.DYNAMIC_UDF_SUPPORT_ENABLED_VALIDATOR, new OptionMetaData(OptionValue.AccessibleScopes.SYSTEM, true, false)), diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/ColumnExplorer.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/ColumnExplorer.java index d4c45ae..ecb12f3 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/ColumnExplorer.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/ColumnExplorer.java @@ -311,6 +311,9 @@ public class ColumnExplorer { case ROW_GROUP_LENGTH: implicitValues.put(key, String.valueOf(length)); break; + case PROJECT_METADATA: + implicitValues.put(key, Boolean.TRUE.toString()); + break; case LAST_MODIFIED_TIME: try { implicitValues.put(key, String.valueOf(fs.getFileStatus(filePath).getModificationTime())); @@ -509,7 +512,9 @@ public class ColumnExplorer { ROW_GROUP_START(ExecConstants.IMPLICIT_ROW_GROUP_START_COLUMN_LABEL), - ROW_GROUP_LENGTH(ExecConstants.IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL); + ROW_GROUP_LENGTH(ExecConstants.IMPLICIT_ROW_GROUP_LENGTH_COLUMN_LABEL), + + PROJECT_METADATA(ExecConstants.IMPLICIT_PROJECT_METADATA_COLUMN_LABEL); private final String name; diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java index b535e0f..f2e177d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/BaseParquetMetadataProvider.java @@ -77,7 +77,7 @@ public abstract class BaseParquetMetadataProvider implements ParquetMetadataProv /** * {@link HashBasedTable} cannot contain nulls, used this object to represent null values. */ - static final Object NULL_VALUE = new Object(); + public static final Object NULL_VALUE = new Object(); protected final List<ReadEntryWithPath> entries; protected final ParquetReaderConfig readerConfig; diff --git a/exec/java-exec/src/main/resources/drill-module.conf b/exec/java-exec/src/main/resources/drill-module.conf index 7230426..b365047 100644 --- a/exec/java-exec/src/main/resources/drill-module.conf +++ b/exec/java-exec/src/main/resources/drill-module.conf @@ -483,6 +483,7 @@ drill.exec.options: { drill.exec.storage.implicit.row_group_start.column.label: "rgs", drill.exec.storage.implicit.row_group_length.column.label: "rgl", drill.exec.storage.implicit.last_modified_time.column.label: "lmt", + drill.exec.storage.implicit.project_metadata.column.label: "$project_metadata$", drill.exec.testing.controls: "{}", drill.exec.memory.operator.output_batch_size : 16777216, # 16 MB drill.exec.memory.operator.output_batch_size_avail_mem_factor : 0.1, diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java index 51d36f2..394adca 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestMetastoreCommands.java @@ -3160,6 +3160,316 @@ public class TestMetastoreCommands extends ClusterTest { } } + @Test + public void testAnalyzeEmptyNullableParquetTable() throws Exception { + File table = dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "empty", "simple", "empty_simple.parquet")); + + String tableName = "parquet/empty/simple/empty_simple.parquet"; + + TableInfo tableInfo = getTableInfo(tableName, "default"); + + TupleMetadata schema = new SchemaBuilder() + .addNullable("id", TypeProtos.MinorType.BIGINT) + .addNullable("name", TypeProtos.MinorType.VARCHAR) + .build(); + + Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder() + .put(SchemaPath.getSimplePath("name"), + getColumnStatistics(null, null, 0L, TypeProtos.MinorType.VARCHAR)) + .put(SchemaPath.getSimplePath("id"), + getColumnStatistics(null, null, 0L, TypeProtos.MinorType.BIGINT)) + .build(); + + BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder() + .tableInfo(tableInfo) + .metadataInfo(TABLE_META_INFO) + .schema(schema) + .location(new Path(table.toURI().getPath())) + .columnsStatistics(columnStatistics) + .metadataStatistics(Arrays.asList(new StatisticsHolder<>(0L, TableStatisticsKind.ROW_COUNT), + new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))) + .partitionKeys(Collections.emptyMap()) + .lastModifiedTime(getMaxLastModified(table)) + .build(); + + try { + testBuilder() + .sqlQuery("ANALYZE TABLE dfs.`%s` REFRESH METADATA", tableName) + .unOrdered() + .baselineColumns("ok", "summary") + .baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.default.%s]", tableName)) + .go(); + + MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .metastoreTableInfo(tableInfo); + + assertTrue("table metadata wasn't found", metastoreTableInfo.isExists()); + + BaseTableMetadata tableMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .tableMetadata(tableInfo); + + assertEquals(expectedTableMetadata, tableMetadata); + + List<FileMetadata> filesMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .filesMetadata(tableInfo, null, null); + + assertEquals(1, filesMetadata.size()); + + List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .rowGroupsMetadata(tableInfo, (String) null, null); + + assertEquals(1, rowGroupsMetadata.size()); + } finally { + run("analyze table dfs.`%s` drop metadata if exists", tableName); + } + } + + @Test + public void testAnalyzeEmptyRequiredParquetTable() throws Exception { + String tableName = "analyze_empty_simple_required"; + + run("create table dfs.tmp.%s as select 1 as id, 'a' as name from (values(1)) where 1 = 2", tableName); + + File table = new File(dirTestWatcher.getDfsTestTmpDir(), tableName); + + TableInfo tableInfo = getTableInfo(tableName, "tmp"); + + TupleMetadata schema = new SchemaBuilder() + .add("id", TypeProtos.MinorType.INT) + .add("name", TypeProtos.MinorType.VARCHAR) + .build(); + + Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder() + .put(SchemaPath.getSimplePath("name"), + getColumnStatistics(null, null, 0L, TypeProtos.MinorType.VARCHAR)) + .put(SchemaPath.getSimplePath("id"), + getColumnStatistics(null, null, 0L, TypeProtos.MinorType.INT)) + .build(); + + BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder() + .tableInfo(tableInfo) + .metadataInfo(TABLE_META_INFO) + .schema(schema) + .location(new Path(table.toURI().getPath())) + .columnsStatistics(columnStatistics) + .metadataStatistics(Arrays.asList(new StatisticsHolder<>(0L, TableStatisticsKind.ROW_COUNT), + new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))) + .partitionKeys(Collections.emptyMap()) + .lastModifiedTime(getMaxLastModified(table)) + .build(); + + try { + testBuilder() + .sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName) + .unOrdered() + .baselineColumns("ok", "summary") + .baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)) + .go(); + + MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .metastoreTableInfo(tableInfo); + + assertTrue("table metadata wasn't found", metastoreTableInfo.isExists()); + + BaseTableMetadata tableMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .tableMetadata(tableInfo); + + assertEquals(expectedTableMetadata, tableMetadata); + + List<FileMetadata> filesMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .filesMetadata(tableInfo, null, null); + + assertEquals(1, filesMetadata.size()); + + List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .rowGroupsMetadata(tableInfo, (String) null, null); + + assertEquals(1, rowGroupsMetadata.size()); + } finally { + run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName); + run("drop table if exists dfs.tmp.`%s`", tableName); + } + } + + @Test + public void testAnalyzeNonEmptyTableWithEmptyFile() throws Exception { + String tableName = "parquet_with_empty_file"; + + File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("parquet", "empty", "simple"), Paths.get(tableName)); + + TableInfo tableInfo = getTableInfo(tableName, "tmp"); + + TupleMetadata schema = new SchemaBuilder() + .addNullable("id", TypeProtos.MinorType.BIGINT) + .addNullable("name", TypeProtos.MinorType.VARCHAR) + .build(); + + Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder() + .put(SchemaPath.getSimplePath("name"), + getColumnStatistics("Tom", "Tom", 1L, TypeProtos.MinorType.VARCHAR)) + .put(SchemaPath.getSimplePath("id"), + getColumnStatistics(2L, 2L, 1L, TypeProtos.MinorType.BIGINT)) + .build(); + + BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder() + .tableInfo(tableInfo) + .metadataInfo(TABLE_META_INFO) + .schema(schema) + .location(new Path(table.toURI().getPath())) + .columnsStatistics(columnStatistics) + .metadataStatistics(Arrays.asList(new StatisticsHolder<>(1L, TableStatisticsKind.ROW_COUNT), + new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))) + .partitionKeys(Collections.emptyMap()) + .lastModifiedTime(getMaxLastModified(table)) + .build(); + + try { + testBuilder() + .sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName) + .unOrdered() + .baselineColumns("ok", "summary") + .baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)) + .go(); + + MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .metastoreTableInfo(tableInfo); + + assertTrue("table metadata wasn't found", metastoreTableInfo.isExists()); + + BaseTableMetadata tableMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .tableMetadata(tableInfo); + + assertEquals(expectedTableMetadata, tableMetadata); + + List<FileMetadata> filesMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .filesMetadata(tableInfo, null, null); + + assertEquals(2, filesMetadata.size()); + + List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext() + .getMetastoreRegistry() + .get() + .tables() + .basicRequests() + .rowGroupsMetadata(tableInfo, (String) null, null); + + assertEquals(2, rowGroupsMetadata.size()); + } finally { + run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName); + } + } + + @Test + public void testSelectEmptyRequiredParquetTable() throws Exception { + String tableName = "empty_simple_required"; + + run("create table dfs.tmp.%s as select 1 as id, 'a' as name from (values(1)) where 1 = 2", tableName); + + try { + testBuilder() + .sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName) + .unOrdered() + .baselineColumns("ok", "summary") + .baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)) + .go(); + + String query = "select * from dfs.tmp.`%s`"; + + queryBuilder() + .sql(query, tableName) + .planMatcher() + .include("usedMetastore=true") + .match(); + + testBuilder() + .sqlQuery(query, tableName) + .unOrdered() + .baselineColumns("id", "name") + .expectsEmptyResultSet() + .go(); + } finally { + run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName); + run("drop table if exists dfs.tmp.`%s`", tableName); + } + } + + @Test + public void testSelectNonEmptyTableWithEmptyFile() throws Exception { + String tableName = "parquet_with_empty_file"; + + dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "empty", "simple"), Paths.get(tableName)); + + try { + testBuilder() + .sqlQuery("ANALYZE TABLE dfs.`%s` REFRESH METADATA", tableName) + .unOrdered() + .baselineColumns("ok", "summary") + .baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.default.%s]", tableName)) + .go(); + + String query = "select * from dfs.`%s`"; + + queryBuilder() + .sql(query, tableName) + .planMatcher() + .include("usedMetastore=true") + .match(); + + testBuilder() + .sqlQuery(query, tableName) + .unOrdered() + .baselineColumns("id", "name") + .baselineValues(2L, "Tom") + .go(); + } finally { + run("analyze table dfs.`%s` drop metadata if exists", tableName); + } + } + private static <T> ColumnStatistics<T> getColumnStatistics(T minValue, T maxValue, long rowCount, TypeProtos.MinorType minorType) { return new ColumnStatistics<>(
