Github user arina-ielchiieva commented on a diff in the pull request: https://github.com/apache/drill/pull/1214#discussion_r183981479 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScanStatistics.java --- @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.parquet; + +import org.apache.drill.common.expression.SchemaPath; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.base.GroupScan; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata; +import static org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetTableMetadataBase; +import static org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ColumnTypeMetadata_v3; +import static org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetTableMetadata_v3; + +/** + * Holds common statistics about data in parquet group scan, + * including information about total row count, columns counts, partition columns. + */ +public class ParquetGroupScanStatistics { + + // map from file names to maps of column name to partition value mappings + private Map<String, Map<SchemaPath, Object>> partitionValueMap; + // only for partition columns : value is unique for each partition + private Map<SchemaPath, TypeProtos.MajorType> partitionColTypeMap; + // total number of non-null value for each column in parquet files + private Map<SchemaPath, Long> columnValueCounts; + // total number of rows (obtained from parquet footer) + private long rowCount; + + + public ParquetGroupScanStatistics(List<RowGroupInfo> rowGroupInfos, ParquetTableMetadataBase parquetTableMetadata) { + collect(rowGroupInfos, parquetTableMetadata); + } + + public ParquetGroupScanStatistics(ParquetGroupScanStatistics that) { + this.partitionValueMap = new HashMap<>(that.partitionValueMap); + this.partitionColTypeMap = new HashMap<>(that.partitionColTypeMap); + this.columnValueCounts = new HashMap<>(that.columnValueCounts); + this.rowCount = that.rowCount; + } + + public long getColumnValueCount(SchemaPath column) { + return columnValueCounts.containsKey(column) ? columnValueCounts.get(column) : 0; + } + + public List<SchemaPath> getPartitionColumns() { + return new ArrayList<>(partitionColTypeMap.keySet()); + } + + public TypeProtos.MajorType getTypeForColumn(SchemaPath schemaPath) { + return partitionColTypeMap.get(schemaPath); + } + + public long getRowCount() { + return rowCount; + } + + public Object getPartitionValue(String path, SchemaPath column) { + return partitionValueMap.get(path).get(column); + } + + public void collect(List<RowGroupInfo> rowGroupInfos, ParquetTableMetadataBase parquetTableMetadata) { + resetHolders(); + boolean first = true; + for (RowGroupInfo rowGroup : rowGroupInfos) { + long rowCount = rowGroup.getRowCount(); + for (ColumnMetadata column : rowGroup.getColumns()) { + SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName()); + Long previousCount = columnValueCounts.get(schemaPath); + if (previousCount != null) { + if (previousCount != GroupScan.NO_COLUMN_STATS) { + if (column.getNulls() != null) { --- End diff -- Changed.
---