Github user arina-ielchiieva commented on a diff in the pull request:

    https://github.com/apache/drill/pull/1214#discussion_r183981479
  
    --- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScanStatistics.java
 ---
    @@ -0,0 +1,217 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.drill.exec.store.parquet;
    +
    +import org.apache.drill.common.expression.SchemaPath;
    +import org.apache.drill.common.types.TypeProtos;
    +import org.apache.drill.exec.physical.base.GroupScan;
    +import org.apache.parquet.schema.OriginalType;
    +import org.apache.parquet.schema.PrimitiveType;
    +
    +import java.util.ArrayList;
    +import java.util.HashMap;
    +import java.util.List;
    +import java.util.Map;
    +
    +import static 
org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata;
    +import static 
org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetTableMetadataBase;
    +import static 
org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ColumnTypeMetadata_v3;
    +import static 
org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetTableMetadata_v3;
    +
    +/**
    + * Holds common statistics about data in parquet group scan,
    + * including information about total row count, columns counts, partition 
columns.
    + */
    +public class ParquetGroupScanStatistics {
    +
    +  // map from file names to maps of column name to partition value mappings
    +  private Map<String, Map<SchemaPath, Object>> partitionValueMap;
    +  // only for partition columns : value is unique for each partition
    +  private Map<SchemaPath, TypeProtos.MajorType> partitionColTypeMap;
    +  // total number of non-null value for each column in parquet files
    +  private Map<SchemaPath, Long> columnValueCounts;
    +  // total number of rows (obtained from parquet footer)
    +  private long rowCount;
    +
    +
    +  public ParquetGroupScanStatistics(List<RowGroupInfo> rowGroupInfos, 
ParquetTableMetadataBase parquetTableMetadata) {
    +    collect(rowGroupInfos, parquetTableMetadata);
    +  }
    +
    +  public ParquetGroupScanStatistics(ParquetGroupScanStatistics that) {
    +    this.partitionValueMap = new HashMap<>(that.partitionValueMap);
    +    this.partitionColTypeMap = new HashMap<>(that.partitionColTypeMap);
    +    this.columnValueCounts = new HashMap<>(that.columnValueCounts);
    +    this.rowCount = that.rowCount;
    +  }
    +
    +  public long getColumnValueCount(SchemaPath column) {
    +    return columnValueCounts.containsKey(column) ? 
columnValueCounts.get(column) : 0;
    +  }
    +
    +  public List<SchemaPath> getPartitionColumns() {
    +    return new ArrayList<>(partitionColTypeMap.keySet());
    +  }
    +
    +  public TypeProtos.MajorType getTypeForColumn(SchemaPath schemaPath) {
    +    return partitionColTypeMap.get(schemaPath);
    +  }
    +
    +  public long getRowCount() {
    +    return rowCount;
    +  }
    +
    +  public Object getPartitionValue(String path, SchemaPath column) {
    +    return partitionValueMap.get(path).get(column);
    +  }
    +
    +  public void collect(List<RowGroupInfo> rowGroupInfos, 
ParquetTableMetadataBase parquetTableMetadata) {
    +    resetHolders();
    +    boolean first = true;
    +    for (RowGroupInfo rowGroup : rowGroupInfos) {
    +      long rowCount = rowGroup.getRowCount();
    +      for (ColumnMetadata column : rowGroup.getColumns()) {
    +        SchemaPath schemaPath = 
SchemaPath.getCompoundPath(column.getName());
    +        Long previousCount = columnValueCounts.get(schemaPath);
    +        if (previousCount != null) {
    +          if (previousCount != GroupScan.NO_COLUMN_STATS) {
    +            if (column.getNulls() != null) {
    --- End diff --
    
    Changed.


---

Reply via email to