[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #710: Parquet changes for vectorized reads

GitBox Fri, 27 Dec 2019 10:46:07 -0800

rdblue commented on a change in pull request #710: Parquet changes for 
vectorized reads
URL: https://github.com/apache/incubator-iceberg/pull/710#discussion_r361723761


 ##########
 File path: parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java
 ##########
 @@ -59,122 +57,15 @@ public ParquetReader(InputFile input, Schema 
expectedSchema, ParquetReadOptions
     this.caseSensitive = caseSensitive;
   }
 
-  private static class ReadConf<T> {
-    private final ParquetFileReader reader;
-    private final InputFile file;
-    private final ParquetReadOptions options;
-    private final MessageType projection;
-    private final ParquetValueReader<T> model;
-    private final List<BlockMetaData> rowGroups;
-    private final boolean[] shouldSkip;
-    private final long totalValues;
-    private final boolean reuseContainers;
-
-    @SuppressWarnings("unchecked")
-    ReadConf(InputFile file, ParquetReadOptions options, Schema 
expectedSchema, Expression filter,
-             Function<MessageType, ParquetValueReader<?>> readerFunc, boolean 
reuseContainers,
-             boolean caseSensitive) {
-      this.file = file;
-      this.options = options;
-      this.reader = newReader(file, options);
-
-      MessageType fileSchema = reader.getFileMetaData().getSchema();
-
-      boolean hasIds = ParquetSchemaUtil.hasIds(fileSchema);
-      MessageType typeWithIds = hasIds ? fileSchema : 
ParquetSchemaUtil.addFallbackIds(fileSchema);
-
-      this.projection = hasIds ?
-          ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema) :
-          ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
-      this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds);
-      this.rowGroups = reader.getRowGroups();
-      this.shouldSkip = new boolean[rowGroups.size()];
-
-      ParquetMetricsRowGroupFilter statsFilter = null;
-      ParquetDictionaryRowGroupFilter dictFilter = null;
-      if (filter != null) {
-        statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter, 
caseSensitive);
-        dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, 
filter, caseSensitive);
-      }
-
-      long computedTotalValues = 0L;
-      for (int i = 0; i < shouldSkip.length; i += 1) {
-        BlockMetaData rowGroup = rowGroups.get(i);
-        boolean shouldRead = filter == null || (
-            statsFilter.shouldRead(typeWithIds, rowGroup) &&
-            dictFilter.shouldRead(typeWithIds, rowGroup, 
reader.getDictionaryReader(rowGroup)));
-        this.shouldSkip[i] = !shouldRead;
-        if (shouldRead) {
-          computedTotalValues += rowGroup.getRowCount();
-        }
-      }
-
-      this.totalValues = computedTotalValues;
-      this.reuseContainers = reuseContainers;
-    }
-
-    ReadConf(ReadConf<T> toCopy) {
-      this.reader = null;
-      this.file = toCopy.file;
-      this.options = toCopy.options;
-      this.projection = toCopy.projection;
-      this.model = toCopy.model;
-      this.rowGroups = toCopy.rowGroups;
-      this.shouldSkip = toCopy.shouldSkip;
-      this.totalValues = toCopy.totalValues;
-      this.reuseContainers = toCopy.reuseContainers;
-    }
-
-    ParquetFileReader reader() {
-      if (reader != null) {
-        reader.setRequestedSchema(projection);
-        return reader;
-      }
-
-      ParquetFileReader newReader = newReader(file, options);
-      newReader.setRequestedSchema(projection);
-      return newReader;
-    }
-
-    ParquetValueReader<T> model() {
-      return model;
-    }
-
-    boolean[] shouldSkip() {
-      return shouldSkip;
-    }
-
-    long totalValues() {
-      return totalValues;
-    }
-
-    boolean reuseContainers() {
-      return reuseContainers;
-    }
-
-    ReadConf<T> copy() {
-      return new ReadConf<>(this);
-    }
-
-    private static ParquetFileReader newReader(InputFile file, 
ParquetReadOptions options) {
-      try {
-        return ParquetFileReader.open(ParquetIO.file(file), options);
-      } catch (IOException e) {
-        throw new RuntimeIOException(e, "Failed to open Parquet file: %s", 
file.location());
-      }
-    }
-  }
-
   private ReadConf<T> conf = null;
 
   private ReadConf<T> init() {
     if (conf == null) {
-      ReadConf<T> readConf = new ReadConf<>(
-          input, options, expectedSchema, filter, readerFunc, reuseContainers, 
caseSensitive);
+      ReadConf<T> readConf = new ReadConf<T>(
 
 Review comment:
   Why was it necessary to add the type variable here? Java should be able to 
infer it.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #710: Parquet changes for vectorized reads

Reply via email to