rdblue commented on a change in pull request #710: Parquet changes for vectorized reads URL: https://github.com/apache/incubator-iceberg/pull/710#discussion_r361723761
########## File path: parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java ########## @@ -59,122 +57,15 @@ public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions this.caseSensitive = caseSensitive; } - private static class ReadConf<T> { - private final ParquetFileReader reader; - private final InputFile file; - private final ParquetReadOptions options; - private final MessageType projection; - private final ParquetValueReader<T> model; - private final List<BlockMetaData> rowGroups; - private final boolean[] shouldSkip; - private final long totalValues; - private final boolean reuseContainers; - - @SuppressWarnings("unchecked") - ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter, - Function<MessageType, ParquetValueReader<?>> readerFunc, boolean reuseContainers, - boolean caseSensitive) { - this.file = file; - this.options = options; - this.reader = newReader(file, options); - - MessageType fileSchema = reader.getFileMetaData().getSchema(); - - boolean hasIds = ParquetSchemaUtil.hasIds(fileSchema); - MessageType typeWithIds = hasIds ? fileSchema : ParquetSchemaUtil.addFallbackIds(fileSchema); - - this.projection = hasIds ? - ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema) : - ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema); - this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds); - this.rowGroups = reader.getRowGroups(); - this.shouldSkip = new boolean[rowGroups.size()]; - - ParquetMetricsRowGroupFilter statsFilter = null; - ParquetDictionaryRowGroupFilter dictFilter = null; - if (filter != null) { - statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter, caseSensitive); - dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter, caseSensitive); - } - - long computedTotalValues = 0L; - for (int i = 0; i < shouldSkip.length; i += 1) { - BlockMetaData rowGroup = rowGroups.get(i); - boolean shouldRead = filter == null || ( - statsFilter.shouldRead(typeWithIds, rowGroup) && - dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup))); - this.shouldSkip[i] = !shouldRead; - if (shouldRead) { - computedTotalValues += rowGroup.getRowCount(); - } - } - - this.totalValues = computedTotalValues; - this.reuseContainers = reuseContainers; - } - - ReadConf(ReadConf<T> toCopy) { - this.reader = null; - this.file = toCopy.file; - this.options = toCopy.options; - this.projection = toCopy.projection; - this.model = toCopy.model; - this.rowGroups = toCopy.rowGroups; - this.shouldSkip = toCopy.shouldSkip; - this.totalValues = toCopy.totalValues; - this.reuseContainers = toCopy.reuseContainers; - } - - ParquetFileReader reader() { - if (reader != null) { - reader.setRequestedSchema(projection); - return reader; - } - - ParquetFileReader newReader = newReader(file, options); - newReader.setRequestedSchema(projection); - return newReader; - } - - ParquetValueReader<T> model() { - return model; - } - - boolean[] shouldSkip() { - return shouldSkip; - } - - long totalValues() { - return totalValues; - } - - boolean reuseContainers() { - return reuseContainers; - } - - ReadConf<T> copy() { - return new ReadConf<>(this); - } - - private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) { - try { - return ParquetFileReader.open(ParquetIO.file(file), options); - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location()); - } - } - } - private ReadConf<T> conf = null; private ReadConf<T> init() { if (conf == null) { - ReadConf<T> readConf = new ReadConf<>( - input, options, expectedSchema, filter, readerFunc, reuseContainers, caseSensitive); + ReadConf<T> readConf = new ReadConf<T>( Review comment: Why was it necessary to add the type variable here? Java should be able to infer it. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org