[iceberg] branch master updated: Parquet: Avoid extra footer read when position column is not needed (#1716)

blue Sat, 07 Nov 2020 14:46:46 -0800

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git



The following commit(s) were added to refs/heads/master by this push:
     new a132372  Parquet: Avoid extra footer read when position column is not 
needed (#1716)
a132372 is described below

commit a132372d59d212f45c6cfe532c271fcb5a1878a7
Author: Chen, Junjie <[email protected]>
AuthorDate: Sun Nov 8 06:46:28 2020 +0800

    Parquet: Avoid extra footer read when position column is not needed (#1716)
---
 .../src/main/java/org/apache/iceberg/parquet/ReadConf.java  | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java 
b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
index cb816da..78deb21 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
@@ -27,6 +27,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
 import java.util.stream.Collectors;
+import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.exceptions.RuntimeIOException;
 import org.apache.iceberg.expressions.Expression;
@@ -87,10 +88,10 @@ class ReadConf<T> {
 
     this.rowGroups = reader.getRowGroups();
     this.shouldSkip = new boolean[rowGroups.size()];
+    this.startRowPositions = new long[rowGroups.size()];
 
     // Fetch all row groups starting positions to compute the row offsets of 
the filtered row groups
-    Map<Long, Long> offsetToStartPos = generateOffsetToStartPos();
-    this.startRowPositions = new long[rowGroups.size()];
+    Map<Long, Long> offsetToStartPos = 
generateOffsetToStartPos(expectedSchema);
 
     ParquetMetricsRowGroupFilter statsFilter = null;
     ParquetDictionaryRowGroupFilter dictFilter = null;
@@ -102,7 +103,7 @@ class ReadConf<T> {
     long computedTotalValues = 0L;
     for (int i = 0; i < shouldSkip.length; i += 1) {
       BlockMetaData rowGroup = rowGroups.get(i);
-      startRowPositions[i] = offsetToStartPos.get(rowGroup.getStartingPos());
+      startRowPositions[i] = offsetToStartPos == null ? 0 : 
offsetToStartPos.get(rowGroup.getStartingPos());
       boolean shouldRead = filter == null || (
           statsFilter.shouldRead(typeWithIds, rowGroup) &&
               dictFilter.shouldRead(typeWithIds, rowGroup, 
reader.getDictionaryReader(rowGroup)));
@@ -166,7 +167,11 @@ class ReadConf<T> {
     return shouldSkip;
   }
 
-  private Map<Long, Long> generateOffsetToStartPos() {
+  private Map<Long, Long> generateOffsetToStartPos(Schema schema) {
+    if (schema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) {
+      return null;
+    }
+
     try (ParquetFileReader fileReader = newReader(file, 
ParquetReadOptions.builder().build())) {
       Map<Long, Long> offsetToStartPos = new HashMap<>();

[iceberg] branch master updated: Parquet: Avoid extra footer read when position column is not needed (#1716)

Reply via email to